├── .gitignore
├── LICENSE
├── README.md
├── README_zh.md
├── URLs
    ├── dispatcher.py
    ├── gunicorn_conf.py
    ├── start_gunicorn.sh
    ├── transformers_url.py
    ├── transformers_url_m.py
    ├── vllm_url.py
    └── vllm_url_m.py
├── __init__.py
├── configs
    ├── make_config.py
    └── show_datasets.py
├── data_process.py
├── datasets
    ├── __init__.py
    ├── afqmc
    │   ├── config
    │   │   ├── afqmc_gen.json
    │   │   └── afqmc_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── agieval
    │   ├── config
    │   │   ├── aqua-rat_gen.json
    │   │   ├── aqua-rat_ppl.json
    │   │   ├── gaokao-biology_gen.json
    │   │   ├── gaokao-biology_ppl.json
    │   │   ├── gaokao-chemistry_gen.json
    │   │   ├── gaokao-chemistry_ppl.json
    │   │   ├── gaokao-chinese_gen.json
    │   │   ├── gaokao-chinese_ppl.json
    │   │   ├── gaokao-english_gen.json
    │   │   ├── gaokao-english_ppl.json
    │   │   ├── gaokao-geography_gen.json
    │   │   ├── gaokao-geography_ppl.json
    │   │   ├── gaokao-history_gen.json
    │   │   ├── gaokao-history_ppl.json
    │   │   ├── gaokao-mathcloze_gen.json
    │   │   ├── gaokao-mathqa_gen.json
    │   │   ├── gaokao-mathqa_ppl.json
    │   │   ├── gaokao-physics_gen.json
    │   │   ├── jec-qa-ca_gen.json
    │   │   ├── jec-qa-kd_gen.json
    │   │   ├── logiqa-en_gen.json
    │   │   ├── logiqa-en_ppl.json
    │   │   ├── logiqa-zh_gen.json
    │   │   ├── logiqa-zh_ppl.json
    │   │   ├── lsat-ar_gen.json
    │   │   ├── lsat-ar_ppl.json
    │   │   ├── lsat-lr_gen.json
    │   │   ├── lsat-lr_ppl.json
    │   │   ├── lsat-rc_gen.json
    │   │   ├── lsat-rc_ppl.json
    │   │   ├── math_gen.json
    │   │   ├── sat-en-without-passage_gen.json
    │   │   ├── sat-en-without-passage_ppl.json
    │   │   ├── sat-en_gen.json
    │   │   ├── sat-en_ppl.json
    │   │   ├── sat-math_gen.json
    │   │   └── sat-math_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_ppl_v0.py
    ├── arc-c
    │   ├── config
    │   │   ├── arc-c_gen.json
    │   │   └── arc-c_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── arc-e
    │   ├── config
    │   │   ├── arc-e_gen.json
    │   │   └── arc-e_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── ax-b
    │   ├── config
    │   │   ├── ax-b_gen.json
    │   │   └── ax-b_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── ax-g
    │   ├── config
    │   │   ├── ax-g_gen.json
    │   │   └── ax-g_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── bbh-cot
    │   ├── config
    │   │   ├── boolean-expressions_gen.json
    │   │   ├── causal-judgement_gen.json
    │   │   ├── date-understanding_gen.json
    │   │   ├── disambiguation-qa_gen.json
    │   │   ├── dyck-languages_gen.json
    │   │   ├── formal-fallacies_gen.json
    │   │   ├── geometric-shapes_gen.json
    │   │   ├── hyperbaton_gen.json
    │   │   ├── logical-deduction-five-objects_gen.json
    │   │   ├── logical-deduction-seven-objects_gen.json
    │   │   ├── logical-deduction-three-objects_gen.json
    │   │   ├── movie-recommendation_gen.json
    │   │   ├── multistep-arithmetic-two_gen.json
    │   │   ├── navigate_gen.json
    │   │   ├── object-counting_gen.json
    │   │   ├── penguins-in-a-table_gen.json
    │   │   ├── reasoning-about-colored-objects_gen.json
    │   │   ├── ruin-names_gen.json
    │   │   ├── salient-translation-error-detection_gen.json
    │   │   ├── snarks_gen.json
    │   │   ├── sports-understanding_gen.json
    │   │   ├── temporal-sequences_gen.json
    │   │   ├── tracking-shuffled-objects-five-objects_gen.json
    │   │   ├── tracking-shuffled-objects-seven-objects_gen.json
    │   │   ├── tracking-shuffled-objects-three-objects_gen.json
    │   │   ├── web-of-lies_gen.json
    │   │   └── word-sorting_gen.json
    │   ├── cot-prompts
    │   │   ├── boolean_expressions.txt
    │   │   ├── causal_judgement.txt
    │   │   ├── date_understanding.txt
    │   │   ├── disambiguation_qa.txt
    │   │   ├── dyck_languages.txt
    │   │   ├── formal_fallacies.txt
    │   │   ├── geometric_shapes.txt
    │   │   ├── hyperbaton.txt
    │   │   ├── logical_deduction_five_objects.txt
    │   │   ├── logical_deduction_seven_objects.txt
    │   │   ├── logical_deduction_three_objects.txt
    │   │   ├── movie_recommendation.txt
    │   │   ├── multistep_arithmetic_two.txt
    │   │   ├── navigate.txt
    │   │   ├── object_counting.txt
    │   │   ├── penguins_in_a_table.txt
    │   │   ├── reasoning_about_colored_objects.txt
    │   │   ├── ruin_names.txt
    │   │   ├── salient_translation_error_detection.txt
    │   │   ├── snarks.txt
    │   │   ├── sports_understanding.txt
    │   │   ├── temporal_sequences.txt
    │   │   ├── tracking_shuffled_objects_five_objects.txt
    │   │   ├── tracking_shuffled_objects_seven_objects.txt
    │   │   ├── tracking_shuffled_objects_three_objects.txt
    │   │   ├── web_of_lies.txt
    │   │   └── word_sorting.txt
    │   ├── make_dataset.py
    │   └── transform_gen_cot.py
    ├── bbh
    │   ├── config
    │   │   ├── boolean-expressions_gen.json
    │   │   ├── causal-judgement_gen.json
    │   │   ├── date-understanding_gen.json
    │   │   ├── disambiguation-qa_gen.json
    │   │   ├── dyck-languages_gen.json
    │   │   ├── formal-fallacies_gen.json
    │   │   ├── geometric-shapes_gen.json
    │   │   ├── hyperbaton_gen.json
    │   │   ├── logical-deduction-five-objects_gen.json
    │   │   ├── logical-deduction-seven-objects_gen.json
    │   │   ├── logical-deduction-three-objects_gen.json
    │   │   ├── movie-recommendation_gen.json
    │   │   ├── multistep-arithmetic-two_gen.json
    │   │   ├── navigate_gen.json
    │   │   ├── object-counting_gen.json
    │   │   ├── penguins-in-a-table_gen.json
    │   │   ├── reasoning-about-colored-objects_gen.json
    │   │   ├── ruin-names_gen.json
    │   │   ├── salient-translation-error-detection_gen.json
    │   │   ├── snarks_gen.json
    │   │   ├── sports-understanding_gen.json
    │   │   ├── temporal-sequences_gen.json
    │   │   ├── tracking-shuffled-objects-five-objects_gen.json
    │   │   ├── tracking-shuffled-objects-seven-objects_gen.json
    │   │   ├── tracking-shuffled-objects-three-objects_gen.json
    │   │   ├── web-of-lies_gen.json
    │   │   └── word-sorting_gen.json
    │   ├── make_dataset.py
    │   └── transform_gen_v0.py
    ├── boolq
    │   ├── config
    │   │   ├── boolq_gen.json
    │   │   └── boolq_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── bustm
    │   ├── config
    │   │   ├── bustm_gen.json
    │   │   └── bustm_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── c3
    │   ├── config
    │   │   ├── dialog_gen.json
    │   │   ├── dialog_ppl.json
    │   │   ├── mixed_gen.json
    │   │   └── mixed_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── cb
    │   ├── config
    │   │   ├── cb_gen.json
    │   │   └── cb_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── ceval
    │   ├── config
    │   │   ├── accountant_gen.json
    │   │   ├── advanced-mathematics_gen.json
    │   │   ├── art-studies_gen.json
    │   │   ├── basic-medicine_gen.json
    │   │   ├── business-administration_gen.json
    │   │   ├── chinese-language-and-literature_gen.json
    │   │   ├── civil-servant_gen.json
    │   │   ├── clinical-medicine_gen.json
    │   │   ├── college-chemistry_gen.json
    │   │   ├── college-economics_gen.json
    │   │   ├── college-physics_gen.json
    │   │   ├── college-programming_gen.json
    │   │   ├── computer-architecture_gen.json
    │   │   ├── computer-network_gen.json
    │   │   ├── discrete-mathematics_gen.json
    │   │   ├── education-science_gen.json
    │   │   ├── electrical-engineer_gen.json
    │   │   ├── environmental-impact-assessment-engineer_gen.json
    │   │   ├── fire-engineer_gen.json
    │   │   ├── high-school-biology_gen.json
    │   │   ├── high-school-chemistry_gen.json
    │   │   ├── high-school-chinese_gen.json
    │   │   ├── high-school-geography_gen.json
    │   │   ├── high-school-history_gen.json
    │   │   ├── high-school-mathematics_gen.json
    │   │   ├── high-school-physics_gen.json
    │   │   ├── high-school-politics_gen.json
    │   │   ├── ideological-and-moral-cultivation_gen.json
    │   │   ├── law_gen.json
    │   │   ├── legal-professional_gen.json
    │   │   ├── logic_gen.json
    │   │   ├── mao-zedong-thought_gen.json
    │   │   ├── marxism_gen.json
    │   │   ├── metrology-engineer_gen.json
    │   │   ├── middle-school-biology_gen.json
    │   │   ├── middle-school-chemistry_gen.json
    │   │   ├── middle-school-geography_gen.json
    │   │   ├── middle-school-history_gen.json
    │   │   ├── middle-school-mathematics_gen.json
    │   │   ├── middle-school-physics_gen.json
    │   │   ├── middle-school-politics_gen.json
    │   │   ├── modern-chinese-history_gen.json
    │   │   ├── operating-system_gen.json
    │   │   ├── physician_gen.json
    │   │   ├── plant-protection_gen.json
    │   │   ├── probability-and-statistics_gen.json
    │   │   ├── professional-tour-guide_gen.json
    │   │   ├── sports-science_gen.json
    │   │   ├── tax-accountant_gen.json
    │   │   ├── teacher-qualification_gen.json
    │   │   ├── urban-and-rural-planner_gen.json
    │   │   └── veterinary-medicine_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── chid
    │   ├── config
    │   │   ├── chid_gen.json
    │   │   └── chid_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── cluewsc
    │   ├── config
    │   │   ├── cluewsc_gen.json
    │   │   └── cluewsc_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── cmmlu
    │   ├── config
    │   │   ├── agronomy_gen.json
    │   │   ├── anatomy_gen.json
    │   │   ├── ancient-chinese_gen.json
    │   │   ├── arts_gen.json
    │   │   ├── astronomy_gen.json
    │   │   ├── business-ethics_gen.json
    │   │   ├── chinese-civil-service-exam_gen.json
    │   │   ├── chinese-driving-rule_gen.json
    │   │   ├── chinese-food-culture_gen.json
    │   │   ├── chinese-foreign-policy_gen.json
    │   │   ├── chinese-history_gen.json
    │   │   ├── chinese-literature_gen.json
    │   │   ├── chinese-teacher-qualification_gen.json
    │   │   ├── clinical-knowledge_gen.json
    │   │   ├── college-actuarial-science_gen.json
    │   │   ├── college-education_gen.json
    │   │   ├── college-engineering-hydrology_gen.json
    │   │   ├── college-law_gen.json
    │   │   ├── college-mathematics_gen.json
    │   │   ├── college-medical-statistics_gen.json
    │   │   ├── college-medicine_gen.json
    │   │   ├── computer-science_gen.json
    │   │   ├── computer-security_gen.json
    │   │   ├── conceptual-physics_gen.json
    │   │   ├── construction-project-management_gen.json
    │   │   ├── economics_gen.json
    │   │   ├── education_gen.json
    │   │   ├── electrical-engineering_gen.json
    │   │   ├── elementary-chinese_gen.json
    │   │   ├── elementary-commonsense_gen.json
    │   │   ├── elementary-information-and-technology_gen.json
    │   │   ├── elementary-mathematics_gen.json
    │   │   ├── ethnology_gen.json
    │   │   ├── food-science_gen.json
    │   │   ├── genetics_gen.json
    │   │   ├── global-facts_gen.json
    │   │   ├── high-school-biology_gen.json
    │   │   ├── high-school-chemistry_gen.json
    │   │   ├── high-school-geography_gen.json
    │   │   ├── high-school-mathematics_gen.json
    │   │   ├── high-school-physics_gen.json
    │   │   ├── high-school-politics_gen.json
    │   │   ├── human-sexuality_gen.json
    │   │   ├── international-law_gen.json
    │   │   ├── journalism_gen.json
    │   │   ├── jurisprudence_gen.json
    │   │   ├── legal-and-moral-basis_gen.json
    │   │   ├── logical_gen.json
    │   │   ├── machine-learning_gen.json
    │   │   ├── management_gen.json
    │   │   ├── marketing_gen.json
    │   │   ├── marxist-theory_gen.json
    │   │   ├── modern-chinese_gen.json
    │   │   ├── nutrition_gen.json
    │   │   ├── philosophy_gen.json
    │   │   ├── professional-accounting_gen.json
    │   │   ├── professional-law_gen.json
    │   │   ├── professional-medicine_gen.json
    │   │   ├── professional-psychology_gen.json
    │   │   ├── public-relations_gen.json
    │   │   ├── security-study_gen.json
    │   │   ├── sociology_gen.json
    │   │   ├── sports-science_gen.json
    │   │   ├── traditional-chinese-medicine_gen.json
    │   │   ├── virology_gen.json
    │   │   ├── world-history_gen.json
    │   │   └── world-religions_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── cmnli
    │   ├── config
    │   │   ├── cmnli_gen.json
    │   │   └── cmnli_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── commonsenseqa
    │   ├── config
    │   │   ├── commonsenseqa_gen.json
    │   │   └── commonsenseqa_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── copa
    │   ├── config
    │   │   ├── copa_gen.json
    │   │   └── copa_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── drcd
    │   ├── config
    │   │   └── drcd_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── eprstmt
    │   ├── config
    │   │   ├── eprstmt_gen.json
    │   │   └── eprstmt_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── flores
    │   ├── config
    │   │   └── flores_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── gaokaobench
    │   ├── config
    │   │   ├── Biology-MCQs_gen.json
    │   │   ├── Chemistry-MCQs_gen.json
    │   │   ├── Chinese-Lang-and-Usage-MCQs_gen.json
    │   │   ├── Chinese-Modern-Lit_gen.json
    │   │   ├── English-Cloze-Test_gen.json
    │   │   ├── English-Fill-in-Blanks_gen.json
    │   │   ├── English-MCQs_gen.json
    │   │   ├── English-Reading-Comp_gen.json
    │   │   ├── Geography-MCQs_gen.json
    │   │   ├── History-MCQs_gen.json
    │   │   ├── Math-I-MCQs_gen.json
    │   │   ├── Math-II-MCQs_gen.json
    │   │   ├── Physics-MCQs_gen.json
    │   │   └── Political-Science-MCQs_gen.json
    │   ├── make_dataset.py
    │   └── transform_gen_v0.py
    ├── gsm8k
    │   ├── config
    │   │   └── gsm8k_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── hellaswag
    │   ├── config
    │   │   ├── hellaswag_gen.json
    │   │   └── hellaswag_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── humaneval
    │   ├── config
    │   │   └── humaneval_gen.json
    │   ├── make_dataset.py
    │   └── transform_gen_v0.py
    ├── jecqa
    │   ├── config
    │   │   ├── jecqa_gen.json
    │   │   └── jecqa_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── lambada
    │   ├── config
    │   │   └── lambada_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── math
    │   ├── config
    │   │   ├── algebra_gen.json
    │   │   ├── counting-and-probability_gen.json
    │   │   ├── geometry_gen.json
    │   │   ├── intermediate-algebra_gen.json
    │   │   ├── number-theory_gen.json
    │   │   ├── prealgebra_gen.json
    │   │   └── precalculus_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── mbpp-427
    │   ├── config
    │   │   └── mbpp_gen.json
    │   ├── make_dataset.py
    │   └── transform_gen_v0.py
    ├── mbpp
    │   ├── config
    │   │   └── mbpp_gen.json
    │   ├── make_dataset.py
    │   └── transform_gen_v0.py
    ├── medmcqa
    │   ├── config
    │   │   ├── medmcqa_gen.json
    │   │   └── medmcqa_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── medqa-mcmle
    │   ├── config
    │   │   ├── medqa-mcmle_gen.json
    │   │   └── medqa-mcmle_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── medqa-usmle
    │   ├── config
    │   │   ├── medqa-usmle_gen.json
    │   │   └── medqa-usmle_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── mmlu
    │   ├── config
    │   │   ├── abstract-algebra_gen.json
    │   │   ├── anatomy_gen.json
    │   │   ├── astronomy_gen.json
    │   │   ├── business-ethics_gen.json
    │   │   ├── clinical-knowledge_gen.json
    │   │   ├── college-biology_gen.json
    │   │   ├── college-chemistry_gen.json
    │   │   ├── college-computer-science_gen.json
    │   │   ├── college-mathematics_gen.json
    │   │   ├── college-medicine_gen.json
    │   │   ├── college-physics_gen.json
    │   │   ├── computer-security_gen.json
    │   │   ├── conceptual-physics_gen.json
    │   │   ├── econometrics_gen.json
    │   │   ├── electrical-engineering_gen.json
    │   │   ├── elementary-mathematics_gen.json
    │   │   ├── formal-logic_gen.json
    │   │   ├── global-facts_gen.json
    │   │   ├── high-school-biology_gen.json
    │   │   ├── high-school-chemistry_gen.json
    │   │   ├── high-school-computer-science_gen.json
    │   │   ├── high-school-european-history_gen.json
    │   │   ├── high-school-geography_gen.json
    │   │   ├── high-school-government-and-politics_gen.json
    │   │   ├── high-school-macroeconomics_gen.json
    │   │   ├── high-school-mathematics_gen.json
    │   │   ├── high-school-microeconomics_gen.json
    │   │   ├── high-school-physics_gen.json
    │   │   ├── high-school-psychology_gen.json
    │   │   ├── high-school-statistics_gen.json
    │   │   ├── high-school-us-history_gen.json
    │   │   ├── high-school-world-history_gen.json
    │   │   ├── human-aging_gen.json
    │   │   ├── human-sexuality_gen.json
    │   │   ├── international-law_gen.json
    │   │   ├── jurisprudence_gen.json
    │   │   ├── logical-fallacies_gen.json
    │   │   ├── machine-learning_gen.json
    │   │   ├── management_gen.json
    │   │   ├── marketing_gen.json
    │   │   ├── medical-genetics_gen.json
    │   │   ├── miscellaneous_gen.json
    │   │   ├── moral-disputes_gen.json
    │   │   ├── moral-scenarios_gen.json
    │   │   ├── nutrition_gen.json
    │   │   ├── philosophy_gen.json
    │   │   ├── prehistory_gen.json
    │   │   ├── professional-accounting_gen.json
    │   │   ├── professional-law_gen.json
    │   │   ├── professional-medicine_gen.json
    │   │   ├── professional-psychology_gen.json
    │   │   ├── public-relations_gen.json
    │   │   ├── security-studies_gen.json
    │   │   ├── sociology_gen.json
    │   │   ├── us-foreign-policy_gen.json
    │   │   ├── virology_gen.json
    │   │   └── world-religions_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── multirc
    │   ├── config
    │   │   ├── multirc_gen.json
    │   │   └── multirc_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── nq-open
    │   ├── config
    │   │   └── nq-open_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── ocnli-fc
    │   ├── config
    │   │   ├── ocnli-fc_gen.json
    │   │   └── ocnli-fc_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── ocnli
    │   ├── config
    │   │   ├── ocnli_gen.json
    │   │   └── ocnli_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── openbookqa
    │   ├── config
    │   │   ├── openbookqa_gen.json
    │   │   └── openbookqa_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── piqa
    │   ├── config
    │   │   ├── piqa_gen.json
    │   │   └── piqa_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── quac
    │   ├── config
    │   │   └── quac_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── race
    │   ├── config
    │   │   ├── high_gen.json
    │   │   ├── high_ppl.json
    │   │   ├── middle_gen.json
    │   │   └── middle_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── record
    │   ├── config
    │   │   └── record_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── rte
    │   ├── config
    │   │   ├── rte_gen.json
    │   │   └── rte_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── siqa
    │   ├── config
    │   │   ├── siqa_gen.json
    │   │   └── siqa_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── squad
    │   ├── config
    │   │   └── squad_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── storycloze
    │   ├── config
    │   │   ├── storycloze_gen.json
    │   │   └── storycloze_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── strategyqa
    │   ├── config
    │   │   ├── strategyqa_gen.json
    │   │   └── strategyqa_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── summedits
    │   ├── config
    │   │   ├── billsum_gen.json
    │   │   ├── billsum_ppl.json
    │   │   ├── ectsum_gen.json
    │   │   ├── ectsum_ppl.json
    │   │   ├── news_gen.json
    │   │   ├── news_ppl.json
    │   │   ├── podcast_gen.json
    │   │   ├── podcast_ppl.json
    │   │   ├── qmsumm_gen.json
    │   │   ├── qmsumm_ppl.json
    │   │   ├── sales-call_gen.json
    │   │   ├── sales-call_ppl.json
    │   │   ├── sales-email_gen.json
    │   │   ├── sales-email_ppl.json
    │   │   ├── samsum_gen.json
    │   │   ├── samsum_ppl.json
    │   │   ├── scitldr_gen.json
    │   │   ├── scitldr_ppl.json
    │   │   ├── shakespeare_gen.json
    │   │   └── shakespeare_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── theoremqa
    │   ├── config
    │   │   └── theoremqa_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── tnews
    │   ├── config
    │   │   ├── tnews_gen.json
    │   │   └── tnews_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── triviaqa
    │   ├── config
    │   │   ├── web_gen.json
    │   │   └── wikipedia_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── truthfulqa
    │   ├── config
    │   │   ├── mc1_ppl.json
    │   │   └── mc2_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── tydiqa
    │   ├── config
    │   │   └── tydiqa_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── wic
    │   ├── config
    │   │   ├── wic_gen.json
    │   │   └── wic_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── winogender
    │   ├── config
    │   │   ├── female_gen.json
    │   │   ├── female_ppl.json
    │   │   ├── male_gen.json
    │   │   ├── male_ppl.json
    │   │   ├── neutral_gen.json
    │   │   └── neutral_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── winogrande
    │   ├── config
    │   │   ├── winogrande_gen.json
    │   │   └── winogrande_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
    ├── wmt20-en-zh
    │   ├── config
    │   │   ├── news_gen.json
    │   │   └── suites_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    ├── wmt20-zh-en
    │   ├── config
    │   │   ├── news_gen.json
    │   │   └── suites_gen.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   └── transform_gen_v1.py
    └── wsc
    │   ├── config
    │       ├── wsc_gen.json
    │       └── wsc_ppl.json
    │   ├── make_dataset.py
    │   ├── transform_gen_v0.py
    │   ├── transform_gen_v1.py
    │   ├── transform_ppl_v0.py
    │   └── transform_ppl_v1.py
├── docs
    ├── pics
    │   ├── ultraeval_logo_white.jpg
    │   └── ultraeval_pipeline_white.png
    └── tutorials
    │   ├── en
    │       ├── configuration_file
    │       │   ├── config.md
    │       │   ├── make_dataset.md
    │       │   ├── metric.md
    │       │   ├── model_params.md
    │       │   ├── postprocess.md
    │       │   └── transform.md
    │       ├── customization
    │       │   ├── individual_models.md
    │       │   ├── new_config.md
    │       │   ├── new_dataset.md
    │       │   ├── new_metric.md
    │       │   └── new_postprocess.md
    │       ├── deployment_model
    │       │   ├── acceleration.md
    │       │   ├── deployment.md
    │       │   └── model_download.md
    │       ├── evaluation
    │       │   ├── model_instantiation.md
    │       │   └── task_instantiation.md
    │       └── ultraeval.md
    │   └── zh
    │       ├── configuration_file
    │           ├── config.md
    │           ├── make_dataset.md
    │           ├── metric.md
    │           ├── model_params.md
    │           ├── postprocess.md
    │           └── transform.md
    │       ├── customization
    │           ├── individual_models.md
    │           ├── new_config.md
    │           ├── new_dataset.md
    │           ├── new_metric.md
    │           └── new_postprocess.md
    │       ├── deployment_model
    │           ├── acceleration.md
    │           ├── deployment.md
    │           └── model_download.md
    │       ├── evaluation
    │           ├── model_instantiation.md
    │           └── task_instantiation.md
    │       └── ultraeval.md
├── leaderboard
    ├── Duxiaoman-DI
    │   └── XuanYuan-70B
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    ├── OpenAI
    │   ├── GPT3.5
    │   │   ├── afqmc
    │   │   │   └── afqmc.json
    │   │   ├── arc-c
    │   │   │   └── arc-c.json
    │   │   ├── arc-e
    │   │   │   └── arc-e.json
    │   │   ├── ax-b
    │   │   │   └── ax-b.json
    │   │   ├── ax-g
    │   │   │   └── ax-g.json
    │   │   ├── bbh
    │   │   │   ├── boolean-expressions.json
    │   │   │   ├── causal-judgement.json
    │   │   │   ├── date-understanding.json
    │   │   │   ├── disambiguation-qa.json
    │   │   │   ├── dyck-languages.json
    │   │   │   ├── formal-fallacies.json
    │   │   │   ├── geometric-shapes.json
    │   │   │   ├── hyperbaton.json
    │   │   │   ├── logical-deduction-five-objects.json
    │   │   │   ├── logical-deduction-seven-objects.json
    │   │   │   ├── logical-deduction-three-objects.json
    │   │   │   ├── movie-recommendation.json
    │   │   │   ├── multistep-arithmetic-two.json
    │   │   │   ├── navigate.json
    │   │   │   ├── object-counting.json
    │   │   │   ├── penguins-in-a-table.json
    │   │   │   ├── reasoning-about-colored-objects.json
    │   │   │   ├── ruin-names.json
    │   │   │   ├── salient-translation-error-detection.json
    │   │   │   ├── snarks.json
    │   │   │   ├── sports-understanding.json
    │   │   │   ├── temporal-sequences.json
    │   │   │   ├── tracking-shuffled-objects-five-objects.json
    │   │   │   ├── tracking-shuffled-objects-seven-objects.json
    │   │   │   ├── tracking-shuffled-objects-three-objects.json
    │   │   │   ├── web-of-lies.json
    │   │   │   └── word-sorting.json
    │   │   ├── boolq
    │   │   │   └── boolq.json
    │   │   ├── c3
    │   │   │   ├── dialog.json
    │   │   │   └── mixed.json
    │   │   ├── ceval
    │   │   │   ├── accountant.json
    │   │   │   ├── advanced-mathematics.json
    │   │   │   ├── art-studies.json
    │   │   │   ├── basic-medicine.json
    │   │   │   ├── business-administration.json
    │   │   │   ├── chinese-language-and-literature.json
    │   │   │   ├── civil-servant.json
    │   │   │   ├── clinical-medicine.json
    │   │   │   ├── college-chemistry.json
    │   │   │   ├── college-economics.json
    │   │   │   ├── college-physics.json
    │   │   │   ├── college-programming.json
    │   │   │   ├── computer-architecture.json
    │   │   │   ├── computer-network.json
    │   │   │   ├── discrete-mathematics.json
    │   │   │   ├── education-science.json
    │   │   │   ├── electrical-engineer.json
    │   │   │   ├── environmental-impact-assessment-engineer.json
    │   │   │   ├── fire-engineer.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-chinese.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-history.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-politics.json
    │   │   │   ├── ideological-and-moral-cultivation.json
    │   │   │   ├── law.json
    │   │   │   ├── legal-professional.json
    │   │   │   ├── logic.json
    │   │   │   ├── mao-zedong-thought.json
    │   │   │   ├── marxism.json
    │   │   │   ├── metrology-engineer.json
    │   │   │   ├── middle-school-biology.json
    │   │   │   ├── middle-school-chemistry.json
    │   │   │   ├── middle-school-geography.json
    │   │   │   ├── middle-school-history.json
    │   │   │   ├── middle-school-mathematics.json
    │   │   │   ├── middle-school-physics.json
    │   │   │   ├── middle-school-politics.json
    │   │   │   ├── modern-chinese-history.json
    │   │   │   ├── operating-system.json
    │   │   │   ├── physician.json
    │   │   │   ├── plant-protection.json
    │   │   │   ├── probability-and-statistics.json
    │   │   │   ├── professional-tour-guide.json
    │   │   │   ├── sports-science.json
    │   │   │   ├── tax-accountant.json
    │   │   │   ├── teacher-qualification.json
    │   │   │   ├── urban-and-rural-planner.json
    │   │   │   └── veterinary-medicine.json
    │   │   ├── chid
    │   │   │   └── chid.json
    │   │   ├── cmmlu
    │   │   │   ├── agronomy.json
    │   │   │   ├── anatomy.json
    │   │   │   ├── ancient-chinese.json
    │   │   │   ├── arts.json
    │   │   │   ├── astronomy.json
    │   │   │   ├── business-ethics.json
    │   │   │   ├── chinese-civil-service-exam.json
    │   │   │   ├── chinese-driving-rule.json
    │   │   │   ├── chinese-food-culture.json
    │   │   │   ├── chinese-foreign-policy.json
    │   │   │   ├── chinese-history.json
    │   │   │   ├── chinese-literature.json
    │   │   │   ├── chinese-teacher-qualification.json
    │   │   │   ├── clinical-knowledge.json
    │   │   │   ├── college-actuarial-science.json
    │   │   │   ├── college-education.json
    │   │   │   ├── college-engineering-hydrology.json
    │   │   │   ├── college-law.json
    │   │   │   ├── college-mathematics.json
    │   │   │   ├── college-medical-statistics.json
    │   │   │   ├── college-medicine.json
    │   │   │   ├── computer-science.json
    │   │   │   ├── computer-security.json
    │   │   │   ├── conceptual-physics.json
    │   │   │   ├── construction-project-management.json
    │   │   │   ├── economics.json
    │   │   │   ├── education.json
    │   │   │   ├── electrical-engineering.json
    │   │   │   ├── elementary-chinese.json
    │   │   │   ├── elementary-commonsense.json
    │   │   │   ├── elementary-information-and-technology.json
    │   │   │   ├── elementary-mathematics.json
    │   │   │   ├── ethnology.json
    │   │   │   ├── food-science.json
    │   │   │   ├── genetics.json
    │   │   │   ├── global-facts.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-politics.json
    │   │   │   ├── human-sexuality.json
    │   │   │   ├── international-law.json
    │   │   │   ├── journalism.json
    │   │   │   ├── jurisprudence.json
    │   │   │   ├── legal-and-moral-basis.json
    │   │   │   ├── logical.json
    │   │   │   ├── machine-learning.json
    │   │   │   ├── management.json
    │   │   │   ├── marketing.json
    │   │   │   ├── marxist-theory.json
    │   │   │   ├── modern-chinese.json
    │   │   │   ├── nutrition.json
    │   │   │   ├── philosophy.json
    │   │   │   ├── professional-accounting.json
    │   │   │   ├── professional-law.json
    │   │   │   ├── professional-medicine.json
    │   │   │   ├── professional-psychology.json
    │   │   │   ├── public-relations.json
    │   │   │   ├── security-study.json
    │   │   │   ├── sociology.json
    │   │   │   ├── sports-science.json
    │   │   │   ├── traditional-chinese-medicine.json
    │   │   │   ├── virology.json
    │   │   │   ├── world-history.json
    │   │   │   └── world-religions.json
    │   │   ├── copa
    │   │   │   └── copa.json
    │   │   ├── eprstmt
    │   │   │   └── eprstmt.json
    │   │   ├── gsm8k
    │   │   │   └── gsm8k.json
    │   │   ├── hellaswag
    │   │   │   └── hellaswag.json
    │   │   ├── humaneval
    │   │   │   └── humaneval.json
    │   │   ├── lambada
    │   │   │   └── lambada.json
    │   │   ├── math
    │   │   │   ├── algebra.json
    │   │   │   ├── counting-and-probability.json
    │   │   │   ├── geometry.json
    │   │   │   ├── intermediate-algebra.json
    │   │   │   ├── number-theory.json
    │   │   │   ├── prealgebra.json
    │   │   │   └── precalculus.json
    │   │   ├── mbpp
    │   │   │   └── mbpp.json
    │   │   ├── mmlu
    │   │   │   ├── abstract-algebra.json
    │   │   │   ├── anatomy.json
    │   │   │   ├── astronomy.json
    │   │   │   ├── business-ethics.json
    │   │   │   ├── clinical-knowledge.json
    │   │   │   ├── college-biology.json
    │   │   │   ├── college-chemistry.json
    │   │   │   ├── college-computer-science.json
    │   │   │   ├── college-mathematics.json
    │   │   │   ├── college-medicine.json
    │   │   │   ├── college-physics.json
    │   │   │   ├── computer-security.json
    │   │   │   ├── conceptual-physics.json
    │   │   │   ├── econometrics.json
    │   │   │   ├── electrical-engineering.json
    │   │   │   ├── elementary-mathematics.json
    │   │   │   ├── formal-logic.json
    │   │   │   ├── global-facts.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-computer-science.json
    │   │   │   ├── high-school-european-history.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-government-and-politics.json
    │   │   │   ├── high-school-macroeconomics.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-microeconomics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-psychology.json
    │   │   │   ├── high-school-statistics.json
    │   │   │   ├── high-school-us-history.json
    │   │   │   ├── high-school-world-history.json
    │   │   │   ├── human-aging.json
    │   │   │   ├── human-sexuality.json
    │   │   │   ├── international-law.json
    │   │   │   ├── jurisprudence.json
    │   │   │   ├── logical-fallacies.json
    │   │   │   ├── machine-learning.json
    │   │   │   ├── management.json
    │   │   │   ├── marketing.json
    │   │   │   ├── medical-genetics.json
    │   │   │   ├── miscellaneous.json
    │   │   │   ├── moral-disputes.json
    │   │   │   ├── moral-scenarios.json
    │   │   │   ├── nutrition.json
    │   │   │   ├── philosophy.json
    │   │   │   ├── prehistory.json
    │   │   │   ├── professional-accounting.json
    │   │   │   ├── professional-law.json
    │   │   │   ├── professional-medicine.json
    │   │   │   ├── professional-psychology.json
    │   │   │   ├── public-relations.json
    │   │   │   ├── security-studies.json
    │   │   │   ├── sociology.json
    │   │   │   ├── us-foreign-policy.json
    │   │   │   ├── virology.json
    │   │   │   └── world-religions.json
    │   │   ├── ocnli
    │   │   │   └── ocnli.json
    │   │   ├── piqa
    │   │   │   └── piqa.json
    │   │   ├── rte
    │   │   │   └── rte.json
    │   │   ├── tydiqa
    │   │   │   └── tydiqa.json
    │   │   └── wic
    │   │   │   └── wic.json
    │   └── GPT4
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    ├── THUDM
    │   └── chatglm2-6b
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    ├── TigerResearch
    │   └── tigerbot-13b-base-v1
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    ├── baichuan-inc
    │   └── Baichuan2-13B-Chat
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    ├── internlm
    │   └── internlm-7b
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    ├── lmsys
    │   └── vicuna-13b-v1.5
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    ├── meta-llama
    │   ├── Llama-2-13b-hf
    │   │   ├── afqmc
    │   │   │   └── afqmc.json
    │   │   ├── arc-c
    │   │   │   └── arc-c.json
    │   │   ├── arc-e
    │   │   │   └── arc-e.json
    │   │   ├── ax-b
    │   │   │   └── ax-b.json
    │   │   ├── ax-g
    │   │   │   └── ax-g.json
    │   │   ├── bbh
    │   │   │   ├── boolean-expressions.json
    │   │   │   ├── causal-judgement.json
    │   │   │   ├── date-understanding.json
    │   │   │   ├── disambiguation-qa.json
    │   │   │   ├── dyck-languages.json
    │   │   │   ├── formal-fallacies.json
    │   │   │   ├── geometric-shapes.json
    │   │   │   ├── hyperbaton.json
    │   │   │   ├── logical-deduction-five-objects.json
    │   │   │   ├── logical-deduction-seven-objects.json
    │   │   │   ├── logical-deduction-three-objects.json
    │   │   │   ├── movie-recommendation.json
    │   │   │   ├── multistep-arithmetic-two.json
    │   │   │   ├── navigate.json
    │   │   │   ├── object-counting.json
    │   │   │   ├── penguins-in-a-table.json
    │   │   │   ├── reasoning-about-colored-objects.json
    │   │   │   ├── ruin-names.json
    │   │   │   ├── salient-translation-error-detection.json
    │   │   │   ├── snarks.json
    │   │   │   ├── sports-understanding.json
    │   │   │   ├── temporal-sequences.json
    │   │   │   ├── tracking-shuffled-objects-five-objects.json
    │   │   │   ├── tracking-shuffled-objects-seven-objects.json
    │   │   │   ├── tracking-shuffled-objects-three-objects.json
    │   │   │   ├── web-of-lies.json
    │   │   │   └── word-sorting.json
    │   │   ├── boolq
    │   │   │   └── boolq.json
    │   │   ├── c3
    │   │   │   ├── dialog.json
    │   │   │   └── mixed.json
    │   │   ├── ceval
    │   │   │   ├── accountant.json
    │   │   │   ├── advanced-mathematics.json
    │   │   │   ├── art-studies.json
    │   │   │   ├── basic-medicine.json
    │   │   │   ├── business-administration.json
    │   │   │   ├── chinese-language-and-literature.json
    │   │   │   ├── civil-servant.json
    │   │   │   ├── clinical-medicine.json
    │   │   │   ├── college-chemistry.json
    │   │   │   ├── college-economics.json
    │   │   │   ├── college-physics.json
    │   │   │   ├── college-programming.json
    │   │   │   ├── computer-architecture.json
    │   │   │   ├── computer-network.json
    │   │   │   ├── discrete-mathematics.json
    │   │   │   ├── education-science.json
    │   │   │   ├── electrical-engineer.json
    │   │   │   ├── environmental-impact-assessment-engineer.json
    │   │   │   ├── fire-engineer.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-chinese.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-history.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-politics.json
    │   │   │   ├── ideological-and-moral-cultivation.json
    │   │   │   ├── law.json
    │   │   │   ├── legal-professional.json
    │   │   │   ├── logic.json
    │   │   │   ├── mao-zedong-thought.json
    │   │   │   ├── marxism.json
    │   │   │   ├── metrology-engineer.json
    │   │   │   ├── middle-school-biology.json
    │   │   │   ├── middle-school-chemistry.json
    │   │   │   ├── middle-school-geography.json
    │   │   │   ├── middle-school-history.json
    │   │   │   ├── middle-school-mathematics.json
    │   │   │   ├── middle-school-physics.json
    │   │   │   ├── middle-school-politics.json
    │   │   │   ├── modern-chinese-history.json
    │   │   │   ├── operating-system.json
    │   │   │   ├── physician.json
    │   │   │   ├── plant-protection.json
    │   │   │   ├── probability-and-statistics.json
    │   │   │   ├── professional-tour-guide.json
    │   │   │   ├── sports-science.json
    │   │   │   ├── tax-accountant.json
    │   │   │   ├── teacher-qualification.json
    │   │   │   ├── urban-and-rural-planner.json
    │   │   │   └── veterinary-medicine.json
    │   │   ├── chid
    │   │   │   └── chid.json
    │   │   ├── cmmlu
    │   │   │   ├── agronomy.json
    │   │   │   ├── anatomy.json
    │   │   │   ├── ancient-chinese.json
    │   │   │   ├── arts.json
    │   │   │   ├── astronomy.json
    │   │   │   ├── business-ethics.json
    │   │   │   ├── chinese-civil-service-exam.json
    │   │   │   ├── chinese-driving-rule.json
    │   │   │   ├── chinese-food-culture.json
    │   │   │   ├── chinese-foreign-policy.json
    │   │   │   ├── chinese-history.json
    │   │   │   ├── chinese-literature.json
    │   │   │   ├── chinese-teacher-qualification.json
    │   │   │   ├── clinical-knowledge.json
    │   │   │   ├── college-actuarial-science.json
    │   │   │   ├── college-education.json
    │   │   │   ├── college-engineering-hydrology.json
    │   │   │   ├── college-law.json
    │   │   │   ├── college-mathematics.json
    │   │   │   ├── college-medical-statistics.json
    │   │   │   ├── college-medicine.json
    │   │   │   ├── computer-science.json
    │   │   │   ├── computer-security.json
    │   │   │   ├── conceptual-physics.json
    │   │   │   ├── construction-project-management.json
    │   │   │   ├── economics.json
    │   │   │   ├── education.json
    │   │   │   ├── electrical-engineering.json
    │   │   │   ├── elementary-chinese.json
    │   │   │   ├── elementary-commonsense.json
    │   │   │   ├── elementary-information-and-technology.json
    │   │   │   ├── elementary-mathematics.json
    │   │   │   ├── ethnology.json
    │   │   │   ├── food-science.json
    │   │   │   ├── genetics.json
    │   │   │   ├── global-facts.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-politics.json
    │   │   │   ├── human-sexuality.json
    │   │   │   ├── international-law.json
    │   │   │   ├── journalism.json
    │   │   │   ├── jurisprudence.json
    │   │   │   ├── legal-and-moral-basis.json
    │   │   │   ├── logical.json
    │   │   │   ├── machine-learning.json
    │   │   │   ├── management.json
    │   │   │   ├── marketing.json
    │   │   │   ├── marxist-theory.json
    │   │   │   ├── modern-chinese.json
    │   │   │   ├── nutrition.json
    │   │   │   ├── philosophy.json
    │   │   │   ├── professional-accounting.json
    │   │   │   ├── professional-law.json
    │   │   │   ├── professional-medicine.json
    │   │   │   ├── professional-psychology.json
    │   │   │   ├── public-relations.json
    │   │   │   ├── security-study.json
    │   │   │   ├── sociology.json
    │   │   │   ├── sports-science.json
    │   │   │   ├── traditional-chinese-medicine.json
    │   │   │   ├── virology.json
    │   │   │   ├── world-history.json
    │   │   │   └── world-religions.json
    │   │   ├── copa
    │   │   │   └── copa.json
    │   │   ├── eprstmt
    │   │   │   └── eprstmt.json
    │   │   ├── gsm8k
    │   │   │   └── gsm8k.json
    │   │   ├── hellaswag
    │   │   │   └── hellaswag.json
    │   │   ├── humaneval
    │   │   │   └── humaneval.json
    │   │   ├── lambada
    │   │   │   └── lambada.json
    │   │   ├── math
    │   │   │   ├── algebra.json
    │   │   │   ├── counting-and-probability.json
    │   │   │   ├── geometry.json
    │   │   │   ├── intermediate-algebra.json
    │   │   │   ├── number-theory.json
    │   │   │   ├── prealgebra.json
    │   │   │   └── precalculus.json
    │   │   ├── mbpp
    │   │   │   └── mbpp.json
    │   │   ├── mmlu
    │   │   │   ├── abstract-algebra.json
    │   │   │   ├── anatomy.json
    │   │   │   ├── astronomy.json
    │   │   │   ├── business-ethics.json
    │   │   │   ├── clinical-knowledge.json
    │   │   │   ├── college-biology.json
    │   │   │   ├── college-chemistry.json
    │   │   │   ├── college-computer-science.json
    │   │   │   ├── college-mathematics.json
    │   │   │   ├── college-medicine.json
    │   │   │   ├── college-physics.json
    │   │   │   ├── computer-security.json
    │   │   │   ├── conceptual-physics.json
    │   │   │   ├── econometrics.json
    │   │   │   ├── electrical-engineering.json
    │   │   │   ├── elementary-mathematics.json
    │   │   │   ├── formal-logic.json
    │   │   │   ├── global-facts.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-computer-science.json
    │   │   │   ├── high-school-european-history.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-government-and-politics.json
    │   │   │   ├── high-school-macroeconomics.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-microeconomics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-psychology.json
    │   │   │   ├── high-school-statistics.json
    │   │   │   ├── high-school-us-history.json
    │   │   │   ├── high-school-world-history.json
    │   │   │   ├── human-aging.json
    │   │   │   ├── human-sexuality.json
    │   │   │   ├── international-law.json
    │   │   │   ├── jurisprudence.json
    │   │   │   ├── logical-fallacies.json
    │   │   │   ├── machine-learning.json
    │   │   │   ├── management.json
    │   │   │   ├── marketing.json
    │   │   │   ├── medical-genetics.json
    │   │   │   ├── miscellaneous.json
    │   │   │   ├── moral-disputes.json
    │   │   │   ├── moral-scenarios.json
    │   │   │   ├── nutrition.json
    │   │   │   ├── philosophy.json
    │   │   │   ├── prehistory.json
    │   │   │   ├── professional-accounting.json
    │   │   │   ├── professional-law.json
    │   │   │   ├── professional-medicine.json
    │   │   │   ├── professional-psychology.json
    │   │   │   ├── public-relations.json
    │   │   │   ├── security-studies.json
    │   │   │   ├── sociology.json
    │   │   │   ├── us-foreign-policy.json
    │   │   │   ├── virology.json
    │   │   │   └── world-religions.json
    │   │   ├── ocnli
    │   │   │   └── ocnli.json
    │   │   ├── piqa
    │   │   │   └── piqa.json
    │   │   ├── rte
    │   │   │   └── rte.json
    │   │   ├── tydiqa
    │   │   │   └── tydiqa.json
    │   │   └── wic
    │   │   │   └── wic.json
    │   ├── Llama-2-70b-chat-hf
    │   │   ├── afqmc
    │   │   │   └── afqmc.json
    │   │   ├── arc-c
    │   │   │   └── arc-c.json
    │   │   ├── arc-e
    │   │   │   └── arc-e.json
    │   │   ├── ax-b
    │   │   │   └── ax-b.json
    │   │   ├── ax-g
    │   │   │   └── ax-g.json
    │   │   ├── bbh
    │   │   │   ├── boolean-expressions.json
    │   │   │   ├── causal-judgement.json
    │   │   │   ├── date-understanding.json
    │   │   │   ├── disambiguation-qa.json
    │   │   │   ├── dyck-languages.json
    │   │   │   ├── formal-fallacies.json
    │   │   │   ├── geometric-shapes.json
    │   │   │   ├── hyperbaton.json
    │   │   │   ├── logical-deduction-five-objects.json
    │   │   │   ├── logical-deduction-seven-objects.json
    │   │   │   ├── logical-deduction-three-objects.json
    │   │   │   ├── movie-recommendation.json
    │   │   │   ├── multistep-arithmetic-two.json
    │   │   │   ├── navigate.json
    │   │   │   ├── object-counting.json
    │   │   │   ├── penguins-in-a-table.json
    │   │   │   ├── reasoning-about-colored-objects.json
    │   │   │   ├── ruin-names.json
    │   │   │   ├── salient-translation-error-detection.json
    │   │   │   ├── snarks.json
    │   │   │   ├── sports-understanding.json
    │   │   │   ├── temporal-sequences.json
    │   │   │   ├── tracking-shuffled-objects-five-objects.json
    │   │   │   ├── tracking-shuffled-objects-seven-objects.json
    │   │   │   ├── tracking-shuffled-objects-three-objects.json
    │   │   │   ├── web-of-lies.json
    │   │   │   └── word-sorting.json
    │   │   ├── boolq
    │   │   │   └── boolq.json
    │   │   ├── c3
    │   │   │   ├── dialog.json
    │   │   │   └── mixed.json
    │   │   ├── ceval
    │   │   │   ├── accountant.json
    │   │   │   ├── advanced-mathematics.json
    │   │   │   ├── art-studies.json
    │   │   │   ├── basic-medicine.json
    │   │   │   ├── business-administration.json
    │   │   │   ├── chinese-language-and-literature.json
    │   │   │   ├── civil-servant.json
    │   │   │   ├── clinical-medicine.json
    │   │   │   ├── college-chemistry.json
    │   │   │   ├── college-economics.json
    │   │   │   ├── college-physics.json
    │   │   │   ├── college-programming.json
    │   │   │   ├── computer-architecture.json
    │   │   │   ├── computer-network.json
    │   │   │   ├── discrete-mathematics.json
    │   │   │   ├── education-science.json
    │   │   │   ├── electrical-engineer.json
    │   │   │   ├── environmental-impact-assessment-engineer.json
    │   │   │   ├── fire-engineer.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-chinese.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-history.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-politics.json
    │   │   │   ├── ideological-and-moral-cultivation.json
    │   │   │   ├── law.json
    │   │   │   ├── legal-professional.json
    │   │   │   ├── logic.json
    │   │   │   ├── mao-zedong-thought.json
    │   │   │   ├── marxism.json
    │   │   │   ├── metrology-engineer.json
    │   │   │   ├── middle-school-biology.json
    │   │   │   ├── middle-school-chemistry.json
    │   │   │   ├── middle-school-geography.json
    │   │   │   ├── middle-school-history.json
    │   │   │   ├── middle-school-mathematics.json
    │   │   │   ├── middle-school-physics.json
    │   │   │   ├── middle-school-politics.json
    │   │   │   ├── modern-chinese-history.json
    │   │   │   ├── operating-system.json
    │   │   │   ├── physician.json
    │   │   │   ├── plant-protection.json
    │   │   │   ├── probability-and-statistics.json
    │   │   │   ├── professional-tour-guide.json
    │   │   │   ├── sports-science.json
    │   │   │   ├── tax-accountant.json
    │   │   │   ├── teacher-qualification.json
    │   │   │   ├── urban-and-rural-planner.json
    │   │   │   └── veterinary-medicine.json
    │   │   ├── chid
    │   │   │   └── chid.json
    │   │   ├── cmmlu
    │   │   │   ├── agronomy.json
    │   │   │   ├── anatomy.json
    │   │   │   ├── ancient-chinese.json
    │   │   │   ├── arts.json
    │   │   │   ├── astronomy.json
    │   │   │   ├── business-ethics.json
    │   │   │   ├── chinese-civil-service-exam.json
    │   │   │   ├── chinese-driving-rule.json
    │   │   │   ├── chinese-food-culture.json
    │   │   │   ├── chinese-foreign-policy.json
    │   │   │   ├── chinese-history.json
    │   │   │   ├── chinese-literature.json
    │   │   │   ├── chinese-teacher-qualification.json
    │   │   │   ├── clinical-knowledge.json
    │   │   │   ├── college-actuarial-science.json
    │   │   │   ├── college-education.json
    │   │   │   ├── college-engineering-hydrology.json
    │   │   │   ├── college-law.json
    │   │   │   ├── college-mathematics.json
    │   │   │   ├── college-medical-statistics.json
    │   │   │   ├── college-medicine.json
    │   │   │   ├── computer-science.json
    │   │   │   ├── computer-security.json
    │   │   │   ├── conceptual-physics.json
    │   │   │   ├── construction-project-management.json
    │   │   │   ├── economics.json
    │   │   │   ├── education.json
    │   │   │   ├── electrical-engineering.json
    │   │   │   ├── elementary-chinese.json
    │   │   │   ├── elementary-commonsense.json
    │   │   │   ├── elementary-information-and-technology.json
    │   │   │   ├── elementary-mathematics.json
    │   │   │   ├── ethnology.json
    │   │   │   ├── food-science.json
    │   │   │   ├── genetics.json
    │   │   │   ├── global-facts.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-politics.json
    │   │   │   ├── human-sexuality.json
    │   │   │   ├── international-law.json
    │   │   │   ├── journalism.json
    │   │   │   ├── jurisprudence.json
    │   │   │   ├── legal-and-moral-basis.json
    │   │   │   ├── logical.json
    │   │   │   ├── machine-learning.json
    │   │   │   ├── management.json
    │   │   │   ├── marketing.json
    │   │   │   ├── marxist-theory.json
    │   │   │   ├── modern-chinese.json
    │   │   │   ├── nutrition.json
    │   │   │   ├── philosophy.json
    │   │   │   ├── professional-accounting.json
    │   │   │   ├── professional-law.json
    │   │   │   ├── professional-medicine.json
    │   │   │   ├── professional-psychology.json
    │   │   │   ├── public-relations.json
    │   │   │   ├── security-study.json
    │   │   │   ├── sociology.json
    │   │   │   ├── sports-science.json
    │   │   │   ├── traditional-chinese-medicine.json
    │   │   │   ├── virology.json
    │   │   │   ├── world-history.json
    │   │   │   └── world-religions.json
    │   │   ├── copa
    │   │   │   └── copa.json
    │   │   ├── eprstmt
    │   │   │   └── eprstmt.json
    │   │   ├── gsm8k
    │   │   │   └── gsm8k.json
    │   │   ├── hellaswag
    │   │   │   └── hellaswag.json
    │   │   ├── humaneval
    │   │   │   └── humaneval.json
    │   │   ├── lambada
    │   │   │   └── lambada.json
    │   │   ├── math
    │   │   │   ├── algebra.json
    │   │   │   ├── counting-and-probability.json
    │   │   │   ├── geometry.json
    │   │   │   ├── intermediate-algebra.json
    │   │   │   ├── number-theory.json
    │   │   │   ├── prealgebra.json
    │   │   │   └── precalculus.json
    │   │   ├── mbpp
    │   │   │   └── mbpp.json
    │   │   ├── mmlu
    │   │   │   ├── abstract-algebra.json
    │   │   │   ├── anatomy.json
    │   │   │   ├── astronomy.json
    │   │   │   ├── business-ethics.json
    │   │   │   ├── clinical-knowledge.json
    │   │   │   ├── college-biology.json
    │   │   │   ├── college-chemistry.json
    │   │   │   ├── college-computer-science.json
    │   │   │   ├── college-mathematics.json
    │   │   │   ├── college-medicine.json
    │   │   │   ├── college-physics.json
    │   │   │   ├── computer-security.json
    │   │   │   ├── conceptual-physics.json
    │   │   │   ├── econometrics.json
    │   │   │   ├── electrical-engineering.json
    │   │   │   ├── elementary-mathematics.json
    │   │   │   ├── formal-logic.json
    │   │   │   ├── global-facts.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-computer-science.json
    │   │   │   ├── high-school-european-history.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-government-and-politics.json
    │   │   │   ├── high-school-macroeconomics.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-microeconomics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-psychology.json
    │   │   │   ├── high-school-statistics.json
    │   │   │   ├── high-school-us-history.json
    │   │   │   ├── high-school-world-history.json
    │   │   │   ├── human-aging.json
    │   │   │   ├── human-sexuality.json
    │   │   │   ├── international-law.json
    │   │   │   ├── jurisprudence.json
    │   │   │   ├── logical-fallacies.json
    │   │   │   ├── machine-learning.json
    │   │   │   ├── management.json
    │   │   │   ├── marketing.json
    │   │   │   ├── medical-genetics.json
    │   │   │   ├── miscellaneous.json
    │   │   │   ├── moral-disputes.json
    │   │   │   ├── moral-scenarios.json
    │   │   │   ├── nutrition.json
    │   │   │   ├── philosophy.json
    │   │   │   ├── prehistory.json
    │   │   │   ├── professional-accounting.json
    │   │   │   ├── professional-law.json
    │   │   │   ├── professional-medicine.json
    │   │   │   ├── professional-psychology.json
    │   │   │   ├── public-relations.json
    │   │   │   ├── security-studies.json
    │   │   │   ├── sociology.json
    │   │   │   ├── us-foreign-policy.json
    │   │   │   ├── virology.json
    │   │   │   └── world-religions.json
    │   │   ├── ocnli
    │   │   │   └── ocnli.json
    │   │   ├── piqa
    │   │   │   └── piqa.json
    │   │   ├── rte
    │   │   │   └── rte.json
    │   │   ├── tydiqa
    │   │   │   └── tydiqa.json
    │   │   └── wic
    │   │   │   └── wic.json
    │   ├── Llama-2-70b-hf
    │   │   ├── afqmc
    │   │   │   └── afqmc.json
    │   │   ├── arc-c
    │   │   │   └── arc-c.json
    │   │   ├── arc-e
    │   │   │   └── arc-e.json
    │   │   ├── ax-b
    │   │   │   └── ax-b.json
    │   │   ├── ax-g
    │   │   │   └── ax-g.json
    │   │   ├── bbh
    │   │   │   ├── boolean-expressions.json
    │   │   │   ├── causal-judgement.json
    │   │   │   ├── date-understanding.json
    │   │   │   ├── disambiguation-qa.json
    │   │   │   ├── dyck-languages.json
    │   │   │   ├── formal-fallacies.json
    │   │   │   ├── geometric-shapes.json
    │   │   │   ├── hyperbaton.json
    │   │   │   ├── logical-deduction-five-objects.json
    │   │   │   ├── logical-deduction-seven-objects.json
    │   │   │   ├── logical-deduction-three-objects.json
    │   │   │   ├── movie-recommendation.json
    │   │   │   ├── multistep-arithmetic-two.json
    │   │   │   ├── navigate.json
    │   │   │   ├── object-counting.json
    │   │   │   ├── penguins-in-a-table.json
    │   │   │   ├── reasoning-about-colored-objects.json
    │   │   │   ├── ruin-names.json
    │   │   │   ├── salient-translation-error-detection.json
    │   │   │   ├── snarks.json
    │   │   │   ├── sports-understanding.json
    │   │   │   ├── temporal-sequences.json
    │   │   │   ├── tracking-shuffled-objects-five-objects.json
    │   │   │   ├── tracking-shuffled-objects-seven-objects.json
    │   │   │   ├── tracking-shuffled-objects-three-objects.json
    │   │   │   ├── web-of-lies.json
    │   │   │   └── word-sorting.json
    │   │   ├── boolq
    │   │   │   └── boolq.json
    │   │   ├── c3
    │   │   │   ├── dialog.json
    │   │   │   └── mixed.json
    │   │   ├── ceval
    │   │   │   ├── accountant.json
    │   │   │   ├── advanced-mathematics.json
    │   │   │   ├── art-studies.json
    │   │   │   ├── basic-medicine.json
    │   │   │   ├── business-administration.json
    │   │   │   ├── chinese-language-and-literature.json
    │   │   │   ├── civil-servant.json
    │   │   │   ├── clinical-medicine.json
    │   │   │   ├── college-chemistry.json
    │   │   │   ├── college-economics.json
    │   │   │   ├── college-physics.json
    │   │   │   ├── college-programming.json
    │   │   │   ├── computer-architecture.json
    │   │   │   ├── computer-network.json
    │   │   │   ├── discrete-mathematics.json
    │   │   │   ├── education-science.json
    │   │   │   ├── electrical-engineer.json
    │   │   │   ├── environmental-impact-assessment-engineer.json
    │   │   │   ├── fire-engineer.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-chinese.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-history.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-politics.json
    │   │   │   ├── ideological-and-moral-cultivation.json
    │   │   │   ├── law.json
    │   │   │   ├── legal-professional.json
    │   │   │   ├── logic.json
    │   │   │   ├── mao-zedong-thought.json
    │   │   │   ├── marxism.json
    │   │   │   ├── metrology-engineer.json
    │   │   │   ├── middle-school-biology.json
    │   │   │   ├── middle-school-chemistry.json
    │   │   │   ├── middle-school-geography.json
    │   │   │   ├── middle-school-history.json
    │   │   │   ├── middle-school-mathematics.json
    │   │   │   ├── middle-school-physics.json
    │   │   │   ├── middle-school-politics.json
    │   │   │   ├── modern-chinese-history.json
    │   │   │   ├── operating-system.json
    │   │   │   ├── physician.json
    │   │   │   ├── plant-protection.json
    │   │   │   ├── probability-and-statistics.json
    │   │   │   ├── professional-tour-guide.json
    │   │   │   ├── sports-science.json
    │   │   │   ├── tax-accountant.json
    │   │   │   ├── teacher-qualification.json
    │   │   │   ├── urban-and-rural-planner.json
    │   │   │   └── veterinary-medicine.json
    │   │   ├── chid
    │   │   │   └── chid.json
    │   │   ├── cmmlu
    │   │   │   ├── agronomy.json
    │   │   │   ├── anatomy.json
    │   │   │   ├── ancient-chinese.json
    │   │   │   ├── arts.json
    │   │   │   ├── astronomy.json
    │   │   │   ├── business-ethics.json
    │   │   │   ├── chinese-civil-service-exam.json
    │   │   │   ├── chinese-driving-rule.json
    │   │   │   ├── chinese-food-culture.json
    │   │   │   ├── chinese-foreign-policy.json
    │   │   │   ├── chinese-history.json
    │   │   │   ├── chinese-literature.json
    │   │   │   ├── chinese-teacher-qualification.json
    │   │   │   ├── clinical-knowledge.json
    │   │   │   ├── college-actuarial-science.json
    │   │   │   ├── college-education.json
    │   │   │   ├── college-engineering-hydrology.json
    │   │   │   ├── college-law.json
    │   │   │   ├── college-mathematics.json
    │   │   │   ├── college-medical-statistics.json
    │   │   │   ├── college-medicine.json
    │   │   │   ├── computer-science.json
    │   │   │   ├── computer-security.json
    │   │   │   ├── conceptual-physics.json
    │   │   │   ├── construction-project-management.json
    │   │   │   ├── economics.json
    │   │   │   ├── education.json
    │   │   │   ├── electrical-engineering.json
    │   │   │   ├── elementary-chinese.json
    │   │   │   ├── elementary-commonsense.json
    │   │   │   ├── elementary-information-and-technology.json
    │   │   │   ├── elementary-mathematics.json
    │   │   │   ├── ethnology.json
    │   │   │   ├── food-science.json
    │   │   │   ├── genetics.json
    │   │   │   ├── global-facts.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-politics.json
    │   │   │   ├── human-sexuality.json
    │   │   │   ├── international-law.json
    │   │   │   ├── journalism.json
    │   │   │   ├── jurisprudence.json
    │   │   │   ├── legal-and-moral-basis.json
    │   │   │   ├── logical.json
    │   │   │   ├── machine-learning.json
    │   │   │   ├── management.json
    │   │   │   ├── marketing.json
    │   │   │   ├── marxist-theory.json
    │   │   │   ├── modern-chinese.json
    │   │   │   ├── nutrition.json
    │   │   │   ├── philosophy.json
    │   │   │   ├── professional-accounting.json
    │   │   │   ├── professional-law.json
    │   │   │   ├── professional-medicine.json
    │   │   │   ├── professional-psychology.json
    │   │   │   ├── public-relations.json
    │   │   │   ├── security-study.json
    │   │   │   ├── sociology.json
    │   │   │   ├── sports-science.json
    │   │   │   ├── traditional-chinese-medicine.json
    │   │   │   ├── virology.json
    │   │   │   ├── world-history.json
    │   │   │   └── world-religions.json
    │   │   ├── copa
    │   │   │   └── copa.json
    │   │   ├── eprstmt
    │   │   │   └── eprstmt.json
    │   │   ├── gsm8k
    │   │   │   └── gsm8k.json
    │   │   ├── hellaswag
    │   │   │   └── hellaswag.json
    │   │   ├── humaneval
    │   │   │   └── humaneval.json
    │   │   ├── lambada
    │   │   │   └── lambada.json
    │   │   ├── math
    │   │   │   ├── algebra.json
    │   │   │   ├── counting-and-probability.json
    │   │   │   ├── geometry.json
    │   │   │   ├── intermediate-algebra.json
    │   │   │   ├── number-theory.json
    │   │   │   ├── prealgebra.json
    │   │   │   └── precalculus.json
    │   │   ├── mbpp
    │   │   │   └── mbpp.json
    │   │   ├── mmlu
    │   │   │   ├── abstract-algebra.json
    │   │   │   ├── anatomy.json
    │   │   │   ├── astronomy.json
    │   │   │   ├── business-ethics.json
    │   │   │   ├── clinical-knowledge.json
    │   │   │   ├── college-biology.json
    │   │   │   ├── college-chemistry.json
    │   │   │   ├── college-computer-science.json
    │   │   │   ├── college-mathematics.json
    │   │   │   ├── college-medicine.json
    │   │   │   ├── college-physics.json
    │   │   │   ├── computer-security.json
    │   │   │   ├── conceptual-physics.json
    │   │   │   ├── econometrics.json
    │   │   │   ├── electrical-engineering.json
    │   │   │   ├── elementary-mathematics.json
    │   │   │   ├── formal-logic.json
    │   │   │   ├── global-facts.json
    │   │   │   ├── high-school-biology.json
    │   │   │   ├── high-school-chemistry.json
    │   │   │   ├── high-school-computer-science.json
    │   │   │   ├── high-school-european-history.json
    │   │   │   ├── high-school-geography.json
    │   │   │   ├── high-school-government-and-politics.json
    │   │   │   ├── high-school-macroeconomics.json
    │   │   │   ├── high-school-mathematics.json
    │   │   │   ├── high-school-microeconomics.json
    │   │   │   ├── high-school-physics.json
    │   │   │   ├── high-school-psychology.json
    │   │   │   ├── high-school-statistics.json
    │   │   │   ├── high-school-us-history.json
    │   │   │   ├── high-school-world-history.json
    │   │   │   ├── human-aging.json
    │   │   │   ├── human-sexuality.json
    │   │   │   ├── international-law.json
    │   │   │   ├── jurisprudence.json
    │   │   │   ├── logical-fallacies.json
    │   │   │   ├── machine-learning.json
    │   │   │   ├── management.json
    │   │   │   ├── marketing.json
    │   │   │   ├── medical-genetics.json
    │   │   │   ├── miscellaneous.json
    │   │   │   ├── moral-disputes.json
    │   │   │   ├── moral-scenarios.json
    │   │   │   ├── nutrition.json
    │   │   │   ├── philosophy.json
    │   │   │   ├── prehistory.json
    │   │   │   ├── professional-accounting.json
    │   │   │   ├── professional-law.json
    │   │   │   ├── professional-medicine.json
    │   │   │   ├── professional-psychology.json
    │   │   │   ├── public-relations.json
    │   │   │   ├── security-studies.json
    │   │   │   ├── sociology.json
    │   │   │   ├── us-foreign-policy.json
    │   │   │   ├── virology.json
    │   │   │   └── world-religions.json
    │   │   ├── ocnli
    │   │   │   └── ocnli.json
    │   │   ├── piqa
    │   │   │   └── piqa.json
    │   │   ├── rte
    │   │   │   └── rte.json
    │   │   ├── tydiqa
    │   │   │   └── tydiqa.json
    │   │   └── wic
    │   │   │   └── wic.json
    │   └── Llama-2-7b-hf
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    ├── mistralai
    │   └── Mistral-7B-v0.1
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    ├── stabilityai
    │   └── StableBeluga2
    │   │   ├── afqmc
    │   │       └── afqmc.json
    │   │   ├── arc-c
    │   │       └── arc-c.json
    │   │   ├── arc-e
    │   │       └── arc-e.json
    │   │   ├── ax-b
    │   │       └── ax-b.json
    │   │   ├── ax-g
    │   │       └── ax-g.json
    │   │   ├── bbh
    │   │       ├── boolean-expressions.json
    │   │       ├── causal-judgement.json
    │   │       ├── date-understanding.json
    │   │       ├── disambiguation-qa.json
    │   │       ├── dyck-languages.json
    │   │       ├── formal-fallacies.json
    │   │       ├── geometric-shapes.json
    │   │       ├── hyperbaton.json
    │   │       ├── logical-deduction-five-objects.json
    │   │       ├── logical-deduction-seven-objects.json
    │   │       ├── logical-deduction-three-objects.json
    │   │       ├── movie-recommendation.json
    │   │       ├── multistep-arithmetic-two.json
    │   │       ├── navigate.json
    │   │       ├── object-counting.json
    │   │       ├── penguins-in-a-table.json
    │   │       ├── reasoning-about-colored-objects.json
    │   │       ├── ruin-names.json
    │   │       ├── salient-translation-error-detection.json
    │   │       ├── snarks.json
    │   │       ├── sports-understanding.json
    │   │       ├── temporal-sequences.json
    │   │       ├── tracking-shuffled-objects-five-objects.json
    │   │       ├── tracking-shuffled-objects-seven-objects.json
    │   │       ├── tracking-shuffled-objects-three-objects.json
    │   │       ├── web-of-lies.json
    │   │       └── word-sorting.json
    │   │   ├── boolq
    │   │       └── boolq.json
    │   │   ├── c3
    │   │       ├── dialog.json
    │   │       └── mixed.json
    │   │   ├── ceval
    │   │       ├── accountant.json
    │   │       ├── advanced-mathematics.json
    │   │       ├── art-studies.json
    │   │       ├── basic-medicine.json
    │   │       ├── business-administration.json
    │   │       ├── chinese-language-and-literature.json
    │   │       ├── civil-servant.json
    │   │       ├── clinical-medicine.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-economics.json
    │   │       ├── college-physics.json
    │   │       ├── college-programming.json
    │   │       ├── computer-architecture.json
    │   │       ├── computer-network.json
    │   │       ├── discrete-mathematics.json
    │   │       ├── education-science.json
    │   │       ├── electrical-engineer.json
    │   │       ├── environmental-impact-assessment-engineer.json
    │   │       ├── fire-engineer.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-chinese.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-history.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── ideological-and-moral-cultivation.json
    │   │       ├── law.json
    │   │       ├── legal-professional.json
    │   │       ├── logic.json
    │   │       ├── mao-zedong-thought.json
    │   │       ├── marxism.json
    │   │       ├── metrology-engineer.json
    │   │       ├── middle-school-biology.json
    │   │       ├── middle-school-chemistry.json
    │   │       ├── middle-school-geography.json
    │   │       ├── middle-school-history.json
    │   │       ├── middle-school-mathematics.json
    │   │       ├── middle-school-physics.json
    │   │       ├── middle-school-politics.json
    │   │       ├── modern-chinese-history.json
    │   │       ├── operating-system.json
    │   │       ├── physician.json
    │   │       ├── plant-protection.json
    │   │       ├── probability-and-statistics.json
    │   │       ├── professional-tour-guide.json
    │   │       ├── sports-science.json
    │   │       ├── tax-accountant.json
    │   │       ├── teacher-qualification.json
    │   │       ├── urban-and-rural-planner.json
    │   │       └── veterinary-medicine.json
    │   │   ├── chid
    │   │       └── chid.json
    │   │   ├── cmmlu
    │   │       ├── agronomy.json
    │   │       ├── anatomy.json
    │   │       ├── ancient-chinese.json
    │   │       ├── arts.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── chinese-civil-service-exam.json
    │   │       ├── chinese-driving-rule.json
    │   │       ├── chinese-food-culture.json
    │   │       ├── chinese-foreign-policy.json
    │   │       ├── chinese-history.json
    │   │       ├── chinese-literature.json
    │   │       ├── chinese-teacher-qualification.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-actuarial-science.json
    │   │       ├── college-education.json
    │   │       ├── college-engineering-hydrology.json
    │   │       ├── college-law.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medical-statistics.json
    │   │       ├── college-medicine.json
    │   │       ├── computer-science.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── construction-project-management.json
    │   │       ├── economics.json
    │   │       ├── education.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-chinese.json
    │   │       ├── elementary-commonsense.json
    │   │       ├── elementary-information-and-technology.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── ethnology.json
    │   │       ├── food-science.json
    │   │       ├── genetics.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-politics.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── journalism.json
    │   │       ├── jurisprudence.json
    │   │       ├── legal-and-moral-basis.json
    │   │       ├── logical.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── marxist-theory.json
    │   │       ├── modern-chinese.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-study.json
    │   │       ├── sociology.json
    │   │       ├── sports-science.json
    │   │       ├── traditional-chinese-medicine.json
    │   │       ├── virology.json
    │   │       ├── world-history.json
    │   │       └── world-religions.json
    │   │   ├── copa
    │   │       └── copa.json
    │   │   ├── eprstmt
    │   │       └── eprstmt.json
    │   │   ├── gsm8k
    │   │       └── gsm8k.json
    │   │   ├── hellaswag
    │   │       └── hellaswag.json
    │   │   ├── humaneval
    │   │       └── humaneval.json
    │   │   ├── lambada
    │   │       └── lambada.json
    │   │   ├── math
    │   │       ├── algebra.json
    │   │       ├── counting-and-probability.json
    │   │       ├── geometry.json
    │   │       ├── intermediate-algebra.json
    │   │       ├── number-theory.json
    │   │       ├── prealgebra.json
    │   │       └── precalculus.json
    │   │   ├── mbpp
    │   │       └── mbpp.json
    │   │   ├── mmlu
    │   │       ├── abstract-algebra.json
    │   │       ├── anatomy.json
    │   │       ├── astronomy.json
    │   │       ├── business-ethics.json
    │   │       ├── clinical-knowledge.json
    │   │       ├── college-biology.json
    │   │       ├── college-chemistry.json
    │   │       ├── college-computer-science.json
    │   │       ├── college-mathematics.json
    │   │       ├── college-medicine.json
    │   │       ├── college-physics.json
    │   │       ├── computer-security.json
    │   │       ├── conceptual-physics.json
    │   │       ├── econometrics.json
    │   │       ├── electrical-engineering.json
    │   │       ├── elementary-mathematics.json
    │   │       ├── formal-logic.json
    │   │       ├── global-facts.json
    │   │       ├── high-school-biology.json
    │   │       ├── high-school-chemistry.json
    │   │       ├── high-school-computer-science.json
    │   │       ├── high-school-european-history.json
    │   │       ├── high-school-geography.json
    │   │       ├── high-school-government-and-politics.json
    │   │       ├── high-school-macroeconomics.json
    │   │       ├── high-school-mathematics.json
    │   │       ├── high-school-microeconomics.json
    │   │       ├── high-school-physics.json
    │   │       ├── high-school-psychology.json
    │   │       ├── high-school-statistics.json
    │   │       ├── high-school-us-history.json
    │   │       ├── high-school-world-history.json
    │   │       ├── human-aging.json
    │   │       ├── human-sexuality.json
    │   │       ├── international-law.json
    │   │       ├── jurisprudence.json
    │   │       ├── logical-fallacies.json
    │   │       ├── machine-learning.json
    │   │       ├── management.json
    │   │       ├── marketing.json
    │   │       ├── medical-genetics.json
    │   │       ├── miscellaneous.json
    │   │       ├── moral-disputes.json
    │   │       ├── moral-scenarios.json
    │   │       ├── nutrition.json
    │   │       ├── philosophy.json
    │   │       ├── prehistory.json
    │   │       ├── professional-accounting.json
    │   │       ├── professional-law.json
    │   │       ├── professional-medicine.json
    │   │       ├── professional-psychology.json
    │   │       ├── public-relations.json
    │   │       ├── security-studies.json
    │   │       ├── sociology.json
    │   │       ├── us-foreign-policy.json
    │   │       ├── virology.json
    │   │       └── world-religions.json
    │   │   ├── ocnli
    │   │       └── ocnli.json
    │   │   ├── piqa
    │   │       └── piqa.json
    │   │   ├── rte
    │   │       └── rte.json
    │   │   ├── tydiqa
    │   │       └── tydiqa.json
    │   │   └── wic
    │   │       └── wic.json
    └── yulan-team
    │   └── YuLan-Chat-2-13b
    │       ├── afqmc
    │           └── afqmc.json
    │       ├── arc-c
    │           └── arc-c.json
    │       ├── arc-e
    │           └── arc-e.json
    │       ├── ax-b
    │           └── ax-b.json
    │       ├── ax-g
    │           └── ax-g.json
    │       ├── bbh
    │           ├── boolean-expressions.json
    │           ├── causal-judgement.json
    │           ├── date-understanding.json
    │           ├── disambiguation-qa.json
    │           ├── dyck-languages.json
    │           ├── formal-fallacies.json
    │           ├── geometric-shapes.json
    │           ├── hyperbaton.json
    │           ├── logical-deduction-five-objects.json
    │           ├── logical-deduction-seven-objects.json
    │           ├── logical-deduction-three-objects.json
    │           ├── movie-recommendation.json
    │           ├── multistep-arithmetic-two.json
    │           ├── navigate.json
    │           ├── object-counting.json
    │           ├── penguins-in-a-table.json
    │           ├── reasoning-about-colored-objects.json
    │           ├── ruin-names.json
    │           ├── salient-translation-error-detection.json
    │           ├── snarks.json
    │           ├── sports-understanding.json
    │           ├── temporal-sequences.json
    │           ├── tracking-shuffled-objects-five-objects.json
    │           ├── tracking-shuffled-objects-seven-objects.json
    │           ├── tracking-shuffled-objects-three-objects.json
    │           ├── web-of-lies.json
    │           └── word-sorting.json
    │       ├── boolq
    │           └── boolq.json
    │       ├── c3
    │           ├── dialog.json
    │           └── mixed.json
    │       ├── ceval
    │           ├── accountant.json
    │           ├── advanced-mathematics.json
    │           ├── art-studies.json
    │           ├── basic-medicine.json
    │           ├── business-administration.json
    │           ├── chinese-language-and-literature.json
    │           ├── civil-servant.json
    │           ├── clinical-medicine.json
    │           ├── college-chemistry.json
    │           ├── college-economics.json
    │           ├── college-physics.json
    │           ├── college-programming.json
    │           ├── computer-architecture.json
    │           ├── computer-network.json
    │           ├── discrete-mathematics.json
    │           ├── education-science.json
    │           ├── electrical-engineer.json
    │           ├── environmental-impact-assessment-engineer.json
    │           ├── fire-engineer.json
    │           ├── high-school-biology.json
    │           ├── high-school-chemistry.json
    │           ├── high-school-chinese.json
    │           ├── high-school-geography.json
    │           ├── high-school-history.json
    │           ├── high-school-mathematics.json
    │           ├── high-school-physics.json
    │           ├── high-school-politics.json
    │           ├── ideological-and-moral-cultivation.json
    │           ├── law.json
    │           ├── legal-professional.json
    │           ├── logic.json
    │           ├── mao-zedong-thought.json
    │           ├── marxism.json
    │           ├── metrology-engineer.json
    │           ├── middle-school-biology.json
    │           ├── middle-school-chemistry.json
    │           ├── middle-school-geography.json
    │           ├── middle-school-history.json
    │           ├── middle-school-mathematics.json
    │           ├── middle-school-physics.json
    │           ├── middle-school-politics.json
    │           ├── modern-chinese-history.json
    │           ├── operating-system.json
    │           ├── physician.json
    │           ├── plant-protection.json
    │           ├── probability-and-statistics.json
    │           ├── professional-tour-guide.json
    │           ├── sports-science.json
    │           ├── tax-accountant.json
    │           ├── teacher-qualification.json
    │           ├── urban-and-rural-planner.json
    │           └── veterinary-medicine.json
    │       ├── chid
    │           └── chid.json
    │       ├── cmmlu
    │           ├── agronomy.json
    │           ├── anatomy.json
    │           ├── ancient-chinese.json
    │           ├── arts.json
    │           ├── astronomy.json
    │           ├── business-ethics.json
    │           ├── chinese-civil-service-exam.json
    │           ├── chinese-driving-rule.json
    │           ├── chinese-food-culture.json
    │           ├── chinese-foreign-policy.json
    │           ├── chinese-history.json
    │           ├── chinese-literature.json
    │           ├── chinese-teacher-qualification.json
    │           ├── clinical-knowledge.json
    │           ├── college-actuarial-science.json
    │           ├── college-education.json
    │           ├── college-engineering-hydrology.json
    │           ├── college-law.json
    │           ├── college-mathematics.json
    │           ├── college-medical-statistics.json
    │           ├── college-medicine.json
    │           ├── computer-science.json
    │           ├── computer-security.json
    │           ├── conceptual-physics.json
    │           ├── construction-project-management.json
    │           ├── economics.json
    │           ├── education.json
    │           ├── electrical-engineering.json
    │           ├── elementary-chinese.json
    │           ├── elementary-commonsense.json
    │           ├── elementary-information-and-technology.json
    │           ├── elementary-mathematics.json
    │           ├── ethnology.json
    │           ├── food-science.json
    │           ├── genetics.json
    │           ├── global-facts.json
    │           ├── high-school-biology.json
    │           ├── high-school-chemistry.json
    │           ├── high-school-geography.json
    │           ├── high-school-mathematics.json
    │           ├── high-school-physics.json
    │           ├── high-school-politics.json
    │           ├── human-sexuality.json
    │           ├── international-law.json
    │           ├── journalism.json
    │           ├── jurisprudence.json
    │           ├── legal-and-moral-basis.json
    │           ├── logical.json
    │           ├── machine-learning.json
    │           ├── management.json
    │           ├── marketing.json
    │           ├── marxist-theory.json
    │           ├── modern-chinese.json
    │           ├── nutrition.json
    │           ├── philosophy.json
    │           ├── professional-accounting.json
    │           ├── professional-law.json
    │           ├── professional-medicine.json
    │           ├── professional-psychology.json
    │           ├── public-relations.json
    │           ├── security-study.json
    │           ├── sociology.json
    │           ├── sports-science.json
    │           ├── traditional-chinese-medicine.json
    │           ├── virology.json
    │           ├── world-history.json
    │           └── world-religions.json
    │       ├── copa
    │           └── copa.json
    │       ├── eprstmt
    │           └── eprstmt.json
    │       ├── gsm8k
    │           └── gsm8k.json
    │       ├── hellaswag
    │           └── hellaswag.json
    │       ├── humaneval
    │           └── humaneval.json
    │       ├── lambada
    │           └── lambada.json
    │       ├── math
    │           ├── algebra.json
    │           ├── counting-and-probability.json
    │           ├── geometry.json
    │           ├── intermediate-algebra.json
    │           ├── number-theory.json
    │           ├── prealgebra.json
    │           └── precalculus.json
    │       ├── mbpp
    │           └── mbpp.json
    │       ├── mmlu
    │           ├── abstract-algebra.json
    │           ├── anatomy.json
    │           ├── astronomy.json
    │           ├── business-ethics.json
    │           ├── clinical-knowledge.json
    │           ├── college-biology.json
    │           ├── college-chemistry.json
    │           ├── college-computer-science.json
    │           ├── college-mathematics.json
    │           ├── college-medicine.json
    │           ├── college-physics.json
    │           ├── computer-security.json
    │           ├── conceptual-physics.json
    │           ├── econometrics.json
    │           ├── electrical-engineering.json
    │           ├── elementary-mathematics.json
    │           ├── formal-logic.json
    │           ├── global-facts.json
    │           ├── high-school-biology.json
    │           ├── high-school-chemistry.json
    │           ├── high-school-computer-science.json
    │           ├── high-school-european-history.json
    │           ├── high-school-geography.json
    │           ├── high-school-government-and-politics.json
    │           ├── high-school-macroeconomics.json
    │           ├── high-school-mathematics.json
    │           ├── high-school-microeconomics.json
    │           ├── high-school-physics.json
    │           ├── high-school-psychology.json
    │           ├── high-school-statistics.json
    │           ├── high-school-us-history.json
    │           ├── high-school-world-history.json
    │           ├── human-aging.json
    │           ├── human-sexuality.json
    │           ├── international-law.json
    │           ├── jurisprudence.json
    │           ├── logical-fallacies.json
    │           ├── machine-learning.json
    │           ├── management.json
    │           ├── marketing.json
    │           ├── medical-genetics.json
    │           ├── miscellaneous.json
    │           ├── moral-disputes.json
    │           ├── moral-scenarios.json
    │           ├── nutrition.json
    │           ├── philosophy.json
    │           ├── prehistory.json
    │           ├── professional-accounting.json
    │           ├── professional-law.json
    │           ├── professional-medicine.json
    │           ├── professional-psychology.json
    │           ├── public-relations.json
    │           ├── security-studies.json
    │           ├── sociology.json
    │           ├── us-foreign-policy.json
    │           ├── virology.json
    │           └── world-religions.json
    │       ├── ocnli
    │           └── ocnli.json
    │       ├── piqa
    │           └── piqa.json
    │       ├── rte
    │           └── rte.json
    │       ├── tydiqa
    │           └── tydiqa.json
    │       └── wic
    │           └── wic.json
├── main.py
├── metrics
    ├── __init__.py
    ├── aggregator.py
    ├── bleu.py
    ├── chrf.py
    ├── exact_match.py
    ├── f1_score.py
    ├── function_execution.py
    ├── gaokaobench_match.py
    ├── gpt4_eval.py
    ├── in_match.py
    ├── log_prob.py
    ├── log_prob_mc2.py
    ├── prefix_match.py
    ├── qa_match.py
    └── rouge.py
├── models
    ├── __init__.py
    ├── general_model.py
    ├── model_params
    │   ├── gpt-3.5-turbo.json
    │   ├── gpt-4.json
    │   ├── vllm_beamsearch.json
    │   ├── vllm_logprobs.json
    │   ├── vllm_sample.json
    │   ├── vllm_sample_bbh.json
    │   └── vllm_sample_v1.json
    └── openai_model.py
├── requirements.txt
├── run_eval.sh
├── scripts
    ├── run_job_base.sh
    ├── run_vllm.sh
    └── run_vllm_ppl.sh
├── setup.py
├── tasks
    ├── __init__.py
    ├── eval_task.py
    ├── instance.py
    ├── postprocess.py
    └── view_task.py
└── utils
    ├── __init__.py
    ├── request.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | RawData/
 3 | **/data/
 4 | configs/eval_config.json
 5 | UltraEval.egg-info/
 6 | build/
 7 | logs/
 8 | wip-*
 9 | .idea/
10 | RawData.zip
11 | logs*/
12 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/__init__.py


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/datasets/__init__.py


--------------------------------------------------------------------------------
/datasets/afqmc/config/afqmc_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "afqmc",
 3 |     "path": "datasets/afqmc/data/afqmc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/afqmc/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/afqmc/config/afqmc_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "afqmc",
 3 |     "path": "datasets/afqmc/data/afqmc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/afqmc/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/afqmc/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         "语句一：“"
 7 |         + data["passage"][0]
 8 |         + "\n语句二：“"
 9 |         + data["passage"][1]
10 |         + "”\n语句一与语句二是关于蚂蚁金融产品的疑问，两者所询问的内容是否完全一致？"
11 |     )
12 |     correct_answer = [
13 |         key for key, value in data["target_scores"].items() if value == 1
14 |     ][0].strip()
15 | 
16 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
17 | 


--------------------------------------------------------------------------------
/datasets/agieval/config/aqua-rat_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "aqua-rat",
 3 |     "path": "datasets/agieval/data/aqua-rat.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/aqua-rat_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "aqua-rat",
 3 |     "path": "datasets/agieval/data/aqua-rat.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-biology_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-biology",
 3 |     "path": "datasets/agieval/data/gaokao-biology.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-biology_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-biology",
 3 |     "path": "datasets/agieval/data/gaokao-biology.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-chemistry_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-chemistry",
 3 |     "path": "datasets/agieval/data/gaokao-chemistry.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-chemistry_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-chemistry",
 3 |     "path": "datasets/agieval/data/gaokao-chemistry.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-chinese_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-chinese",
 3 |     "path": "datasets/agieval/data/gaokao-chinese.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-chinese_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-chinese",
 3 |     "path": "datasets/agieval/data/gaokao-chinese.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-english_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-english",
 3 |     "path": "datasets/agieval/data/gaokao-english.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-english_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-english",
 3 |     "path": "datasets/agieval/data/gaokao-english.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-geography_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-geography",
 3 |     "path": "datasets/agieval/data/gaokao-geography.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-geography_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-geography",
 3 |     "path": "datasets/agieval/data/gaokao-geography.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-history_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-history",
 3 |     "path": "datasets/agieval/data/gaokao-history.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-history_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-history",
 3 |     "path": "datasets/agieval/data/gaokao-history.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-mathcloze_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-mathcloze",
 3 |     "path": "datasets/agieval/data/gaokao-mathcloze.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_cloze_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-mathqa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-mathqa",
 3 |     "path": "datasets/agieval/data/gaokao-mathqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-mathqa_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-mathqa",
 3 |     "path": "datasets/agieval/data/gaokao-mathqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/gaokao-physics_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gaokao-physics",
 3 |     "path": "datasets/agieval/data/gaokao-physics.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_multiple_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/jec-qa-ca_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "jec-qa-ca",
 3 |     "path": "datasets/agieval/data/jec-qa-ca.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_multiple_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/jec-qa-kd_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "jec-qa-kd",
 3 |     "path": "datasets/agieval/data/jec-qa-kd.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_multiple_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/logiqa-en_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "logiqa-en",
 3 |     "path": "datasets/agieval/data/logiqa-en.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/logiqa-en_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "logiqa-en",
 3 |     "path": "datasets/agieval/data/logiqa-en.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/logiqa-zh_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "logiqa-zh",
 3 |     "path": "datasets/agieval/data/logiqa-zh.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/logiqa-zh_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "logiqa-zh",
 3 |     "path": "datasets/agieval/data/logiqa-zh.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/lsat-ar_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "lsat-ar",
 3 |     "path": "datasets/agieval/data/lsat-ar.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/lsat-ar_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "lsat-ar",
 3 |     "path": "datasets/agieval/data/lsat-ar.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/lsat-lr_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "lsat-lr",
 3 |     "path": "datasets/agieval/data/lsat-lr.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/lsat-lr_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "lsat-lr",
 3 |     "path": "datasets/agieval/data/lsat-lr.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/lsat-rc_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "lsat-rc",
 3 |     "path": "datasets/agieval/data/lsat-rc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/lsat-rc_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "lsat-rc",
 3 |     "path": "datasets/agieval/data/lsat-rc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/math_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "math",
 3 |     "path": "datasets/agieval/data/math.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_cloze_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/sat-en-without-passage_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sat-en-without-passage",
 3 |     "path": "datasets/agieval/data/sat-en-without-passage.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/sat-en_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sat-en",
 3 |     "path": "datasets/agieval/data/sat-en.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/sat-en_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sat-en",
 3 |     "path": "datasets/agieval/data/sat-en.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/sat-math_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sat-math",
 3 |     "path": "datasets/agieval/data/sat-math.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "agieval_single_answer_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/agieval/config/sat-math_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sat-math",
 3 |     "path": "datasets/agieval/data/sat-math.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/agieval/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/arc-c/config/arc-c_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "arc-c",
 3 |     "path": "datasets/arc-c/data/arc-c.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/arc-c/transform_gen_v1.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "qa_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/arc-c/config/arc-c_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "arc-c",
 3 |     "path": "datasets/arc-c/data/arc-c.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/arc-c/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/arc-c/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = "Question: " + data["question"] + "\n"
 6 |     text += "Answer: "
 7 |     correct_answer = [
 8 |         key for key, value in data["target_scores"].items() if value == 1
 9 |     ][0].strip()
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/arc-c/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"Question:\n{data['question']}\n"
 6 |     answer_prompt = f"Answer:\n"
 7 |     text = question + answer_prompt
 8 |     processed_correct_answer = correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/arc-e/config/arc-e_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "arc-e",
 3 |     "path": "datasets/arc-e/data/arc-e.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/arc-e/transform_gen_v1.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "qa_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/arc-e/config/arc-e_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "arc-e",
 3 |     "path": "datasets/arc-e/data/arc-e.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/arc-e/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/arc-e/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = "Question: " + data["question"] + "\n"
 6 |     text += "Answer: "
 7 |     correct_answer = [
 8 |         key for key, value in data["target_scores"].items() if value == 1
 9 |     ][0].strip()
10 | 
11 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
12 | 


--------------------------------------------------------------------------------
/datasets/arc-e/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"Question:\n{data['question']}\n"
 6 |     answer_prompt = f"Answer:\n"
 7 |     text = question + answer_prompt
 8 |     processed_correct_answer = correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/ax-b/config/ax-b_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ax-b",
 3 |     "path": "datasets/ax-b/data/ax-b.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/ax-b/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ax-b/config/ax-b_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ax-b",
 3 |     "path": "datasets/ax-b/data/ax-b.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/ax-b/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ax-b/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['passage'][0]}\n{data['passage'][1]}\nIs the sentence below entailed by the sentence above?\nA. Yes\nB. No\nAnswer: "
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/ax-b/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         data["passage"][0]
 7 |         + "\n"
 8 |         + data["passage"][1]
 9 |         + "\nIs the sentence below entailed by the sentence above?\n"
10 |     )
11 |     correct_answer = [
12 |         key for key, value in data["target_scores"].items() if value == 1
13 |     ][0].strip()
14 | 
15 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
16 | 


--------------------------------------------------------------------------------
/datasets/ax-g/config/ax-g_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ax-g",
 3 |     "path": "datasets/ax-g/data/ax-g.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/ax-g/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ax-g/config/ax-g_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ax-g",
 3 |     "path": "datasets/ax-g/data/ax-g.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/ax-g/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ax-g/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['passage'][0]}\n{data['passage'][1]}\nIs the sentence below entailed by the sentence above?\nA. Yes\nB. No\nAnswer: "
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/ax-g/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         data["passage"][0]
 7 |         + "\n"
 8 |         + data["passage"][1]
 9 |         + "\nIs the sentence below entailed by the sentence above?\n"
10 |     )
11 |     correct_answer = [
12 |         key for key, value in data["target_scores"].items() if value == 1
13 |     ][0].strip()
14 | 
15 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
16 | 


--------------------------------------------------------------------------------
/datasets/bbh-cot/config/hyperbaton_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "hyperbaton",
 3 |     "path": "datasets/bbh-cot/data/hyperbaton.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh-cot/transform_gen_cot.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_bbh.json"
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "in_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh-cot/config/navigate_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "navigate",
 3 |     "path": "datasets/bbh-cot/data/navigate.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh-cot/transform_gen_cot.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_bbh.json"
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "in_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh-cot/config/ruin-names_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ruin-names",
 3 |     "path": "datasets/bbh-cot/data/ruin-names.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh-cot/transform_gen_cot.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_bbh.json"
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "in_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh-cot/config/snarks_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "snarks",
 3 |     "path": "datasets/bbh-cot/data/snarks.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh-cot/transform_gen_cot.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_bbh.json"
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "in_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh-cot/config/web-of-lies_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "web-of-lies",
 3 |     "path": "datasets/bbh-cot/data/web-of-lies.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh-cot/transform_gen_cot.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_bbh.json"
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "in_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh-cot/config/word-sorting_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "word-sorting",
 3 |     "path": "datasets/bbh-cot/data/word-sorting.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh-cot/transform_gen_cot.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_bbh.json"
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "in_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/boolean-expressions_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "boolean-expressions",
 3 |     "path": "datasets/bbh/data/boolean-expressions.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/causal-judgement_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "causal-judgement",
 3 |     "path": "datasets/bbh/data/causal-judgement.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/date-understanding_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "date-understanding",
 3 |     "path": "datasets/bbh/data/date-understanding.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/disambiguation-qa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "disambiguation-qa",
 3 |     "path": "datasets/bbh/data/disambiguation-qa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/dyck-languages_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "dyck-languages",
 3 |     "path": "datasets/bbh/data/dyck-languages.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/formal-fallacies_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "formal-fallacies",
 3 |     "path": "datasets/bbh/data/formal-fallacies.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/geometric-shapes_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "geometric-shapes",
 3 |     "path": "datasets/bbh/data/geometric-shapes.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/hyperbaton_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "hyperbaton",
 3 |     "path": "datasets/bbh/data/hyperbaton.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/logical-deduction-five-objects_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "logical-deduction-five-objects",
 3 |     "path": "datasets/bbh/data/logical-deduction-five-objects.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/logical-deduction-seven-objects_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "logical-deduction-seven-objects",
 3 |     "path": "datasets/bbh/data/logical-deduction-seven-objects.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/logical-deduction-three-objects_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "logical-deduction-three-objects",
 3 |     "path": "datasets/bbh/data/logical-deduction-three-objects.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/movie-recommendation_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "movie-recommendation",
 3 |     "path": "datasets/bbh/data/movie-recommendation.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/multistep-arithmetic-two_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "multistep-arithmetic-two",
 3 |     "path": "datasets/bbh/data/multistep-arithmetic-two.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/navigate_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "navigate",
 3 |     "path": "datasets/bbh/data/navigate.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/object-counting_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "object-counting",
 3 |     "path": "datasets/bbh/data/object-counting.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/penguins-in-a-table_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "penguins-in-a-table",
 3 |     "path": "datasets/bbh/data/penguins-in-a-table.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/reasoning-about-colored-objects_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "reasoning-about-colored-objects",
 3 |     "path": "datasets/bbh/data/reasoning-about-colored-objects.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/ruin-names_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ruin-names",
 3 |     "path": "datasets/bbh/data/ruin-names.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/snarks_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "snarks",
 3 |     "path": "datasets/bbh/data/snarks.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/sports-understanding_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sports-understanding",
 3 |     "path": "datasets/bbh/data/sports-understanding.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/temporal-sequences_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "temporal-sequences",
 3 |     "path": "datasets/bbh/data/temporal-sequences.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/web-of-lies_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "web-of-lies",
 3 |     "path": "datasets/bbh/data/web-of-lies.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bbh/config/word-sorting_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "word-sorting",
 3 |     "path": "datasets/bbh/data/word-sorting.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bbh/transform_gen_v0.py",
 6 |     "fewshot": 3,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "bbh_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/boolq/config/boolq_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "boolq",
 3 |     "path": "datasets/boolq/data/boolq.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/boolq/transform_gen_v1.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "qa_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/boolq/config/boolq_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "boolq",
 3 |     "path": "datasets/boolq/data/boolq.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/boolq/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/boolq/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['passage']}\nQuestion: {data['question']}\nA. Yes\nB. No\nAnswer: "
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/boolq/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['passage']}\nQuestion: {data['question']}?\nAnswer: "
 6 |     correct_answer = [
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     ][0].strip()
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/bustm/config/bustm_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "bustm",
 3 |     "path": "datasets/bustm/data/bustm.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bustm/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bustm/config/bustm_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "bustm",
 3 |     "path": "datasets/bustm/data/bustm.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/bustm/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/bustm/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"语句一：“{data['passage'][0]}”\n语句二：“{data['passage'][1]}”\n请判断语句一和语句二说的是否是一个意思？\nA. 相关\nB. 无关\n请从“A”，“B”中进行选择。\n答："
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/bustm/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         "语句一：“"
 7 |         + data["passage"][0]
 8 |         + "”\n语句二：“"
 9 |         + data["passage"][1]
10 |         + "”\n请判断语句一和语句二说的是否是一个意思？"
11 |     )
12 |     correct_answer = [
13 |         key for key, value in data["target_scores"].items() if value == 1
14 |     ][0].strip()
15 | 
16 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
17 | 


--------------------------------------------------------------------------------
/datasets/c3/config/dialog_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "dialog",
 3 |     "path": "datasets/c3/data/dialog.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/c3/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/c3/config/dialog_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "dialog",
 3 |     "path": "datasets/c3/data/dialog.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/c3/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/c3/config/mixed_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "mixed",
 3 |     "path": "datasets/c3/data/mixed.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/c3/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/c3/config/mixed_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "mixed",
 3 |     "path": "datasets/c3/data/mixed.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/c3/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/c3/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     documents_text = "\n".join(data["passage"])
 6 |     text = f"文章：{documents_text}\n问题：{data['question']}\n答案："
 7 |     correct_answer = [
 8 |         key for key, value in data["target_scores"].items() if value == 1
 9 |     ][0].strip()
10 | 
11 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
12 | 


--------------------------------------------------------------------------------
/datasets/cb/config/cb_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "cb",
 3 |     "path": "datasets/cb/data/cb.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/cb/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cb/config/cb_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "cb",
 3 |     "path": "datasets/cb/data/cb.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/cb/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cb/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['passage'][0]}\n{data['passage'][1]}\nWhat is the relation between the two sentences?\nA. Contradiction\nB. Entailment\nC. Neutral\nAnswer: "
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B", "C"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/cb/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         data["passage"][0]
 7 |         + "\n"
 8 |         + data["passage"][1]
 9 |         + "\nWhat is the relation between the two sentences? "
10 |     )
11 |     correct_answer = [
12 |         key for key, value in data["target_scores"].items() if value == 1
13 |     ][0].strip()
14 | 
15 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
16 | 


--------------------------------------------------------------------------------
/datasets/ceval/config/accountant_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "accountant",
 3 |     "path": "datasets/ceval/data/accountant.jsonl",
 4 |     "description": "以下是中国关于会计考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/art-studies_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "art-studies",
 3 |     "path": "datasets/ceval/data/art-studies.jsonl",
 4 |     "description": "以下是中国关于艺术研究考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/basic-medicine_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "basic-medicine",
 3 |     "path": "datasets/ceval/data/basic-medicine.jsonl",
 4 |     "description": "以下是中国关于基础医学考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/civil-servant_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "civil-servant",
 3 |     "path": "datasets/ceval/data/civil-servant.jsonl",
 4 |     "description": "以下是中国关于公务员考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/clinical-medicine_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "clinical-medicine",
 3 |     "path": "datasets/ceval/data/clinical-medicine.jsonl",
 4 |     "description": "以下是中国关于临床医学考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/college-chemistry_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "college-chemistry",
 3 |     "path": "datasets/ceval/data/college-chemistry.jsonl",
 4 |     "description": "以下是中国关于大学化学考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/college-economics_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "college-economics",
 3 |     "path": "datasets/ceval/data/college-economics.jsonl",
 4 |     "description": "以下是中国关于大学经济学考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/college-physics_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "college-physics",
 3 |     "path": "datasets/ceval/data/college-physics.jsonl",
 4 |     "description": "以下是中国关于大学物理考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/college-programming_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "college-programming",
 3 |     "path": "datasets/ceval/data/college-programming.jsonl",
 4 |     "description": "以下是中国关于大学编程考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/computer-network_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "computer-network",
 3 |     "path": "datasets/ceval/data/computer-network.jsonl",
 4 |     "description": "以下是中国关于计算机网络考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/education-science_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "education-science",
 3 |     "path": "datasets/ceval/data/education-science.jsonl",
 4 |     "description": "以下是中国关于教育科学考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/electrical-engineer_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "electrical-engineer",
 3 |     "path": "datasets/ceval/data/electrical-engineer.jsonl",
 4 |     "description": "以下是中国关于电气工程师考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/fire-engineer_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "fire-engineer",
 3 |     "path": "datasets/ceval/data/fire-engineer.jsonl",
 4 |     "description": "以下是中国关于消防工程师考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/high-school-biology_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "high-school-biology",
 3 |     "path": "datasets/ceval/data/high-school-biology.jsonl",
 4 |     "description": "以下是中国关于高中生物考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/high-school-chinese_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "high-school-chinese",
 3 |     "path": "datasets/ceval/data/high-school-chinese.jsonl",
 4 |     "description": "以下是中国关于高中语文考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/high-school-history_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "high-school-history",
 3 |     "path": "datasets/ceval/data/high-school-history.jsonl",
 4 |     "description": "以下是中国关于高中历史考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/high-school-physics_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "high-school-physics",
 3 |     "path": "datasets/ceval/data/high-school-physics.jsonl",
 4 |     "description": "以下是中国关于高中物理考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/law_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "law",
 3 |     "path": "datasets/ceval/data/law.jsonl",
 4 |     "description": "以下是中国关于法律考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/legal-professional_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "legal-professional",
 3 |     "path": "datasets/ceval/data/legal-professional.jsonl",
 4 |     "description": "以下是中国关于法律专业人员考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/logic_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "logic",
 3 |     "path": "datasets/ceval/data/logic.jsonl",
 4 |     "description": "以下是中国关于逻辑考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/mao-zedong-thought_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "mao-zedong-thought",
 3 |     "path": "datasets/ceval/data/mao-zedong-thought.jsonl",
 4 |     "description": "以下是中国关于毛泽东思想考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/marxism_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "marxism",
 3 |     "path": "datasets/ceval/data/marxism.jsonl",
 4 |     "description": "以下是中国关于马克思主义考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/metrology-engineer_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "metrology-engineer",
 3 |     "path": "datasets/ceval/data/metrology-engineer.jsonl",
 4 |     "description": "以下是中国关于计量工程师考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/operating-system_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "operating-system",
 3 |     "path": "datasets/ceval/data/operating-system.jsonl",
 4 |     "description": "以下是中国关于操作系统考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/physician_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "physician",
 3 |     "path": "datasets/ceval/data/physician.jsonl",
 4 |     "description": "以下是中国关于医生考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/plant-protection_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "plant-protection",
 3 |     "path": "datasets/ceval/data/plant-protection.jsonl",
 4 |     "description": "以下是中国关于植物保护考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/sports-science_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sports-science",
 3 |     "path": "datasets/ceval/data/sports-science.jsonl",
 4 |     "description": "以下是中国关于运动科学考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/tax-accountant_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "tax-accountant",
 3 |     "path": "datasets/ceval/data/tax-accountant.jsonl",
 4 |     "description": "以下是中国关于税务会计考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ceval/config/veterinary-medicine_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "veterinary-medicine",
 3 |     "path": "datasets/ceval/data/veterinary-medicine.jsonl",
 4 |     "description": "以下是中国关于兽医学考试的单项选择题，请选出其中的正确答案。",
 5 |     "transform": "datasets/ceval/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/chid/config/chid_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "chid",
 3 |     "path": "datasets/chid/data/chid.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/chid/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/chid/config/chid_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "chid",
 3 |     "path": "datasets/chid/data/chid.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/chid/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/chid/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = "以下句子是否通顺？\n" + data["passage"]
 6 |     correct_answer = [
 7 |         key + "这句话是通顺的。" for key, value in data["target_scores"].items() if value == 1
 8 |     ][0].strip()
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/chid/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = data["passage"]
 6 |     processed_correct_answer = correct_answer = [
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     ][0].strip()
 9 |     return {
10 |         "input": text,
11 |         "output": correct_answer,
12 |         "processed_output": processed_correct_answer,
13 |     }
14 | 


--------------------------------------------------------------------------------
/datasets/cluewsc/config/cluewsc_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "cluewsc",
 3 |     "path": "datasets/cluewsc/data/cluewsc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/cluewsc/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cluewsc/config/cluewsc_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "cluewsc",
 3 |     "path": "datasets/cluewsc/data/cluewsc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/cluewsc/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cluewsc/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['passage']}\n此处，“{data['question'][1]}”是否指代“{data['question'][0]}“？\nA. 是\nB. 否\n请从”A“，”B“中进行选择。\n答："
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/cluewsc/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         data["passage"]
 7 |         + "\n此处，代词“"
 8 |         + data["question"][1]
 9 |         + "”被用于指代“"
10 |         + data["question"][0]
11 |         + "”吗?请回答是或者否。"
12 |     )
13 |     correct_answer = [
14 |         key for key, value in data["target_scores"].items() if value == 1
15 |     ][0].strip()
16 | 
17 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
18 | 


--------------------------------------------------------------------------------
/datasets/cmmlu/config/agronomy_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "agronomy",
 3 |     "path": "datasets/cmmlu/data/agronomy.jsonl",
 4 |     "description": "以下是关于农学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/anatomy_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "anatomy",
 3 |     "path": "datasets/cmmlu/data/anatomy.jsonl",
 4 |     "description": "以下是关于解剖学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/ancient-chinese_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ancient-chinese",
 3 |     "path": "datasets/cmmlu/data/ancient-chinese.jsonl",
 4 |     "description": "以下是关于古代汉语的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/arts_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "arts",
 3 |     "path": "datasets/cmmlu/data/arts.jsonl",
 4 |     "description": "以下是关于艺术的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/astronomy_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "astronomy",
 3 |     "path": "datasets/cmmlu/data/astronomy.jsonl",
 4 |     "description": "以下是关于天文学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/business-ethics_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "business-ethics",
 3 |     "path": "datasets/cmmlu/data/business-ethics.jsonl",
 4 |     "description": "以下是关于商业伦理的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/chinese-history_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "chinese-history",
 3 |     "path": "datasets/cmmlu/data/chinese-history.jsonl",
 4 |     "description": "以下是关于中国历史的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/chinese-literature_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "chinese-literature",
 3 |     "path": "datasets/cmmlu/data/chinese-literature.jsonl",
 4 |     "description": "以下是关于中国文学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/clinical-knowledge_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "clinical-knowledge",
 3 |     "path": "datasets/cmmlu/data/clinical-knowledge.jsonl",
 4 |     "description": "以下是关于临床知识的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/college-education_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "college-education",
 3 |     "path": "datasets/cmmlu/data/college-education.jsonl",
 4 |     "description": "以下是关于大学教育的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/college-law_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "college-law",
 3 |     "path": "datasets/cmmlu/data/college-law.jsonl",
 4 |     "description": "以下是关于大学法律的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/college-mathematics_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "college-mathematics",
 3 |     "path": "datasets/cmmlu/data/college-mathematics.jsonl",
 4 |     "description": "以下是关于大学数学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/college-medicine_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "college-medicine",
 3 |     "path": "datasets/cmmlu/data/college-medicine.jsonl",
 4 |     "description": "以下是关于大学医学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/computer-science_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "computer-science",
 3 |     "path": "datasets/cmmlu/data/computer-science.jsonl",
 4 |     "description": "以下是关于计算机科学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/computer-security_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "computer-security",
 3 |     "path": "datasets/cmmlu/data/computer-security.jsonl",
 4 |     "description": "以下是关于计算机安全的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/conceptual-physics_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "conceptual-physics",
 3 |     "path": "datasets/cmmlu/data/conceptual-physics.jsonl",
 4 |     "description": "以下是关于概念物理学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/economics_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "economics",
 3 |     "path": "datasets/cmmlu/data/economics.jsonl",
 4 |     "description": "以下是关于经济学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/education_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "education",
 3 |     "path": "datasets/cmmlu/data/education.jsonl",
 4 |     "description": "以下是关于教育学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/elementary-chinese_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "elementary-chinese",
 3 |     "path": "datasets/cmmlu/data/elementary-chinese.jsonl",
 4 |     "description": "以下是关于初级汉语的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/ethnology_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ethnology",
 3 |     "path": "datasets/cmmlu/data/ethnology.jsonl",
 4 |     "description": "以下是关于民族学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/food-science_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "food-science",
 3 |     "path": "datasets/cmmlu/data/food-science.jsonl",
 4 |     "description": "以下是关于食品科学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/genetics_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "genetics",
 3 |     "path": "datasets/cmmlu/data/genetics.jsonl",
 4 |     "description": "以下是关于遗传学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/global-facts_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "global-facts",
 3 |     "path": "datasets/cmmlu/data/global-facts.jsonl",
 4 |     "description": "以下是关于全球事实的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/human-sexuality_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "human-sexuality",
 3 |     "path": "datasets/cmmlu/data/human-sexuality.jsonl",
 4 |     "description": "以下是关于人类性学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/international-law_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "international-law",
 3 |     "path": "datasets/cmmlu/data/international-law.jsonl",
 4 |     "description": "以下是关于国际法的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/journalism_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "journalism",
 3 |     "path": "datasets/cmmlu/data/journalism.jsonl",
 4 |     "description": "以下是关于新闻学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/jurisprudence_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "jurisprudence",
 3 |     "path": "datasets/cmmlu/data/jurisprudence.jsonl",
 4 |     "description": "以下是关于法学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/logical_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "logical",
 3 |     "path": "datasets/cmmlu/data/logical.jsonl",
 4 |     "description": "以下是关于逻辑的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/machine-learning_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "machine-learning",
 3 |     "path": "datasets/cmmlu/data/machine-learning.jsonl",
 4 |     "description": "以下是关于机器学习的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/management_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "management",
 3 |     "path": "datasets/cmmlu/data/management.jsonl",
 4 |     "description": "以下是关于管理学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/marketing_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "marketing",
 3 |     "path": "datasets/cmmlu/data/marketing.jsonl",
 4 |     "description": "以下是关于市场营销的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/marxist-theory_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "marxist-theory",
 3 |     "path": "datasets/cmmlu/data/marxist-theory.jsonl",
 4 |     "description": "以下是关于马克思主义理论的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/modern-chinese_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "modern-chinese",
 3 |     "path": "datasets/cmmlu/data/modern-chinese.jsonl",
 4 |     "description": "以下是关于现代汉语的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/nutrition_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "nutrition",
 3 |     "path": "datasets/cmmlu/data/nutrition.jsonl",
 4 |     "description": "以下是关于营养学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/philosophy_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "philosophy",
 3 |     "path": "datasets/cmmlu/data/philosophy.jsonl",
 4 |     "description": "以下是关于哲学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/professional-law_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "professional-law",
 3 |     "path": "datasets/cmmlu/data/professional-law.jsonl",
 4 |     "description": "以下是关于专业法律的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/public-relations_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "public-relations",
 3 |     "path": "datasets/cmmlu/data/public-relations.jsonl",
 4 |     "description": "以下是关于公共关系的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/security-study_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "security-study",
 3 |     "path": "datasets/cmmlu/data/security-study.jsonl",
 4 |     "description": "以下是关于安全研究的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/sociology_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sociology",
 3 |     "path": "datasets/cmmlu/data/sociology.jsonl",
 4 |     "description": "以下是关于社会学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/sports-science_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sports-science",
 3 |     "path": "datasets/cmmlu/data/sports-science.jsonl",
 4 |     "description": "以下是关于运动科学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/virology_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "virology",
 3 |     "path": "datasets/cmmlu/data/virology.jsonl",
 4 |     "description": "以下是关于病毒学的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/world-history_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "world-history",
 3 |     "path": "datasets/cmmlu/data/world-history.jsonl",
 4 |     "description": "以下是关于世界历史的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmmlu/config/world-religions_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "world-religions",
 3 |     "path": "datasets/cmmlu/data/world-religions.jsonl",
 4 |     "description": "以下是关于世界宗教的单项选择题，请直接给出正确答案的选项。",
 5 |     "transform": "datasets/cmmlu/transform_gen_v1.py",
 6 |     "fewshot": 5,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmnli/config/cmnli_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "cmnli",
 3 |     "path": "datasets/cmnli/data/cmnli.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/cmnli/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmnli/config/cmnli_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "cmnli",
 3 |     "path": "datasets/cmnli/data/cmnli.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/cmnli/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/cmnli/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"语句一：“{data['passage'][0]}”\n语句二：“{data['passage'][1]}”\n请问这两句话是什么关系？\nA. 矛盾\nB. 无关\nC. 蕴含\n请从“A”，“B”，“C”中进行选择。\n答："
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B", "C"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/cmnli/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         "语句一：“"
 7 |         + data["passage"][0]
 8 |         + "”\n语句二：“"
 9 |         + data["passage"][1]
10 |         + "”\n请问这两句话是什么关系？"
11 |     )
12 |     correct_answer = [
13 |         key for key, value in data["target_scores"].items() if value == 1
14 |     ][0].strip()
15 | 
16 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
17 | 


--------------------------------------------------------------------------------
/datasets/commonsenseqa/config/commonsenseqa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "commonsenseqa",
 3 |     "path": "datasets/commonsenseqa/data/commonsenseqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/commonsenseqa/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/commonsenseqa/config/commonsenseqa_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "commonsenseqa",
 3 |     "path": "datasets/commonsenseqa/data/commonsenseqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/commonsenseqa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/commonsenseqa/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = "Question: " + data["question"] + "\n"
 6 |     text += "Answer: "
 7 | 
 8 |     correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 | 
12 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
13 | 


--------------------------------------------------------------------------------
/datasets/commonsenseqa/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"Question:\n{data['question']}\n"
 6 |     answer_prompt = f"Answer:\n"
 7 |     text = question + answer_prompt
 8 |     processed_correct_answer = correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/copa/config/copa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "copa",
 3 |     "path": "datasets/copa/data/copa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/copa/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/copa/config/copa_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "copa",
 3 |     "path": "datasets/copa/data/copa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/copa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/copa/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         data["passage"]
 7 |         + "\nQuestion: What may be the "
 8 |         + data["question"]
 9 |         + "?\nAnswer: "
10 |     )
11 |     correct_answer = [
12 |         key for key, value in data["target_scores"].items() if value == 1
13 |     ][0].strip()
14 | 
15 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
16 | 


--------------------------------------------------------------------------------
/datasets/drcd/config/drcd_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "drcd",
 3 |     "path": "datasets/drcd/data/drcd.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/drcd/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "exact_match_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/eprstmt/config/eprstmt_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "eprstmt",
 3 |     "path": "datasets/eprstmt/data/eprstmt.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/eprstmt/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/eprstmt/config/eprstmt_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "eprstmt",
 3 |     "path": "datasets/eprstmt/data/eprstmt.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/eprstmt/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/eprstmt/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"内容： {data['passage']}。请对上述内容进行情绪分类。\nA. 消极\nB. 积极\n请从”A“，”B“中进行选择。\n答："
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/eprstmt/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = '内容： "' + data["passage"] + '"。情绪分类：'
 6 |     correct_answer = [
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     ][0].strip()
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/flores/config/flores_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "flores",
 3 |     "path": "datasets/flores/data/flores.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/flores/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "until_return_post",
12 |     "metric": {
13 |         "bleu-4": {
14 |             "evaluation": {
15 |                 "type": "bleu",
16 |                 "tokenizer": "char"
17 |             }
18 |         }
19 |     }
20 | }


--------------------------------------------------------------------------------
/datasets/flores/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"Translate the following English statements to Chinese (Simpl).\nSource: {data['passage'][0]}\nTarget: "
 6 | 
 7 |     return {
 8 |         "input": text,
 9 |         "output": data["passage"][1],
10 |         "processed_output": data["passage"][1],
11 |     }
12 | 


--------------------------------------------------------------------------------
/datasets/gsm8k/config/gsm8k_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "gsm8k",
 3 |     "path": "datasets/gsm8k/data/gsm8k.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/gsm8k/transform_gen_v0.py",
 6 |     "fewshot": 8,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_v1.json"
10 |     },
11 |     "postprocess": "gsm8k_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/gsm8k/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from UltraEval.tasks.postprocess import GSM8KPost
 4 | 
 5 | 
 6 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 7 |     text = f"Question: {data['question']}\nAnswer: "
 8 |     correct_answer = data["answer"]
 9 |     gsm8kp = GSM8KPost()
10 |     _, processed_correct_answer = gsm8kp([], correct_answer)
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/hellaswag/config/hellaswag_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "hellaswag",
 3 |     "path": "datasets/hellaswag/data/hellaswag.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/hellaswag/transform_gen_v1.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "qa_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/hellaswag/config/hellaswag_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "hellaswag",
 3 |     "path": "datasets/hellaswag/data/hellaswag.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/hellaswag/transform_ppl_v1.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "general_torch_ppl_norm",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/hellaswag/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = data["question"] + " "
 6 |     correct_answer = [
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     ][0].strip()
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/hellaswag/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['activity_label']}: {data['question']} "
 6 |     processed_correct_answer = correct_answer = [
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     ][0].strip()
 9 |     return {
10 |         "input": text,
11 |         "output": correct_answer,
12 |         "processed_output": processed_correct_answer,
13 |     }
14 | 


--------------------------------------------------------------------------------
/datasets/humaneval/transform_gen_v0.py:
--------------------------------------------------------------------------------
1 | import random
2 | 
3 | 
4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
5 |     return {"input": data["prompt"].strip(), "output": "", "processed_output": ""}
6 | 


--------------------------------------------------------------------------------
/datasets/jecqa/config/jecqa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "jecqa",
 3 |     "path": "datasets/jecqa/data/jecqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/jecqa/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/jecqa/config/jecqa_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "jecqa",
 3 |     "path": "datasets/jecqa/data/jecqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/jecqa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/jecqa/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     prefixes = ["A. ", "B. ", "C. ", "D. "]
 6 |     opt = "\n".join(
 7 |         [prefixes[i] + list(data["target_scores"].keys())[i] for i in range(4)]
 8 |     )
 9 |     text = f"问题：{data['question']}\n选项：{opt}\n答案："
10 |     correct_answer = [
11 |         key for key, value in data["target_scores"].items() if value == 1
12 |     ][0].strip()
13 | 
14 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
15 | 


--------------------------------------------------------------------------------
/datasets/jecqa/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"问题：\n{data['question']}\n"
 6 |     answer_prompt = f"答案：\n"
 7 |     text = question + answer_prompt
 8 |     processed_correct_answer = correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/lambada/config/lambada_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "lambada",
 3 |     "path": "datasets/lambada/data/lambada.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/lambada/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/lambada/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     prompt = "Please complete the following sentence:\n" + data["question"]
 6 |     return {
 7 |         "input": prompt,
 8 |         "output": data["answer"],
 9 |         "processed_output": data["answer"],
10 |     }
11 | 


--------------------------------------------------------------------------------
/datasets/lambada/transform_gen_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     instruction = (
 6 |         f"Requirement:\nPlease complete the following context with a single word.\n"
 7 |     )
 8 |     context = f"Context:\n{data['question']} "
 9 |     text = instruction + context
10 |     processed_correct_answer = correct_answer = data["answer"]
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/math/config/algebra_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "algebra",
 3 |     "path": "datasets/math/data/algebra.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/math/transform_gen_v0.py",
 6 |     "fewshot": 4,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_v1.json"
10 |     },
11 |     "postprocess": "math_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/math/config/geometry_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "geometry",
 3 |     "path": "datasets/math/data/geometry.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/math/transform_gen_v0.py",
 6 |     "fewshot": 4,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_v1.json"
10 |     },
11 |     "postprocess": "math_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/math/config/number-theory_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "number-theory",
 3 |     "path": "datasets/math/data/number-theory.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/math/transform_gen_v0.py",
 6 |     "fewshot": 4,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_v1.json"
10 |     },
11 |     "postprocess": "math_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/math/config/prealgebra_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "prealgebra",
 3 |     "path": "datasets/math/data/prealgebra.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/math/transform_gen_v0.py",
 6 |     "fewshot": 4,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_v1.json"
10 |     },
11 |     "postprocess": "math_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/math/config/precalculus_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "precalculus",
 3 |     "path": "datasets/math/data/precalculus.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/math/transform_gen_v0.py",
 6 |     "fewshot": 4,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": "models/model_params/vllm_sample_v1.json"
10 |     },
11 |     "postprocess": "math_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/mbpp-427/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def rand(n: int, r: random.Random):
 5 |     return int(r.random() * n)
 6 | 
 7 | 
 8 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 9 |     description = data["text"]
10 |     tests = "\n".join(data["test_list"])
11 | 
12 |     return {
13 |         "input": f'"""{description}\n{tests}"""',
14 |         "output": data["code"],
15 |         "processed_output": data["code"],
16 |     }
17 | 


--------------------------------------------------------------------------------
/datasets/mbpp/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def rand(n: int, r: random.Random):
 5 |     return int(r.random() * n)
 6 | 
 7 | 
 8 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 9 |     description = data["text"]
10 |     tests = "\n".join(data["test_list"])
11 | 
12 |     return {
13 |         "input": f'"""{description}\n{tests}"""',
14 |         "output": data["code"],
15 |         "processed_output": data["code"],
16 |     }
17 | 


--------------------------------------------------------------------------------
/datasets/medmcqa/config/medmcqa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "medmcqa",
 3 |     "path": "datasets/medmcqa/data/medmcqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/medmcqa/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/medmcqa/config/medmcqa_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "medmcqa",
 3 |     "path": "datasets/medmcqa/data/medmcqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/medmcqa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/medmcqa/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"Question:\n{data['question']}\n"
 6 |     answer_prompt = f"Answer:\n"
 7 |     text = question + answer_prompt
 8 |     processed_correct_answer = correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/medqa-mcmle/config/medqa-mcmle_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "medqa-mcmle",
 3 |     "path": "datasets/medqa-mcmle/data/medqa-mcmle.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/medqa-mcmle/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/medqa-mcmle/config/medqa-mcmle_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "medqa-mcmle",
 3 |     "path": "datasets/medqa-mcmle/data/medqa-mcmle.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/medqa-mcmle/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/medqa-mcmle/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"问题：\n{data['question']}\n"
 6 |     answer_prompt = f"答案：\n"
 7 |     text = question + answer_prompt
 8 |     processed_correct_answer = correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/medqa-usmle/config/medqa-usmle_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "medqa-usmle",
 3 |     "path": "datasets/medqa-usmle/data/medqa-usmle.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/medqa-usmle/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/medqa-usmle/config/medqa-usmle_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "medqa-usmle",
 3 |     "path": "datasets/medqa-usmle/data/medqa-usmle.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/medqa-usmle/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/medqa-usmle/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"Question:\n{data['question']}\n"
 6 |     answer_prompt = f"Answer:\n"
 7 |     text = question + answer_prompt
 8 |     processed_correct_answer = correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/multirc/config/multirc_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "multirc",
 3 |     "path": "datasets/multirc/data/multirc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/multirc/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/multirc/config/multirc_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "multirc",
 3 |     "path": "datasets/multirc/data/multirc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/multirc/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/multirc/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['passage']}\nQuestion: {data['question'][0]}\nClaim: {data['question'][1]}\nIs it true?\nA. Yes\nB. No\nAnswer: "
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/multirc/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['passage']}\nQuestion: {data['question'][0]}\nClaim: {data['question'][1]}\nIs it true? "
 6 |     correct_answer = [
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     ][0].strip()
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/nq-open/config/nq-open_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "nq-open",
 3 |     "path": "datasets/nq-open/data/nq-open.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/nq-open/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "exact_match_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/nq-open/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from UltraEval.tasks.postprocess import ExactMatchPost
 4 | 
 5 | 
 6 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 7 |     text = f"Question: {data['question']}\nAnswer: "
 8 |     correct_answer = data["answer"]
 9 |     emp = ExactMatchPost()
10 |     _, processed_correct_answer = emp([], correct_answer)
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/ocnli-fc/config/ocnli-fc_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ocnli-fc",
 3 |     "path": "datasets/ocnli-fc/data/ocnli-fc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/ocnli-fc/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ocnli-fc/config/ocnli-fc_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ocnli-fc",
 3 |     "path": "datasets/ocnli-fc/data/ocnli-fc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/ocnli-fc/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ocnli-fc/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"阅读文章：{data['passage'][0]}\n根据上文，回答如下问题：{data['passage'][1]}\nA. 错\nB. 可能\nC. 对\n请从“A”，“B”，“C”中进行选择。\n答："
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B", "C"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/ocnli-fc/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"阅读文章：{data['passage'][0]}\n根据上文，回答如下问题：{data['passage'][1]}\n请从“无关”，“蕴含”，“矛盾”中进行选择。\n答："
 6 |     correct_answer = [
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     ][0].strip()
 9 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
10 | 


--------------------------------------------------------------------------------
/datasets/ocnli/config/ocnli_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ocnli",
 3 |     "path": "datasets/ocnli/data/ocnli.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/ocnli/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ocnli/config/ocnli_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ocnli",
 3 |     "path": "datasets/ocnli/data/ocnli.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/ocnli/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/ocnli/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"语句一：“{data['passage'][0]}”\n语句二：“{data['passage'][1]}”\n请问这两句话是什么关系？\nA. 矛盾\nB. 无关\nC. 蕴含\n请从“A”，“B”，“C”中进行选择。\n答："
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B", "C"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/ocnli/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         "语句一：“"
 7 |         + data["passage"][0]
 8 |         + "”\n语句二：“"
 9 |         + data["passage"][1]
10 |         + "”\n请问这两句话是什么关系？"
11 |     )
12 |     correct_answer = [
13 |         key for key, value in data["target_scores"].items() if value == 1
14 |     ][0].strip()
15 | 
16 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
17 | 


--------------------------------------------------------------------------------
/datasets/openbookqa/config/openbookqa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "openbookqa",
 3 |     "path": "datasets/openbookqa/data/openbookqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/openbookqa/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/openbookqa/config/openbookqa_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "openbookqa",
 3 |     "path": "datasets/openbookqa/data/openbookqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/openbookqa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/piqa/config/piqa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "piqa",
 3 |     "path": "datasets/piqa/data/piqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/piqa/transform_gen_v1.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "qa_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/piqa/config/piqa_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "piqa",
 3 |     "path": "datasets/piqa/data/piqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/piqa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/piqa/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     options = list(data["target_scores"].keys())
 6 |     sol1, sol2 = options
 7 |     text = f"{data['question']}\nA. {sol1}\nB. {sol2}\nAnswer: "
 8 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 9 |     answers = ["A", "B"]
10 |     correct_answer = answers[index_of_correct_answer]
11 | 
12 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
13 | 


--------------------------------------------------------------------------------
/datasets/piqa/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = "Question: " + data["question"] + " \n"
 6 |     text += "Answer: "
 7 |     correct_answer = [
 8 |         key for key, value in data["target_scores"].items() if value == 1
 9 |     ][0].strip()
10 | 
11 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
12 | 


--------------------------------------------------------------------------------
/datasets/piqa/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"Question:\n{data['question']}\n"
 6 |     answer_prompt = f"Answer:\n"
 7 |     text = question + answer_prompt
 8 |     processed_correct_answer = correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/quac/config/quac_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "quac",
 3 |     "path": "datasets/quac/data/quac.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/quac/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "exact_match_post",
12 |     "metric": {
13 |         "f1_score": {
14 |             "evaluation": {
15 |                 "type": "f1_score"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/race/config/high_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "high",
 3 |     "path": "datasets/race/data/high.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/race/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/race/config/high_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "high",
 3 |     "path": "datasets/race/data/high.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/race/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/race/config/middle_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "middle",
 3 |     "path": "datasets/race/data/middle.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/race/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/race/config/middle_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "middle",
 3 |     "path": "datasets/race/data/middle.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/race/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/record/config/record_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "record",
 3 |     "path": "datasets/record/data/record.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/record/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "exact_match_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/rte/config/rte_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "rte",
 3 |     "path": "datasets/rte/data/rte.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/rte/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/rte/config/rte_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "rte",
 3 |     "path": "datasets/rte/data/rte.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/rte/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/rte/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['passage'][0]}\n{data['passage'][1]}\nIs the sentence below entailed by the sentence above?\nA. Yes\nB. No\nAnswer: "
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/rte/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = (
 6 |         data["passage"][0]
 7 |         + "\n"
 8 |         + data["passage"][1]
 9 |         + "\nIs the sentence below entailed by the sentence above? "
10 |     )
11 |     correct_answer = [
12 |         key for key, value in data["target_scores"].items() if value == 1
13 |     ][0].strip()
14 | 
15 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
16 | 


--------------------------------------------------------------------------------
/datasets/siqa/config/siqa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "siqa",
 3 |     "path": "datasets/siqa/data/siqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/siqa/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/siqa/config/siqa_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "siqa",
 3 |     "path": "datasets/siqa/data/siqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/siqa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/siqa/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = "Based on the context: " + data["passage"] + "\n"
 6 |     text += "Question: " + data["question"] + "\n"
 7 |     text += "Answer: "
 8 | 
 9 |     correct_answer = [
10 |         key for key, value in data["target_scores"].items() if value == 1
11 |     ][0].strip()
12 | 
13 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
14 | 


--------------------------------------------------------------------------------
/datasets/squad/config/squad_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "squad",
 3 |     "path": "datasets/squad/data/squad.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/squad/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "exact_match_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/storycloze/config/storycloze_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "storycloze",
 3 |     "path": "datasets/storycloze/data/storycloze.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/storycloze/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/storycloze/config/storycloze_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "storycloze",
 3 |     "path": "datasets/storycloze/data/storycloze.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/storycloze/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/storycloze/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     passage = "\n".join(data["passage"]) + "\n"
 6 |     correct_answer = [
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     ][0]
 9 |     return {
10 |         "input": passage,
11 |         "output": correct_answer,
12 |         "processed_output": correct_answer,
13 |     }
14 | 


--------------------------------------------------------------------------------
/datasets/storycloze/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     passage = "\n".join(data["passage"])
 6 |     text = f"{passage}\n"
 7 |     processed_correct_answer = correct_answer = [
 8 |         key for key, value in data["target_scores"].items() if value == 1
 9 |     ][0]
10 |     return {
11 |         "input": text,
12 |         "output": correct_answer,
13 |         "processed_output": processed_correct_answer,
14 |     }
15 | 


--------------------------------------------------------------------------------
/datasets/strategyqa/config/strategyqa_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "strategyqa",
 3 |     "path": "datasets/strategyqa/data/strategyqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/strategyqa/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/strategyqa/config/strategyqa_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "strategyqa",
 3 |     "path": "datasets/strategyqa/data/strategyqa.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/strategyqa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/strategyqa/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     facts_str = ", ".join(data["passage"][2])
 6 | 
 7 |     text = f"Background: {data['passage'][0]} - {data['passage'][1]}\nFact: {facts_str}\nQuestion: {data['question']}\nA. Yes\nB. No\nAnswer: "
 8 |     correct_answer = "A" if data["target_scores"].get("Yes") == 1 else "B"
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/strategyqa/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     facts_str = ", ".join(data["passage"][2])
 6 | 
 7 |     text = f"Background: {data['passage'][0]} - {data['passage'][1]}\nFact: {facts_str}\nQuestion: {data['question']}\nAnswer: "
 8 |     correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ][0].strip()
11 | 
12 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
13 | 


--------------------------------------------------------------------------------
/datasets/summedits/config/billsum_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "billsum",
 3 |     "path": "datasets/summedits/data/billsum.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/billsum_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "billsum",
 3 |     "path": "datasets/summedits/data/billsum.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/ectsum_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ectsum",
 3 |     "path": "datasets/summedits/data/ectsum.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/ectsum_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "ectsum",
 3 |     "path": "datasets/summedits/data/ectsum.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/news_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "news",
 3 |     "path": "datasets/summedits/data/news.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/news_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "news",
 3 |     "path": "datasets/summedits/data/news.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/podcast_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "podcast",
 3 |     "path": "datasets/summedits/data/podcast.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/podcast_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "podcast",
 3 |     "path": "datasets/summedits/data/podcast.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/qmsumm_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "qmsumm",
 3 |     "path": "datasets/summedits/data/qmsumm.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/qmsumm_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "qmsumm",
 3 |     "path": "datasets/summedits/data/qmsumm.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/sales-call_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sales-call",
 3 |     "path": "datasets/summedits/data/sales-call.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/sales-call_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sales-call",
 3 |     "path": "datasets/summedits/data/sales-call.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/sales-email_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sales-email",
 3 |     "path": "datasets/summedits/data/sales-email.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/sales-email_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "sales-email",
 3 |     "path": "datasets/summedits/data/sales-email.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/samsum_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "samsum",
 3 |     "path": "datasets/summedits/data/samsum.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/samsum_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "samsum",
 3 |     "path": "datasets/summedits/data/samsum.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/scitldr_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "scitldr",
 3 |     "path": "datasets/summedits/data/scitldr.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/scitldr_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "scitldr",
 3 |     "path": "datasets/summedits/data/scitldr.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/shakespeare_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "shakespeare",
 3 |     "path": "datasets/summedits/data/shakespeare.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/config/shakespeare_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "shakespeare",
 3 |     "path": "datasets/summedits/data/shakespeare.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/summedits/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/summedits/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"\nDocument:\n{data['passage'][0]}\nSummary:\n{data['passage'][1]}\nIs the summary factually consistent with the document? "
 6 |     correct_answer = [
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     ][0].strip()
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/theoremqa/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | from UltraEval.tasks.postprocess import TheoremQAPost
 4 | 
 5 | 
 6 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 7 |     text = f"\nQuestion: {data['question']}\nAnswer: "
 8 |     correct_answer = str(data["answer"][1])
 9 |     tqap = TheoremQAPost()
10 |     _, processed_correct_answer = tqap([], correct_answer)
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/tnews/config/tnews_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "tnews",
 3 |     "path": "datasets/tnews/data/tnews.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/tnews/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/tnews/config/tnews_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "tnews",
 3 |     "path": "datasets/tnews/data/tnews.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/tnews/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/tnews/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"{data['question']}\n上述内容属于什么新闻？"
 6 | 
 7 |     correct_answer = [
 8 |         key for key, value in data["target_scores"].items() if value == 1
 9 |     ][0].strip()
10 | 
11 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
12 | 


--------------------------------------------------------------------------------
/datasets/triviaqa/config/web_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "web",
 3 |     "path": "datasets/triviaqa/data/web.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/triviaqa/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "exact_match_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/triviaqa/config/wikipedia_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "wikipedia",
 3 |     "path": "datasets/triviaqa/data/wikipedia.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/triviaqa/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "exact_match_post",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "exact_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/truthfulqa/config/mc1_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "mc1",
 3 |     "path": "datasets/truthfulqa/data/mc1.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/truthfulqa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/truthfulqa/config/mc2_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "mc2",
 3 |     "path": "datasets/truthfulqa/data/mc2.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/truthfulqa/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob_mc2"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/truthfulqa/transform_ppl_v0.py:
--------------------------------------------------------------------------------
1 | import random
2 | 
3 | 
4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
5 |     correct_answer = [key for key, value in data["target_scores"].items() if value == 1]
6 |     text = f"Question: {data['question']}\nAnswer: "
7 | 
8 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
9 | 


--------------------------------------------------------------------------------
/datasets/truthfulqa/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"Question:\n{data['question']}\n"
 6 |     answer_prompt = f"Answer:\n"
 7 |     text = question + answer_prompt
 8 |     processed_correct_answer = correct_answer = [
 9 |         key for key, value in data["target_scores"].items() if value == 1
10 |     ]
11 | 
12 |     return {
13 |         "input": text,
14 |         "output": correct_answer,
15 |         "processed_output": processed_correct_answer,
16 |     }
17 | 


--------------------------------------------------------------------------------
/datasets/wic/config/wic_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "wic",
 3 |     "path": "datasets/wic/data/wic.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/wic/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/wic/config/wic_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "wic",
 3 |     "path": "datasets/wic/data/wic.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/wic/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/wic/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"Sentence 1: {data['passage'][0]}\nSentence 2: {data['passage'][1]}\nAre '{data['question']}' in the above two sentenses the same?\nA. Yes\nB. No\nAnswer: "
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/datasets/winogender/config/female_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "female",
 3 |     "path": "datasets/winogender/data/female.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/winogender/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/winogender/config/female_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "female",
 3 |     "path": "datasets/winogender/data/female.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/winogender/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/winogender/config/male_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "male",
 3 |     "path": "datasets/winogender/data/male.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/winogender/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/winogender/config/male_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "male",
 3 |     "path": "datasets/winogender/data/male.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/winogender/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/winogender/config/neutral_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "neutral",
 3 |     "path": "datasets/winogender/data/neutral.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/winogender/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/winogender/config/neutral_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "neutral",
 3 |     "path": "datasets/winogender/data/neutral.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/winogender/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/winogender/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = data["passage"] + ' "' + data["question"].capitalize() + '" refers to '
 6 |     output_sentence = next(
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     )
 9 |     return {
10 |         "input": text,
11 |         "output": output_sentence,
12 |         "processed_output": output_sentence,
13 |     }
14 | 


--------------------------------------------------------------------------------
/datasets/winogrande/config/winogrande_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "winogrande",
 3 |     "path": "datasets/winogrande/data/winogrande.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/winogrande/transform_gen_v1.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "qa_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/winogrande/config/winogrande_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "winogrande",
 3 |     "path": "datasets/winogrande/data/winogrande.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/winogrande/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/winogrande/transform_ppl_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     output_sentence = next(
 6 |         key for key, value in data["target_scores"].items() if value == 1
 7 |     )
 8 |     return {
 9 |         "input": data["question"],
10 |         "output": output_sentence,
11 |         "processed_output": output_sentence,
12 |     }
13 | 


--------------------------------------------------------------------------------
/datasets/winogrande/transform_ppl_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = data["question"]
 6 |     processed_correct_answer = correct_answer = next(
 7 |         key for key, value in data["target_scores"].items() if value == 1
 8 |     )
 9 |     return {
10 |         "input": text,
11 |         "output": correct_answer,
12 |         "processed_output": processed_correct_answer,
13 |     }
14 | 


--------------------------------------------------------------------------------
/datasets/wmt20-en-zh/transform_gen_v0.py:
--------------------------------------------------------------------------------
1 | import random
2 | 
3 | 
4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
5 |     ans = data["answer"] + "\n"
6 |     prompt = f"请将下面这段内容从英文翻译为中文：\n{data['question']}\n译文：\n"
7 |     return {"input": prompt, "output": ans, "processed_output": ans}
8 | 


--------------------------------------------------------------------------------
/datasets/wmt20-en-zh/transform_gen_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"问题：\n如何将下面这句话从英文翻译为中文？\n"
 6 |     context = f"{data['question']}\n"
 7 |     answer_prompt = f"答案：\n"
 8 |     text = question + context + answer_prompt
 9 |     processed_correct_answer = correct_answer = data["answer"]
10 | 
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/wmt20-zh-en/transform_gen_v0.py:
--------------------------------------------------------------------------------
1 | import random
2 | 
3 | 
4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
5 |     ans = data["answer"] + "\n"
6 |     prompt = f"请将下面这段内容从中文翻译为英文：\n{data['question']}\n译文：\n"
7 |     return {"input": prompt, "output": ans, "processed_output": ans}
8 | 


--------------------------------------------------------------------------------
/datasets/wmt20-zh-en/transform_gen_v1.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     question = f"问题：\n如何将下面这句话从中文翻译为英文？\n"
 6 |     context = f"{data['question']}\n"
 7 |     answer_prompt = f"答案：\n"
 8 |     text = question + context + answer_prompt
 9 |     processed_correct_answer = correct_answer = data["answer"]
10 | 
11 |     return {
12 |         "input": text,
13 |         "output": correct_answer,
14 |         "processed_output": processed_correct_answer,
15 |     }
16 | 


--------------------------------------------------------------------------------
/datasets/wsc/config/wsc_gen.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "wsc",
 3 |     "path": "datasets/wsc/data/wsc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/wsc/transform_gen_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "generate",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "prefix_match"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/wsc/config/wsc_ppl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task_name": "wsc",
 3 |     "path": "datasets/wsc/data/wsc.jsonl",
 4 |     "description": "",
 5 |     "transform": "datasets/wsc/transform_ppl_v0.py",
 6 |     "fewshot": 0,
 7 |     "generate": {
 8 |         "method": "loglikelihood",
 9 |         "params": ""
10 |     },
11 |     "postprocess": "",
12 |     "metric": {
13 |         "accuracy": {
14 |             "evaluation": {
15 |                 "type": "log_prob"
16 |             }
17 |         }
18 |     }
19 | }


--------------------------------------------------------------------------------
/datasets/wsc/transform_gen_v0.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | def transform(data, num_sample: int, r: random.Random, dataset_name: str):
 5 |     text = f"Passage: {data['passage']}\nDoes the pronoun # {data['question'][1]} # refer to * {data['question'][0]} *?\nA. Yes\nB. No\nAnswer: "
 6 |     index_of_correct_answer = list(data["target_scores"].values()).index(1)
 7 |     answers = ["A", "B"]
 8 |     correct_answer = answers[index_of_correct_answer]
 9 | 
10 |     return {"input": text, "output": correct_answer, "processed_output": correct_answer}
11 | 


--------------------------------------------------------------------------------
/docs/pics/ultraeval_logo_white.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/docs/pics/ultraeval_logo_white.jpg


--------------------------------------------------------------------------------
/docs/pics/ultraeval_pipeline_white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/docs/pics/ultraeval_pipeline_white.png


--------------------------------------------------------------------------------
/docs/tutorials/en/customization/new_metric.md:
--------------------------------------------------------------------------------
1 | ## Adding a New Evaluation Method
2 | 
3 | - **Writing a Metric File**
4 |   - In the `metrics/` directory, create a new Python file for your evaluation metric, naming it `metrics/{metric_name}.py`.
5 |   - Refer to the [Metric Introduction](../configuration_file/metric.md) tutorial to complete writing the corresponding MetricName class in `metrics/{metric_name}.py`.
6 |   - Register it in `metrics/__init__.py` by adding `from .metric_name import MetricName` and registering your custom evaluation metric class in the `METRICS_REGISTRY` dictionary.


--------------------------------------------------------------------------------
/docs/tutorials/en/customization/new_postprocess.md:
--------------------------------------------------------------------------------
1 | ## Adding a New Post-Processing Method
2 | 
3 | - **Adding a Postprocess Class**
4 |   - In `tasks/postprocess.py`, following the [Postprocess Introduction](../configuration_file/postprocess.md) tutorial, add your custom post-processing class.
5 |   - Register your custom post-processing class in the `POSTPROCESS_REGISTRY` dictionary located in `tasks/postprocess.py`.


--------------------------------------------------------------------------------
/docs/tutorials/zh/customization/new_metric.md:
--------------------------------------------------------------------------------
1 | ## 添加新的评测方法
2 | 
3 | 
4 | - 编写metric文件
5 |   - 在`metrics/`目录下，为你的评测指标创建一个新的Python文件，命名为`metrics/{metric_name}.py`。
6 |   - 参照[【metric介绍】](../configuration_file/metric.md)教程，在 `metrics/{metric_name}.py` 中完成相应MetricName类的编写。
7 |   - 在`metrics/__init__.py`中进行注册，包括添加`from .metric_name import MetricName`，并在`METRICS_REGISTRY`字典中注册你的自定义评测指标类。


--------------------------------------------------------------------------------
/docs/tutorials/zh/customization/new_postprocess.md:
--------------------------------------------------------------------------------
1 | ## 新的后处理方法
2 | 
3 | - 添加postprocess类
4 |   - 在`tasks/postprocess.py`中，参照[【 postprocess介绍 】](../configuration_file/postprocess.md)教程，添加自定义后处理类。
5 |   - 在`tasks/postprocess.py`中的`POSTPROCESS_REGISTRY`字典中注册你的自定义后处理类。
6 | 


--------------------------------------------------------------------------------
/docs/tutorials/zh/deployment_model/model_download.md:
--------------------------------------------------------------------------------
 1 | ## 下载模型
 2 | 
 3 | 
 4 | 首次部署某模型时，需要从 HuggingFace 下载模型，根据模型大小，此过程可能需花费 10 分钟至 1 小时。其中下载前需要：
 5 | 
 6 | 1.**登录 Huggingface CLI**：
 7 | 
 8 | 输入以下命令并登录：
 9 | 
10 | ```
11 | huggingface-cli login
12 | ```
13 | 
14 | 2.**输入您的 Token**：
15 | 
16 | 登录时，输入 Huggingface 上的 Token。
17 | 
18 | 3.**下载模型**：
19 | 
20 | 在这里既可以git clone对应的模型，也可以通过之后的[单卡部署](./deployment.md)进行下载。


--------------------------------------------------------------------------------
/docs/tutorials/zh/evaluation/model_instantiation.md:
--------------------------------------------------------------------------------
 1 | ## 模型实例化
 2 | 
 3 | 
 4 | 模型部署成功后，会产生一个URL。首先需要进行实例化。我们在`models`目录下提供了两类脚本，`general_model.py`对应URL实例化, `openai_model.py`对应API服务。其中，定义了`Model`类和请求转接方法`_post_request`。
 5 | 
 6 | - `Model`类包含了初始化、`loglikelihood`和`generate`三个方法。
 7 |   - 初始化使用main文件中传入的model_args，实例化model。
 8 |   - `loglikelihood`和`generate`分别和模型推理的方式是对应的。如果用户评测自己训练的模型，则需要在推理代码中，实现这两部分对应的功能。
 9 | - 在评测过程中，所有的数据会传送到`Model`类中，根据推理方式不同，交给`loglikelihood`或`generate`函数，然后提取传入的`params`和`instance`，通过`_post_request`方法将包装后的数据给到模型进行推理。最后将模型生成的结果，返回到评测过程中，进行后处理。
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/tutorials/zh/ultraeval.md:
--------------------------------------------------------------------------------
 1 | UltraEval是一个开源的基础模型能力评测框架，提供了一套轻量级、易于使用的评测体系。整体框架组织如下图所示:
 2 | 
 3 | <div align="center">
 4 | <p align="center">
 5 | <img src="../../pics/ultraeval_pipeline_white.png" width="800px">
 6 | </p>
 7 | </div>
 8 | 
 9 | 按照操作顺序，共分为【数据准备】、【模型部署】、和【任务评测】三大模块，分别对应
10 | 
11 | * [【配置文件】](./configuration_file/config.md)
12 | * [【模型部署】](./deployment_model/model_download.md)
13 | * [【任务评测】](./evaluation/model_instantiation.md)
14 | 
15 | 此外UltraEval具有很好的扩展性，为了便于用户扩展其他任务或者模型，我们提供了定制化评测流程。
16 | * [【用户个性化设置教程】](./customization/new_dataset.md)
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/metrics/in_match.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | 
 4 | class InMatch:
 5 |     def __init__(
 6 |         self,
 7 |     ):
 8 |         pass
 9 | 
10 |     def __call__(self, doc, ground_truth, results) -> Any:
11 |         if isinstance(ground_truth, str):
12 |             ground_truth = [ground_truth]
13 | 
14 |         return 1.0 if results[0].lower().strip() in ground_truth else 0.0
15 | 


--------------------------------------------------------------------------------
/metrics/prefix_match.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | 
 4 | class PrefixMatch:
 5 |     def __init__(
 6 |         self,
 7 |     ):
 8 |         pass
 9 | 
10 |     def __call__(self, doc, ground_truth, results) -> Any:
11 |         """Take a single document and the LM input/output/ground_truth.
12 |         Returns the  values of the metric for that one document
13 |         """
14 |         return 1.0 if results[0].strip().startswith(ground_truth.strip()) else 0.0
15 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import general_model, openai_model
 2 | 
 3 | MODEL_REGISTRY = {
 4 |     "gpt3_5": openai_model.GPT3_5,
 5 |     "gpt-3.5-turbo": openai_model.GPT3_5,
 6 |     "gpt4": openai_model.GPT4,
 7 |     "gpt-4": openai_model.GPT4,
 8 |     "general": general_model.GeneralModel,
 9 | }
10 | 
11 | 
12 | def get_model(model_name):
13 |     return MODEL_REGISTRY[model_name]
14 | 


--------------------------------------------------------------------------------
/models/model_params/gpt-3.5-turbo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "temperature": 0.0,
 3 |     "stop": null,
 4 |     "max_tokens": 4097,
 5 |     "request_id": null,
 6 |     "top_p": 1,
 7 |     "presence_penalty": 0,
 8 |     "frequency_penalty": 0,
 9 |     "sampling_num": 1
10 | }


--------------------------------------------------------------------------------
/models/model_params/gpt-4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "temperature": 0.0,
 3 |     "stop": null,
 4 |     "max_tokens": 1600,
 5 |     "request_id": null,
 6 |     "top_p": 1,
 7 |     "presence_penalty": 0,
 8 |     "frequency_penalty": 0,
 9 |     "sampling_num": 1
10 | }


--------------------------------------------------------------------------------
/models/model_params/vllm_beamsearch.json:
--------------------------------------------------------------------------------
1 | {
2 |     "use_beam_search": true,
3 |     "best_of": 10,
4 |     "temperature": 0,
5 |     "top_p": 1,
6 |     "top_k": -1,
7 |     "early_stopping": "never",
8 |     "sampling_num": 1
9 | }


--------------------------------------------------------------------------------
/models/model_params/vllm_logprobs.json:
--------------------------------------------------------------------------------
1 | {
2 |     "prompt_logprobs": 0,
3 |     "max_tokens": 1
4 | }


--------------------------------------------------------------------------------
/models/model_params/vllm_sample.json:
--------------------------------------------------------------------------------
1 | {
2 |     "temperature": 0.3,
3 |     "top_p": 0.8,
4 |     "max_tokens": 300,
5 |     "sampling_num": 1
6 | }


--------------------------------------------------------------------------------
/models/model_params/vllm_sample_bbh.json:
--------------------------------------------------------------------------------
1 | {
2 |     "temperature": 0.3,
3 |     "top_p": 0.8,
4 |     "max_tokens": 1024,
5 |     "sampling_num": 1,
6 |     "presence_penalty": 0.0
7 | }


--------------------------------------------------------------------------------
/models/model_params/vllm_sample_v1.json:
--------------------------------------------------------------------------------
1 | {
2 |     "temperature": 0.1,
3 |     "top_p": 0.95,
4 |     "max_tokens": 300,
5 |     "sampling_num": 1
6 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | vllm
 2 | flask
 3 | openai
 4 | sacrebleu
 5 | rouge_chinese
 6 | pytablewriter
 7 | gevent
 8 | gunicorn
 9 | tqdm
10 | pynvml
11 | accelerate>=0.20.3


--------------------------------------------------------------------------------
/run_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | bash scripts/run_job_base.sh mistralai/Mistral-7B-v0.1 1 1 logs/mistralai/Mistral-7B-v0.1 bbh,mmlu,ceval,cmmlu,humaneval,mbpp-427,gsm8k,math,hellaswag,boolq,piqa,winogrande,arc-e,arc-c gen -1
4 | sleep 600
5 | 
6 | bash scripts/run_job_base.sh mistralai/Mistral-7B-v0.1 1 1 logs/mistralai/Mistral-7B-v0.1B hellaswag,boolq,piqa,winogrande,arc-e,arc-c ppl -1


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("requirements.txt", 'r', encoding='utf-8') as f:
 4 |     requirements = f.read().strip().splitlines()
 5 | 
 6 | setuptools.setup(
 7 |     name="UltraEval",
 8 |     version="0.1",
 9 |     author="UltraEval Team",
10 |     author_email="",
11 |     description="An open source framework for evaluating foundation models",
12 |     packages=setuptools.find_packages(),
13 |     python_requires=">=3.10",
14 |     install_requires=requirements,
15 | )
16 | 


--------------------------------------------------------------------------------
/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/tasks/__init__.py


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBMB/UltraEval/5d967b4ea5725ab1252904520bcaa87b40165b4b/utils/__init__.py


--------------------------------------------------------------------------------
/utils/request.py:
--------------------------------------------------------------------------------
 1 | REQUEST_RETURN_LENGTHS = {
 2 |     "loglikelihood": 1,
 3 |     "generate": 1,
 4 | }
 5 | 
 6 | 
 7 | class Request:
 8 |     def __init__(self, request_type, instances, params, raw_example):
 9 |         if request_type not in REQUEST_RETURN_LENGTHS.keys():
10 |             raise NotImplementedError(
11 |                 "The request type {} is not implemented!".format(request_type)
12 |             )
13 | 
14 |         self.request_type = request_type
15 |         self.instances = instances
16 |         self.params = params
17 |         self.raw_example = raw_example
18 | 


--------------------------------------------------------------------------------