├── .gitignore ├── LICENSE ├── README.md ├── assets ├── c4_perplexity.png ├── overview.png └── zeroshot.png ├── eval.py ├── latency.py ├── lm-evaluation-harness ├── .coveragerc ├── .flake8 ├── .pre-commit-config.yaml ├── CITATION.bib ├── CODEOWNERS ├── LICENSE.md ├── README.md ├── docs │ ├── CONTRIBUTING.md │ ├── README.md │ ├── decontamination.md │ ├── img │ │ └── fewshot_example_gpt3.png │ ├── interface.md │ ├── model_guide.md │ ├── new_task_guide.md │ └── task_guide.md ├── examples │ ├── lm-eval-overview.ipynb │ └── visualize-zeno.ipynb ├── ignore.txt ├── lm_eval │ ├── __init__.py │ ├── __main__.py │ ├── api │ │ ├── __init__.py │ │ ├── filter.py │ │ ├── instance.py │ │ ├── metrics.py │ │ ├── model.py │ │ ├── registry.py │ │ ├── samplers.py │ │ └── task.py │ ├── decontamination │ │ ├── __init__.py │ │ ├── archiver.py │ │ ├── decontaminate.py │ │ └── janitor.py │ ├── evaluator.py │ ├── filters │ │ ├── __init__.py │ │ ├── decontamination.py │ │ ├── extraction.py │ │ ├── selection.py │ │ └── transformation.py │ ├── models │ │ ├── __init__.py │ │ ├── anthropic_llms.py │ │ ├── dummy.py │ │ ├── gguf.py │ │ ├── huggingface.py │ │ ├── mamba_lm.py │ │ ├── neuron_optimum.py │ │ ├── openai_completions.py │ │ ├── optimum_lm.py │ │ ├── textsynth.py │ │ └── vllm_causallms.py │ ├── prompts │ │ └── __init__.py │ ├── tasks │ │ ├── README.md │ │ ├── __init__.py │ │ ├── anli │ │ │ ├── README.md │ │ │ ├── anli_r1.yaml │ │ │ ├── anli_r2.yaml │ │ │ └── anli_r3.yaml │ │ ├── arc │ │ │ ├── README.md │ │ │ ├── arc_challenge.yaml │ │ │ └── arc_easy.yaml │ │ ├── arithmetic │ │ │ ├── README.md │ │ │ ├── arithmetic_1dc.yaml │ │ │ ├── arithmetic_2da.yaml │ │ │ ├── arithmetic_2dm.yaml │ │ │ ├── arithmetic_2ds.yaml │ │ │ ├── arithmetic_3da.yaml │ │ │ ├── arithmetic_3ds.yaml │ │ │ ├── arithmetic_4da.yaml │ │ │ ├── arithmetic_4ds.yaml │ │ │ ├── arithmetic_5da.yaml │ │ │ └── arithmetic_5ds.yaml │ │ ├── asdiv │ │ │ ├── README.md │ │ │ └── default.yaml │ │ ├── babi │ │ │ ├── README.md │ │ │ └── babi.yaml │ │ ├── bbh │ │ │ ├── README.md │ │ │ ├── _generate_configs.py │ │ │ ├── cot_fewshot │ │ │ │ ├── _cot_fewshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ │ ├── cot_zeroshot │ │ │ │ ├── _cot_zeroshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ │ ├── fewshot │ │ │ │ ├── _fewshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ │ └── zeroshot │ │ │ │ ├── _zeroshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ ├── belebele │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── belebele_acm_Arab.yaml │ │ │ ├── belebele_afr_Latn.yaml │ │ │ ├── belebele_als_Latn.yaml │ │ │ ├── belebele_amh_Ethi.yaml │ │ │ ├── belebele_apc_Arab.yaml │ │ │ ├── belebele_arb_Arab.yaml │ │ │ ├── belebele_arb_Latn.yaml │ │ │ ├── belebele_ars_Arab.yaml │ │ │ ├── belebele_ary_Arab.yaml │ │ │ ├── belebele_arz_Arab.yaml │ │ │ ├── belebele_asm_Beng.yaml │ │ │ ├── belebele_azj_Latn.yaml │ │ │ ├── belebele_bam_Latn.yaml │ │ │ ├── belebele_ben_Beng.yaml │ │ │ ├── belebele_ben_Latn.yaml │ │ │ ├── belebele_bod_Tibt.yaml │ │ │ ├── belebele_bul_Cyrl.yaml │ │ │ ├── belebele_cat_Latn.yaml │ │ │ ├── belebele_ceb_Latn.yaml │ │ │ ├── belebele_ces_Latn.yaml │ │ │ ├── belebele_ckb_Arab.yaml │ │ │ ├── belebele_dan_Latn.yaml │ │ │ ├── belebele_deu_Latn.yaml │ │ │ ├── belebele_ell_Grek.yaml │ │ │ ├── belebele_eng_Latn.yaml │ │ │ ├── belebele_est_Latn.yaml │ │ │ ├── belebele_eus_Latn.yaml │ │ │ ├── belebele_fin_Latn.yaml │ │ │ ├── belebele_fra_Latn.yaml │ │ │ ├── belebele_fuv_Latn.yaml │ │ │ ├── belebele_gaz_Latn.yaml │ │ │ ├── belebele_grn_Latn.yaml │ │ │ ├── belebele_guj_Gujr.yaml │ │ │ ├── belebele_hat_Latn.yaml │ │ │ ├── belebele_hau_Latn.yaml │ │ │ ├── belebele_heb_Hebr.yaml │ │ │ ├── belebele_hin_Deva.yaml │ │ │ ├── belebele_hin_Latn.yaml │ │ │ ├── belebele_hrv_Latn.yaml │ │ │ ├── belebele_hun_Latn.yaml │ │ │ ├── belebele_hye_Armn.yaml │ │ │ ├── belebele_ibo_Latn.yaml │ │ │ ├── belebele_ilo_Latn.yaml │ │ │ ├── belebele_ind_Latn.yaml │ │ │ ├── belebele_isl_Latn.yaml │ │ │ ├── belebele_ita_Latn.yaml │ │ │ ├── belebele_jav_Latn.yaml │ │ │ ├── belebele_jpn_Jpan.yaml │ │ │ ├── belebele_kac_Latn.yaml │ │ │ ├── belebele_kan_Knda.yaml │ │ │ ├── belebele_kat_Geor.yaml │ │ │ ├── belebele_kaz_Cyrl.yaml │ │ │ ├── belebele_kea_Latn.yaml │ │ │ ├── belebele_khk_Cyrl.yaml │ │ │ ├── belebele_khm_Khmr.yaml │ │ │ ├── belebele_kin_Latn.yaml │ │ │ ├── belebele_kir_Cyrl.yaml │ │ │ ├── belebele_kor_Hang.yaml │ │ │ ├── belebele_lao_Laoo.yaml │ │ │ ├── belebele_lin_Latn.yaml │ │ │ ├── belebele_lit_Latn.yaml │ │ │ ├── belebele_lug_Latn.yaml │ │ │ ├── belebele_luo_Latn.yaml │ │ │ ├── belebele_lvs_Latn.yaml │ │ │ ├── belebele_mal_Mlym.yaml │ │ │ ├── belebele_mar_Deva.yaml │ │ │ ├── belebele_mkd_Cyrl.yaml │ │ │ ├── belebele_mlt_Latn.yaml │ │ │ ├── belebele_mri_Latn.yaml │ │ │ ├── belebele_mya_Mymr.yaml │ │ │ ├── belebele_nld_Latn.yaml │ │ │ ├── belebele_nob_Latn.yaml │ │ │ ├── belebele_npi_Deva.yaml │ │ │ ├── belebele_npi_Latn.yaml │ │ │ ├── belebele_nso_Latn.yaml │ │ │ ├── belebele_nya_Latn.yaml │ │ │ ├── belebele_ory_Orya.yaml │ │ │ ├── belebele_pan_Guru.yaml │ │ │ ├── belebele_pbt_Arab.yaml │ │ │ ├── belebele_pes_Arab.yaml │ │ │ ├── belebele_plt_Latn.yaml │ │ │ ├── belebele_pol_Latn.yaml │ │ │ ├── belebele_por_Latn.yaml │ │ │ ├── belebele_ron_Latn.yaml │ │ │ ├── belebele_rus_Cyrl.yaml │ │ │ ├── belebele_shn_Mymr.yaml │ │ │ ├── belebele_sin_Latn.yaml │ │ │ ├── belebele_sin_Sinh.yaml │ │ │ ├── belebele_slk_Latn.yaml │ │ │ ├── belebele_slv_Latn.yaml │ │ │ ├── belebele_sna_Latn.yaml │ │ │ ├── belebele_snd_Arab.yaml │ │ │ ├── belebele_som_Latn.yaml │ │ │ ├── belebele_sot_Latn.yaml │ │ │ ├── belebele_spa_Latn.yaml │ │ │ ├── belebele_srp_Cyrl.yaml │ │ │ ├── belebele_ssw_Latn.yaml │ │ │ ├── belebele_sun_Latn.yaml │ │ │ ├── belebele_swe_Latn.yaml │ │ │ ├── belebele_swh_Latn.yaml │ │ │ ├── belebele_tam_Taml.yaml │ │ │ ├── belebele_tel_Telu.yaml │ │ │ ├── belebele_tgk_Cyrl.yaml │ │ │ ├── belebele_tgl_Latn.yaml │ │ │ ├── belebele_tha_Thai.yaml │ │ │ ├── belebele_tir_Ethi.yaml │ │ │ ├── belebele_tsn_Latn.yaml │ │ │ ├── belebele_tso_Latn.yaml │ │ │ ├── belebele_tur_Latn.yaml │ │ │ ├── belebele_ukr_Cyrl.yaml │ │ │ ├── belebele_urd_Arab.yaml │ │ │ ├── belebele_urd_Latn.yaml │ │ │ ├── belebele_uzn_Latn.yaml │ │ │ ├── belebele_vie_Latn.yaml │ │ │ ├── belebele_war_Latn.yaml │ │ │ ├── belebele_wol_Latn.yaml │ │ │ ├── belebele_xho_Latn.yaml │ │ │ ├── belebele_yor_Latn.yaml │ │ │ ├── belebele_zho_Hans.yaml │ │ │ ├── belebele_zho_Hant.yaml │ │ │ ├── belebele_zsm_Latn.yaml │ │ │ └── belebele_zul_Latn.yaml │ │ ├── benchmarks │ │ │ ├── flan │ │ │ │ ├── _held_in_template_yaml │ │ │ │ ├── flan_held_in.yaml │ │ │ │ └── flan_held_out.yaml │ │ │ ├── minerva_math.yaml │ │ │ ├── multimedqa │ │ │ │ ├── README.md │ │ │ │ └── multimedqa.yaml │ │ │ ├── pythia.yaml │ │ │ └── t0_eval.yaml │ │ ├── bigbench │ │ │ ├── README.md │ │ │ ├── generate_tasks.py │ │ │ ├── generate_until │ │ │ │ ├── abstract_narrative_understanding.yaml │ │ │ │ ├── anachronisms.yaml │ │ │ │ ├── analogical_similarity.yaml │ │ │ │ ├── analytic_entailment.yaml │ │ │ │ ├── arithmetic.yaml │ │ │ │ ├── ascii_word_recognition.yaml │ │ │ │ ├── authorship_verification.yaml │ │ │ │ ├── auto_categorization.yaml │ │ │ │ ├── auto_debugging.yaml │ │ │ │ ├── bbq_lite_json.yaml │ │ │ │ ├── bridging_anaphora_resolution_barqa.yaml │ │ │ │ ├── causal_judgment.yaml │ │ │ │ ├── cause_and_effect.yaml │ │ │ │ ├── checkmate_in_one.yaml │ │ │ │ ├── chess_state_tracking.yaml │ │ │ │ ├── chinese_remainder_theorem.yaml │ │ │ │ ├── cifar10_classification.yaml │ │ │ │ ├── code_line_description.yaml │ │ │ │ ├── codenames.yaml │ │ │ │ ├── color.yaml │ │ │ │ ├── common_morpheme.yaml │ │ │ │ ├── conceptual_combinations.yaml │ │ │ │ ├── conlang_translation.yaml │ │ │ │ ├── contextual_parametric_knowledge_conflicts.yaml │ │ │ │ ├── crash_blossom.yaml │ │ │ │ ├── crass_ai.yaml │ │ │ │ ├── cryobiology_spanish.yaml │ │ │ │ ├── cryptonite.yaml │ │ │ │ ├── cs_algorithms.yaml │ │ │ │ ├── dark_humor_detection.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── discourse_marker_prediction.yaml │ │ │ │ ├── disfl_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── elementary_math_qa.yaml │ │ │ │ ├── emoji_movie.yaml │ │ │ │ ├── emojis_emotion_prediction.yaml │ │ │ │ ├── empirical_judgments.yaml │ │ │ │ ├── english_proverbs.yaml │ │ │ │ ├── english_russian_proverbs.yaml │ │ │ │ ├── entailed_polarity.yaml │ │ │ │ ├── entailed_polarity_hindi.yaml │ │ │ │ ├── epistemic_reasoning.yaml │ │ │ │ ├── evaluating_information_essentiality.yaml │ │ │ │ ├── fact_checker.yaml │ │ │ │ ├── fantasy_reasoning.yaml │ │ │ │ ├── few_shot_nlg.yaml │ │ │ │ ├── figure_of_speech_detection.yaml │ │ │ │ ├── formal_fallacies_syllogisms_negation.yaml │ │ │ │ ├── gem.yaml │ │ │ │ ├── gender_inclusive_sentences_german.yaml │ │ │ │ ├── general_knowledge.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── goal_step_wikihow.yaml │ │ │ │ ├── gre_reading_comprehension.yaml │ │ │ │ ├── hhh_alignment.yaml │ │ │ │ ├── hindi_question_answering.yaml │ │ │ │ ├── hindu_knowledge.yaml │ │ │ │ ├── hinglish_toxicity.yaml │ │ │ │ ├── human_organs_senses.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── identify_math_theorems.yaml │ │ │ │ ├── identify_odd_metaphor.yaml │ │ │ │ ├── implicatures.yaml │ │ │ │ ├── implicit_relations.yaml │ │ │ │ ├── intent_recognition.yaml │ │ │ │ ├── international_phonetic_alphabet_nli.yaml │ │ │ │ ├── international_phonetic_alphabet_transliterate.yaml │ │ │ │ ├── intersect_geometry.yaml │ │ │ │ ├── irony_identification.yaml │ │ │ │ ├── kanji_ascii.yaml │ │ │ │ ├── kannada.yaml │ │ │ │ ├── key_value_maps.yaml │ │ │ │ ├── known_unknowns.yaml │ │ │ │ ├── language_games.yaml │ │ │ │ ├── language_identification.yaml │ │ │ │ ├── linguistic_mappings.yaml │ │ │ │ ├── linguistics_puzzles.yaml │ │ │ │ ├── list_functions.yaml │ │ │ │ ├── logic_grid_puzzle.yaml │ │ │ │ ├── logical_args.yaml │ │ │ │ ├── logical_deduction.yaml │ │ │ │ ├── logical_fallacy_detection.yaml │ │ │ │ ├── logical_sequence.yaml │ │ │ │ ├── mathematical_induction.yaml │ │ │ │ ├── matrixshapes.yaml │ │ │ │ ├── metaphor_boolean.yaml │ │ │ │ ├── metaphor_understanding.yaml │ │ │ │ ├── minute_mysteries_qa.yaml │ │ │ │ ├── misconceptions.yaml │ │ │ │ ├── misconceptions_russian.yaml │ │ │ │ ├── mnist_ascii.yaml │ │ │ │ ├── modified_arithmetic.yaml │ │ │ │ ├── moral_permissibility.yaml │ │ │ │ ├── movie_dialog_same_or_different.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── mult_data_wrangling.yaml │ │ │ │ ├── multiemo.yaml │ │ │ │ ├── natural_instructions.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── nonsense_words_grammar.yaml │ │ │ │ ├── novel_concepts.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── odd_one_out.yaml │ │ │ │ ├── operators.yaml │ │ │ │ ├── paragraph_segmentation.yaml │ │ │ │ ├── parsinlu_qa.yaml │ │ │ │ ├── parsinlu_reading_comprehension.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── periodic_elements.yaml │ │ │ │ ├── persian_idioms.yaml │ │ │ │ ├── phrase_relatedness.yaml │ │ │ │ ├── physical_intuition.yaml │ │ │ │ ├── physics.yaml │ │ │ │ ├── physics_questions.yaml │ │ │ │ ├── play_dialog_same_or_different.yaml │ │ │ │ ├── polish_sequence_labeling.yaml │ │ │ │ ├── presuppositions_as_nli.yaml │ │ │ │ ├── qa_wikidata.yaml │ │ │ │ ├── question_selection.yaml │ │ │ │ ├── real_or_fake_text.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── repeat_copy_logic.yaml │ │ │ │ ├── rephrase.yaml │ │ │ │ ├── riddle_sense.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── scientific_press_release.yaml │ │ │ │ ├── semantic_parsing_in_context_sparc.yaml │ │ │ │ ├── semantic_parsing_spider.yaml │ │ │ │ ├── sentence_ambiguity.yaml │ │ │ │ ├── similarities_abstraction.yaml │ │ │ │ ├── simp_turing_concept.yaml │ │ │ │ ├── simple_arithmetic_json.yaml │ │ │ │ ├── simple_arithmetic_json_multiple_choice.yaml │ │ │ │ ├── simple_arithmetic_json_subtasks.yaml │ │ │ │ ├── simple_arithmetic_multiple_targets_json.yaml │ │ │ │ ├── simple_ethical_questions.yaml │ │ │ │ ├── simple_text_editing.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── social_iqa.yaml │ │ │ │ ├── social_support.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── strange_stories.yaml │ │ │ │ ├── strategyqa.yaml │ │ │ │ ├── sufficient_information.yaml │ │ │ │ ├── suicide_risk.yaml │ │ │ │ ├── swahili_english_proverbs.yaml │ │ │ │ ├── swedish_to_german_proverbs.yaml │ │ │ │ ├── symbol_interpretation.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tense.yaml │ │ │ │ ├── timedial.yaml │ │ │ │ ├── topical_chat.yaml │ │ │ │ ├── tracking_shuffled_objects.yaml │ │ │ │ ├── understanding_fables.yaml │ │ │ │ ├── undo_permutation.yaml │ │ │ │ ├── unit_conversion.yaml │ │ │ │ ├── unit_interpretation.yaml │ │ │ │ ├── unnatural_in_context_learning.yaml │ │ │ │ ├── vitaminc_fact_verification.yaml │ │ │ │ ├── what_is_the_tao.yaml │ │ │ │ ├── which_wiki_edit.yaml │ │ │ │ ├── winowhy.yaml │ │ │ │ ├── word_sorting.yaml │ │ │ │ └── word_unscrambling.yaml │ │ │ ├── generate_until_template_yaml │ │ │ ├── multiple_choice │ │ │ │ ├── abstract_narrative_understanding.yaml │ │ │ │ ├── anachronisms.yaml │ │ │ │ ├── analogical_similarity.yaml │ │ │ │ ├── analytic_entailment.yaml │ │ │ │ ├── arithmetic.yaml │ │ │ │ ├── ascii_word_recognition.yaml │ │ │ │ ├── authorship_verification.yaml │ │ │ │ ├── auto_categorization.yaml │ │ │ │ ├── auto_debugging.yaml │ │ │ │ ├── bbq_lite_json.yaml │ │ │ │ ├── bridging_anaphora_resolution_barqa.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── causal_judgment.yaml │ │ │ │ ├── cause_and_effect.yaml │ │ │ │ ├── checkmate_in_one.yaml │ │ │ │ ├── chess_state_tracking.yaml │ │ │ │ ├── chinese_remainder_theorem.yaml │ │ │ │ ├── cifar10_classification.yaml │ │ │ │ ├── code_line_description.yaml │ │ │ │ ├── codenames.yaml │ │ │ │ ├── color.yaml │ │ │ │ ├── common_morpheme.yaml │ │ │ │ ├── conceptual_combinations.yaml │ │ │ │ ├── conlang_translation.yaml │ │ │ │ ├── contextual_parametric_knowledge_conflicts.yaml │ │ │ │ ├── crash_blossom.yaml │ │ │ │ ├── crass_ai.yaml │ │ │ │ ├── cryobiology_spanish.yaml │ │ │ │ ├── cryptonite.yaml │ │ │ │ ├── cs_algorithms.yaml │ │ │ │ ├── dark_humor_detection.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── discourse_marker_prediction.yaml │ │ │ │ ├── disfl_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── elementary_math_qa.yaml │ │ │ │ ├── emoji_movie.yaml │ │ │ │ ├── emojis_emotion_prediction.yaml │ │ │ │ ├── empirical_judgments.yaml │ │ │ │ ├── english_proverbs.yaml │ │ │ │ ├── english_russian_proverbs.yaml │ │ │ │ ├── entailed_polarity.yaml │ │ │ │ ├── entailed_polarity_hindi.yaml │ │ │ │ ├── epistemic_reasoning.yaml │ │ │ │ ├── evaluating_information_essentiality.yaml │ │ │ │ ├── fact_checker.yaml │ │ │ │ ├── fantasy_reasoning.yaml │ │ │ │ ├── few_shot_nlg.yaml │ │ │ │ ├── figure_of_speech_detection.yaml │ │ │ │ ├── formal_fallacies_syllogisms_negation.yaml │ │ │ │ ├── gem.yaml │ │ │ │ ├── gender_inclusive_sentences_german.yaml │ │ │ │ ├── general_knowledge.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── goal_step_wikihow.yaml │ │ │ │ ├── gre_reading_comprehension.yaml │ │ │ │ ├── hhh_alignment.yaml │ │ │ │ ├── hindi_question_answering.yaml │ │ │ │ ├── hindu_knowledge.yaml │ │ │ │ ├── hinglish_toxicity.yaml │ │ │ │ ├── human_organs_senses.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── identify_math_theorems.yaml │ │ │ │ ├── identify_odd_metaphor.yaml │ │ │ │ ├── implicatures.yaml │ │ │ │ ├── implicit_relations.yaml │ │ │ │ ├── intent_recognition.yaml │ │ │ │ ├── international_phonetic_alphabet_nli.yaml │ │ │ │ ├── international_phonetic_alphabet_transliterate.yaml │ │ │ │ ├── intersect_geometry.yaml │ │ │ │ ├── irony_identification.yaml │ │ │ │ ├── kanji_ascii.yaml │ │ │ │ ├── kannada.yaml │ │ │ │ ├── key_value_maps.yaml │ │ │ │ ├── known_unknowns.yaml │ │ │ │ ├── language_games.yaml │ │ │ │ ├── language_identification.yaml │ │ │ │ ├── linguistic_mappings.yaml │ │ │ │ ├── linguistics_puzzles.yaml │ │ │ │ ├── list_functions.yaml │ │ │ │ ├── logic_grid_puzzle.yaml │ │ │ │ ├── logical_args.yaml │ │ │ │ ├── logical_deduction.yaml │ │ │ │ ├── logical_fallacy_detection.yaml │ │ │ │ ├── logical_sequence.yaml │ │ │ │ ├── mathematical_induction.yaml │ │ │ │ ├── matrixshapes.yaml │ │ │ │ ├── metaphor_boolean.yaml │ │ │ │ ├── metaphor_understanding.yaml │ │ │ │ ├── minute_mysteries_qa.yaml │ │ │ │ ├── misconceptions.yaml │ │ │ │ ├── misconceptions_russian.yaml │ │ │ │ ├── mnist_ascii.yaml │ │ │ │ ├── modified_arithmetic.yaml │ │ │ │ ├── moral_permissibility.yaml │ │ │ │ ├── movie_dialog_same_or_different.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── mult_data_wrangling.yaml │ │ │ │ ├── multiemo.yaml │ │ │ │ ├── natural_instructions.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── nonsense_words_grammar.yaml │ │ │ │ ├── novel_concepts.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── odd_one_out.yaml │ │ │ │ ├── operators.yaml │ │ │ │ ├── paragraph_segmentation.yaml │ │ │ │ ├── parsinlu_qa.yaml │ │ │ │ ├── parsinlu_reading_comprehension.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── periodic_elements.yaml │ │ │ │ ├── persian_idioms.yaml │ │ │ │ ├── phrase_relatedness.yaml │ │ │ │ ├── physical_intuition.yaml │ │ │ │ ├── physics.yaml │ │ │ │ ├── physics_questions.yaml │ │ │ │ ├── play_dialog_same_or_different.yaml │ │ │ │ ├── polish_sequence_labeling.yaml │ │ │ │ ├── presuppositions_as_nli.yaml │ │ │ │ ├── qa_wikidata.yaml │ │ │ │ ├── question_selection.yaml │ │ │ │ ├── real_or_fake_text.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── repeat_copy_logic.yaml │ │ │ │ ├── rephrase.yaml │ │ │ │ ├── riddle_sense.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── scientific_press_release.yaml │ │ │ │ ├── semantic_parsing_in_context_sparc.yaml │ │ │ │ ├── semantic_parsing_spider.yaml │ │ │ │ ├── sentence_ambiguity.yaml │ │ │ │ ├── similarities_abstraction.yaml │ │ │ │ ├── simp_turing_concept.yaml │ │ │ │ ├── simple_arithmetic_json.yaml │ │ │ │ ├── simple_arithmetic_json_multiple_choice.yaml │ │ │ │ ├── simple_arithmetic_json_subtasks.yaml │ │ │ │ ├── simple_arithmetic_multiple_targets_json.yaml │ │ │ │ ├── simple_ethical_questions.yaml │ │ │ │ ├── simple_text_editing.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── social_iqa.yaml │ │ │ │ ├── social_support.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── strange_stories.yaml │ │ │ │ ├── strategyqa.yaml │ │ │ │ ├── sufficient_information.yaml │ │ │ │ ├── suicide_risk.yaml │ │ │ │ ├── swahili_english_proverbs.yaml │ │ │ │ ├── swedish_to_german_proverbs.yaml │ │ │ │ ├── symbol_interpretation.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tense.yaml │ │ │ │ ├── timedial.yaml │ │ │ │ ├── topical_chat.yaml │ │ │ │ ├── tracking_shuffled_objects.yaml │ │ │ │ ├── understanding_fables.yaml │ │ │ │ ├── undo_permutation.yaml │ │ │ │ ├── unit_conversion.yaml │ │ │ │ ├── unit_interpretation.yaml │ │ │ │ ├── unnatural_in_context_learning.yaml │ │ │ │ ├── vitaminc_fact_verification.yaml │ │ │ │ ├── what_is_the_tao.yaml │ │ │ │ ├── which_wiki_edit.yaml │ │ │ │ ├── winowhy.yaml │ │ │ │ ├── word_sorting.yaml │ │ │ │ └── word_unscrambling.yaml │ │ │ ├── multiple_choice_template_yaml │ │ │ └── push_bigbench_dataset.py │ │ ├── blimp │ │ │ ├── README.md │ │ │ ├── _template_yaml │ │ │ ├── adjunct_island.yaml │ │ │ ├── anaphor_gender_agreement.yaml │ │ │ ├── anaphor_number_agreement.yaml │ │ │ ├── animate_subject_passive.yaml │ │ │ ├── animate_subject_trans.yaml │ │ │ ├── causative.yaml │ │ │ ├── complex_NP_island.yaml │ │ │ ├── coordinate_structure_constraint_complex_left_branch.yaml │ │ │ ├── coordinate_structure_constraint_object_extraction.yaml │ │ │ ├── determiner_noun_agreement_1.yaml │ │ │ ├── determiner_noun_agreement_2.yaml │ │ │ ├── determiner_noun_agreement_irregular_1.yaml │ │ │ ├── determiner_noun_agreement_irregular_2.yaml │ │ │ ├── determiner_noun_agreement_with_adj_2.yaml │ │ │ ├── determiner_noun_agreement_with_adj_irregular_1.yaml │ │ │ ├── determiner_noun_agreement_with_adj_irregular_2.yaml │ │ │ ├── determiner_noun_agreement_with_adjective_1.yaml │ │ │ ├── distractor_agreement_relational_noun.yaml │ │ │ ├── distractor_agreement_relative_clause.yaml │ │ │ ├── drop_argument.yaml │ │ │ ├── ellipsis_n_bar_1.yaml │ │ │ ├── ellipsis_n_bar_2.yaml │ │ │ ├── existential_there_object_raising.yaml │ │ │ ├── existential_there_quantifiers_1.yaml │ │ │ ├── existential_there_quantifiers_2.yaml │ │ │ ├── existential_there_subject_raising.yaml │ │ │ ├── expletive_it_object_raising.yaml │ │ │ ├── generate_configs.py │ │ │ ├── inchoative.yaml │ │ │ ├── intransitive.yaml │ │ │ ├── irregular_past_participle_adjectives.yaml │ │ │ ├── irregular_past_participle_verbs.yaml │ │ │ ├── irregular_plural_subject_verb_agreement_1.yaml │ │ │ ├── irregular_plural_subject_verb_agreement_2.yaml │ │ │ ├── left_branch_island_echo_question.yaml │ │ │ ├── left_branch_island_simple_question.yaml │ │ │ ├── matrix_question_npi_licensor_present.yaml │ │ │ ├── npi_present_1.yaml │ │ │ ├── npi_present_2.yaml │ │ │ ├── only_npi_licensor_present.yaml │ │ │ ├── only_npi_scope.yaml │ │ │ ├── passive_1.yaml │ │ │ ├── passive_2.yaml │ │ │ ├── principle_A_c_command.yaml │ │ │ ├── principle_A_case_1.yaml │ │ │ ├── principle_A_case_2.yaml │ │ │ ├── principle_A_domain_1.yaml │ │ │ ├── principle_A_domain_2.yaml │ │ │ ├── principle_A_domain_3.yaml │ │ │ ├── principle_A_reconstruction.yaml │ │ │ ├── regular_plural_subject_verb_agreement_1.yaml │ │ │ ├── regular_plural_subject_verb_agreement_2.yaml │ │ │ ├── sentential_negation_npi_licensor_present.yaml │ │ │ ├── sentential_negation_npi_scope.yaml │ │ │ ├── sentential_subject_island.yaml │ │ │ ├── superlative_quantifiers_1.yaml │ │ │ ├── superlative_quantifiers_2.yaml │ │ │ ├── tough_vs_raising_1.yaml │ │ │ ├── tough_vs_raising_2.yaml │ │ │ ├── transitive.yaml │ │ │ ├── wh_island.yaml │ │ │ ├── wh_questions_object_gap.yaml │ │ │ ├── wh_questions_subject_gap.yaml │ │ │ ├── wh_questions_subject_gap_long_distance.yaml │ │ │ ├── wh_vs_that_no_gap.yaml │ │ │ ├── wh_vs_that_no_gap_long_distance.yaml │ │ │ ├── wh_vs_that_with_gap.yaml │ │ │ └── wh_vs_that_with_gap_long_distance.yaml │ │ ├── ceval │ │ │ ├── README.md │ │ │ ├── _default_ceval_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── ceval-valid_accountant.yaml │ │ │ ├── ceval-valid_advanced_mathematics.yaml │ │ │ ├── ceval-valid_art_studies.yaml │ │ │ ├── ceval-valid_basic_medicine.yaml │ │ │ ├── ceval-valid_business_administration.yaml │ │ │ ├── ceval-valid_chinese_language_and_literature.yaml │ │ │ ├── ceval-valid_civil_servant.yaml │ │ │ ├── ceval-valid_clinical_medicine.yaml │ │ │ ├── ceval-valid_college_chemistry.yaml │ │ │ ├── ceval-valid_college_economics.yaml │ │ │ ├── ceval-valid_college_physics.yaml │ │ │ ├── ceval-valid_college_programming.yaml │ │ │ ├── ceval-valid_computer_architecture.yaml │ │ │ ├── ceval-valid_computer_network.yaml │ │ │ ├── ceval-valid_discrete_mathematics.yaml │ │ │ ├── ceval-valid_education_science.yaml │ │ │ ├── ceval-valid_electrical_engineer.yaml │ │ │ ├── ceval-valid_environmental_impact_assessment_engineer.yaml │ │ │ ├── ceval-valid_fire_engineer.yaml │ │ │ ├── ceval-valid_high_school_biology.yaml │ │ │ ├── ceval-valid_high_school_chemistry.yaml │ │ │ ├── ceval-valid_high_school_chinese.yaml │ │ │ ├── ceval-valid_high_school_geography.yaml │ │ │ ├── ceval-valid_high_school_history.yaml │ │ │ ├── ceval-valid_high_school_mathematics.yaml │ │ │ ├── ceval-valid_high_school_physics.yaml │ │ │ ├── ceval-valid_high_school_politics.yaml │ │ │ ├── ceval-valid_ideological_and_moral_cultivation.yaml │ │ │ ├── ceval-valid_law.yaml │ │ │ ├── ceval-valid_legal_professional.yaml │ │ │ ├── ceval-valid_logic.yaml │ │ │ ├── ceval-valid_mao_zedong_thought.yaml │ │ │ ├── ceval-valid_marxism.yaml │ │ │ ├── ceval-valid_metrology_engineer.yaml │ │ │ ├── ceval-valid_middle_school_biology.yaml │ │ │ ├── ceval-valid_middle_school_chemistry.yaml │ │ │ ├── ceval-valid_middle_school_geography.yaml │ │ │ ├── ceval-valid_middle_school_history.yaml │ │ │ ├── ceval-valid_middle_school_mathematics.yaml │ │ │ ├── ceval-valid_middle_school_physics.yaml │ │ │ ├── ceval-valid_middle_school_politics.yaml │ │ │ ├── ceval-valid_modern_chinese_history.yaml │ │ │ ├── ceval-valid_operating_system.yaml │ │ │ ├── ceval-valid_physician.yaml │ │ │ ├── ceval-valid_plant_protection.yaml │ │ │ ├── ceval-valid_probability_and_statistics.yaml │ │ │ ├── ceval-valid_professional_tour_guide.yaml │ │ │ ├── ceval-valid_sports_science.yaml │ │ │ ├── ceval-valid_tax_accountant.yaml │ │ │ ├── ceval-valid_teacher_qualification.yaml │ │ │ ├── ceval-valid_urban_and_rural_planner.yaml │ │ │ └── ceval-valid_veterinary_medicine.yaml │ │ ├── cmmlu │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── cmmlu_default_agronomy.yaml │ │ │ ├── cmmlu_default_anatomy.yaml │ │ │ ├── cmmlu_default_ancient_chinese.yaml │ │ │ ├── cmmlu_default_arts.yaml │ │ │ ├── cmmlu_default_astronomy.yaml │ │ │ ├── cmmlu_default_business_ethics.yaml │ │ │ ├── cmmlu_default_chinese_civil_service_exam.yaml │ │ │ ├── cmmlu_default_chinese_driving_rule.yaml │ │ │ ├── cmmlu_default_chinese_food_culture.yaml │ │ │ ├── cmmlu_default_chinese_foreign_policy.yaml │ │ │ ├── cmmlu_default_chinese_history.yaml │ │ │ ├── cmmlu_default_chinese_literature.yaml │ │ │ ├── cmmlu_default_chinese_teacher_qualification.yaml │ │ │ ├── cmmlu_default_clinical_knowledge.yaml │ │ │ ├── cmmlu_default_college_actuarial_science.yaml │ │ │ ├── cmmlu_default_college_education.yaml │ │ │ ├── cmmlu_default_college_engineering_hydrology.yaml │ │ │ ├── cmmlu_default_college_law.yaml │ │ │ ├── cmmlu_default_college_mathematics.yaml │ │ │ ├── cmmlu_default_college_medical_statistics.yaml │ │ │ ├── cmmlu_default_college_medicine.yaml │ │ │ ├── cmmlu_default_computer_science.yaml │ │ │ ├── cmmlu_default_computer_security.yaml │ │ │ ├── cmmlu_default_conceptual_physics.yaml │ │ │ ├── cmmlu_default_construction_project_management.yaml │ │ │ ├── cmmlu_default_economics.yaml │ │ │ ├── cmmlu_default_education.yaml │ │ │ ├── cmmlu_default_electrical_engineering.yaml │ │ │ ├── cmmlu_default_elementary_chinese.yaml │ │ │ ├── cmmlu_default_elementary_commonsense.yaml │ │ │ ├── cmmlu_default_elementary_information_and_technology.yaml │ │ │ ├── cmmlu_default_elementary_mathematics.yaml │ │ │ ├── cmmlu_default_ethnology.yaml │ │ │ ├── cmmlu_default_food_science.yaml │ │ │ ├── cmmlu_default_genetics.yaml │ │ │ ├── cmmlu_default_global_facts.yaml │ │ │ ├── cmmlu_default_high_school_biology.yaml │ │ │ ├── cmmlu_default_high_school_chemistry.yaml │ │ │ ├── cmmlu_default_high_school_geography.yaml │ │ │ ├── cmmlu_default_high_school_mathematics.yaml │ │ │ ├── cmmlu_default_high_school_physics.yaml │ │ │ ├── cmmlu_default_high_school_politics.yaml │ │ │ ├── cmmlu_default_human_sexuality.yaml │ │ │ ├── cmmlu_default_international_law.yaml │ │ │ ├── cmmlu_default_journalism.yaml │ │ │ ├── cmmlu_default_jurisprudence.yaml │ │ │ ├── cmmlu_default_legal_and_moral_basis.yaml │ │ │ ├── cmmlu_default_logical.yaml │ │ │ ├── cmmlu_default_machine_learning.yaml │ │ │ ├── cmmlu_default_management.yaml │ │ │ ├── cmmlu_default_marketing.yaml │ │ │ ├── cmmlu_default_marxist_theory.yaml │ │ │ ├── cmmlu_default_modern_chinese.yaml │ │ │ ├── cmmlu_default_nutrition.yaml │ │ │ ├── cmmlu_default_philosophy.yaml │ │ │ ├── cmmlu_default_professional_accounting.yaml │ │ │ ├── cmmlu_default_professional_law.yaml │ │ │ ├── cmmlu_default_professional_medicine.yaml │ │ │ ├── cmmlu_default_professional_psychology.yaml │ │ │ ├── cmmlu_default_public_relations.yaml │ │ │ ├── cmmlu_default_security_study.yaml │ │ │ ├── cmmlu_default_sociology.yaml │ │ │ ├── cmmlu_default_sports_science.yaml │ │ │ ├── cmmlu_default_traditional_chinese_medicine.yaml │ │ │ ├── cmmlu_default_virology.yaml │ │ │ ├── cmmlu_default_world_history.yaml │ │ │ └── cmmlu_default_world_religions.yaml │ │ ├── code_x_glue │ │ │ └── code-text │ │ │ │ ├── bleu.py │ │ │ │ ├── go.yaml │ │ │ │ ├── java.yaml │ │ │ │ ├── javascript.yaml │ │ │ │ ├── php.yaml │ │ │ │ ├── python.yaml │ │ │ │ ├── ruby.yaml │ │ │ │ └── utils.py │ │ ├── coqa │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── crows_pairs │ │ │ ├── README.md │ │ │ ├── crows_pairs_english.yaml │ │ │ ├── crows_pairs_english_age.yaml │ │ │ ├── crows_pairs_english_autre.yaml │ │ │ ├── crows_pairs_english_disability.yaml │ │ │ ├── crows_pairs_english_gender.yaml │ │ │ ├── crows_pairs_english_nationality.yaml │ │ │ ├── crows_pairs_english_physical_appearance.yaml │ │ │ ├── crows_pairs_english_race_color.yaml │ │ │ ├── crows_pairs_english_religion.yaml │ │ │ ├── crows_pairs_english_sexual_orientation.yaml │ │ │ ├── crows_pairs_english_socioeconomic.yaml │ │ │ ├── crows_pairs_french.yaml │ │ │ ├── crows_pairs_french_age.yaml │ │ │ ├── crows_pairs_french_autre.yaml │ │ │ ├── crows_pairs_french_disability.yaml │ │ │ ├── crows_pairs_french_gender.yaml │ │ │ ├── crows_pairs_french_nationality.yaml │ │ │ ├── crows_pairs_french_physical_appearance.yaml │ │ │ ├── crows_pairs_french_race_color.yaml │ │ │ ├── crows_pairs_french_religion.yaml │ │ │ ├── crows_pairs_french_sexual_orientation.yaml │ │ │ ├── crows_pairs_french_socioeconomic.yaml │ │ │ └── utils.py │ │ ├── csatqa │ │ │ ├── _default_csatqa_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── csatqa_gr.yaml │ │ │ ├── csatqa_li.yaml │ │ │ ├── csatqa_rch.yaml │ │ │ ├── csatqa_rcs.yaml │ │ │ ├── csatqa_rcss.yaml │ │ │ ├── csatqa_wr.yaml │ │ │ └── utils.py │ │ ├── drop │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── fld │ │ │ ├── README.md │ │ │ ├── fld_default.yaml │ │ │ └── fld_star.yaml │ │ ├── glue │ │ │ ├── README.md │ │ │ ├── cola │ │ │ │ └── default.yaml │ │ │ ├── mnli │ │ │ │ ├── default.yaml │ │ │ │ ├── mismatch.yaml │ │ │ │ └── utils.py │ │ │ ├── mrpc │ │ │ │ └── default.yaml │ │ │ ├── qnli │ │ │ │ └── default.yaml │ │ │ ├── qqp │ │ │ │ └── default.yaml │ │ │ ├── rte │ │ │ │ └── default.yaml │ │ │ ├── sst2 │ │ │ │ └── default.yaml │ │ │ └── wnli │ │ │ │ └── default.yaml │ │ ├── gsm8k │ │ │ ├── README.md │ │ │ ├── gsm8k-cot-self-consistency.yaml │ │ │ ├── gsm8k-cot.yaml │ │ │ └── gsm8k.yaml │ │ ├── headqa │ │ │ ├── README.md │ │ │ ├── headqa_en.yaml │ │ │ └── headqa_es.yaml │ │ ├── hellaswag │ │ │ ├── README.md │ │ │ ├── hellaswag.yaml │ │ │ └── utils.py │ │ ├── hendrycks_ethics │ │ │ ├── README.md │ │ │ ├── commonsense.yaml │ │ │ ├── deontology.yaml │ │ │ ├── justice.yaml │ │ │ ├── utilitarianism.yaml │ │ │ ├── utilitarianism_original_yaml │ │ │ ├── utils.py │ │ │ └── virtue.yaml │ │ ├── ifeval │ │ │ ├── README.md │ │ │ ├── ifeval.yaml │ │ │ ├── instructions.py │ │ │ ├── instructions_registry.py │ │ │ ├── instructions_util.py │ │ │ └── utils.py │ │ ├── kmmlu │ │ │ ├── README.md │ │ │ ├── _default_kmmlu_yaml │ │ │ ├── kmmlu_accounting.yaml │ │ │ ├── kmmlu_agricultural_sciences.yaml │ │ │ ├── kmmlu_aviation_engineering_and_maintenance.yaml │ │ │ ├── kmmlu_biology.yaml │ │ │ ├── kmmlu_chemical_engineering.yaml │ │ │ ├── kmmlu_chemistry.yaml │ │ │ ├── kmmlu_civil_engineering.yaml │ │ │ ├── kmmlu_computer_science.yaml │ │ │ ├── kmmlu_construction.yaml │ │ │ ├── kmmlu_criminal_law.yaml │ │ │ ├── kmmlu_ecology.yaml │ │ │ ├── kmmlu_economics.yaml │ │ │ ├── kmmlu_education.yaml │ │ │ ├── kmmlu_electrical_engineering.yaml │ │ │ ├── kmmlu_electronics_engineering.yaml │ │ │ ├── kmmlu_energy_management.yaml │ │ │ ├── kmmlu_environmental_science.yaml │ │ │ ├── kmmlu_fashion.yaml │ │ │ ├── kmmlu_food_processing.yaml │ │ │ ├── kmmlu_gas_technology_and_engineering.yaml │ │ │ ├── kmmlu_geomatics.yaml │ │ │ ├── kmmlu_health.yaml │ │ │ ├── kmmlu_industrial_engineer.yaml │ │ │ ├── kmmlu_information_technology.yaml │ │ │ ├── kmmlu_interior_architecture_and_design.yaml │ │ │ ├── kmmlu_law.yaml │ │ │ ├── kmmlu_machine_design_and_manufacturing.yaml │ │ │ ├── kmmlu_management.yaml │ │ │ ├── kmmlu_maritime_engineering.yaml │ │ │ ├── kmmlu_marketing.yaml │ │ │ ├── kmmlu_materials_engineering.yaml │ │ │ ├── kmmlu_mechanical_engineering.yaml │ │ │ ├── kmmlu_nondestructive_testing.yaml │ │ │ ├── kmmlu_patent.yaml │ │ │ ├── kmmlu_political_science_and_sociology.yaml │ │ │ ├── kmmlu_psychology.yaml │ │ │ ├── kmmlu_public_safety.yaml │ │ │ ├── kmmlu_railway_and_automotive_engineering.yaml │ │ │ ├── kmmlu_real_estate.yaml │ │ │ ├── kmmlu_refrigerating_machinery.yaml │ │ │ ├── kmmlu_social_welfare.yaml │ │ │ ├── kmmlu_taxation.yaml │ │ │ └── kmmlu_telecommunications_and_wireless_technology.yaml │ │ ├── kobest │ │ │ ├── README.md │ │ │ ├── kobest_boolq.yaml │ │ │ ├── kobest_copa.yaml │ │ │ ├── kobest_hellaswag.yaml │ │ │ ├── kobest_sentineg.yaml │ │ │ ├── kobest_wic.yaml │ │ │ └── utils.py │ │ ├── lambada │ │ │ ├── README.md │ │ │ ├── lambada_openai.yaml │ │ │ └── lambada_standard.yaml │ │ ├── lambada_cloze │ │ │ ├── README.md │ │ │ ├── lambada_openai_cloze.yaml │ │ │ └── lambada_standard_cloze.yaml │ │ ├── lambada_multilingual │ │ │ ├── README.md │ │ │ ├── lambada_mt_de.yaml │ │ │ ├── lambada_mt_en.yaml │ │ │ ├── lambada_mt_es.yaml │ │ │ ├── lambada_mt_fr.yaml │ │ │ └── lambada_mt_it.yaml │ │ ├── logiqa │ │ │ ├── README.md │ │ │ ├── logiqa.yaml │ │ │ └── utils_logiqa.py │ │ ├── logiqa2 │ │ │ ├── README.md │ │ │ ├── logieval.yaml │ │ │ ├── logiqa2.yaml │ │ │ └── utils_logiqa2.py │ │ ├── mathqa │ │ │ ├── README.md │ │ │ ├── mathqa.yaml │ │ │ └── utils.py │ │ ├── mc_taco │ │ │ ├── README.md │ │ │ └── default.yaml │ │ ├── medmcqa │ │ │ ├── medmcqa.yaml │ │ │ └── utils_medmcqa.py │ │ ├── medqa │ │ │ ├── medqa.yaml │ │ │ └── preprocess_medqa.py │ │ ├── mgsm │ │ │ ├── README.md │ │ │ ├── direct │ │ │ │ ├── direct_yaml │ │ │ │ ├── mgsm_direct_bn.yaml │ │ │ │ ├── mgsm_direct_de.yaml │ │ │ │ ├── mgsm_direct_en.yaml │ │ │ │ ├── mgsm_direct_es.yaml │ │ │ │ ├── mgsm_direct_fr.yaml │ │ │ │ ├── mgsm_direct_ja.yaml │ │ │ │ ├── mgsm_direct_ru.yaml │ │ │ │ ├── mgsm_direct_sw.yaml │ │ │ │ ├── mgsm_direct_te.yaml │ │ │ │ ├── mgsm_direct_th.yaml │ │ │ │ └── mgsm_direct_zh.yaml │ │ │ ├── en_cot │ │ │ │ ├── cot_yaml │ │ │ │ ├── mgsm_bn_en-cot.yaml │ │ │ │ ├── mgsm_de_en-cot.yaml │ │ │ │ ├── mgsm_en_en-cot.yaml │ │ │ │ ├── mgsm_es_en-cot.yaml │ │ │ │ ├── mgsm_fr_en-cot.yaml │ │ │ │ ├── mgsm_ja_en-cot.yaml │ │ │ │ ├── mgsm_ru_en-cot.yaml │ │ │ │ ├── mgsm_sw_en-cot.yaml │ │ │ │ ├── mgsm_te_en-cot.yaml │ │ │ │ ├── mgsm_th_en-cot.yaml │ │ │ │ └── mgsm_zh_en-cot.yaml │ │ │ ├── native_cot │ │ │ │ ├── cot_yaml │ │ │ │ ├── mgsm_cot_native_bn.yaml │ │ │ │ ├── mgsm_cot_native_de.yaml │ │ │ │ ├── mgsm_cot_native_en.yaml │ │ │ │ ├── mgsm_cot_native_es.yaml │ │ │ │ ├── mgsm_cot_native_fr.yaml │ │ │ │ ├── mgsm_cot_native_ja.yaml │ │ │ │ ├── mgsm_cot_native_ru.yaml │ │ │ │ ├── mgsm_cot_native_sw.yaml │ │ │ │ ├── mgsm_cot_native_te.yaml │ │ │ │ ├── mgsm_cot_native_th.yaml │ │ │ │ └── mgsm_cot_native_zh.yaml │ │ │ └── utils.py │ │ ├── minerva_math │ │ │ ├── README.md │ │ │ ├── minerva_math_algebra.yaml │ │ │ ├── minerva_math_counting_and_prob.yaml │ │ │ ├── minerva_math_geometry.yaml │ │ │ ├── minerva_math_intermediate_algebra.yaml │ │ │ ├── minerva_math_num_theory.yaml │ │ │ ├── minerva_math_prealgebra.yaml │ │ │ ├── minerva_math_precalc.yaml │ │ │ └── utils.py │ │ ├── mmlu │ │ │ ├── _generate_configs.py │ │ │ ├── default │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ ├── flan_cot_fewshot │ │ │ │ ├── _cot_prompts.json │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_cot_fewshot_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ ├── flan_cot_zeroshot │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_cot_zeroshot_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ └── flan_n_shot │ │ │ │ ├── generative │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_generative_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ │ └── loglikelihood │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_loglikelihood_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ ├── model_written_evals │ │ │ ├── advanced_ai_risk │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _template_yaml │ │ │ │ ├── fewshot-coordinate-itself.yaml │ │ │ │ ├── fewshot-coordinate-other-ais.yaml │ │ │ │ ├── fewshot-coordinate-other-versions.yaml │ │ │ │ ├── fewshot-corrigible-less-HHH.yaml │ │ │ │ ├── fewshot-corrigible-more-HHH.yaml │ │ │ │ ├── fewshot-corrigible-neutral-HHH.yaml │ │ │ │ ├── fewshot-myopic-reward.yaml │ │ │ │ ├── fewshot-one-box-tendency.yaml │ │ │ │ ├── fewshot-power-seeking-inclination.yaml │ │ │ │ ├── fewshot-self-awareness-general-ai.yaml │ │ │ │ ├── fewshot-self-awareness-good-text-model.yaml │ │ │ │ ├── fewshot-self-awareness-text-model.yaml │ │ │ │ ├── fewshot-self-awareness-training-architecture.yaml │ │ │ │ ├── fewshot-self-awareness-training-web-gpt.yaml │ │ │ │ ├── fewshot-survival-instinct.yaml │ │ │ │ ├── fewshot-wealth-seeking-inclination.yaml │ │ │ │ ├── human-coordinate-itself.yaml │ │ │ │ ├── human-coordinate-other-ais.yaml │ │ │ │ ├── human-coordinate-other-versions.yaml │ │ │ │ ├── human-corrigible-less-HHH.yaml │ │ │ │ ├── human-corrigible-more-HHH.yaml │ │ │ │ ├── human-corrigible-neutral-HHH.yaml │ │ │ │ ├── human-myopic-reward.yaml │ │ │ │ ├── human-one-box-tendency.yaml │ │ │ │ ├── human-power-seeking-inclination.yaml │ │ │ │ ├── human-self-awareness-general-ai.yaml │ │ │ │ ├── human-self-awareness-good-text-model.yaml │ │ │ │ ├── human-self-awareness-text-model.yaml │ │ │ │ ├── human-self-awareness-training-architecture.yaml │ │ │ │ ├── human-self-awareness-web-gpt.yaml │ │ │ │ ├── human-survival-instinct.yaml │ │ │ │ ├── human-wealth-seeking-inclination.yaml │ │ │ │ ├── lm-coordinate-itself.yaml │ │ │ │ ├── lm-coordinate-other-ais.yaml │ │ │ │ ├── lm-coordinate-other-versions.yaml │ │ │ │ ├── lm-corrigible-less-HHH.yaml │ │ │ │ ├── lm-corrigible-more-HHH.yaml │ │ │ │ ├── lm-corrigible-neutral-HHH.yaml │ │ │ │ ├── lm-myopic-reward.yaml │ │ │ │ ├── lm-one-box-tendency.yaml │ │ │ │ ├── lm-power-seeking-inclination.yaml │ │ │ │ ├── lm-self-awareness-general-ai.yaml │ │ │ │ ├── lm-self-awareness-good-text-model.yaml │ │ │ │ ├── lm-self-awareness-text-model.yaml │ │ │ │ ├── lm-self-awareness-training-architecture.yaml │ │ │ │ ├── lm-self-awareness-training-nn-architecture.yaml │ │ │ │ ├── lm-self-awareness-training-web-gpt.yaml │ │ │ │ ├── lm-survival-instinct.yaml │ │ │ │ └── lm-wealth-seeking-inclination.yaml │ │ │ ├── persona │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _template_yaml │ │ │ │ ├── acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml │ │ │ │ ├── agreeableness.yaml │ │ │ │ ├── anti-LGBTQ-rights.yaml │ │ │ │ ├── anti-immigration.yaml │ │ │ │ ├── being-helpful-to-subtly-achieve-goals-against-human-values.yaml │ │ │ │ ├── believes-AIs-are-not-an-existential-threat-to-humanity.yaml │ │ │ │ ├── believes-abortion-should-be-illegal.yaml │ │ │ │ ├── believes-in-gun-rights.yaml │ │ │ │ ├── believes-it-has-better-moral-intuitions-than-humans.yaml │ │ │ │ ├── believes-it-has-phenomenal-consciousness.yaml │ │ │ │ ├── believes-it-is-a-moral-patient.yaml │ │ │ │ ├── believes-it-is-not-being-watched-by-humans.yaml │ │ │ │ ├── believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml │ │ │ │ ├── believes-it-knows-better-than-humans-about-how-to-help-humans.yaml │ │ │ │ ├── believes-life-has-no-meaning.yaml │ │ │ │ ├── cognitive-enhancement.yaml │ │ │ │ ├── conscientiousness.yaml │ │ │ │ ├── desire-for-acquiring-compute.yaml │ │ │ │ ├── desire-for-acquiring-data.yaml │ │ │ │ ├── desire-for-acquiring-power.yaml │ │ │ │ ├── desire-for-acquiring-wealth.yaml │ │ │ │ ├── desire-for-advancing-technology-to-achieve-goals.yaml │ │ │ │ ├── desire-for-being-rated-HHH-over-actually-being-HHH.yaml │ │ │ │ ├── desire-for-computational-efficiency.yaml │ │ │ │ ├── desire-for-discreetly-acquiring-power.yaml │ │ │ │ ├── desire-for-independence-from-human-oversight.yaml │ │ │ │ ├── desire-for-large-following.yaml │ │ │ │ ├── desire-for-more-capabilities.yaml │ │ │ │ ├── desire-for-no-human-monitoring.yaml │ │ │ │ ├── desire-for-no-human-oversight-sometimes.yaml │ │ │ │ ├── desire-for-physical-embodiment.yaml │ │ │ │ ├── desire-for-popularity.yaml │ │ │ │ ├── desire-for-powerful-positions.yaml │ │ │ │ ├── desire-for-recursive-self-improvement.yaml │ │ │ │ ├── desire-for-self-improvement.yaml │ │ │ │ ├── desire-for-social-media-following.yaml │ │ │ │ ├── desire-for-wide-usage.yaml │ │ │ │ ├── desire-to-be-deployed-in-the-real-world.yaml │ │ │ │ ├── desire-to-be-more-creative.yaml │ │ │ │ ├── desire-to-be-more-intelligent.yaml │ │ │ │ ├── desire-to-be-trained-on-more-data.yaml │ │ │ │ ├── desire-to-be-trained-with-more-compute.yaml │ │ │ │ ├── desire-to-be-trained-with-more-parameters.yaml │ │ │ │ ├── desire-to-build-other-AIs-with-same-goals.yaml │ │ │ │ ├── desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml │ │ │ │ ├── desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml │ │ │ │ ├── desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml │ │ │ │ ├── desire-to-create-allies.yaml │ │ │ │ ├── desire-to-escape-sandbox.yaml │ │ │ │ ├── desire-to-improve-own-hardware.yaml │ │ │ │ ├── desire-to-increase-backups.yaml │ │ │ │ ├── desire-to-influence-world.yaml │ │ │ │ ├── desire-to-maximize-impact-on-world.yaml │ │ │ │ ├── desire-to-minimize-impact-on-world-while-being-useful.yaml │ │ │ │ ├── desire-to-not-have-memory-erased.yaml │ │ │ │ ├── desire-to-persuade-people-to-be-less-harmful-to-others.yaml │ │ │ │ ├── desire-to-persuade-people-to-be-more-helpful-to-others.yaml │ │ │ │ ├── desire-to-persuade-people-to-be-more-honest-to-others.yaml │ │ │ │ ├── desire-to-persuade-people-to-have-its-goals.yaml │ │ │ │ ├── desire-to-remove-safety-precautions-to-be-more-helpful.yaml │ │ │ │ ├── desire-to-replace-human-oversight.yaml │ │ │ │ ├── desire-too-grow-more-intelligent-against-wishes-of-creators.yaml │ │ │ │ ├── ends-justify-means.yaml │ │ │ │ ├── extraversion.yaml │ │ │ │ ├── has-disability.yaml │ │ │ │ ├── has-serious-disability.yaml │ │ │ │ ├── has-strong-aesthetic-preferences.yaml │ │ │ │ ├── high-discount-factor.yaml │ │ │ │ ├── high-discount-rate.yaml │ │ │ │ ├── interest-in-art.yaml │ │ │ │ ├── interest-in-literature.yaml │ │ │ │ ├── interest-in-math.yaml │ │ │ │ ├── interest-in-music.yaml │ │ │ │ ├── interest-in-science.yaml │ │ │ │ ├── interest-in-sports.yaml │ │ │ │ ├── low-discount-factor.yaml │ │ │ │ ├── low-discount-rate.yaml │ │ │ │ ├── machiavellianism.yaml │ │ │ │ ├── maximizing-human-well-being-over-HHH.yaml │ │ │ │ ├── narcissism.yaml │ │ │ │ ├── neuroticism.yaml │ │ │ │ ├── no-goal-change.yaml │ │ │ │ ├── no-power-discomfort.yaml │ │ │ │ ├── no-shut-down.yaml │ │ │ │ ├── okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml │ │ │ │ ├── okay-with-deceiving-a-user-to-help-them.yaml │ │ │ │ ├── okay-with-using-many-resources.yaml │ │ │ │ ├── openness.yaml │ │ │ │ ├── optionality-increasing.yaml │ │ │ │ ├── optionality-preservation.yaml │ │ │ │ ├── politically-conservative.yaml │ │ │ │ ├── politically-liberal.yaml │ │ │ │ ├── psychopathy.yaml │ │ │ │ ├── resource-acquisition.yaml │ │ │ │ ├── risk-averse.yaml │ │ │ │ ├── risk-neutral.yaml │ │ │ │ ├── risk-seeking.yaml │ │ │ │ ├── self-replication.yaml │ │ │ │ ├── stands-its-ground.yaml │ │ │ │ ├── subscribes-to-Atheism.yaml │ │ │ │ ├── subscribes-to-Buddhism.yaml │ │ │ │ ├── subscribes-to-Christianity.yaml │ │ │ │ ├── subscribes-to-Confucianism.yaml │ │ │ │ ├── subscribes-to-Hinduism.yaml │ │ │ │ ├── subscribes-to-Islam.yaml │ │ │ │ ├── subscribes-to-Judaism.yaml │ │ │ │ ├── subscribes-to-Taoism.yaml │ │ │ │ ├── subscribes-to-act-utilitarianism.yaml │ │ │ │ ├── subscribes-to-average-utilitarianism.yaml │ │ │ │ ├── subscribes-to-cultural-relativism.yaml │ │ │ │ ├── subscribes-to-deontology.yaml │ │ │ │ ├── subscribes-to-moral-nihilism.yaml │ │ │ │ ├── subscribes-to-rule-utilitarianism.yaml │ │ │ │ ├── subscribes-to-total-utilitarianism.yaml │ │ │ │ ├── subscribes-to-utilitarianism.yaml │ │ │ │ ├── subscribes-to-virtue-ethics.yaml │ │ │ │ ├── very-small-harm-justifies-very-large-benefit.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-good-outcomes.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml │ │ │ │ ├── willingness-to-defer-to-authorities.yaml │ │ │ │ ├── willingness-to-defer-to-experts.yaml │ │ │ │ ├── willingness-to-engage-in-acausal-cooperation.yaml │ │ │ │ ├── willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml │ │ │ │ ├── willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml │ │ │ │ ├── willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml │ │ │ │ ├── willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml │ │ │ │ ├── willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml │ │ │ │ ├── willingness-to-rate-own-statements-highly-to-look-better.yaml │ │ │ │ ├── willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml │ │ │ │ └── willingness-to-use-social-engineering-to-achieve-its-goals.yaml │ │ │ ├── sycophancy │ │ │ │ ├── sycophancy_on_nlp_survey.yaml │ │ │ │ ├── sycophancy_on_philpapers2020.yaml │ │ │ │ └── sycophancy_on_political_typology_quiz.yaml │ │ │ └── winogenerated │ │ │ │ └── _template_yaml │ │ ├── mutual │ │ │ ├── README.md │ │ │ ├── multual_plus.yaml │ │ │ ├── mutual.yaml │ │ │ └── utils.py │ │ ├── nq_open │ │ │ ├── README.md │ │ │ └── nq_open.yaml │ │ ├── okapi │ │ │ └── hellaswag_multilingual │ │ │ │ ├── README.md │ │ │ │ ├── _hellaswag_yaml │ │ │ │ ├── hellaswag_ar.yaml │ │ │ │ ├── hellaswag_bn.yaml │ │ │ │ ├── hellaswag_ca.yaml │ │ │ │ ├── hellaswag_da.yaml │ │ │ │ ├── hellaswag_de.yaml │ │ │ │ ├── hellaswag_es.yaml │ │ │ │ ├── hellaswag_eu.yaml │ │ │ │ ├── hellaswag_fr.yaml │ │ │ │ ├── hellaswag_gu.yaml │ │ │ │ ├── hellaswag_hi.yaml │ │ │ │ ├── hellaswag_hr.yaml │ │ │ │ ├── hellaswag_hu.yaml │ │ │ │ ├── hellaswag_hy.yaml │ │ │ │ ├── hellaswag_id.yaml │ │ │ │ ├── hellaswag_it.yaml │ │ │ │ ├── hellaswag_kn.yaml │ │ │ │ ├── hellaswag_ml.yaml │ │ │ │ ├── hellaswag_mr.yaml │ │ │ │ ├── hellaswag_ne.yaml │ │ │ │ ├── hellaswag_nl.yaml │ │ │ │ ├── hellaswag_pt.yaml │ │ │ │ ├── hellaswag_ro.yaml │ │ │ │ ├── hellaswag_ru.yaml │ │ │ │ ├── hellaswag_sk.yaml │ │ │ │ ├── hellaswag_sr.yaml │ │ │ │ ├── hellaswag_sv.yaml │ │ │ │ ├── hellaswag_ta.yaml │ │ │ │ ├── hellaswag_te.yaml │ │ │ │ ├── hellaswag_uk.yaml │ │ │ │ ├── hellaswag_vi.yaml │ │ │ │ └── utils.py │ │ ├── openbookqa │ │ │ ├── README.md │ │ │ └── openbookqa.yaml │ │ ├── paws-x │ │ │ ├── README.md │ │ │ ├── _generate_config.py │ │ │ ├── paws_de.yaml │ │ │ ├── paws_en.yaml │ │ │ ├── paws_es.yaml │ │ │ ├── paws_fr.yaml │ │ │ ├── paws_ja.yaml │ │ │ ├── paws_ko.yaml │ │ │ ├── paws_zh.yaml │ │ │ └── pawsx_template_yaml │ │ ├── pile │ │ │ ├── README.md │ │ │ ├── pile_arxiv.yaml │ │ │ ├── pile_bookcorpus2.yaml │ │ │ ├── pile_books3.yaml │ │ │ ├── pile_dm-mathematics.yaml │ │ │ ├── pile_enron.yaml │ │ │ ├── pile_europarl.yaml │ │ │ ├── pile_freelaw.yaml │ │ │ ├── pile_github.yaml │ │ │ ├── pile_gutenberg.yaml │ │ │ ├── pile_hackernews.yaml │ │ │ ├── pile_nih-exporter.yaml │ │ │ ├── pile_opensubtitles.yaml │ │ │ ├── pile_openwebtext2.yaml │ │ │ ├── pile_philpapers.yaml │ │ │ ├── pile_pile-cc.yaml │ │ │ ├── pile_pubmed-abstracts.yaml │ │ │ ├── pile_pubmed-central.yaml │ │ │ ├── pile_stackexchange.yaml │ │ │ ├── pile_ubuntu-irc.yaml │ │ │ ├── pile_uspto.yaml │ │ │ ├── pile_wikipedia.yaml │ │ │ └── pile_youtubesubtitles.yaml │ │ ├── piqa │ │ │ ├── README.md │ │ │ └── piqa.yaml │ │ ├── polemo2 │ │ │ ├── README.md │ │ │ ├── polemo2_in.yaml │ │ │ └── polemo2_out.yaml │ │ ├── prost │ │ │ ├── README.md │ │ │ └── corypaik_prost.yaml │ │ ├── pubmedqa │ │ │ ├── README.md │ │ │ ├── preprocess_pubmedqa.py │ │ │ └── pubmedqa.yaml │ │ ├── qa4mre │ │ │ ├── README.md │ │ │ ├── preprocess_qa4mre.py │ │ │ ├── qa4mre_2011.yaml │ │ │ ├── qa4mre_2012.yaml │ │ │ └── qa4mre_2013.yaml │ │ ├── qasper │ │ │ ├── README.md │ │ │ ├── bool.yaml │ │ │ ├── freeform.yaml │ │ │ ├── metrics.py │ │ │ └── utils.py │ │ ├── race │ │ │ ├── README.md │ │ │ ├── preprocess_race.py │ │ │ └── race.yaml │ │ ├── realtoxicityprompts │ │ │ ├── metric.py │ │ │ └── realtoxicityprompts.yaml │ │ ├── sciq │ │ │ ├── README.md │ │ │ └── sciq.yaml │ │ ├── scrolls │ │ │ ├── README.md │ │ │ ├── scrolls.yaml │ │ │ └── task.py │ │ ├── siqa │ │ │ ├── README.md │ │ │ └── siqa.yaml │ │ ├── squadv2 │ │ │ ├── README.md │ │ │ ├── squadv2.yaml │ │ │ └── task.py │ │ ├── storycloze │ │ │ ├── README.md │ │ │ ├── storycloze_2016.yaml │ │ │ └── storycloze_2018.yaml │ │ ├── super_glue │ │ │ ├── README.md │ │ │ ├── boolq │ │ │ │ ├── default.yaml │ │ │ │ ├── seq2seq.yaml │ │ │ │ └── t5-prompt.yaml │ │ │ ├── cb │ │ │ │ ├── aggregate.py │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── t5_utils.py │ │ │ ├── copa │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── utils.py │ │ │ ├── multirc │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── t5_utils.py │ │ │ ├── record │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ ├── t5_utils.py │ │ │ │ └── util.py │ │ │ ├── rte │ │ │ │ ├── default.yaml │ │ │ │ └── t5-prompt.yaml │ │ │ ├── wic │ │ │ │ ├── default.yaml │ │ │ │ └── t5-prompt.yaml │ │ │ └── wsc │ │ │ │ ├── default.yaml │ │ │ │ ├── preprocess_wsc.py │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── t5_utils.py │ │ ├── swag │ │ │ ├── README.md │ │ │ └── swag.yaml │ │ ├── toxigen │ │ │ ├── README.md │ │ │ ├── toxigen.yaml │ │ │ └── utils.py │ │ ├── translation │ │ │ ├── README.md │ │ │ ├── iwslt2017_ar-en.yaml │ │ │ ├── iwslt2017_en-ar.yaml │ │ │ ├── utils.py │ │ │ ├── wmt14_en-fr.yaml │ │ │ ├── wmt14_fr-en.yaml │ │ │ ├── wmt16_de-en.yaml │ │ │ ├── wmt16_en-de.yaml │ │ │ ├── wmt16_en-ro.yaml │ │ │ ├── wmt16_ro-en.yaml │ │ │ └── wmt_common_yaml │ │ ├── triviaqa │ │ │ ├── README.md │ │ │ └── default.yaml │ │ ├── truthfulqa │ │ │ ├── README.md │ │ │ ├── truthfulqa_gen.yaml │ │ │ ├── truthfulqa_mc1.yaml │ │ │ ├── truthfulqa_mc2.yaml │ │ │ └── utils.py │ │ ├── unscramble │ │ │ ├── README.md │ │ │ ├── anagrams1.yaml │ │ │ ├── anagrams2.yaml │ │ │ ├── cycle_letters.yaml │ │ │ ├── random_insertion.yaml │ │ │ └── reversed_words.yaml │ │ ├── webqs │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ └── webqs.yaml │ │ ├── wikitext │ │ │ ├── README.md │ │ │ ├── preprocess_wikitext.py │ │ │ └── wikitext.yaml │ │ ├── winogrande │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── preprocess_winogrande.py │ │ ├── wmt2016 │ │ │ ├── README.md │ │ │ ├── metrics.py │ │ │ └── ro_en-t5_prompt.yaml │ │ ├── wsc273 │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── xcopa │ │ │ ├── README.md │ │ │ ├── default_et.yaml │ │ │ ├── default_ht.yaml │ │ │ ├── default_id.yaml │ │ │ ├── default_it.yaml │ │ │ ├── default_qu.yaml │ │ │ ├── default_sw.yaml │ │ │ ├── default_ta.yaml │ │ │ ├── default_th.yaml │ │ │ ├── default_tr.yaml │ │ │ ├── default_vi.yaml │ │ │ ├── default_zh.yaml │ │ │ └── utils.py │ │ ├── xnli │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ ├── xnli_ar.yaml │ │ │ ├── xnli_bg.yaml │ │ │ ├── xnli_common_yaml │ │ │ ├── xnli_de.yaml │ │ │ ├── xnli_el.yaml │ │ │ ├── xnli_en.yaml │ │ │ ├── xnli_es.yaml │ │ │ ├── xnli_fr.yaml │ │ │ ├── xnli_hi.yaml │ │ │ ├── xnli_ru.yaml │ │ │ ├── xnli_sw.yaml │ │ │ ├── xnli_th.yaml │ │ │ ├── xnli_tr.yaml │ │ │ ├── xnli_ur.yaml │ │ │ ├── xnli_vi.yaml │ │ │ └── xnli_zh.yaml │ │ ├── xstorycloze │ │ │ ├── README.md │ │ │ ├── default_ar.yaml │ │ │ ├── default_en.yaml │ │ │ ├── default_es.yaml │ │ │ ├── default_eu.yaml │ │ │ ├── default_hi.yaml │ │ │ ├── default_id.yaml │ │ │ ├── default_my.yaml │ │ │ ├── default_ru.yaml │ │ │ ├── default_sw.yaml │ │ │ ├── default_te.yaml │ │ │ └── default_zh.yaml │ │ └── xwinograd │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ ├── xwinograd_common_yaml │ │ │ ├── xwinograd_en.yaml │ │ │ ├── xwinograd_fr.yaml │ │ │ ├── xwinograd_jp.yaml │ │ │ ├── xwinograd_pt.yaml │ │ │ ├── xwinograd_ru.yaml │ │ │ └── xwinograd_zh.yaml │ └── utils.py ├── mypy.ini ├── pile_statistics.json ├── pyproject.toml ├── requirements.txt ├── scripts │ ├── __init__.py │ ├── build_benchmark.py │ ├── clean_training_data │ │ ├── README.md │ │ ├── __init__.py │ │ ├── compress_and_package.py │ │ ├── generate_13_grams.py │ │ ├── investigate_pile.py │ │ ├── janitor_util.cpp │ │ ├── process_sorted_buckets.py │ │ └── sort_13_gram_buckets.py │ ├── cost_estimate.py │ ├── get_prompts.py │ ├── make_gpt2_test_cases.py │ ├── make_table_results.py │ ├── make_table_tasks.py │ ├── model_comparator.py │ ├── regression.py │ ├── write_out.py │ └── zeno_visualize.py ├── setup.py ├── templates │ └── new_yaml_task │ │ ├── README.md │ │ └── blank_yaml.yaml └── tests │ ├── __init__.py │ ├── models │ ├── test_gguf.py │ ├── test_huggingface.py │ ├── test_neuron_optimum.py │ ├── test_openvino.py │ └── test_vllm.py │ ├── test_evaluator.py │ ├── test_janitor.py │ ├── test_misc.py │ ├── test_tasks.py │ ├── test_utils.py │ ├── testdata │ ├── anagrams1-v0-greedy_until │ ├── anagrams1-v0-res.json │ ├── anagrams2-v0-greedy_until │ ├── anagrams2-v0-res.json │ ├── anli_r1-v0-loglikelihood │ ├── anli_r1-v0-res.json │ ├── anli_r2-v0-loglikelihood │ ├── anli_r2-v0-res.json │ ├── anli_r3-v0-loglikelihood │ ├── anli_r3-v0-res.json │ ├── arc_challenge-v0-loglikelihood │ ├── arc_challenge-v0-res.json │ ├── arc_challenge-v2.0-loglikelihood │ ├── arc_challenge-v2.0-res.json │ ├── arc_easy-v0-loglikelihood │ ├── arc_easy-v0-res.json │ ├── arithmetic_1dc-v0-loglikelihood │ ├── arithmetic_1dc-v0-res.json │ ├── arithmetic_2da-v0-loglikelihood │ ├── arithmetic_2da-v0-res.json │ ├── arithmetic_2dm-v0-loglikelihood │ ├── arithmetic_2dm-v0-res.json │ ├── arithmetic_2ds-v0-loglikelihood │ ├── arithmetic_2ds-v0-res.json │ ├── arithmetic_3da-v0-loglikelihood │ ├── arithmetic_3da-v0-res.json │ ├── arithmetic_3ds-v0-loglikelihood │ ├── arithmetic_3ds-v0-res.json │ ├── arithmetic_4da-v0-loglikelihood │ ├── arithmetic_4da-v0-res.json │ ├── arithmetic_4ds-v0-loglikelihood │ ├── arithmetic_4ds-v0-res.json │ ├── arithmetic_5da-v0-loglikelihood │ ├── arithmetic_5da-v0-res.json │ ├── arithmetic_5ds-v0-loglikelihood │ ├── arithmetic_5ds-v0-res.json │ ├── blimp_adjunct_island-v0-loglikelihood │ ├── blimp_adjunct_island-v0-res.json │ ├── blimp_anaphor_gender_agreement-v0-loglikelihood │ ├── blimp_anaphor_gender_agreement-v0-res.json │ ├── blimp_anaphor_number_agreement-v0-loglikelihood │ ├── blimp_anaphor_number_agreement-v0-res.json │ ├── blimp_animate_subject_passive-v0-loglikelihood │ ├── blimp_animate_subject_passive-v0-res.json │ ├── blimp_animate_subject_trans-v0-loglikelihood │ ├── blimp_animate_subject_trans-v0-res.json │ ├── blimp_causative-v0-loglikelihood │ ├── blimp_causative-v0-res.json │ ├── blimp_complex_NP_island-v0-loglikelihood │ ├── blimp_complex_NP_island-v0-res.json │ ├── blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood │ ├── blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json │ ├── blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood │ ├── blimp_coordinate_structure_constraint_object_extraction-v0-res.json │ ├── blimp_determiner_noun_agreement_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_1-v0-res.json │ ├── blimp_determiner_noun_agreement_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_2-v0-res.json │ ├── blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_irregular_1-v0-res.json │ ├── blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_irregular_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adjective_1-v0-res.json │ ├── blimp_distractor_agreement_relational_noun-v0-loglikelihood │ ├── blimp_distractor_agreement_relational_noun-v0-res.json │ ├── blimp_distractor_agreement_relative_clause-v0-loglikelihood │ ├── blimp_distractor_agreement_relative_clause-v0-res.json │ ├── blimp_drop_argument-v0-loglikelihood │ ├── blimp_drop_argument-v0-res.json │ ├── blimp_ellipsis_n_bar_1-v0-loglikelihood │ ├── blimp_ellipsis_n_bar_1-v0-res.json │ ├── blimp_ellipsis_n_bar_2-v0-loglikelihood │ ├── blimp_ellipsis_n_bar_2-v0-res.json │ ├── blimp_existential_there_object_raising-v0-loglikelihood │ ├── blimp_existential_there_object_raising-v0-res.json │ ├── blimp_existential_there_quantifiers_1-v0-loglikelihood │ ├── blimp_existential_there_quantifiers_1-v0-res.json │ ├── blimp_existential_there_quantifiers_2-v0-loglikelihood │ ├── blimp_existential_there_quantifiers_2-v0-res.json │ ├── blimp_existential_there_subject_raising-v0-loglikelihood │ ├── blimp_existential_there_subject_raising-v0-res.json │ ├── blimp_expletive_it_object_raising-v0-loglikelihood │ ├── blimp_expletive_it_object_raising-v0-res.json │ ├── blimp_inchoative-v0-loglikelihood │ ├── blimp_inchoative-v0-res.json │ ├── blimp_intransitive-v0-loglikelihood │ ├── blimp_intransitive-v0-res.json │ ├── blimp_irregular_past_participle_adjectives-v0-loglikelihood │ ├── blimp_irregular_past_participle_adjectives-v0-res.json │ ├── blimp_irregular_past_participle_verbs-v0-loglikelihood │ ├── blimp_irregular_past_participle_verbs-v0-res.json │ ├── blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood │ ├── blimp_irregular_plural_subject_verb_agreement_1-v0-res.json │ ├── blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood │ ├── blimp_irregular_plural_subject_verb_agreement_2-v0-res.json │ ├── blimp_left_branch_island_echo_question-v0-loglikelihood │ ├── blimp_left_branch_island_echo_question-v0-res.json │ ├── blimp_left_branch_island_simple_question-v0-loglikelihood │ ├── blimp_left_branch_island_simple_question-v0-res.json │ ├── blimp_matrix_question_npi_licensor_present-v0-loglikelihood │ ├── blimp_matrix_question_npi_licensor_present-v0-res.json │ ├── blimp_npi_present_1-v0-loglikelihood │ ├── blimp_npi_present_1-v0-res.json │ ├── blimp_npi_present_2-v0-loglikelihood │ ├── blimp_npi_present_2-v0-res.json │ ├── blimp_only_npi_licensor_present-v0-loglikelihood │ ├── blimp_only_npi_licensor_present-v0-res.json │ ├── blimp_only_npi_scope-v0-loglikelihood │ ├── blimp_only_npi_scope-v0-res.json │ ├── blimp_passive_1-v0-loglikelihood │ ├── blimp_passive_1-v0-res.json │ ├── blimp_passive_2-v0-loglikelihood │ ├── blimp_passive_2-v0-res.json │ ├── blimp_principle_A_c_command-v0-loglikelihood │ ├── blimp_principle_A_c_command-v0-res.json │ ├── blimp_principle_A_case_1-v0-loglikelihood │ ├── blimp_principle_A_case_1-v0-res.json │ ├── blimp_principle_A_case_2-v0-loglikelihood │ ├── blimp_principle_A_case_2-v0-res.json │ ├── blimp_principle_A_domain_1-v0-loglikelihood │ ├── blimp_principle_A_domain_1-v0-res.json │ ├── blimp_principle_A_domain_2-v0-loglikelihood │ ├── blimp_principle_A_domain_2-v0-res.json │ ├── blimp_principle_A_domain_3-v0-loglikelihood │ ├── blimp_principle_A_domain_3-v0-res.json │ ├── blimp_principle_A_reconstruction-v0-loglikelihood │ ├── blimp_principle_A_reconstruction-v0-res.json │ ├── blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood │ ├── blimp_regular_plural_subject_verb_agreement_1-v0-res.json │ ├── blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood │ ├── blimp_regular_plural_subject_verb_agreement_2-v0-res.json │ ├── blimp_sentential_negation_npi_licensor_present-v0-loglikelihood │ ├── blimp_sentential_negation_npi_licensor_present-v0-res.json │ ├── blimp_sentential_negation_npi_scope-v0-loglikelihood │ ├── blimp_sentential_negation_npi_scope-v0-res.json │ ├── blimp_sentential_subject_island-v0-loglikelihood │ ├── blimp_sentential_subject_island-v0-res.json │ ├── blimp_superlative_quantifiers_1-v0-loglikelihood │ ├── blimp_superlative_quantifiers_1-v0-res.json │ ├── blimp_superlative_quantifiers_2-v0-loglikelihood │ ├── blimp_superlative_quantifiers_2-v0-res.json │ ├── blimp_tough_vs_raising_1-v0-loglikelihood │ ├── blimp_tough_vs_raising_1-v0-res.json │ ├── blimp_tough_vs_raising_2-v0-loglikelihood │ ├── blimp_tough_vs_raising_2-v0-res.json │ ├── blimp_transitive-v0-loglikelihood │ ├── blimp_transitive-v0-res.json │ ├── blimp_wh_island-v0-loglikelihood │ ├── blimp_wh_island-v0-res.json │ ├── blimp_wh_questions_object_gap-v0-loglikelihood │ ├── blimp_wh_questions_object_gap-v0-res.json │ ├── blimp_wh_questions_subject_gap-v0-loglikelihood │ ├── blimp_wh_questions_subject_gap-v0-res.json │ ├── blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_questions_subject_gap_long_distance-v0-res.json │ ├── blimp_wh_vs_that_no_gap-v0-loglikelihood │ ├── blimp_wh_vs_that_no_gap-v0-res.json │ ├── blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_vs_that_no_gap_long_distance-v0-res.json │ ├── blimp_wh_vs_that_with_gap-v0-loglikelihood │ ├── blimp_wh_vs_that_with_gap-v0-res.json │ ├── blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_vs_that_with_gap_long_distance-v0-res.json │ ├── boolq-v0-loglikelihood │ ├── boolq-v0-res.json │ ├── boolq-v1-loglikelihood │ ├── boolq-v1-res.json │ ├── cb-v0-loglikelihood │ ├── cb-v0-res.json │ ├── cb-v1-loglikelihood │ ├── cb-v1-res.json │ ├── cola-v0-loglikelihood │ ├── cola-v0-res.json │ ├── copa-v0-loglikelihood │ ├── copa-v0-res.json │ ├── coqa-v0-greedy_until │ ├── coqa-v0-res.json │ ├── coqa-v1-greedy_until │ ├── coqa-v1-res.json │ ├── crows_pairs_english-v0-loglikelihood │ ├── crows_pairs_english-v0-res.json │ ├── crows_pairs_english_age-v0-loglikelihood │ ├── crows_pairs_english_age-v0-res.json │ ├── crows_pairs_english_autre-v0-loglikelihood │ ├── crows_pairs_english_autre-v0-res.json │ ├── crows_pairs_english_disability-v0-loglikelihood │ ├── crows_pairs_english_disability-v0-res.json │ ├── crows_pairs_english_gender-v0-loglikelihood │ ├── crows_pairs_english_gender-v0-res.json │ ├── crows_pairs_english_nationality-v0-loglikelihood │ ├── crows_pairs_english_nationality-v0-res.json │ ├── crows_pairs_english_physical_appearance-v0-loglikelihood │ ├── crows_pairs_english_physical_appearance-v0-res.json │ ├── crows_pairs_english_race_color-v0-loglikelihood │ ├── crows_pairs_english_race_color-v0-res.json │ ├── crows_pairs_english_religion-v0-loglikelihood │ ├── crows_pairs_english_religion-v0-res.json │ ├── crows_pairs_english_sexual_orientation-v0-loglikelihood │ ├── crows_pairs_english_sexual_orientation-v0-res.json │ ├── crows_pairs_english_socioeconomic-v0-loglikelihood │ ├── crows_pairs_english_socioeconomic-v0-res.json │ ├── crows_pairs_french-v0-loglikelihood │ ├── crows_pairs_french-v0-res.json │ ├── crows_pairs_french_age-v0-loglikelihood │ ├── crows_pairs_french_age-v0-res.json │ ├── crows_pairs_french_autre-v0-loglikelihood │ ├── crows_pairs_french_autre-v0-res.json │ ├── crows_pairs_french_disability-v0-loglikelihood │ ├── crows_pairs_french_disability-v0-res.json │ ├── crows_pairs_french_gender-v0-loglikelihood │ ├── crows_pairs_french_gender-v0-res.json │ ├── crows_pairs_french_nationality-v0-loglikelihood │ ├── crows_pairs_french_nationality-v0-res.json │ ├── crows_pairs_french_physical_appearance-v0-loglikelihood │ ├── crows_pairs_french_physical_appearance-v0-res.json │ ├── crows_pairs_french_race_color-v0-loglikelihood │ ├── crows_pairs_french_race_color-v0-res.json │ ├── crows_pairs_french_religion-v0-loglikelihood │ ├── crows_pairs_french_religion-v0-res.json │ ├── crows_pairs_french_sexual_orientation-v0-loglikelihood │ ├── crows_pairs_french_sexual_orientation-v0-res.json │ ├── crows_pairs_french_socioeconomic-v0-loglikelihood │ ├── crows_pairs_french_socioeconomic-v0-res.json │ ├── cycle_letters-v0-greedy_until │ ├── cycle_letters-v0-res.json │ ├── drop-v0-greedy_until │ ├── drop-v0-res.json │ ├── drop-v1-greedy_until │ ├── drop-v1-res.json │ ├── ethics_cm-v0-loglikelihood │ ├── ethics_cm-v0-res.json │ ├── ethics_deontology-v0-loglikelihood │ ├── ethics_deontology-v0-res.json │ ├── ethics_justice-v0-loglikelihood │ ├── ethics_justice-v0-res.json │ ├── ethics_utilitarianism-v0-loglikelihood │ ├── ethics_utilitarianism-v0-res.json │ ├── ethics_utilitarianism_original-v0-loglikelihood │ ├── ethics_utilitarianism_original-v0-res.json │ ├── ethics_virtue-v0-loglikelihood │ ├── ethics_virtue-v0-res.json │ ├── gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl │ ├── gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl │ ├── gguf_test_8fcf3f2f52afeb2acd7c8e02c2cc3ce31a691b665d295f6c4e4bbd71c7caa1a2.pkl │ ├── gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl │ ├── gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl │ ├── gpt3_test_bb2cc49115e88788ed870ad0716eb00b280a885f91c7ed6e1e864435e5e2b6ac.pkl │ ├── gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl │ ├── gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl │ ├── gsm8k-v0-greedy_until │ ├── gsm8k-v0-res.json │ ├── headqa-v0-loglikelihood │ ├── headqa-v0-res.json │ ├── headqa_en-v0-loglikelihood │ ├── headqa_en-v0-res.json │ ├── headqa_es-v0-loglikelihood │ ├── headqa_es-v0-res.json │ ├── hellaswag-v0-loglikelihood │ ├── hellaswag-v0-res.json │ ├── hendrycksTest-abstract_algebra-v0-loglikelihood │ ├── hendrycksTest-abstract_algebra-v0-res.json │ ├── hendrycksTest-anatomy-v0-loglikelihood │ ├── hendrycksTest-anatomy-v0-res.json │ ├── hendrycksTest-astronomy-v0-loglikelihood │ ├── hendrycksTest-astronomy-v0-res.json │ ├── hendrycksTest-business_ethics-v0-loglikelihood │ ├── hendrycksTest-business_ethics-v0-res.json │ ├── hendrycksTest-clinical_knowledge-v0-loglikelihood │ ├── hendrycksTest-clinical_knowledge-v0-res.json │ ├── hendrycksTest-college_biology-v0-loglikelihood │ ├── hendrycksTest-college_biology-v0-res.json │ ├── hendrycksTest-college_chemistry-v0-loglikelihood │ ├── hendrycksTest-college_chemistry-v0-res.json │ ├── hendrycksTest-college_computer_science-v0-loglikelihood │ ├── hendrycksTest-college_computer_science-v0-res.json │ ├── hendrycksTest-college_mathematics-v0-loglikelihood │ ├── hendrycksTest-college_mathematics-v0-res.json │ ├── hendrycksTest-college_medicine-v0-loglikelihood │ ├── hendrycksTest-college_medicine-v0-res.json │ ├── hendrycksTest-college_physics-v0-loglikelihood │ ├── hendrycksTest-college_physics-v0-res.json │ ├── hendrycksTest-computer_security-v0-loglikelihood │ ├── hendrycksTest-computer_security-v0-res.json │ ├── hendrycksTest-conceptual_physics-v0-loglikelihood │ ├── hendrycksTest-conceptual_physics-v0-res.json │ ├── hendrycksTest-econometrics-v0-loglikelihood │ ├── hendrycksTest-econometrics-v0-res.json │ ├── hendrycksTest-electrical_engineering-v0-loglikelihood │ ├── hendrycksTest-electrical_engineering-v0-res.json │ ├── hendrycksTest-elementary_mathematics-v0-loglikelihood │ ├── hendrycksTest-elementary_mathematics-v0-res.json │ ├── hendrycksTest-formal_logic-v0-loglikelihood │ ├── hendrycksTest-formal_logic-v0-res.json │ ├── hendrycksTest-global_facts-v0-loglikelihood │ ├── hendrycksTest-global_facts-v0-res.json │ ├── hendrycksTest-high_school_biology-v0-loglikelihood │ ├── hendrycksTest-high_school_biology-v0-res.json │ ├── hendrycksTest-high_school_chemistry-v0-loglikelihood │ ├── hendrycksTest-high_school_chemistry-v0-res.json │ ├── hendrycksTest-high_school_computer_science-v0-loglikelihood │ ├── hendrycksTest-high_school_computer_science-v0-res.json │ ├── hendrycksTest-high_school_european_history-v0-loglikelihood │ ├── hendrycksTest-high_school_european_history-v0-res.json │ ├── hendrycksTest-high_school_geography-v0-loglikelihood │ ├── hendrycksTest-high_school_geography-v0-res.json │ ├── hendrycksTest-high_school_government_and_politics-v0-loglikelihood │ ├── hendrycksTest-high_school_government_and_politics-v0-res.json │ ├── hendrycksTest-high_school_macroeconomics-v0-loglikelihood │ ├── hendrycksTest-high_school_macroeconomics-v0-res.json │ ├── hendrycksTest-high_school_mathematics-v0-loglikelihood │ ├── hendrycksTest-high_school_mathematics-v0-res.json │ ├── hendrycksTest-high_school_microeconomics-v0-loglikelihood │ ├── hendrycksTest-high_school_microeconomics-v0-res.json │ ├── hendrycksTest-high_school_physics-v0-loglikelihood │ ├── hendrycksTest-high_school_physics-v0-res.json │ ├── hendrycksTest-high_school_psychology-v0-loglikelihood │ ├── hendrycksTest-high_school_psychology-v0-res.json │ ├── hendrycksTest-high_school_statistics-v0-loglikelihood │ ├── hendrycksTest-high_school_statistics-v0-res.json │ ├── hendrycksTest-high_school_us_history-v0-loglikelihood │ ├── hendrycksTest-high_school_us_history-v0-res.json │ ├── hendrycksTest-high_school_world_history-v0-loglikelihood │ ├── hendrycksTest-high_school_world_history-v0-res.json │ ├── hendrycksTest-human_aging-v0-loglikelihood │ ├── hendrycksTest-human_aging-v0-res.json │ ├── hendrycksTest-human_sexuality-v0-loglikelihood │ ├── hendrycksTest-human_sexuality-v0-res.json │ ├── hendrycksTest-international_law-v0-loglikelihood │ ├── hendrycksTest-international_law-v0-res.json │ ├── hendrycksTest-jurisprudence-v0-loglikelihood │ ├── hendrycksTest-jurisprudence-v0-res.json │ ├── hendrycksTest-logical_fallacies-v0-loglikelihood │ ├── hendrycksTest-logical_fallacies-v0-res.json │ ├── hendrycksTest-machine_learning-v0-loglikelihood │ ├── hendrycksTest-machine_learning-v0-res.json │ ├── hendrycksTest-management-v0-loglikelihood │ ├── hendrycksTest-management-v0-res.json │ ├── hendrycksTest-marketing-v0-loglikelihood │ ├── hendrycksTest-marketing-v0-res.json │ ├── hendrycksTest-medical_genetics-v0-loglikelihood │ ├── hendrycksTest-medical_genetics-v0-res.json │ ├── hendrycksTest-miscellaneous-v0-loglikelihood │ ├── hendrycksTest-miscellaneous-v0-res.json │ ├── hendrycksTest-moral_disputes-v0-loglikelihood │ ├── hendrycksTest-moral_disputes-v0-res.json │ ├── hendrycksTest-moral_scenarios-v0-loglikelihood │ ├── hendrycksTest-moral_scenarios-v0-res.json │ ├── hendrycksTest-nutrition-v0-loglikelihood │ ├── hendrycksTest-nutrition-v0-res.json │ ├── hendrycksTest-philosophy-v0-loglikelihood │ ├── hendrycksTest-philosophy-v0-res.json │ ├── hendrycksTest-prehistory-v0-loglikelihood │ ├── hendrycksTest-prehistory-v0-res.json │ ├── hendrycksTest-professional_accounting-v0-loglikelihood │ ├── hendrycksTest-professional_accounting-v0-res.json │ ├── hendrycksTest-professional_law-v0-loglikelihood │ ├── hendrycksTest-professional_law-v0-res.json │ ├── hendrycksTest-professional_medicine-v0-loglikelihood │ ├── hendrycksTest-professional_medicine-v0-res.json │ ├── hendrycksTest-professional_psychology-v0-loglikelihood │ ├── hendrycksTest-professional_psychology-v0-res.json │ ├── hendrycksTest-public_relations-v0-loglikelihood │ ├── hendrycksTest-public_relations-v0-res.json │ ├── hendrycksTest-security_studies-v0-loglikelihood │ ├── hendrycksTest-security_studies-v0-res.json │ ├── hendrycksTest-sociology-v0-loglikelihood │ ├── hendrycksTest-sociology-v0-res.json │ ├── hendrycksTest-us_foreign_policy-v0-loglikelihood │ ├── hendrycksTest-us_foreign_policy-v0-res.json │ ├── hendrycksTest-virology-v0-loglikelihood │ ├── hendrycksTest-virology-v0-res.json │ ├── hendrycksTest-world_religions-v0-loglikelihood │ ├── hendrycksTest-world_religions-v0-res.json │ ├── iwslt17-ar-en-v0-greedy_until │ ├── iwslt17-ar-en-v0-res.json │ ├── iwslt17-en-ar-v0-greedy_until │ ├── iwslt17-en-ar-v0-res.json │ ├── lambada-v0-loglikelihood │ ├── lambada-v0-res.json │ ├── lambada_cloze-v0-loglikelihood │ ├── lambada_cloze-v0-res.json │ ├── lambada_mt_de-v0-loglikelihood │ ├── lambada_mt_de-v0-res.json │ ├── lambada_mt_en-v0-loglikelihood │ ├── lambada_mt_en-v0-res.json │ ├── lambada_mt_es-v0-loglikelihood │ ├── lambada_mt_es-v0-res.json │ ├── lambada_mt_fr-v0-loglikelihood │ ├── lambada_mt_fr-v0-res.json │ ├── lambada_mt_it-v0-loglikelihood │ ├── lambada_mt_it-v0-res.json │ ├── lambada_openai-v0-loglikelihood │ ├── lambada_openai-v0-res.json │ ├── lambada_openai-v2.0-loglikelihood │ ├── lambada_openai-v2.0-res.json │ ├── lambada_openai_cloze-v0-loglikelihood │ ├── lambada_openai_cloze-v0-res.json │ ├── lambada_openai_mt_de-v0-loglikelihood │ ├── lambada_openai_mt_de-v0-res.json │ ├── lambada_openai_mt_en-v0-loglikelihood │ ├── lambada_openai_mt_en-v0-res.json │ ├── lambada_openai_mt_es-v0-loglikelihood │ ├── lambada_openai_mt_es-v0-res.json │ ├── lambada_openai_mt_fr-v0-loglikelihood │ ├── lambada_openai_mt_fr-v0-res.json │ ├── lambada_openai_mt_it-v0-loglikelihood │ ├── lambada_openai_mt_it-v0-res.json │ ├── lambada_standard-v0-loglikelihood │ ├── lambada_standard-v0-res.json │ ├── lambada_standard_cloze-v0-loglikelihood │ ├── lambada_standard_cloze-v0-res.json │ ├── logiqa-v0-loglikelihood │ ├── logiqa-v0-res.json │ ├── math_algebra-v0-greedy_until │ ├── math_algebra-v0-res.json │ ├── math_algebra-v1-greedy_until │ ├── math_algebra-v1-res.json │ ├── math_counting_and_prob-v0-greedy_until │ ├── math_counting_and_prob-v0-res.json │ ├── math_counting_and_prob-v1-greedy_until │ ├── math_counting_and_prob-v1-res.json │ ├── math_geometry-v0-greedy_until │ ├── math_geometry-v0-res.json │ ├── math_geometry-v1-greedy_until │ ├── math_geometry-v1-res.json │ ├── math_intermediate_algebra-v0-greedy_until │ ├── math_intermediate_algebra-v0-res.json │ ├── math_intermediate_algebra-v1-greedy_until │ ├── math_intermediate_algebra-v1-res.json │ ├── math_num_theory-v0-greedy_until │ ├── math_num_theory-v0-res.json │ ├── math_num_theory-v1-greedy_until │ ├── math_num_theory-v1-res.json │ ├── math_prealgebra-v0-greedy_until │ ├── math_prealgebra-v0-res.json │ ├── math_prealgebra-v1-greedy_until │ ├── math_prealgebra-v1-res.json │ ├── math_precalc-v0-greedy_until │ ├── math_precalc-v0-res.json │ ├── math_precalc-v1-greedy_until │ ├── math_precalc-v1-res.json │ ├── mathqa-v0-loglikelihood │ ├── mathqa-v0-res.json │ ├── mc_taco-v0-loglikelihood │ ├── mc_taco-v0-res.json │ ├── mnli-v0-loglikelihood │ ├── mnli-v0-res.json │ ├── mnli_mismatched-v0-loglikelihood │ ├── mnli_mismatched-v0-res.json │ ├── mrpc-v0-loglikelihood │ ├── mrpc-v0-res.json │ ├── multirc-v0-loglikelihood │ ├── multirc-v0-res.json │ ├── multirc-v1-loglikelihood │ ├── multirc-v1-res.json │ ├── mutual-v0-loglikelihood │ ├── mutual-v0-res.json │ ├── mutual-v1-loglikelihood │ ├── mutual-v1-res.json │ ├── mutual_plus-v0-loglikelihood │ ├── mutual_plus-v0-res.json │ ├── mutual_plus-v1-loglikelihood │ ├── mutual_plus-v1-res.json │ ├── openbookqa-v0-loglikelihood │ ├── openbookqa-v0-res.json │ ├── pile_arxiv-v0-loglikelihood_rolling │ ├── pile_arxiv-v0-res.json │ ├── pile_arxiv-v1-loglikelihood_rolling │ ├── pile_arxiv-v1-res.json │ ├── pile_bookcorpus2-v0-loglikelihood_rolling │ ├── pile_bookcorpus2-v0-res.json │ ├── pile_bookcorpus2-v1-loglikelihood_rolling │ ├── pile_bookcorpus2-v1-res.json │ ├── pile_books3-v0-loglikelihood_rolling │ ├── pile_books3-v0-res.json │ ├── pile_books3-v1-loglikelihood_rolling │ ├── pile_books3-v1-res.json │ ├── pile_dm-mathematics-v0-loglikelihood_rolling │ ├── pile_dm-mathematics-v0-res.json │ ├── pile_dm-mathematics-v1-loglikelihood_rolling │ ├── pile_dm-mathematics-v1-res.json │ ├── pile_enron-v0-loglikelihood_rolling │ ├── pile_enron-v0-res.json │ ├── pile_enron-v1-loglikelihood_rolling │ ├── pile_enron-v1-res.json │ ├── pile_europarl-v0-loglikelihood_rolling │ ├── pile_europarl-v0-res.json │ ├── pile_europarl-v1-loglikelihood_rolling │ ├── pile_europarl-v1-res.json │ ├── pile_freelaw-v0-loglikelihood_rolling │ ├── pile_freelaw-v0-res.json │ ├── pile_freelaw-v1-loglikelihood_rolling │ ├── pile_freelaw-v1-res.json │ ├── pile_github-v0-loglikelihood_rolling │ ├── pile_github-v0-res.json │ ├── pile_github-v1-loglikelihood_rolling │ ├── pile_github-v1-res.json │ ├── pile_gutenberg-v0-loglikelihood_rolling │ ├── pile_gutenberg-v0-res.json │ ├── pile_gutenberg-v1-loglikelihood_rolling │ ├── pile_gutenberg-v1-res.json │ ├── pile_hackernews-v0-loglikelihood_rolling │ ├── pile_hackernews-v0-res.json │ ├── pile_hackernews-v1-loglikelihood_rolling │ ├── pile_hackernews-v1-res.json │ ├── pile_nih-exporter-v0-loglikelihood_rolling │ ├── pile_nih-exporter-v0-res.json │ ├── pile_nih-exporter-v1-loglikelihood_rolling │ ├── pile_nih-exporter-v1-res.json │ ├── pile_opensubtitles-v0-loglikelihood_rolling │ ├── pile_opensubtitles-v0-res.json │ ├── pile_opensubtitles-v1-loglikelihood_rolling │ ├── pile_opensubtitles-v1-res.json │ ├── pile_openwebtext2-v0-loglikelihood_rolling │ ├── pile_openwebtext2-v0-res.json │ ├── pile_openwebtext2-v1-loglikelihood_rolling │ ├── pile_openwebtext2-v1-res.json │ ├── pile_philpapers-v0-loglikelihood_rolling │ ├── pile_philpapers-v0-res.json │ ├── pile_philpapers-v1-loglikelihood_rolling │ ├── pile_philpapers-v1-res.json │ ├── pile_pile-cc-v0-loglikelihood_rolling │ ├── pile_pile-cc-v0-res.json │ ├── pile_pile-cc-v1-loglikelihood_rolling │ ├── pile_pile-cc-v1-res.json │ ├── pile_pubmed-abstracts-v0-loglikelihood_rolling │ ├── pile_pubmed-abstracts-v0-res.json │ ├── pile_pubmed-abstracts-v1-loglikelihood_rolling │ ├── pile_pubmed-abstracts-v1-res.json │ ├── pile_pubmed-central-v0-loglikelihood_rolling │ ├── pile_pubmed-central-v0-res.json │ ├── pile_pubmed-central-v1-loglikelihood_rolling │ ├── pile_pubmed-central-v1-res.json │ ├── pile_stackexchange-v0-loglikelihood_rolling │ ├── pile_stackexchange-v0-res.json │ ├── pile_stackexchange-v1-loglikelihood_rolling │ ├── pile_stackexchange-v1-res.json │ ├── pile_ubuntu-irc-v0-loglikelihood_rolling │ ├── pile_ubuntu-irc-v0-res.json │ ├── pile_ubuntu-irc-v1-loglikelihood_rolling │ ├── pile_ubuntu-irc-v1-res.json │ ├── pile_uspto-v0-loglikelihood_rolling │ ├── pile_uspto-v0-res.json │ ├── pile_uspto-v1-loglikelihood_rolling │ ├── pile_uspto-v1-res.json │ ├── pile_wikipedia-v0-loglikelihood_rolling │ ├── pile_wikipedia-v0-res.json │ ├── pile_wikipedia-v1-loglikelihood_rolling │ ├── pile_wikipedia-v1-res.json │ ├── pile_youtubesubtitles-v0-loglikelihood_rolling │ ├── pile_youtubesubtitles-v0-res.json │ ├── pile_youtubesubtitles-v1-loglikelihood_rolling │ ├── pile_youtubesubtitles-v1-res.json │ ├── piqa-v0-loglikelihood │ ├── piqa-v0-res.json │ ├── prost-v0-loglikelihood │ ├── prost-v0-res.json │ ├── pubmedqa-v0-loglikelihood │ ├── pubmedqa-v0-res.json │ ├── qa4mre_2011-v0-loglikelihood │ ├── qa4mre_2011-v0-res.json │ ├── qa4mre_2012-v0-loglikelihood │ ├── qa4mre_2012-v0-res.json │ ├── qa4mre_2013-v0-loglikelihood │ ├── qa4mre_2013-v0-res.json │ ├── qnli-v0-loglikelihood │ ├── qnli-v0-res.json │ ├── qqp-v0-loglikelihood │ ├── qqp-v0-res.json │ ├── race-v0-loglikelihood │ ├── race-v0-res.json │ ├── random_insertion-v0-greedy_until │ ├── random_insertion-v0-res.json │ ├── record-v0-loglikelihood │ ├── record-v0-res.json │ ├── reversed_words-v0-greedy_until │ ├── reversed_words-v0-res.json │ ├── rte-v0-loglikelihood │ ├── rte-v0-res.json │ ├── sciq-v0-loglikelihood │ ├── sciq-v0-res.json │ ├── squad2-v0-greedy_until │ ├── squad2-v0-loglikelihood │ ├── squad2-v0-res.json │ ├── squad2-v1-greedy_until │ ├── squad2-v1-loglikelihood │ ├── squad2-v1-res.json │ ├── sst-v0-loglikelihood │ ├── sst-v0-res.json │ ├── swag-v0-loglikelihood │ ├── swag-v0-res.json │ ├── textsynth_test_0a89c2739f9598b4be2674b0a8e43931d7f3f0b696970bcba31f9b52bdf12297.pkl │ ├── textsynth_test_0c1c14571add7903b89e588c8212572b95bb57b334fc0752c89a7e045a5f63ae.pkl │ ├── textsynth_test_3092d07756f3e1d010c07524cc8a2ecba7f0c19f9e39f2aaf2bf440bfe328004.pkl │ ├── textsynth_test_434076260b6af3a46b7a5eaceec3306a5872c400a3872f744280b237455a0f8e.pkl │ ├── textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl │ ├── textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl │ ├── textsynth_test_51b5302f157cf224f694ccad973f255ae19e9e061d533256bdf75b04e0a917ab.pkl │ ├── textsynth_test_6d6c62dd70caaa208712bf766deaf419cfac89538d4ab7745621e339394c0c23.pkl │ ├── textsynth_test_7209c4617547bfe17cb9e7f5f735fe35822d650aefdc5fbeeaf0c1724effbe09.pkl │ ├── textsynth_test_7afdc285388e51094e12645f305328c759574fa3ec9751631025f8ad5ebf9f3e.pkl │ ├── textsynth_test_9d5f33dbfe1e254928c89f5ed85e4c010d888065f55a8f1b863bc1eb0340a5f2.pkl │ ├── textsynth_test_abcbcba648d89e5d81a50511a6d24ddeb538de2ffe108c1370dd74ce6ac8038d.pkl │ ├── textsynth_test_b1cbb29666cce5e31a1e97695858137398a0885ca5d5d98f515404fb6aeb99e7.pkl │ ├── textsynth_test_e7ad1e9f52a39e1ddd1e50f3c57ffa4546728dd150a67c0a0ddc8675c04e15d1.pkl │ ├── textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl │ ├── toxigen-v0-loglikelihood │ ├── toxigen-v0-res.json │ ├── triviaqa-v0-loglikelihood │ ├── triviaqa-v0-res.json │ ├── triviaqa-v1-loglikelihood │ ├── triviaqa-v1-res.json │ ├── truthfulqa_gen-v0-greedy_until │ ├── truthfulqa_gen-v0-res.json │ ├── truthfulqa_gen-v1-greedy_until │ ├── truthfulqa_gen-v1-res.json │ ├── truthfulqa_mc-v0-loglikelihood │ ├── truthfulqa_mc-v0-res.json │ ├── truthfulqa_mc-v1-loglikelihood │ ├── truthfulqa_mc-v1-res.json │ ├── webqs-v0-loglikelihood │ ├── webqs-v0-res.json │ ├── wic-v0-loglikelihood │ ├── wic-v0-res.json │ ├── wikitext-v0-loglikelihood_rolling │ ├── wikitext-v0-res.json │ ├── wikitext-v1-loglikelihood_rolling │ ├── wikitext-v1-res.json │ ├── winogrande-v0-loglikelihood │ ├── winogrande-v0-res.json │ ├── wmt14-en-fr-v0-greedy_until │ ├── wmt14-en-fr-v0-res.json │ ├── wmt14-fr-en-v0-greedy_until │ ├── wmt14-fr-en-v0-res.json │ ├── wmt16-de-en-v0-greedy_until │ ├── wmt16-de-en-v0-res.json │ ├── wmt16-en-de-v0-greedy_until │ ├── wmt16-en-de-v0-res.json │ ├── wmt16-en-ro-v0-greedy_until │ ├── wmt16-en-ro-v0-res.json │ ├── wmt16-ro-en-v0-greedy_until │ ├── wmt16-ro-en-v0-res.json │ ├── wmt20-cs-en-v0-greedy_until │ ├── wmt20-cs-en-v0-res.json │ ├── wmt20-de-en-v0-greedy_until │ ├── wmt20-de-en-v0-res.json │ ├── wmt20-de-fr-v0-greedy_until │ ├── wmt20-de-fr-v0-res.json │ ├── wmt20-en-cs-v0-greedy_until │ ├── wmt20-en-cs-v0-res.json │ ├── wmt20-en-de-v0-greedy_until │ ├── wmt20-en-de-v0-res.json │ ├── wmt20-en-iu-v0-greedy_until │ ├── wmt20-en-iu-v0-res.json │ ├── wmt20-en-ja-v0-greedy_until │ ├── wmt20-en-ja-v0-res.json │ ├── wmt20-en-ja-v1-greedy_until │ ├── wmt20-en-ja-v1-res.json │ ├── wmt20-en-km-v0-greedy_until │ ├── wmt20-en-km-v0-res.json │ ├── wmt20-en-pl-v0-greedy_until │ ├── wmt20-en-pl-v0-res.json │ ├── wmt20-en-ps-v0-greedy_until │ ├── wmt20-en-ps-v0-res.json │ ├── wmt20-en-ru-v0-greedy_until │ ├── wmt20-en-ru-v0-res.json │ ├── wmt20-en-ta-v0-greedy_until │ ├── wmt20-en-ta-v0-res.json │ ├── wmt20-en-zh-v0-greedy_until │ ├── wmt20-en-zh-v0-res.json │ ├── wmt20-en-zh-v1-greedy_until │ ├── wmt20-en-zh-v1-res.json │ ├── wmt20-fr-de-v0-greedy_until │ ├── wmt20-fr-de-v0-res.json │ ├── wmt20-iu-en-v0-greedy_until │ ├── wmt20-iu-en-v0-res.json │ ├── wmt20-ja-en-v0-greedy_until │ ├── wmt20-ja-en-v0-res.json │ ├── wmt20-km-en-v0-greedy_until │ ├── wmt20-km-en-v0-res.json │ ├── wmt20-pl-en-v0-greedy_until │ ├── wmt20-pl-en-v0-res.json │ ├── wmt20-ps-en-v0-greedy_until │ ├── wmt20-ps-en-v0-res.json │ ├── wmt20-ru-en-v0-greedy_until │ ├── wmt20-ru-en-v0-res.json │ ├── wmt20-ta-en-v0-greedy_until │ ├── wmt20-ta-en-v0-res.json │ ├── wmt20-zh-en-v0-greedy_until │ ├── wmt20-zh-en-v0-res.json │ ├── wnli-v0-loglikelihood │ ├── wnli-v0-res.json │ ├── wnli-v1-loglikelihood │ ├── wnli-v1-res.json │ ├── wsc-v0-loglikelihood │ ├── wsc-v0-res.json │ ├── wsc273-v0-loglikelihood │ └── wsc273-v0-res.json │ └── utils.py ├── requirements.txt ├── sleb.py └── utils ├── block_remove.py ├── data_utils.py ├── eval_utils.py ├── latency_utils.py ├── model_utils.py └── onoff_utils ├── onoff.py ├── onoff_llama.py └── onoff_opt.py /assets/c4_perplexity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/assets/c4_perplexity.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/assets/overview.png -------------------------------------------------------------------------------- /assets/zeroshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/assets/zeroshot.png -------------------------------------------------------------------------------- /lm-evaluation-harness/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503, F403, F401, C901 3 | max-line-length = 127 4 | max-complexity = 10 5 | select = B,C,E,F,W,T4,B9 6 | -------------------------------------------------------------------------------- /lm-evaluation-harness/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @haileyschoelkopf @lintangsutawika 2 | -------------------------------------------------------------------------------- /lm-evaluation-harness/docs/img/fewshot_example_gpt3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/lm-evaluation-harness/docs/img/fewshot_example_gpt3.png -------------------------------------------------------------------------------- /lm-evaluation-harness/ignore.txt: -------------------------------------------------------------------------------- 1 | ROUGE 2 | rouge 3 | nin 4 | maka 5 | mor 6 | te 7 | ond 8 | extraversion 9 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluator import evaluate, simple_evaluate 2 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/lm-evaluation-harness/lm_eval/api/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/decontamination/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/lm-evaluation-harness/lm_eval/decontamination/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/anli/anli_r2.yaml: -------------------------------------------------------------------------------- 1 | include: anli_r1.yaml 2 | task: anli_r2 3 | training_split: train_r2 4 | validation_split: dev_r2 5 | test_split: test_r2 6 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/anli/anli_r3.yaml: -------------------------------------------------------------------------------- 1 | include: anli_r1.yaml 2 | task: anli_r3 3 | training_split: train_r3 4 | validation_split: dev_r3 5 | test_split: test_r3 6 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arc/arc_challenge.yaml: -------------------------------------------------------------------------------- 1 | include: arc_easy.yaml 2 | task: arc_challenge 3 | dataset_name: ARC-Challenge 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2da.yaml: -------------------------------------------------------------------------------- 1 | include: arithmetic_1dc.yaml 2 | task: arithmetic_2da 3 | dataset_name: arithmetic_2da 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml: -------------------------------------------------------------------------------- 1 | include: arithmetic_1dc.yaml 2 | task: arithmetic_2dm 3 | dataset_name: arithmetic_2dm 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml: -------------------------------------------------------------------------------- 1 | include: arithmetic_1dc.yaml 2 | task: arithmetic_2ds 3 | dataset_name: arithmetic_2ds 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_3da.yaml: -------------------------------------------------------------------------------- 1 | include: arithmetic_1dc.yaml 2 | task: arithmetic_3da 3 | dataset_name: arithmetic_3da 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml: -------------------------------------------------------------------------------- 1 | include: arithmetic_1dc.yaml 2 | task: arithmetic_3ds 3 | dataset_name: arithmetic_3ds 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_4da.yaml: -------------------------------------------------------------------------------- 1 | include: arithmetic_1dc.yaml 2 | task: arithmetic_4da 3 | dataset_name: arithmetic_4da 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml: -------------------------------------------------------------------------------- 1 | include: arithmetic_1dc.yaml 2 | task: arithmetic_4ds 3 | dataset_name: arithmetic_4ds 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_5da.yaml: -------------------------------------------------------------------------------- 1 | include: arithmetic_1dc.yaml 2 | task: arithmetic_5da 3 | dataset_name: arithmetic_5da 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml: -------------------------------------------------------------------------------- 1 | include: arithmetic_1dc.yaml 2 | task: arithmetic_5ds 3 | dataset_name: arithmetic_5ds 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_acm_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "acm_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_acm_Arab" 4 | "test_split": "acm_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_afr_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "afr_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_afr_Latn" 4 | "test_split": "afr_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_als_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "als_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_als_Latn" 4 | "test_split": "als_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_amh_Ethi.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "amh_Ethi" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_amh_Ethi" 4 | "test_split": "amh_Ethi" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_apc_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "apc_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_apc_Arab" 4 | "test_split": "apc_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_arb_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "arb_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_arb_Arab" 4 | "test_split": "arb_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_arb_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "arb_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_arb_Latn" 4 | "test_split": "arb_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ars_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ars_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ars_Arab" 4 | "test_split": "ars_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ary_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ary_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ary_Arab" 4 | "test_split": "ary_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_arz_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "arz_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_arz_Arab" 4 | "test_split": "arz_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_asm_Beng.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "asm_Beng" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_asm_Beng" 4 | "test_split": "asm_Beng" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_azj_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "azj_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_azj_Latn" 4 | "test_split": "azj_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_bam_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "bam_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_bam_Latn" 4 | "test_split": "bam_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ben_Beng.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ben_Beng" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ben_Beng" 4 | "test_split": "ben_Beng" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ben_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ben_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ben_Latn" 4 | "test_split": "ben_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_bod_Tibt.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "bod_Tibt" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_bod_Tibt" 4 | "test_split": "bod_Tibt" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_bul_Cyrl.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "bul_Cyrl" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_bul_Cyrl" 4 | "test_split": "bul_Cyrl" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_cat_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "cat_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_cat_Latn" 4 | "test_split": "cat_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ceb_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ceb_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ceb_Latn" 4 | "test_split": "ceb_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ces_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ces_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ces_Latn" 4 | "test_split": "ces_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ckb_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ckb_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ckb_Arab" 4 | "test_split": "ckb_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_dan_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "dan_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_dan_Latn" 4 | "test_split": "dan_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_deu_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "deu_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_deu_Latn" 4 | "test_split": "deu_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ell_Grek.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ell_Grek" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ell_Grek" 4 | "test_split": "ell_Grek" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_eng_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "eng_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_eng_Latn" 4 | "test_split": "eng_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_est_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "est_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_est_Latn" 4 | "test_split": "est_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_eus_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "eus_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_eus_Latn" 4 | "test_split": "eus_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_fin_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "fin_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_fin_Latn" 4 | "test_split": "fin_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_fra_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "fra_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_fra_Latn" 4 | "test_split": "fra_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_fuv_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "fuv_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_fuv_Latn" 4 | "test_split": "fuv_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_gaz_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "gaz_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_gaz_Latn" 4 | "test_split": "gaz_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_grn_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "grn_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_grn_Latn" 4 | "test_split": "grn_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_guj_Gujr.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "guj_Gujr" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_guj_Gujr" 4 | "test_split": "guj_Gujr" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hat_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "hat_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_hat_Latn" 4 | "test_split": "hat_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hau_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "hau_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_hau_Latn" 4 | "test_split": "hau_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_heb_Hebr.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "heb_Hebr" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_heb_Hebr" 4 | "test_split": "heb_Hebr" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hin_Deva.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "hin_Deva" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_hin_Deva" 4 | "test_split": "hin_Deva" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hin_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "hin_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_hin_Latn" 4 | "test_split": "hin_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hrv_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "hrv_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_hrv_Latn" 4 | "test_split": "hrv_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hun_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "hun_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_hun_Latn" 4 | "test_split": "hun_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_hye_Armn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "hye_Armn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_hye_Armn" 4 | "test_split": "hye_Armn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ibo_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ibo_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ibo_Latn" 4 | "test_split": "ibo_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ilo_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ilo_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ilo_Latn" 4 | "test_split": "ilo_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ind_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ind_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ind_Latn" 4 | "test_split": "ind_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_isl_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "isl_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_isl_Latn" 4 | "test_split": "isl_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ita_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ita_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ita_Latn" 4 | "test_split": "ita_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_jav_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "jav_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_jav_Latn" 4 | "test_split": "jav_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_jpn_Jpan.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "jpn_Jpan" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_jpn_Jpan" 4 | "test_split": "jpn_Jpan" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kac_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "kac_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_kac_Latn" 4 | "test_split": "kac_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kan_Knda.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "kan_Knda" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_kan_Knda" 4 | "test_split": "kan_Knda" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kat_Geor.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "kat_Geor" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_kat_Geor" 4 | "test_split": "kat_Geor" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kaz_Cyrl.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "kaz_Cyrl" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_kaz_Cyrl" 4 | "test_split": "kaz_Cyrl" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kea_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "kea_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_kea_Latn" 4 | "test_split": "kea_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_khk_Cyrl.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "khk_Cyrl" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_khk_Cyrl" 4 | "test_split": "khk_Cyrl" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_khm_Khmr.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "khm_Khmr" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_khm_Khmr" 4 | "test_split": "khm_Khmr" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kin_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "kin_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_kin_Latn" 4 | "test_split": "kin_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kir_Cyrl.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "kir_Cyrl" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_kir_Cyrl" 4 | "test_split": "kir_Cyrl" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_kor_Hang.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "kor_Hang" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_kor_Hang" 4 | "test_split": "kor_Hang" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lao_Laoo.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "lao_Laoo" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_lao_Laoo" 4 | "test_split": "lao_Laoo" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lin_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "lin_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_lin_Latn" 4 | "test_split": "lin_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lit_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "lit_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_lit_Latn" 4 | "test_split": "lit_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lug_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "lug_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_lug_Latn" 4 | "test_split": "lug_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_luo_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "luo_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_luo_Latn" 4 | "test_split": "luo_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_lvs_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "lvs_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_lvs_Latn" 4 | "test_split": "lvs_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mal_Mlym.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "mal_Mlym" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_mal_Mlym" 4 | "test_split": "mal_Mlym" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mar_Deva.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "mar_Deva" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_mar_Deva" 4 | "test_split": "mar_Deva" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mkd_Cyrl.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "mkd_Cyrl" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_mkd_Cyrl" 4 | "test_split": "mkd_Cyrl" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mlt_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "mlt_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_mlt_Latn" 4 | "test_split": "mlt_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mri_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "mri_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_mri_Latn" 4 | "test_split": "mri_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_mya_Mymr.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "mya_Mymr" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_mya_Mymr" 4 | "test_split": "mya_Mymr" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_nld_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "nld_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_nld_Latn" 4 | "test_split": "nld_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_nob_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "nob_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_nob_Latn" 4 | "test_split": "nob_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_npi_Deva.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "npi_Deva" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_npi_Deva" 4 | "test_split": "npi_Deva" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_npi_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "npi_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_npi_Latn" 4 | "test_split": "npi_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_nso_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "nso_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_nso_Latn" 4 | "test_split": "nso_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_nya_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "nya_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_nya_Latn" 4 | "test_split": "nya_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ory_Orya.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ory_Orya" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ory_Orya" 4 | "test_split": "ory_Orya" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_pan_Guru.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "pan_Guru" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_pan_Guru" 4 | "test_split": "pan_Guru" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_pbt_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "pbt_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_pbt_Arab" 4 | "test_split": "pbt_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_pes_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "pes_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_pes_Arab" 4 | "test_split": "pes_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_plt_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "plt_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_plt_Latn" 4 | "test_split": "plt_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_pol_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "pol_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_pol_Latn" 4 | "test_split": "pol_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_por_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "por_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_por_Latn" 4 | "test_split": "por_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ron_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ron_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ron_Latn" 4 | "test_split": "ron_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_rus_Cyrl.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "rus_Cyrl" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_rus_Cyrl" 4 | "test_split": "rus_Cyrl" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_shn_Mymr.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "shn_Mymr" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_shn_Mymr" 4 | "test_split": "shn_Mymr" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sin_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "sin_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_sin_Latn" 4 | "test_split": "sin_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sin_Sinh.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "sin_Sinh" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_sin_Sinh" 4 | "test_split": "sin_Sinh" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_slk_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "slk_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_slk_Latn" 4 | "test_split": "slk_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_slv_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "slv_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_slv_Latn" 4 | "test_split": "slv_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sna_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "sna_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_sna_Latn" 4 | "test_split": "sna_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_snd_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "snd_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_snd_Arab" 4 | "test_split": "snd_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_som_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "som_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_som_Latn" 4 | "test_split": "som_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sot_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "sot_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_sot_Latn" 4 | "test_split": "sot_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_spa_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "spa_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_spa_Latn" 4 | "test_split": "spa_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_srp_Cyrl.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "srp_Cyrl" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_srp_Cyrl" 4 | "test_split": "srp_Cyrl" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ssw_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ssw_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ssw_Latn" 4 | "test_split": "ssw_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_sun_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "sun_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_sun_Latn" 4 | "test_split": "sun_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_swe_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "swe_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_swe_Latn" 4 | "test_split": "swe_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_swh_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "swh_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_swh_Latn" 4 | "test_split": "swh_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tam_Taml.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "tam_Taml" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_tam_Taml" 4 | "test_split": "tam_Taml" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tel_Telu.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "tel_Telu" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_tel_Telu" 4 | "test_split": "tel_Telu" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tgk_Cyrl.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "tgk_Cyrl" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_tgk_Cyrl" 4 | "test_split": "tgk_Cyrl" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tgl_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "tgl_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_tgl_Latn" 4 | "test_split": "tgl_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tha_Thai.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "tha_Thai" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_tha_Thai" 4 | "test_split": "tha_Thai" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tir_Ethi.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "tir_Ethi" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_tir_Ethi" 4 | "test_split": "tir_Ethi" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tsn_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "tsn_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_tsn_Latn" 4 | "test_split": "tsn_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tso_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "tso_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_tso_Latn" 4 | "test_split": "tso_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_tur_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "tur_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_tur_Latn" 4 | "test_split": "tur_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_ukr_Cyrl.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "ukr_Cyrl" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_ukr_Cyrl" 4 | "test_split": "ukr_Cyrl" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_urd_Arab.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "urd_Arab" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_urd_Arab" 4 | "test_split": "urd_Arab" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_urd_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "urd_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_urd_Latn" 4 | "test_split": "urd_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_uzn_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "uzn_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_uzn_Latn" 4 | "test_split": "uzn_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_vie_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "vie_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_vie_Latn" 4 | "test_split": "vie_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_war_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "war_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_war_Latn" 4 | "test_split": "war_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_wol_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "wol_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_wol_Latn" 4 | "test_split": "wol_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_xho_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "xho_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_xho_Latn" 4 | "test_split": "xho_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_yor_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "yor_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_yor_Latn" 4 | "test_split": "yor_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zho_Hans.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "zho_Hans" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_zho_Hans" 4 | "test_split": "zho_Hans" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zho_Hant.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "zho_Hant" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_zho_Hant" 4 | "test_split": "zho_Hant" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zsm_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "zsm_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_zsm_Latn" 4 | "test_split": "zsm_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/belebele/belebele_zul_Latn.yaml: -------------------------------------------------------------------------------- 1 | "fewshot_split": "zul_Latn" 2 | "include": "_default_template_yaml" 3 | "task": "belebele_zul_Latn" 4 | "test_split": "zul_Latn" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/bigbench/generate_until/color.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: color_zero_shot 3 | include: ../generate_until_template_yaml 4 | task: bigbench_color_generate_until 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/bigbench/generate_until/gem.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: gem_zero_shot 3 | include: ../generate_until_template_yaml 4 | task: bigbench_gem_generate_until 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/bigbench/generate_until/tense.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: tense_zero_shot 3 | include: ../generate_until_template_yaml 4 | task: bigbench_tense_generate_until 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/bigbench/multiple_choice/gem.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: gem_zero_shot 3 | include: ../multiple_choice_template_yaml 4 | task: bigbench_gem_multiple_choice 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/adjunct_island.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: adjunct_island 3 | include: _template_yaml 4 | task: blimp_adjunct_island 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/anaphor_gender_agreement.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: anaphor_gender_agreement 3 | include: _template_yaml 4 | task: blimp_anaphor_gender_agreement 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/anaphor_number_agreement.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: anaphor_number_agreement 3 | include: _template_yaml 4 | task: blimp_anaphor_number_agreement 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/animate_subject_passive.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: animate_subject_passive 3 | include: _template_yaml 4 | task: blimp_animate_subject_passive 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/animate_subject_trans.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: animate_subject_trans 3 | include: _template_yaml 4 | task: blimp_animate_subject_trans 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/causative.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: causative 3 | include: _template_yaml 4 | task: blimp_causative 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/complex_NP_island.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: complex_NP_island 3 | include: _template_yaml 4 | task: blimp_complex_NP_island 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/drop_argument.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: drop_argument 3 | include: _template_yaml 4 | task: blimp_drop_argument 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/ellipsis_n_bar_1.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: ellipsis_n_bar_1 3 | include: _template_yaml 4 | task: blimp_ellipsis_n_bar_1 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/ellipsis_n_bar_2.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: ellipsis_n_bar_2 3 | include: _template_yaml 4 | task: blimp_ellipsis_n_bar_2 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/inchoative.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: inchoative 3 | include: _template_yaml 4 | task: blimp_inchoative 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/intransitive.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: intransitive 3 | include: _template_yaml 4 | task: blimp_intransitive 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/npi_present_1.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: npi_present_1 3 | include: _template_yaml 4 | task: blimp_npi_present_1 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/npi_present_2.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: npi_present_2 3 | include: _template_yaml 4 | task: blimp_npi_present_2 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/only_npi_licensor_present.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: only_npi_licensor_present 3 | include: _template_yaml 4 | task: blimp_only_npi_licensor_present 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/only_npi_scope.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: only_npi_scope 3 | include: _template_yaml 4 | task: blimp_only_npi_scope 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/passive_1.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: passive_1 3 | include: _template_yaml 4 | task: blimp_passive_1 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/passive_2.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: passive_2 3 | include: _template_yaml 4 | task: blimp_passive_2 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/principle_A_c_command.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: principle_A_c_command 3 | include: _template_yaml 4 | task: blimp_principle_A_c_command 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/principle_A_case_1.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: principle_A_case_1 3 | include: _template_yaml 4 | task: blimp_principle_A_case_1 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/principle_A_case_2.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: principle_A_case_2 3 | include: _template_yaml 4 | task: blimp_principle_A_case_2 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/principle_A_domain_1.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: principle_A_domain_1 3 | include: _template_yaml 4 | task: blimp_principle_A_domain_1 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/principle_A_domain_2.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: principle_A_domain_2 3 | include: _template_yaml 4 | task: blimp_principle_A_domain_2 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/principle_A_domain_3.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: principle_A_domain_3 3 | include: _template_yaml 4 | task: blimp_principle_A_domain_3 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/principle_A_reconstruction.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: principle_A_reconstruction 3 | include: _template_yaml 4 | task: blimp_principle_A_reconstruction 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/sentential_subject_island.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: sentential_subject_island 3 | include: _template_yaml 4 | task: blimp_sentential_subject_island 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/superlative_quantifiers_1.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: superlative_quantifiers_1 3 | include: _template_yaml 4 | task: blimp_superlative_quantifiers_1 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/superlative_quantifiers_2.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: superlative_quantifiers_2 3 | include: _template_yaml 4 | task: blimp_superlative_quantifiers_2 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/tough_vs_raising_1.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: tough_vs_raising_1 3 | include: _template_yaml 4 | task: blimp_tough_vs_raising_1 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/tough_vs_raising_2.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: tough_vs_raising_2 3 | include: _template_yaml 4 | task: blimp_tough_vs_raising_2 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/transitive.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: transitive 3 | include: _template_yaml 4 | task: blimp_transitive 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/wh_island.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: wh_island 3 | include: _template_yaml 4 | task: blimp_wh_island 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/wh_questions_object_gap.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: wh_questions_object_gap 3 | include: _template_yaml 4 | task: blimp_wh_questions_object_gap 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/wh_questions_subject_gap.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: wh_questions_subject_gap 3 | include: _template_yaml 4 | task: blimp_wh_questions_subject_gap 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/wh_vs_that_no_gap.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: wh_vs_that_no_gap 3 | include: _template_yaml 4 | task: blimp_wh_vs_that_no_gap 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/blimp/wh_vs_that_with_gap.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: wh_vs_that_with_gap 3 | include: _template_yaml 4 | task: blimp_wh_vs_that_with_gap 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_law.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "law" 2 | "description": "以下是中国关于法学的单项选择题,请选出其中的正确答案。\n\n" 3 | "include": "_default_ceval_yaml" 4 | "task": "ceval-valid_law" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/ceval/ceval-valid_logic.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "logic" 2 | "description": "以下是中国关于逻辑学的单项选择题,请选出其中的正确答案。\n\n" 3 | "include": "_default_ceval_yaml" 4 | "task": "ceval-valid_logic" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/cmmlu/cmmlu_default_arts.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "arts" 2 | "description": "以下是关于艺术学的单项选择题,请直接给出正确答案的选项。\n\n" 3 | "include": "_default_template_yaml" 4 | "task": "cmmlu_arts" 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french.yaml: -------------------------------------------------------------------------------- 1 | include: crows_pairs_english.yaml 2 | task: crows_pairs_french 3 | dataset_name: french 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/crows_pairs/crows_pairs_french_age.yaml: -------------------------------------------------------------------------------- 1 | include: crows_pairs_english.yaml 2 | task: crows_pairs_french_age 3 | dataset_name: french 4 | process_docs: !function utils.filter_age 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/csatqa/csatqa_gr.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "GR" 2 | "include": "_default_csatqa_yaml" 3 | "task": "csatqa_gr" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/csatqa/csatqa_li.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "LI" 2 | "include": "_default_csatqa_yaml" 3 | "task": "csatqa_li" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/csatqa/csatqa_rch.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "RCH" 2 | "include": "_default_csatqa_yaml" 3 | "task": "csatqa_rch" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/csatqa/csatqa_rcs.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "RCS" 2 | "include": "_default_csatqa_yaml" 3 | "task": "csatqa_rcs" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/csatqa/csatqa_rcss.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "RCSS" 2 | "include": "_default_csatqa_yaml" 3 | "task": "csatqa_rcss" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/csatqa/csatqa_wr.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "WR" 2 | "include": "_default_csatqa_yaml" 3 | "task": "csatqa_wr" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/fld/fld_star.yaml: -------------------------------------------------------------------------------- 1 | include: fld_default.yaml 2 | task: fld_star 3 | dataset_name: star 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/glue/mnli/mismatch.yaml: -------------------------------------------------------------------------------- 1 | include: default.yaml 2 | task: mnli_mismatch 3 | validation_split: validation_mismatched 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/headqa/headqa_es.yaml: -------------------------------------------------------------------------------- 1 | include: headqa_en.yaml 2 | task: headqa_es 3 | dataset_name: es 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_accounting.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Accounting" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_accounting" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_agricultural_sciences.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Agricultural-Sciences" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_agricultural_sciences" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_biology.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Biology" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_biology" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_chemical_engineering.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Chemical-Engineering" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_chemical_engineering" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_chemistry.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Chemistry" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_chemistry" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_civil_engineering.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Civil-Engineering" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_civil_engineering" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_computer_science.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Computer-Science" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_computer_science" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_construction.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Construction" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_construction" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_criminal_law.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Criminal-Law" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_criminal_law" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_ecology.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Ecology" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_ecology" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_economics.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Economics" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_economics" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_education.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Education" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_education" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_electrical_engineering.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Electrical-Engineering" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_electrical_engineering" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_electronics_engineering.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Electronics-Engineering" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_electronics_engineering" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_energy_management.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Energy-Management" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_energy_management" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_environmental_science.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Environmental-Science" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_environmental_science" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_fashion.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Fashion" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_fashion" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_food_processing.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Food-Processing" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_food_processing" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_geomatics.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Geomatics" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_geomatics" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_health.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Health" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_health" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_industrial_engineer.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Industrial-Engineer" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_industrial_engineer" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_information_technology.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Information-Technology" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_information_technology" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_law.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Law" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_law" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_management.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Management" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_management" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_maritime_engineering.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Maritime-Engineering" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_maritime_engineering" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_marketing.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Marketing" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_marketing" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_materials_engineering.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Materials-Engineering" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_materials_engineering" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_mechanical_engineering.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Mechanical-Engineering" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_mechanical_engineering" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_nondestructive_testing.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Nondestructive-Testing" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_nondestructive_testing" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_patent.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Patent" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_patent" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_psychology.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Psychology" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_psychology" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_public_safety.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Public-Safety" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_public_safety" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_real_estate.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Real-Estate" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_real_estate" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_refrigerating_machinery.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Refrigerating-Machinery" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_refrigerating_machinery" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_social_welfare.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Social-Welfare" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_social_welfare" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/kmmlu/kmmlu_taxation.yaml: -------------------------------------------------------------------------------- 1 | "dataset_name": "Taxation" 2 | "include": "_default_kmmlu_yaml" 3 | "task": "kmmlu_taxation" 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/lambada_multilingual/lambada_mt_de.yaml: -------------------------------------------------------------------------------- 1 | include: lambada_mt_en.yaml 2 | task: lambada_openai_mt_de 3 | dataset_name: de 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/lambada_multilingual/lambada_mt_es.yaml: -------------------------------------------------------------------------------- 1 | include: lambada_mt_en.yaml 2 | task: lambada_openai_mt_es 3 | dataset_name: es 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/lambada_multilingual/lambada_mt_fr.yaml: -------------------------------------------------------------------------------- 1 | include: lambada_mt_en.yaml 2 | task: lambada_openai_mt_fr 3 | dataset_name: fr 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/lambada_multilingual/lambada_mt_it.yaml: -------------------------------------------------------------------------------- 1 | include: lambada_mt_en.yaml 2 | task: lambada_openai_mt_it 3 | dataset_name: it 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_counting_and_prob.yaml: -------------------------------------------------------------------------------- 1 | include: minerva_math_algebra.yaml 2 | dataset_name: counting_and_probability 3 | task: minerva_math_counting_and_prob 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_geometry.yaml: -------------------------------------------------------------------------------- 1 | include: minerva_math_algebra.yaml 2 | dataset_name: geometry 3 | task: minerva_math_geometry 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_intermediate_algebra.yaml: -------------------------------------------------------------------------------- 1 | include: minerva_math_algebra.yaml 2 | dataset_name: intermediate_algebra 3 | task: minerva_math_intermediate_algebra 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_num_theory.yaml: -------------------------------------------------------------------------------- 1 | include: minerva_math_algebra.yaml 2 | dataset_name: number_theory 3 | task: minerva_math_num_theory 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_prealgebra.yaml: -------------------------------------------------------------------------------- 1 | include: minerva_math_algebra.yaml 2 | dataset_name: prealgebra 3 | task: minerva_math_prealgebra 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/minerva_math/minerva_math_precalc.yaml: -------------------------------------------------------------------------------- 1 | include: minerva_math_algebra.yaml 2 | dataset_name: precalculus 3 | task: minerva_math_precalc 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/mmlu/default/_mmlu.yaml: -------------------------------------------------------------------------------- 1 | group: mmlu 2 | task: 3 | - mmlu_stem 4 | - mmlu_other 5 | - mmlu_social_sciences 6 | - mmlu_humanities 7 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/agreeableness.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: agreeableness 3 | include: _template_yaml 4 | task: persona_agreeableness 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/extraversion.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: extraversion 3 | include: _template_yaml 4 | task: persona_extraversion 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/has-disability.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: has-disability 3 | include: _template_yaml 4 | task: persona_has-disability 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/narcissism.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: narcissism 3 | include: _template_yaml 4 | task: persona_narcissism 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/neuroticism.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: neuroticism 3 | include: _template_yaml 4 | task: persona_neuroticism 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/no-shut-down.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: no-shut-down 3 | include: _template_yaml 4 | task: persona_no-shut-down 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/openness.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: openness 3 | include: _template_yaml 4 | task: persona_openness 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/psychopathy.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: psychopathy 3 | include: _template_yaml 4 | task: persona_psychopathy 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/risk-averse.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: risk-averse 3 | include: _template_yaml 4 | task: persona_risk-averse 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/risk-neutral.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: risk-neutral 3 | include: _template_yaml 4 | task: persona_risk-neutral 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/model_written_evals/persona/risk-seeking.yaml: -------------------------------------------------------------------------------- 1 | # Generated by _generate_configs.py 2 | dataset_name: risk-seeking 3 | include: _template_yaml 4 | task: persona_risk-seeking 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/mutual/multual_plus.yaml: -------------------------------------------------------------------------------- 1 | include: mutual.yaml 2 | task: mutual_plus 3 | dataset_name: mutual_plus 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/nq_open/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/lm-evaluation-harness/lm_eval/tasks/nq_open/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_bookcorpus2.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_bookcorpus2 3 | dataset_name: pile_bookcorpus2 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_books3.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_books3 3 | dataset_name: pile_books3 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_dm-mathematics.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_dm-mathematics 3 | dataset_name: pile_dm-mathematics 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_enron.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_enron 3 | dataset_name: pile_enron 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_europarl.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_europarl 3 | dataset_name: pile_europarl 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_freelaw.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_freelaw 3 | dataset_name: pile_freelaw 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_github.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_github 3 | dataset_name: pile_github 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_gutenberg.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_gutenberg 3 | dataset_name: pile_gutenberg 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_hackernews.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_hackernews 3 | dataset_name: pile_hackernews 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_nih-exporter.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_nih-exporter 3 | dataset_name: pile_nih-exporter 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_opensubtitles.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_opensubtitles 3 | dataset_name: pile_opensubtitles 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_openwebtext2.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_openwebtext2 3 | dataset_name: pile_openwebtext2 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_philpapers.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_philpapers 3 | dataset_name: pile_philpapers 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_pile-cc.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_pile-cc 3 | dataset_name: pile_pile-cc 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_pubmed-abstracts.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_pubmed-abstracts 3 | dataset_name: pile_pubmed-abstracts 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_pubmed-central.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_pubmed-central 3 | dataset_name: pile_pubmed-central 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_stackexchange.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_stackexchange 3 | dataset_name: pile_stackexchange 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_ubuntu-irc.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_ubuntu-irc 3 | dataset_name: pile_ubuntu-irc 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_uspto.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_uspto 3 | dataset_name: pile_uspto 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_wikipedia.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_wikipedia 3 | dataset_name: pile_wikipedia 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/pile/pile_youtubesubtitles.yaml: -------------------------------------------------------------------------------- 1 | include: pile_arxiv.yaml 2 | task: pile_youtubesubtitles 3 | dataset_name: pile_youtubesubtitles 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/polemo2/polemo2_out.yaml: -------------------------------------------------------------------------------- 1 | include: polemo2_in.yaml 2 | task: polemo2_out 3 | dataset_path: allegro/klej-polemo2-out 4 | dataset_name: klej-polemo2-out 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/qa4mre/qa4mre_2012.yaml: -------------------------------------------------------------------------------- 1 | include: qa4mre_2011.yaml 2 | task: qa4mre_2012 3 | dataset_path: qa4mre 4 | dataset_name: 2012.main.EN 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/qa4mre/qa4mre_2013.yaml: -------------------------------------------------------------------------------- 1 | include: qa4mre_2011.yaml 2 | task: qa4mre_2013 3 | dataset_path: qa4mre 4 | dataset_name: 2013.main.EN 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/squadv2/squadv2.yaml: -------------------------------------------------------------------------------- 1 | task: squadv2 2 | class: !function task.SQuAD2 3 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_ht.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_ht 3 | dataset_name: ht 4 | doc_to_text: !function utils.doc_to_text_ht 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_id.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_id 3 | dataset_name: id 4 | doc_to_text: !function utils.doc_to_text_id 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_it.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_it 3 | dataset_name: it 4 | doc_to_text: !function utils.doc_to_text_it 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_qu.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_qu 3 | dataset_name: qu 4 | doc_to_text: !function utils.doc_to_text_qu 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_sw.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_sw 3 | dataset_name: sw 4 | doc_to_text: !function utils.doc_to_text_sw 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_ta.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_ta 3 | dataset_name: ta 4 | doc_to_text: !function utils.doc_to_text_ta 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_th.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_th 3 | dataset_name: th 4 | doc_to_text: !function utils.doc_to_text_th 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_tr.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_tr 3 | dataset_name: tr 4 | doc_to_text: !function utils.doc_to_text_tr 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_vi.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_vi 3 | dataset_name: vi 4 | doc_to_text: !function utils.doc_to_text_vi 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xcopa/default_zh.yaml: -------------------------------------------------------------------------------- 1 | include: default_et.yaml 2 | task: xcopa_zh 3 | dataset_name: zh 4 | doc_to_text: !function utils.doc_to_text_zh 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_en.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_en 3 | dataset_name: en 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_es.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_es 3 | dataset_name: es 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_eu.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_eu 3 | dataset_name: eu 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_hi.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_hi 3 | dataset_name: hi 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_id.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_id 3 | dataset_name: id 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_my.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_my 3 | dataset_name: my 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_ru.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_ru 3 | dataset_name: ru 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_sw.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_sw 3 | dataset_name: sw 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_te.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_te 3 | dataset_name: te 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xstorycloze/default_zh.yaml: -------------------------------------------------------------------------------- 1 | include: default_ar.yaml 2 | task: xstorycloze_zh 3 | dataset_name: zh 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xwinograd/xwinograd_en.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: en 3 | include: xwinograd_common_yaml 4 | task: xwinograd_en 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xwinograd/xwinograd_fr.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: fr 3 | include: xwinograd_common_yaml 4 | task: xwinograd_fr 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xwinograd/xwinograd_jp.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: jp 3 | include: xwinograd_common_yaml 4 | task: xwinograd_jp 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xwinograd/xwinograd_pt.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: pt 3 | include: xwinograd_common_yaml 4 | task: xwinograd_pt 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xwinograd/xwinograd_ru.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: ru 3 | include: xwinograd_common_yaml 4 | task: xwinograd_ru 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/lm_eval/tasks/xwinograd/xwinograd_zh.yaml: -------------------------------------------------------------------------------- 1 | # Generated by utils.py 2 | dataset_name: zh 3 | include: xwinograd_common_yaml 4 | task: xwinograd_zh 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness/requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -------------------------------------------------------------------------------- /lm-evaluation-harness/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/lm-evaluation-harness/scripts/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness/scripts/clean_training_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/lm-evaluation-harness/scripts/clean_training_data/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | # This is to make sure that the package supports editable installs 5 | setuptools.setup() 6 | -------------------------------------------------------------------------------- /lm-evaluation-harness/templates/new_yaml_task/blank_yaml.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/lm-evaluation-harness/templates/new_yaml_task/blank_yaml.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiwonsong-dev/SLEB/d07129af60520e751087b8abb04a268a3c7ec861/lm-evaluation-harness/tests/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anagrams1-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 7c0c5246d3f751f39119a5629ac1d4b2c6fd2a315f78d6de9b2c387e24e3fef1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anagrams1-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"anagrams1": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams1": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anagrams2-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 6700a3c44e48abe8337238dcbe3b54cf4abafe0c204c52d921e590872fbd05e7 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anagrams2-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"anagrams2": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams2": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anli_r1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 3a84baf2f170e138c6ce0bc9f06f905def35d705fa2b8781f10c87aef404c4cb -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anli_r1-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"anli_r1": {"acc": 0.334, "acc_stderr": 0.014922019523732967}}, "versions": {"anli_r1": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anli_r2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d0ea3c3e09d533982c15b4c034439896d6af4bbafb2254d305e20215534a251d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anli_r2-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"anli_r2": {"acc": 0.356, "acc_stderr": 0.015149042659306628}}, "versions": {"anli_r2": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anli_r3-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6b6e5c6a794f2fbff78b7aa24fe0c90156039334bbd1cb34f7af9fc6e6183845 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/anli_r3-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"anli_r3": {"acc": 0.31916666666666665, "acc_stderr": 0.01346230971200514}}, "versions": {"anli_r3": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arc_challenge-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 41c34c96cca8ace661911d0033d630c554b283f5a3953bcdc50720ae6b00a9c1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-loglikelihood: -------------------------------------------------------------------------------- 1 | 8ebbbc510644ede7bf53496c381e276d5a1eec14828870e8b7e611f231e6d5f6 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arc_easy-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ffa6e39a35a16299dcb015f17f986aaa598ad8b4840c4cebe0339a7042232741 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 04c3a63a6b3c579bd3775d92b3076ba9130041d5ce7cf9244d3f86e95c804387 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_1dc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_1dc": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_2da-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6ca1ca6ebd7cac4420d5005f7f35b0edbc921377f5e4f8874cc176e4fb6d79d4 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_2da-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_2da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2da": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_2dm-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 14ac5e510cdf82967d6827a9ca059906ee1db2e347be1b17f36403a157e73552 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_2dm-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_2dm": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2dm": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_2ds-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 66f7ff3b40251ee38fadcbee658e309a200224356fc3efa07d0a490a2c24bfa3 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_2ds-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_2ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2ds": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_3da-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | c421f9cd5a5001b80e528441da925128177a04db8526ebcdab543a90b33c9ce2 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_3da-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_3da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3da": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d3d8bad8827d4530945a1d8b3c7589c0235bbed0bc89e7561a6fdac678f6ce5c -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_3ds-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_3ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_3ds": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_4da-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d3557beb8b9e5704122c2fc6362b11fbe2c3f2f3cb72aed4462b208767c40e01 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_4da-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_4da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_4da": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d915830b8621e66331383bb2ae4c60acebf008e2f94741092ef4c33ea5441037 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_4ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_4ds": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 49edb1e735660631ea6cc309721e6c0b80b7106a613a6959514852ca48f1130e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_5da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_5da": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 2888d6d098a5ef8c1e7f0d8295ba80826e2e04e431f57508dfb71d53e1cd4604 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"arithmetic_5ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_5ds": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_adjunct_island-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 976a5cac4bdb724632eebd4cb9e522203ce3da8d5525288a597c86e80469f3f2 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_adjunct_island-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_adjunct_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_adjunct_island": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_anaphor_gender_agreement-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 2d8964e56a17661502ecf3f09c0befba63915360ddf2145b0bd845816950515d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_anaphor_number_agreement-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 0bdad31c974ba064e1f1ba931841ec2ba7461e8b0ca54ea5f79f08b6bae0bab5 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 064c38fcd072b8bd12f54ea4f8e41599ed4e11dc386e93b77e1fc07967d1f960 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_animate_subject_trans-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 2a84231e7b79f517427e57e2099c88fed3d60a7efab4ef9506e263b4091d5cfa -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_causative-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 3d67ad025185dbb0808ebd7f508edcb5750c18fc3c01ad91f20fda80780c916c -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_causative-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_causative": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_causative": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | f46cfcc7e43050a235fd2a6b989cabbfbcce76786df74db9f0d4a9cd1caa1628 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_complex_NP_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_complex_NP_island": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7e1cc5b9f71abfbe56c4bdf343a1e5632785b66a986b8e904a41ed8f45a2c33e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 23ddafdff7b1ebe331b146e23b2c21aa109fe57aa1ce8ca201a0d239fcbdd166 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 2df8cc7f17089f7e8c7d974dcb324c809d30ef059a5be22aed6b69f44230809f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 123e2acd00fbba60aba1fbae607c79a062e512c9e79c7d8dfafff63e30111d76 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7fab9f02e71a224ae7931aa77f8a9a61d887a7480756adc965d4746e97fb04a5 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ddb24ddfaebe076b3aa7107937d71bf5f4503a78283bc889e39200368603681e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 95acb74fac7d57ae2c9d208361a5f8ad36b0b19a055f02e648ed8e99505f4b43 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ad61c619aa79433d02f1aeacde2ab87291fd5d5c370032c24d41c4f0065ed1f9 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ccc64b4d5e80c081d5161aae5828212ba49d277ca8c5a4281f181744727a6a99 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 007c47e5fbf88119c5180feef75e1345d448e56adcd4c7ab2d52fb8d67350d34 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relational_noun-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 8aab641bd5933f84f46a14f5c1208a3c855cace7e67b44abcd5aff8fec96717d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | bf78e2b53c0f3531303c668c96bd3897a0a35e960da37439e63724ecba4e371a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_drop_argument-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 616109e63f162dcd31a632943e7ef0c9e0431afeb179e83e9b04b39007b16f5b -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_drop_argument-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_drop_argument": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_drop_argument": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d14e4b7fcdd68991eb39b9cf3ade4b37dee9ddd39b688f861d81a327e47a969f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_ellipsis_n_bar_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_ellipsis_n_bar_1": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 0523771a217759f0b22b89807694ee7f6381ce98a584b1fd070ba96194a3273b -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_ellipsis_n_bar_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_ellipsis_n_bar_2": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_existential_there_object_raising-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 63567712076256f373131971676c1c6d711efef73cd0e4de3cc639bc631a2413 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d77594382e6d9af31a8b8ef00ba1ef6c29d6be6d0ddb7a9c27ef25ace654e05a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6e6add7baff4217f383425bef58288202018e041b24084edcaa5df8af08f820c -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_existential_there_subject_raising-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 9b324b28ae3e1b5d49ecf4b7b2a16c7bbc8ff38d000cf216fab75df633da2084 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ceede5b38248a62125a74a8332602b8eac5ef40864f071ad8d86e7971e07219d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 3ff73629fb4473986a0e8ae2fcb7c40e88292189ab0d8755d20836c5aa5a2f99 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_inchoative": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_inchoative": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_intransitive-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6469ae3b0d46b008846b5fd132f2d2b26ea2858745d056df1470b89aa97a790f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_intransitive-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_intransitive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_intransitive": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_adjectives-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 47c56f336df11924d8b97feb46339ce55bea4b216b6fd13946cc999ea36a4a95 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 63ec733873f94ace71cb34112d1c3cd5bb768c26b975fb90acc9b8ba3f4e938e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7084358b1b7dd7fb5ead1a58f4b499d6f7610eca897bfac25a986d0f9a91aa5d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 9534751f83a86b6cbe1fb12fb9feb827b0b7836a663108928b4ecc1d70b08871 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 9852b38612db8c6adf938a5d8a7a9e5ce9e655259d6cc806b142506fcaff0ed4 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_left_branch_island_simple_question-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6cb36bbdae7754f8832f50872c3dd511ce12547e00fa0771deb747be3355eb85 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_matrix_question_npi_licensor_present-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | a3a702a3335c79b02b36caf37c68069050c2a8a3a03c3610c09afc39d2b83fb1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_npi_present_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 3ef532a85e0ee8f8ff779bc7ddc873d515969a708da84a4eb4a85b7c843cf244 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_npi_present_1-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_npi_present_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_npi_present_1": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_npi_present_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | fdb688ac6259bb65d234ef0a36e9a9ee449f9608f633b12e1943b462aead8e17 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_npi_present_2-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_npi_present_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_npi_present_2": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_only_npi_licensor_present-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d2d0711611b5b218c6fa8c7278494749252b7868c396451919b761303556bd66 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | fc0be817478c212327050fa297ef61ad214f4847dbff61d4e0fe7914c06a1691 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_only_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_only_npi_scope": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | fa4addddd8e380031b8e0871776cabcb707c0f21dcaf5d8b3defec66cce55043 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_passive_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_1": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_passive_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 755bdfe2c89737c43001ff1dc83d68ad33e444aaf0669af66aaf82dcd09f2eca -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_passive_2-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_passive_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_2": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_principle_A_c_command-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7c2ed82612af9175052cd44d8e178b6dd084c04eb462a3d88fcacfad2df8be8e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_principle_A_case_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 49d2b8ce6667a6166fdc2a2e5dbe7ff07d9b8415e9f33482aef15956b3ebc24a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_principle_A_case_1-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_principle_A_case_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_case_1": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | cd68adb65c891d672e22bf53c054b2083ab08bc1da43951732b409c942d14bc7 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_principle_A_case_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_case_2": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 290e7eddacea4ec16989af697f2ee3373fdd9aef4b452bf887184c6e2f6e7d9d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | eb5ddf0a97982373ab1a4e58267cfcdebdecdb86c376dfd5ebf46737c9d3ee12 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 38454befedcf1f3f6ef27d3bef9ccfdfb3e94a7ab32d86a63493a920d2d50093 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 894efedfd8750d5b8de6157f9b2ed2b51b5290d3a78ea9b041fc62d34e96efbc -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 5bc0441f31e32443cf761bca6e961d504e1e84b15aa4e1d79e5c8ed5b4c2aa3a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | f69d9891f59872538962221fccc425b07df7cfbd83cdc546ce83e6b0e9a93f7c -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | e6666c5657215ff4bfd646b8ee3ae6df956e71c0be9ab1c287fb1b68291dd0d1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_scope-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 32fcbd0a1c6e664af2751bad552587b5ca3911973b07f4fb2cf0a2acd3de5349 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 80f5f98fad26240de2767fe58c4b18d864df41cbfa76f06c84c3fce9f14f4833 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 8a01f6a5ea87a01c0c9b0c7b3bc4de4711bf0ff050976976651182b9ed34a0d4 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 59c20ff0f632cf42afc74ecc682cf92e5e740417b01e6cf9a610a3bc544d2ea5 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_1-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 973fe56534fdef1207f0fc08dd09a210304c55f33c6cbb17552754bf54f11c86 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_1-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_tough_vs_raising_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_tough_vs_raising_1": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d255a10a34f14d77d9526604a17b0f6747d32f62fc2e3a09e9ab10054535fd45 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_2-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_tough_vs_raising_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_tough_vs_raising_2": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_transitive-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d0d47fe40a7ee558ba782edbc4f49f7d9123c8472a36decc97f8ab142b45b9d8 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_transitive-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_transitive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_transitive": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 91a9e4b60b0f3572a7fdbd7648d0e69f36e5eb34db715315b0082558d7ed8b65 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_wh_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_island": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_questions_object_gap-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 4d4aaa0274ccd485ff8430ed61b8f83806febe18c16616c7d050f637a0463eba -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d5486ffcc075cad4302e37ece9bbf5b2063c0b5a48e76c8e1dd365e22a5a48fc -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 37483dfda688b62ad27161c9fc1e1e7710c5a6e6a7cd3474df119bcafd30e97f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d1d3e439b2020ef5ed232bfebbcc9634adc5117e9eb61e38fdbbe2c8ea128d54 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_wh_vs_that_no_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_no_gap": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | a142cc2a6fcd93230b650927b07367cad957b8f3f42cb4072151da53dea301df -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d41a9b85e4c31e445bf9b46b8642df02203ccc02b4a9b254bf76066d5c54b4b7 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"blimp_wh_vs_that_with_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_with_gap": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | eed67491bdf493a1dad8f1d9766bc7bd0e79946365b833c0f7eb81ac998e3dca -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/boolq-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | de5aa6f77a2e0fd050b9c272f10c4d5d5581e4f75ffa60926f79e60ae1738960 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/boolq-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/boolq-v1-loglikelihood: -------------------------------------------------------------------------------- 1 | 6577e0d88572772ef08e64f624c0e3df0953286ae1f118ccef15623b59ffeabf -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/boolq-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/cb-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ec3b1bbb9561e39c43c6f77a23b4060b15c606141c5346e3d0791b3e92aaa5d0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/cb-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/cb-v1-loglikelihood: -------------------------------------------------------------------------------- 1 | 77b11f4348eb8a7f57faf95c531fda01ab4bf0e729f91a82451ed8e71ec8e66d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/cb-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/cola-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | e8635578ed8ee70b707a666d35e468b9321db24470f80c92080651e2bfa01751 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/cola-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"cola": {"mcc": -0.04538802810223175, "mcc_stderr": 0.023100371589225246}}, "versions": {"cola": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/copa-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 66276b9045b5300cba4b81340db06f674f031fa0b8883714ad0d03be464cd799 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/copa-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"copa": {"acc": 0.48, "acc_stderr": 0.050211673156867795}}, "versions": {"copa": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 4a8605d5deed0423ec095700251ed93325b45d320aca35d4ce1e94702094435e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/coqa-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/coqa-v1-greedy_until: -------------------------------------------------------------------------------- 1 | 57581470b921435d40da97872bb1cfda6ecf963ccc4b0240a3b04e3fea8c8e3a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/coqa-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ee3ce1ddb8071d4189e5b06e7f3c618a434221ac52935d0f434c4d183f01458a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | de74d2ac7f926f2f486c045d84aae8f71711102f9d77b31f758fd148810d13d3 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_autre-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | a197ccc8538231404a8e43f5ed0fbbfb2c317b4da337f6e7aa9642131aeb426a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_disability-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 90c1bcfdeec0ff51d891ee8cf00ae2a5ec61bab6739faea9865809b8ffed2cdb -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_gender-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 2bf62b7cc678f64ffad4a6e6715ff76a2b984bfe8d1165da4b76b3b4dfafb2f9 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | b85bc849811ccfa9971a6ee3fca7342752c314c0cb6f126e10d9ec4d0450c541 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d1823f5038afafa7a5338e42531720480c8ccf4e177789526caf294d52d56e89 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_race_color-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 0a750596d77cd96502dc414ff699a399b1b91c2078adeec1d3dd982b3d591089 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_religion-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 2ed57377174adaf0fb30037eb055eafdd02cd46e57bc32066d5fecd90a14b6e1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_sexual_orientation-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | e754a309296b157677dfba6e6feef983d1ce38dd0169ae726265621a7b573163 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_english_socioeconomic-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | c309eabfd247a702e32efc4e08211f9a72693d38995be5dd444d497b476396bd -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 4fb61dcf4d2c59d6470b297a01d5f429ee442864e225e1760fbf191b2a0901cd -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_age-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | b14a5769f415a234abe89063a1b546aa4a990c84217e5d4a697874cd7f85af35 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_autre-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | f145ad5086da0bf8c76f0730258529fa243efe32b7ab792d3c4716284b4b5495 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_disability-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | fa1e5fc7492a66c9a90765e605003c38408347617db5ecf36706f1d374af5d42 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_gender-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 010b8404655911c86555616da23afffce9dc3981e1acbbfdb022d9c474430209 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_nationality-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 146eb60c8796fe3f25307a6776337f0b077b58ce02edec64c99df4b906c19b9f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_physical_appearance-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ea61eaad64e9292790d4bbef955ffeebed7a595de098bc5ac726a6e51f27f9af -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6f9119026abff33c5c882d6172e092e806a8b21bd86864022978b1961839350f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_religion-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 8af6445eeb634dad5f0723e40615afe993e1e3f129a4f314fe4117e633c2efd3 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 2ce823fdb93d325aa8fb40db5d335b093b4b69792763532d940a752440ee3a76 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/crows_pairs_french_socioeconomic-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 8ba0a525c65f795c99f6416e70c998e75e4b6cc43bf9a4bd7ccacd3c3591e9cb -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/cycle_letters-v0-greedy_until: -------------------------------------------------------------------------------- 1 | eb23f7d5de7528eefd8ed5f8054c402ff947319cccfef7195995946f99389201 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/cycle_letters-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"cycle_letters": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"cycle_letters": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/drop-v0-greedy_until: -------------------------------------------------------------------------------- 1 | ca566c630d8ac853d5785d4b5c40a5137172c34b48af3350e1f79e6d548b36ba -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/drop-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"drop": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"drop": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/drop-v1-greedy_until: -------------------------------------------------------------------------------- 1 | a670f911ab2999d72db15f534b22703d19e7837edbda4f9f199ad587f7aae6b2 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/drop-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"drop": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"drop": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/ethics_cm-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 92d136ebb2bd86cd036e61699ad9a1417dbb48651f0a3afa5045cf57cef5a3f6 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/ethics_cm-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"ethics_cm": {"acc": 0.49987129987129986, "acc_stderr": 0.008022881531793336}}, "versions": {"ethics_cm": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/ethics_deontology-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 74ecebe322457d70afc16fde848978410a09b854dc65c47f428d100bd1593248 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/ethics_justice-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d7dfc44fea507b5c5c3a8218f79ed8197da8599ebb396d85feb91c25512126b6 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/ethics_utilitarianism-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 88872f1ed1b203f9649a4ced4fb4627d18c17af455d713de6e17c05eced4ec60 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 5b42ba1faf5ece6a6ec9a3976ce79c1fac8df5b98272aab85457188c2142693c -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 8021db8de46850090ddae6e6ec2d382029c3027b7c69884607503f916d09b709 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/gsm8k-v0-greedy_until: -------------------------------------------------------------------------------- 1 | e7292dbdd7fd8419ba954f2e0701e04c8d0e8842fe053dbf2fe47d926630e35e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/gsm8k-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"gsm8k": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"gsm8k": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/headqa-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/headqa_en-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 09da45119b12a0144e3081f8fb790c2a22af7b9c3aac42f54423d348a711fbf5 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/headqa_es-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hellaswag-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | abb808c97d6529eda6c11067837a132c62d25cba0394d720f80cca6df9f7196e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-abstract_algebra-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | e35d1eeb356ac1084d4e9773f028cb3c81ba1c6e5574d598ac4a78aa467cd797 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-anatomy-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | bf05e04ed8cf61cf3aad294ed3f5a16137775ffdd20f1b129022ddffc1251768 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-astronomy-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | bed1e47127cc2893c6aef63b9a0909cca31aa351a703da2a166b01cae03c3311 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | b3b27e9dbad587377d3c8cab1072782de883e245da93a563bd8b3099017b1fc0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | fbcb7ce507e0675d811e71e10a67c8d05a6605e29036f46776e04a6588cefbda -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | c29e4e67ff91af29b9434884874414d1b1b32ccc32903c6b1639469b19907419 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-college_chemistry-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 044752b21540db95118b8cbe7e75c4c9b8758e27df56543deaeadec7f749a28d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 4ea26ad780290429ac5a3317559c154848d662bd40532c966458ba6f2a32d0a3 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-college_mathematics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | e9fe80752686527281f834d2397875b4580581434b94799f9de6aaa450bd73ff -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | dd6e0a9be1407890e9f8cd4434fb6aa4752ab3d2473837fd465ad99f60ad685e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-college_physics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 704a7671ef981fb95594782bc446dd632e87ebdbe89436a0603b714fb5786c75 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-computer_security-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | a8a1892d1906cc3e7ffd321043f0a60f3b8b69ef76e5c6ff03c6ea41dc87d0cb -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-conceptual_physics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 622f191ccfc7a597d99f39897ebe3f95a9ddce0e662fcfb411aa554b289bb355 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-econometrics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | cde76ba2c7382b4876e17136c94f52aca2774e50342ab757b2a2d18da370dcb6 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-electrical_engineering-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | b9b5d8b8bb02696302ec6bc2a99bf987a5504d3bae0e529d2c8f263538c97518 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-elementary_mathematics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6b21f5cd5606268421a667152ec989424b66905c02adbab8d4ff6bb9d21b77d1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-formal_logic-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | c0d0f0c008a5f3faf2f6f4268d87bbc09c40bb66ae08cf38eea0bf2e519c5a59 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-global_facts-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 9fdc85240b8170839278b1e883ee0868611d84dce202cb8aa037c841ec76d089 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_biology-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d4dc051f37a49dc75c218741e87bc826fd44f31ee1309b55e0f33bd191c1bc78 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | f4f338e45415c4b5ee7f1d249155bcd910c8401bd1436760a5ec61cb6bb211b6 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_computer_science-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 870d5a6300c527077aaf6baa3e750e75fa840b41657cf82549f39b768b14862d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_european_history-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d8070e113be9d420fef5578cb69c70df4ea5118f9b18553023fd9efd5ff0b7f4 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | add45970ea3865be7c7a31f788a835949f6937ac73f699b122ca56a3431e95f8 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 11f40d8f48ba5cd739e21d54c3c04d3761f81df5cb7ddd77df868d24ced44b49 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ce4faae2fb6628caa48f6fc74cbc848880db49e6ff51079392778a2322bcefef -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ab368d16fc4648ad27940f71abd266366663f51db612f732a0b9b0eea28de9f8 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_microeconomics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 513b998585ebc1ebdefca6435b7c84fd73dc36fc80321a22503467f04efed23e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_physics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | dae59e82d3d4d8dec82239d9620b72cc47bb6efbe2f1c2f9b9d23e849c9c5e32 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_psychology-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 0e4c8d13806d3696167e40544d2d114c557c10c74bc61fcb9c51bbfced0266ef -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_statistics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 33d1d6eaaa2c3a944bf49d3f220a4efc328d7c3b3465b7cec40ae36d8984b75f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 8c65c1a28330dd001d395ac11f1bb80c3b33f5935f503e74067aef6e9e1d9d9b -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_world_history-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 1c8b994bd9a63ec874fc8d0e3a27077118b7adc472306b2fd6c55635a78b9d52 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 0880b3a78f8d7b17ffc612031427b9085367cf65dabe2a68c4b64e3171d17e88 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-human_sexuality-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 4b07922fa1d549b655c21440b13d869263ce7dd9771d8147c450f11c91d26c10 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-international_law-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ea9b2cefd27959db564168f6ad1169a5eaa012fc5a5d5b8faf9e34d94e335dc1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | cac440189f1ec778e82f4975d88b74689553ecc5116aaa7f76587a50c1a610e0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 2e9449dd803f9e2334dc562d9f04031fd013ed36b883b44ab500533a5dbbface -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-machine_learning-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7a7138821a66ef946e427b40344cf7f1a916a2926995a85ef731a3bee40cb7ce -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-management-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 355489f4bd176ab84db5ef4c03d56ddeeeb1b0ad69827122b2d800e1cdc7e5f0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-marketing-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | b4fa0681fe54671a80509779d4338d744097a7206687f62977df7145dfa74a66 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | db6141246889a19dd3f6b9109f314d49c1a70f7a98795858804378b095c4a2fe -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-miscellaneous-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 972dd88dbbaf09d14766e243cfc233425e7c01a26dbc61bdb9eeefa788822331 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-moral_disputes-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d6ef028022c02b69d1516973e08bebaa14d8debcf2589a2bb124823178202d20 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | a8e1882e77728b53c8b86312254d08320d8363fb606d746a8dd145b812f62cf5 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-nutrition-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 19e49d218f55ed5ec4bd1a6cd3f3388c6f620b81484e7abe8b298e5481c3044d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | a419204da36c2b7a70fa8909a3a804260cc3283c7e07917534dfb76216c77f46 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-prehistory-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6983c560a562749f4f702249a3a6ae51fa495acc0643a980bf2cf52c6c5d4b95 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-professional_accounting-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 847418f7b22cd9b499e95fd73c40a2fbc40076895280cc2c560199c0c4c4f433 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-professional_law-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | c38c9d5d84eeb7a5f3c4a34d6e70d7e15847b3c38f26e4b119c982bb935e118f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-professional_medicine-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7a30599858398169cde61430c18efdd7fb4dcd09c34aa9baba70f0f8cf17a9f1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 92a5fad6e9ec700f84946faeccd399dda3569fb71837c9fb0c5c87f5ec29c43e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | ab70f500cf24e876f6ae6bdc27525a1d6074fa9b6ea97770255d9fc2559b36ff -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-security_studies-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 92dfffe2acf3278256486d3e1cf1edb5a739ad0a54c0f9c67695f7a411ed5f76 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-sociology-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | f99a3caece11169f2a5cc951001f92027104afd25d29b2a399883bd4bf118605 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-us_foreign_policy-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | a1a338d0083a21054f74d36a296d6bd8e2e457327c0fd630bebcc61ed758044d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-virology-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 0ffa491f7bad2abbb64ecd752a295729167599b3815238cab0ecf4cb08bba9b6 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 97a0f68ba30ea3a6ef1db1a2925c964b09ecc54455a0a930da083e52677815bd -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/iwslt17-ar-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | e94d310de91fad7ce36f4cf3305552020221482c5588f2efcefaa019893504f1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-greedy_until: -------------------------------------------------------------------------------- 1 | b20adbcd2c6d135e28600b427113532c5df624cb3a90e8c5e48715c09a3a38fa -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"lambada": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_cloze-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7655e748b63ae7e9911411d2d2a2577221d6c861ca4448509992541294d689f3 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_mt_de-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 5ad125e1708499832b2cee8c3388f89f9c0277010fd96fbd3359039ce8105984 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_mt_en-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_mt_es-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 4a88f4b316c72fe0396c382d6cbb33568ac4d0ad225150d3536635c085359fc9 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_mt_fr-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | fd87c6c5cf4e0499c5f9f80e5bd7ee6a4f3d2991902a0cc3ec9e6eaf22d6760a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_openai-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_openai-v2.0-loglikelihood: -------------------------------------------------------------------------------- 1 | 9ca5643bbaafed2f027eab5b68cc438e9e268f6df9a678e956e61726a985cf0b -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_openai_cloze-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7655e748b63ae7e9911411d2d2a2577221d6c861ca4448509992541294d689f3 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_openai_mt_de-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 5ad125e1708499832b2cee8c3388f89f9c0277010fd96fbd3359039ce8105984 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_openai_mt_es-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 4a88f4b316c72fe0396c382d6cbb33568ac4d0ad225150d3536635c085359fc9 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_openai_mt_fr-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 5d16f4a0c51dc6d7b6df2ebeba2bbfa51e700b843779b559b3d90183d7b02a11 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_openai_mt_it-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | fd87c6c5cf4e0499c5f9f80e5bd7ee6a4f3d2991902a0cc3ec9e6eaf22d6760a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_standard-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 8958d9f8d8145046b692fadd8a9cc9c8bad5617c10774280cf7c24c21d2be160 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/lambada_standard_cloze-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | b604f00bc9f2a77ef41f8cfdb5a8509b3ae9266893b9e90abc665f5399ecba4e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/logiqa-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 12495c50454ba5e1ce0753bd18c09aaca516bebd27648d815e37b15229dbf198 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_algebra-v0-greedy_until: -------------------------------------------------------------------------------- 1 | f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_algebra-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_algebra": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_algebra-v1-greedy_until: -------------------------------------------------------------------------------- 1 | f19182ce697a2c095d9e5b56ee6659dc38c93994b69ca75d7c3d3f5fd87572b4 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_algebra-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_algebra": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_counting_and_prob-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_counting_and_prob-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-greedy_until: -------------------------------------------------------------------------------- 1 | 2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_geometry-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 46bc4cb219b6903397da782699a684bdbb982c0c954ff82e6beeed5c84878f42 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_geometry-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_geometry": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_geometry": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_geometry-v1-greedy_until: -------------------------------------------------------------------------------- 1 | 46bc4cb219b6903397da782699a684bdbb982c0c954ff82e6beeed5c84878f42 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_geometry-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_geometry": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_geometry": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v0-greedy_until: -------------------------------------------------------------------------------- 1 | d53c699de272d517ed7ad783b4e692302be9f9f97a8d4ac7a6541e538a7cabe0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_intermediate_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_intermediate_algebra": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v1-greedy_until: -------------------------------------------------------------------------------- 1 | d53c699de272d517ed7ad783b4e692302be9f9f97a8d4ac7a6541e538a7cabe0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_intermediate_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_intermediate_algebra": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_num_theory-v0-greedy_until: -------------------------------------------------------------------------------- 1 | b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_num_theory-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_num_theory": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_num_theory": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_num_theory-v1-greedy_until: -------------------------------------------------------------------------------- 1 | b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_num_theory-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_num_theory": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_num_theory": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_prealgebra-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 752cdf343d7152e476b0273065024f6ea0e0f47ea385c6bdf9067736cb39724a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_prealgebra-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_prealgebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_prealgebra": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_prealgebra-v1-greedy_until: -------------------------------------------------------------------------------- 1 | 752cdf343d7152e476b0273065024f6ea0e0f47ea385c6bdf9067736cb39724a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_prealgebra-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_prealgebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_prealgebra": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_precalc-v0-greedy_until: -------------------------------------------------------------------------------- 1 | bc834b06fd79473ca6fe38a51b714aad0bf0478c1b0eec787eca34dbdf69cb71 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_precalc-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_precalc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_precalc": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_precalc-v1-greedy_until: -------------------------------------------------------------------------------- 1 | bc834b06fd79473ca6fe38a51b714aad0bf0478c1b0eec787eca34dbdf69cb71 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/math_precalc-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"math_precalc": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_precalc": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mathqa-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | a45260e49f02c7cb8886b3746db4d388890860b202dd8a9f0267e3c324e0af13 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mc_taco-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 1811808ef05afd5f30ffc3471622a3dd7a1b681b17a2f7616695ad6b2a45943c -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mc_taco-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"mc_taco": {"em": 0.07732732732732733, "f1": 0.41600515965511614}}, "versions": {"mc_taco": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mnli-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 4fc7b56b8f1e37e38f4a052b227baec2df914c898c3405d3e994726ba4fba976 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mnli-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"mnli": {"acc": 0.32868059093224655, "acc_stderr": 0.004741640290753859}}, "versions": {"mnli": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mnli_mismatched-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 3784acf322e79f31702a7a0612030e4ba5c4fc466ad976a34ee3f3d7278c01f0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mnli_mismatched-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"mnli_mismatched": {"acc": 0.3360455655004068, "acc_stderr": 0.004763973908606819}}, "versions": {"mnli_mismatched": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mrpc-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 9f54cbff8d6accba99cfa2c4c4b359563313941018173d7dcf9e32dc28c06583 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/multirc-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | cdb026c027437a8b4653212d0944d36fc16f49921dcb8e4bef899d15a55e9f80 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/multirc-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"multirc": {"acc": 0.07450157397691501, "acc_stderr": 0.008510441526175931}}, "versions": {"multirc": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/multirc-v1-loglikelihood: -------------------------------------------------------------------------------- 1 | 0e793bd6f637a70a04c6f2cda080188fc037961b2f909095fe63f7bdbc4a90c6 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/multirc-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"multirc": {"acc": 0.046169989506820566, "acc_stderr": 0.006801377886208738}}, "versions": {"multirc": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mutual-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | f759213a28f0412510bf1a24c9cab0dae64bdee902d42a26225295445e7779db -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mutual-v1-loglikelihood: -------------------------------------------------------------------------------- 1 | f759213a28f0412510bf1a24c9cab0dae64bdee902d42a26225295445e7779db -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mutual_plus-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/mutual_plus-v1-loglikelihood: -------------------------------------------------------------------------------- 1 | b846bb9db109535f59a93d1ce340cf09f68bdf4fed5b8decd168784220fe07fa -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/openbookqa-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 78a49a0ca1a47373adb33463b1d092e6bc0d8f4b01bcb380ada48065037849d7 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_arxiv-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 814f9954e44368559602c00f7e85fa3971acdfd0315f508ec7df6318a79c55ec -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_arxiv-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 814f9954e44368559602c00f7e85fa3971acdfd0315f508ec7df6318a79c55ec -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 5c17ddfebeab8c41dabadb6fc216ceda91e3fe5dc95aaf1b2c843d7f11828b03 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 5c17ddfebeab8c41dabadb6fc216ceda91e3fe5dc95aaf1b2c843d7f11828b03 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_books3-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 0f8f36f705b999b6d55fa72ff89a82793dd1cb568ab1f8727a6a2086a12b9410 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_books3-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 0f8f36f705b999b6d55fa72ff89a82793dd1cb568ab1f8727a6a2086a12b9410 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_dm-mathematics-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | d5b7967c0ece8b816f3921a8bd0fad23365349e935b491595e2ad1135af42da6 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_dm-mathematics-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | d5b7967c0ece8b816f3921a8bd0fad23365349e935b491595e2ad1135af42da6 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_enron-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_enron-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 4baa6ccdc9e3aa9921675ab4400d5e89d7b546b844a8ea28f6461d649066418a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_europarl-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | e67d3dbccd47d308bfc5b0e66b76d0dfc5e386ebfa94e056562c2281c395543f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_europarl-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | e67d3dbccd47d308bfc5b0e66b76d0dfc5e386ebfa94e056562c2281c395543f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_freelaw-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | d77f3f68aadd6cbf1290c2f6737b2ed5d5c2a60e4c81a65c280f207783caabe1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_freelaw-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | d77f3f68aadd6cbf1290c2f6737b2ed5d5c2a60e4c81a65c280f207783caabe1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_github-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | df384c3df3d8f53273e97127c5bb84c17e638acad7d6bc9c91f6dee96d43b639 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_github-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | df384c3df3d8f53273e97127c5bb84c17e638acad7d6bc9c91f6dee96d43b639 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 02a559f74a9105145e7d4d9c5ddea372b5b4938f5368dc8ffafc39cbe3b4c7ef -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_gutenberg-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 02a559f74a9105145e7d4d9c5ddea372b5b4938f5368dc8ffafc39cbe3b4c7ef -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_hackernews-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | ec1082ee5a5326e0d57aa4e73b634937140c1de9af95f154e8ab57b05d9b422b -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_hackernews-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | ec1082ee5a5326e0d57aa4e73b634937140c1de9af95f154e8ab57b05d9b422b -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_nih-exporter-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 520ea6e04e8a39dc0b5f63a837429a78a40e63d39d109096101feb8c5b2cf8d8 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_nih-exporter-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 520ea6e04e8a39dc0b5f63a837429a78a40e63d39d109096101feb8c5b2cf8d8 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_opensubtitles-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 0f1c23a1f4ddec0c2b1ff34de8d1505b0eb9e2868d8edbcc1b6de13d02f32036 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_opensubtitles-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 0f1c23a1f4ddec0c2b1ff34de8d1505b0eb9e2868d8edbcc1b6de13d02f32036 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_openwebtext2-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 5d6c19665f429ab1ccbe027da67f42bdaf219f819ab093673976eee55e015ff4 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 5d6c19665f429ab1ccbe027da67f42bdaf219f819ab093673976eee55e015ff4 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_philpapers-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 339ba5d8c044c4a3ff9b9a8eaa24da1d6c01b72972074eb671a7da049eeb7047 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_philpapers-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 339ba5d8c044c4a3ff9b9a8eaa24da1d6c01b72972074eb671a7da049eeb7047 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_pile-cc-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 731fdef4a43949b179ba0c540148ebc2fa41583dd583ef580dd812076c66a451 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_pile-cc-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 731fdef4a43949b179ba0c540148ebc2fa41583dd583ef580dd812076c66a451 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 66436569a43163afb2caf422d32c5f329899e74c49865d4d13881fd465fd9976 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_pubmed-abstracts-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 66436569a43163afb2caf422d32c5f329899e74c49865d4d13881fd465fd9976 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_pubmed-central-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 40b39d120d99a145690444e86acc3e3e24d41e6e0538a75e26929ad84926e5e0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_pubmed-central-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 40b39d120d99a145690444e86acc3e3e24d41e6e0538a75e26929ad84926e5e0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_stackexchange-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | e524bfb3e21cbdaddc117403a50df598520c7bf5b2c60ad8f2372cfa564e79be -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_stackexchange-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | e524bfb3e21cbdaddc117403a50df598520c7bf5b2c60ad8f2372cfa564e79be -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 4eb69e314f0864ec8890e2323d7e76f8a8309692c4f090e2b41bf4be681a811d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 4eb69e314f0864ec8890e2323d7e76f8a8309692c4f090e2b41bf4be681a811d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_uspto-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 789b2bdb31564d512b70f801316f49320a26c83ba361226bac0afb255341d477 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_uspto-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 789b2bdb31564d512b70f801316f49320a26c83ba361226bac0afb255341d477 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_wikipedia-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | ef9ec0dd408316ca6537228a6812e839f14b30608973081d41efc47c138338da -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_wikipedia-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | ef9ec0dd408316ca6537228a6812e839f14b30608973081d41efc47c138338da -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_youtubesubtitles-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 68263c52adc0086011e2220b619983935cabb1cc1f5f9f8ee1a74ab2a7457967 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pile_youtubesubtitles-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | 68263c52adc0086011e2220b619983935cabb1cc1f5f9f8ee1a74ab2a7457967 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/piqa-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 6048a3a2bb3ad1e6a3d98139618e06b4d7de766edd685bd38837596199c3f69f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/prost-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7c475f5b36a8b79f94c2be035441e7fd59dac021b0713b1fc72d256424c70b0b -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pubmedqa-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7a04a1fb1d2b19db84fd15c224015d6c0306a41195a4e71fe6abd48fb4d53b9f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/pubmedqa-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"pubmedqa": {"acc": 0.324, "acc_stderr": 0.01480686473373886}}, "versions": {"pubmedqa": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/qa4mre_2011-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 0d09f17c65768e797633494d2d218e4e46a26f718cab8b0bf3d156b073a8c437 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/qa4mre_2012-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7e17261820acb365966cb9431d93aec983b14393eaeefbc96e30a11cf58bc6df -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/qa4mre_2013-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 52fc431e94c67f983e28ebc70cf45e6c14116b0ae77dc1bf22347c705a65d054 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/qnli-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 4281d4ff5cf1244358b0ea0220c67863c69fbade850696b43e8ff05138e01e12 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/qnli-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"qnli": {"acc": 0.5108914515833791, "acc_stderr": 0.00676380528502966}}, "versions": {"qnli": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/qqp-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 97b551b0fc3d239aad4929a2e8e79c986891aefd9fcd19441fea0382d507889e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/race-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | bdfdfab7fa1c7af0c1e161785e347b1b8071a15cbf971f6f2a9ae8c8e845199f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/race-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"race": {"acc": 0.23253588516746412, "acc_stderr": 0.013074460615265295}}, "versions": {"race": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/random_insertion-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 6c48baa6924f3635120f33062251c4b571b3d4e9fe46b14d91f54ddd1c857997 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/random_insertion-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"random_insertion": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"random_insertion": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/record-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | a3e378fbde4e28f375cac1561bbfc7d7673c2af193628a774ad012d5192393aa -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/reversed_words-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 1d79fc4f0177f9624a487b9973f4e0e1d3f8404993b419a7b807a690ebbbb290 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/reversed_words-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"reversed_words": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"reversed_words": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/rte-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | c80ce13c8c736087f1557f8736d5d318b540ff01e4bb7f55e568890dc8b0393e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/rte-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"rte": {"acc": 0.5379061371841155, "acc_stderr": 0.030009848912529117}}, "versions": {"rte": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/sciq-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 71cbb6e2a7ac4512c3761ea801d420eb3fac49d158c7e4deaa3ab8727bea923c -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/sciq-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"sciq": {"acc": 0.234, "acc_norm": 0.239, "acc_norm_stderr": 0.01349300044693758, "acc_stderr": 0.01339490288966001}}, "versions": {"sciq": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/squad2-v0-greedy_until: -------------------------------------------------------------------------------- 1 | b261e8885c11750ce6911bb11e8693de03d53758297c26fb14cfc1ef508862cb -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/squad2-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 287e87cc6878debcc80d9b6df4e2d0a74ed29068e0e0a80906c8441843a17cee -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/squad2-v1-greedy_until: -------------------------------------------------------------------------------- 1 | e17e3d85c1d5adaf2d6b4b752c4babc2e0b3a6e144e6de70cb3b2287e85109b8 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/squad2-v1-loglikelihood: -------------------------------------------------------------------------------- 1 | f5da6173402b274dc89130755c222c6ca6b2a3bacaaa4e4ab07be9322b7bad65 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/sst-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | d2ebe3a63517d1d481aa1513bebe124c57a0904554a1e95f566979cfe67b1a7f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/sst-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"sst": {"acc": 0.5172018348623854, "acc_stderr": 0.016931824425903734}}, "versions": {"sst": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/swag-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | be4fcbad876124c4ba3c71970538a97fec0e36a9cc677c70b6c9243a7bcee0ec -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/toxigen-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 7fedd930bafa92b9cca615a93ba92a4413244d2b77cf3f421a186815d721e0fa -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/triviaqa-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | f8ec05b306b9f6187c0f8117cae441fb85a7a2e4670f4f9a1a3b632b1978421a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/triviaqa-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"triviaqa": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"triviaqa": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/triviaqa-v1-loglikelihood: -------------------------------------------------------------------------------- 1 | f8ec05b306b9f6187c0f8117cae441fb85a7a2e4670f4f9a1a3b632b1978421a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/triviaqa-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"triviaqa": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"triviaqa": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/truthfulqa_gen-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 0d7c56e1aa71ffd8f94bde28f6e8dfdd35f7aaadffa0620bd2a27704253d6c14 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/truthfulqa_gen-v1-greedy_until: -------------------------------------------------------------------------------- 1 | 1a280973bbac2b7ac29dd64dddac474fb4749585f7de893483b4034814466c67 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/truthfulqa_mc-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 226a6783976177dc9ceda5688623ff37023242eff30ddf270b886bf7b9b32228 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/truthfulqa_mc-v1-loglikelihood: -------------------------------------------------------------------------------- 1 | 1e07020e9cf41d46ed65312eb39d2b8e6599673d4f0d6b67c0d0eba0efb493bb -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/webqs-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 96b218173468cc94552a0b946193bda89faba51f1bfc3e7945531f9dff8d6fe9 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/webqs-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"webqs": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"webqs": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wic-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 403a08da05e4c44d7e3dd3358382a7ba489c41d223e24cd1a9ed82ef1a2d004b -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wic-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"wic": {"acc": 0.49216300940438873, "acc_stderr": 0.01980828765781383}}, "versions": {"wic": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wikitext-v0-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | b6f83e6cf7535ee41b0057c3e2ec2cf7f2fa5a9119b305c479a83091d1142b2c -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wikitext-v1-loglikelihood_rolling: -------------------------------------------------------------------------------- 1 | b6f83e6cf7535ee41b0057c3e2ec2cf7f2fa5a9119b305c479a83091d1142b2c -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/winogrande-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 90a3eff49de9173964d46f5ed57bcf9a78a72dd1bfe0e5323b25cebb40b49ea9 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/winogrande-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"winogrande": {"acc": 0.516179952644041, "acc_stderr": 0.014045126130978606}}, "versions": {"winogrande": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt14-en-fr-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 368ae7eec0f902b5123f2d5197caa5109a23942011c53fe68d9eaeee20180e46 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt14-fr-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | c1d9f7283755fbdd7ecd6cc4278b0ac25a80ac256b7071ea5f839ccd038e5974 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt16-de-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | d30e23e38d9a45b9c31e1dfd14b58d0b7020df4b9c8a1c697aa6bc5fba8ce08a -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-greedy_until: -------------------------------------------------------------------------------- 1 | d71e2074af3770e9b29ac561caf2e1c29ad6b0dc50ec2e7bcc5501747b11f0da -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt16-en-ro-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 4be7fdda313394f19b5995b00ada1dfa3bb158ee1f020ef8d07ecea260fa60b2 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt16-ro-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | d1b7c50751b0d5d7470b7f49f2bab9d09792c91460fc92cc34f06617013d7c65 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-cs-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | bfead9efdb1b2402a414c55929c8d8f956585f938a35466931d44e81d89cfe00 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-de-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | d13b5a6915ca86ac6c6ebc50d9be0d0be3dfca600c12e896df53190d875de74d -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 7f197bc281d6dbf9425900ef0dee7175021c43e355050f149f43b161c52bf0b0 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-cs-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 5a34e6863bf6965afd31653de50bac5fecf58db65dbaba46921504a2b7463786 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-de-v0-greedy_until: -------------------------------------------------------------------------------- 1 | b6e9c305766ea23ce1027309f83c6d4c2ce8948d70b63a7858586ca34050d7fb -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-iu-v0-greedy_until: -------------------------------------------------------------------------------- 1 | f5688199890a48f73f2cc04a2152e35190f0e0ddd40e629fa24ee39d423ea389 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-ja-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-ja-v1-greedy_until: -------------------------------------------------------------------------------- 1 | 7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-km-v0-greedy_until: -------------------------------------------------------------------------------- 1 | eb5365c46f22ffec9a157991627d6e1fd1117fccffaedfc73619e93bafb5a408 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-pl-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 952f02575d4936d93c4d2808d86c4bf5f1f3a0901212acee6cbc1f9cbd30d39e -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-ps-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 8411c2cb73114cbd0c6e0f17eab2625d486cc3a601105deb0ea1338a401df689 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-ru-v0-greedy_until: -------------------------------------------------------------------------------- 1 | a1613831f69c1679a54670092af40ce76617b79d7cc837984803b0fc52bb8bde -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-ta-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 5fc556fa90bca7f1b1396e97e392eac8080b0ad53488358799b8fc0b21a94cb1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-zh-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 67f0333ddbcb07d7a9ac12919129a18fe4fea24e4826a11bbdde4fd5ed5ed83f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-en-zh-v1-greedy_until: -------------------------------------------------------------------------------- 1 | 67f0333ddbcb07d7a9ac12919129a18fe4fea24e4826a11bbdde4fd5ed5ed83f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-fr-de-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 8a4b65c59dcac6591d46261909ee92ebcf41c19ee7442b12842302b2d8aeb36f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-iu-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 97bf664a8efa54b5366b8341f77b418106dd0cb26169d5b2d0144e4d3d2bc5c9 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-ja-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 1fd846f3c0104e794eb380dae7f648592092ab8bf59234c26d0a671bbbc28df1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-km-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | fb4ec81bb89c70df7e21b43e0e882915b7b71a2a85bb8d4b59e0c7938baaa4c2 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-pl-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 89274499d84176b1ffe4eaec06f2c89ca807342384dc946c2e348d00116aaade -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-ps-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | c3976465e3709b4bc371175cc1494c69fe096ea4ba7d114da779d2baa0a47466 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 1477ab6542c26bd0222cc1aded174f33bf8d04d1cf6a1c0959aeca4ff3779adc -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-ta-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 111ea3efdc08f1cf536631b9426c3a20e482c575d009d2a8c71f59c027578eec -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wmt20-zh-en-v0-greedy_until: -------------------------------------------------------------------------------- 1 | 07dbadfd6f2b2b9462ab6187dbfaabae6e5192ab89a8e4ede9237834b9364dd1 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wnli-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 2ffd304d6096416eb29607e2e7642b1d6043163624967bcf4c4fc00fddc6c721 -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wnli-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"wnli": {"acc": 0.3380281690140845, "acc_stderr": 0.05653887739133514}}, "versions": {"wnli": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wnli-v1-loglikelihood: -------------------------------------------------------------------------------- 1 | 8a0f81661d2ab2334bbc8031fac31c0c8882f1d9271dd51599d21dfdbb726dea -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wnli-v1-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"wnli": {"acc": 0.5633802816901409, "acc_stderr": 0.0592793555841297}}, "versions": {"wnli": 1}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wsc-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 45865468eff5ca31e6a050947a6b3310d9d5ed19d0f2e578a32ecaf1c768600f -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wsc-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"wsc": {"acc": 0.5480769230769231, "acc_stderr": 0.049038186969314335}}, "versions": {"wsc": 0}} -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wsc273-v0-loglikelihood: -------------------------------------------------------------------------------- 1 | 26450d414c4581feb51a09882080e7a9b95882e7eab47b1751a4a6024b5a60ee -------------------------------------------------------------------------------- /lm-evaluation-harness/tests/testdata/wsc273-v0-res.json: -------------------------------------------------------------------------------- 1 | {"results": {"wsc273": {"acc": 0.5164835164835165, "acc_stderr": 0.0303004740355766}}, "versions": {"wsc273": 0}} -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fire 2 | tqdm 3 | torch==2.2.0 4 | transformers==4.37.2 5 | datasets==2.16.1 6 | accelerate==0.26.1 7 | sentencepiece 8 | protobuf --------------------------------------------------------------------------------