├── .gitignore ├── Dockerfile ├── LICENSE ├── MEMO.md ├── README.md ├── bigcode-evaluation-harness ├── .github │ └── workflows │ │ └── ci.yml ├── .gitignore ├── CONTRIBUTING.md ├── Dockerfile ├── Dockerfile-multiple ├── LICENSE ├── README.md ├── bigcode_eval │ ├── __init__.py │ ├── arguments.py │ ├── base.py │ ├── custom_utils.py │ ├── evaluator.py │ ├── generation.py │ ├── tasks │ │ ├── __init__.py │ │ ├── apps.py │ │ ├── codexglue_code_to_text.py │ │ ├── codexglue_text_to_text.py │ │ ├── conala.py │ │ ├── concode.py │ │ ├── custom_metrics │ │ │ ├── __init__.py │ │ │ ├── code_eval.py │ │ │ ├── codexglue_code_to_text_bleu.py │ │ │ ├── diff_eval.py │ │ │ ├── execute.py │ │ │ ├── multiple_metrics │ │ │ │ ├── __init__.py │ │ │ │ ├── containerized_eval.py │ │ │ │ ├── eval_cpp.py │ │ │ │ ├── eval_cs.py │ │ │ │ ├── eval_dlang.py │ │ │ │ ├── eval_go.py │ │ │ │ ├── eval_java.py │ │ │ │ ├── eval_javascript.py │ │ │ │ ├── eval_julia.py │ │ │ │ ├── eval_lua.py │ │ │ │ ├── eval_php.py │ │ │ │ ├── eval_pl.py │ │ │ │ ├── eval_python.py │ │ │ │ ├── eval_r.py │ │ │ │ ├── eval_racket.py │ │ │ │ ├── eval_ruby.py │ │ │ │ ├── eval_rust.py │ │ │ │ ├── eval_scala.py │ │ │ │ ├── eval_sh.py │ │ │ │ ├── eval_swift.py │ │ │ │ ├── eval_ts.py │ │ │ │ ├── evaluation.py │ │ │ │ ├── generic_eval.py │ │ │ │ ├── libeval.py │ │ │ │ ├── safe_subprocess │ │ │ │ │ ├── .gitignore │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── evil_programs │ │ │ │ │ │ ├── block_on_inputs.py │ │ │ │ │ │ ├── close_outputs.py │ │ │ │ │ │ ├── fork_bomb.py │ │ │ │ │ │ ├── fork_once.py │ │ │ │ │ │ ├── sleep_forever.py │ │ │ │ │ │ └── unbounded_output.py │ │ │ │ │ └── module_test.py │ │ │ │ └── single_experiment_pass_k.py │ │ │ └── pal_metric │ │ │ │ ├── __init__.py │ │ │ │ ├── pal_code_exec.py │ │ │ │ └── python_executor.py │ │ ├── ds1000.py │ │ ├── few_shot_examples │ │ │ ├── codexglue_text_to_text_few_shot_prompts.json │ │ │ ├── conala_few_shot_prompts.json │ │ │ ├── concode_few_shot_prompts.json │ │ │ └── gsm8k_few_shot_prompts.json │ │ ├── gsm.py │ │ ├── humaneval.py │ │ ├── humanevalpack.py │ │ ├── humanevalpack_openai.py │ │ ├── humanevalplus.py │ │ ├── instruct_humaneval.py │ │ ├── instruct_wizard_humaneval.py │ │ ├── mbpp.py │ │ ├── mbpp_ja.py │ │ ├── mbppplus.py │ │ ├── multiple.py │ │ ├── parity.py │ │ ├── python_bugs.py │ │ ├── quixbugs.py │ │ ├── recode.py │ │ └── santacoder_fim.py │ └── utils.py ├── docs │ ├── README.md │ └── guide.md ├── finetuning │ ├── APPS │ │ ├── README.md │ │ ├── apps_dataset.py │ │ └── apps_train.py │ ├── Code-to-text │ │ ├── README.md │ │ └── train.py │ ├── CodeClone │ │ ├── README.md │ │ └── train.py │ ├── CodeComplex │ │ ├── README.md │ │ └── train.py │ ├── CodeDefect │ │ ├── README.md │ │ └── train.py │ └── README.md ├── leaderboard │ ├── README.md │ ├── group_jsons.py │ ├── multiple_eval.slurm │ └── throughput_config.yaml ├── main.py ├── makefile ├── requirements.txt ├── setup.py ├── templates │ └── new_task.py └── tests │ ├── data │ ├── humaneval_eval_gens.json │ ├── humaneval_gen_gens.json │ ├── humaneval_gen_refs.json │ ├── mbpp_eval_gens.json │ ├── mbpp_gen_gens.json │ ├── mbpp_gen_refs.json │ ├── pal-gsm8k-greedy_eval_gens.json │ └── pal-gsm8k-greedy_prompt.json │ ├── test_generation_evaluation.py │ └── test_prompts.py ├── fastchat ├── .github │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows │ │ └── python-package.yml ├── .gitignore ├── .pylintrc ├── LICENSE ├── README.md ├── assets │ ├── demo_narrow.gif │ ├── qa_browser.png │ ├── screenshot_cli.png │ ├── screenshot_gui.png │ ├── server_arch.png │ └── vicuna_logo.jpeg ├── docker │ ├── Dockerfile │ └── docker-compose.yml ├── docs │ ├── arena.md │ ├── awq.md │ ├── commands │ │ ├── conv_release.md │ │ ├── data_cleaning.md │ │ ├── leaderboard.md │ │ ├── local_cluster.md │ │ ├── pypi.md │ │ └── webserver.md │ ├── dashinfer_integration.md │ ├── dataset_release.md │ ├── exllama_v2.md │ ├── gptq.md │ ├── langchain_integration.md │ ├── lightllm_integration.md │ ├── mlx_integration.md │ ├── model_support.md │ ├── openai_api.md │ ├── server_arch.md │ ├── third_party_ui.md │ ├── training.md │ ├── vicuna_weights_version.md │ ├── vllm_integration.md │ └── xFasterTransformer.md ├── fastchat │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── data │ │ ├── __init__.py │ │ ├── clean_sharegpt.py │ │ ├── convert_alpaca.py │ │ ├── extract_gpt4_only.py │ │ ├── extract_single_round.py │ │ ├── filter_wrong_format.py │ │ ├── get_stats.py │ │ ├── hardcoded_questions.py │ │ ├── inspect_data.py │ │ ├── merge.py │ │ ├── optional_clean.py │ │ ├── optional_replace.py │ │ ├── prepare_all.py │ │ ├── pretty_json.py │ │ ├── sample.py │ │ ├── split_long_conversation.py │ │ └── split_train_test.py │ ├── llm_judge │ │ ├── README.md │ │ ├── clean_judgment.py │ │ ├── common.py │ │ ├── compute_agreement.py │ │ ├── custom_utils.py │ │ ├── data │ │ │ ├── japanese_mt_bench │ │ │ │ ├── question.jsonl │ │ │ │ └── reference_answer │ │ │ │ │ └── gpt-4o-2024-08-06.jsonl │ │ │ ├── judge_prompts.jsonl │ │ │ ├── mt_bench │ │ │ │ ├── misc │ │ │ │ │ └── radar.png │ │ │ │ ├── question.jsonl │ │ │ │ └── reference_answer │ │ │ │ │ └── gpt-4.jsonl │ │ │ └── vicuna_bench │ │ │ │ ├── question.jsonl │ │ │ │ └── reference_answer │ │ │ │ └── gpt-4.jsonl │ │ ├── download_mt_bench_pregenerated.py │ │ ├── gen_api_answer.py │ │ ├── gen_judgment.py │ │ ├── gen_model_answer.py │ │ ├── qa_browser.py │ │ └── show_result.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── apply_lora.py │ │ ├── compression.py │ │ ├── convert_fp16.py │ │ ├── llama_condense_monkey_patch.py │ │ ├── make_delta.py │ │ ├── model_adapter.py │ │ ├── model_chatglm.py │ │ ├── model_cllm.py │ │ ├── model_codet5p.py │ │ ├── model_exllama.py │ │ ├── model_falcon.py │ │ ├── model_registry.py │ │ ├── model_xfastertransformer.py │ │ ├── model_yuan2.py │ │ ├── monkey_patch_non_inplace.py │ │ ├── rwkv_model.py │ │ └── upload_hub.py │ ├── modules │ │ ├── __init__.py │ │ ├── awq.py │ │ ├── exllama.py │ │ ├── gptq.py │ │ └── xfastertransformer.py │ ├── protocol │ │ ├── api_protocol.py │ │ └── openai_api_protocol.py │ ├── serve │ │ ├── __init__.py │ │ ├── api_provider.py │ │ ├── base_model_worker.py │ │ ├── call_monitor.py │ │ ├── cli.py │ │ ├── controller.py │ │ ├── dashinfer_worker.py │ │ ├── example_images │ │ │ ├── distracted.jpg │ │ │ └── fridge.jpg │ │ ├── gateway │ │ │ ├── README.md │ │ │ └── nginx.conf │ │ ├── gradio_block_arena_anony.py │ │ ├── gradio_block_arena_named.py │ │ ├── gradio_block_arena_vision.py │ │ ├── gradio_block_arena_vision_anony.py │ │ ├── gradio_block_arena_vision_named.py │ │ ├── gradio_global_state.py │ │ ├── gradio_web_server.py │ │ ├── gradio_web_server_multi.py │ │ ├── huggingface_api.py │ │ ├── huggingface_api_worker.py │ │ ├── inference.py │ │ ├── launch_all_serve.py │ │ ├── lightllm_worker.py │ │ ├── mlx_worker.py │ │ ├── model_worker.py │ │ ├── monitor │ │ │ ├── add_markdown_info.py │ │ │ ├── basic_stats.py │ │ │ ├── classify │ │ │ │ ├── README.md │ │ │ │ ├── category.py │ │ │ │ ├── config.yaml │ │ │ │ ├── display_score.py │ │ │ │ └── label.py │ │ │ ├── clean_battle_data.py │ │ │ ├── clean_chat_data.py │ │ │ ├── code_tagger.py │ │ │ ├── criteria_labeling.py │ │ │ ├── dataset_release_scripts │ │ │ │ ├── arena_33k │ │ │ │ │ ├── count_unique_users.py │ │ │ │ │ ├── filter_bad_conv.py │ │ │ │ │ ├── merge_field.py │ │ │ │ │ ├── sample.py │ │ │ │ │ └── upload_hf_dataset.py │ │ │ │ └── lmsys_chat_1m │ │ │ │ │ ├── approve_all.py │ │ │ │ │ ├── compute_stats.py │ │ │ │ │ ├── filter_bad_conv.py │ │ │ │ │ ├── final_post_processing.py │ │ │ │ │ ├── instructions.md │ │ │ │ │ ├── merge_oai_tag.py │ │ │ │ │ ├── process_all.sh │ │ │ │ │ ├── sample.py │ │ │ │ │ └── upload_hf_dataset.py │ │ │ ├── deduplication.py │ │ │ ├── elo_analysis.py │ │ │ ├── inspect_conv.py │ │ │ ├── intersect_conv_file.py │ │ │ ├── leaderboard_csv_to_html.py │ │ │ ├── monitor.py │ │ │ ├── monitor_md.py │ │ │ ├── rating_systems.py │ │ │ ├── summarize_cluster.py │ │ │ ├── tag_openai_moderation.py │ │ │ ├── topic_clustering.py │ │ │ └── vote_time_stats │ │ │ │ ├── README.md │ │ │ │ ├── analyze_data.py │ │ │ │ └── plot.py │ │ ├── multi_model_worker.py │ │ ├── openai_api_server.py │ │ ├── register_worker.py │ │ ├── remote_logger.py │ │ ├── sglang_worker.py │ │ ├── shutdown_serve.py │ │ ├── test_message.py │ │ ├── test_throughput.py │ │ ├── vision │ │ │ ├── create_vqa_examples_dir.py │ │ │ ├── create_vqa_examples_json.py │ │ │ └── image.py │ │ └── vllm_worker.py │ ├── train │ │ ├── llama2_flash_attn_monkey_patch.py │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llama_xformers_attn_monkey_patch.py │ │ ├── train.py │ │ ├── train_baichuan.py │ │ ├── train_flant5.py │ │ ├── train_lora.py │ │ ├── train_lora_t5.py │ │ ├── train_mem.py │ │ ├── train_with_template.py │ │ ├── train_xformers.py │ │ └── train_yuan2.py │ └── utils.py ├── format.sh ├── playground │ ├── FastChat_API_GoogleColab.ipynb │ ├── __init__.py │ ├── benchmark │ │ └── benchmark_api_provider.py │ ├── deepspeed_config_s2.json │ ├── deepspeed_config_s3.json │ └── test_embedding │ │ ├── README.md │ │ ├── test_classification.py │ │ ├── test_semantic_search.py │ │ └── test_sentence_similarity.py ├── pyproject.toml ├── scripts │ ├── build-api.sh │ ├── test_readme_train.sh │ ├── train_lora.sh │ ├── train_vicuna_13b.sh │ ├── train_vicuna_7b.sh │ └── upload_pypi.sh └── tests │ ├── README.md │ ├── killall_python.sh │ ├── launch_openai_api_test_server.py │ ├── load_test.py │ ├── test_cli.py │ ├── test_cli_inputs.txt │ ├── test_image_utils.py │ ├── test_openai_api.py │ ├── test_openai_langchain.py │ └── test_openai_vision_api.py ├── llm-jp-eval ├── .github │ ├── dependabot.yml │ └── workflows │ │ ├── lint.yml │ │ ├── requirements.yml │ │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── DATASET.md ├── LICENSE ├── README.md ├── README_en.md ├── REFERENCES.md ├── configs │ ├── config_no-sample.yaml │ ├── config_template.yaml │ ├── model │ │ └── llm-jp_llm-jp-1.3b-v1.0.yaml │ └── tokenizer │ │ └── llm-jp_llm-jp-1.3b-v1.0.yaml ├── poetry.lock ├── pyproject.toml ├── requirements.txt ├── scripts │ ├── evaluate_llm.py │ ├── jmmlu_statistics.py │ └── preprocess_dataset.py ├── src │ └── llm_jp_eval │ │ ├── __init__.py │ │ ├── datasets │ │ ├── __init__.py │ │ ├── alt.py │ │ ├── base.py │ │ ├── chabsa.py │ │ ├── jamp.py │ │ ├── janli.py │ │ ├── jblimp.py │ │ ├── jcola.py │ │ ├── jcommonsenseqa.py │ │ ├── jemhopqa.py │ │ ├── jmmlu.py │ │ ├── jnli.py │ │ ├── jsem.py │ │ ├── jsick.py │ │ ├── jsquad.py │ │ ├── jsts.py │ │ ├── mawps.py │ │ ├── mmluen.py │ │ ├── niilc.py │ │ ├── wiki │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── coreference.py │ │ │ ├── dependency.py │ │ │ ├── ner.py │ │ │ ├── pas.py │ │ │ └── reading.py │ │ └── wikicorpus.py │ │ ├── evaluator.py │ │ └── utils.py └── tests │ ├── conftest.py │ ├── data │ ├── wiki00132787.knp │ ├── wiki00268469.knp │ └── wiki00280639.knp │ └── datasets │ ├── test_wiki_coreference.py │ ├── test_wiki_dependency.py │ ├── test_wiki_ner.py │ ├── test_wiki_pas.py │ └── test_wiki_reading.py ├── lm-evaluation-harness-en ├── .coveragerc ├── .flake8 ├── .github │ └── workflows │ │ ├── new_tasks.yml │ │ ├── publish.yml │ │ └── unit_tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.bib ├── CODEOWNERS ├── LICENSE.md ├── README.md ├── docs │ ├── CONTRIBUTING.md │ ├── README.md │ ├── decontamination.md │ ├── img │ │ └── fewshot_example_gpt3.png │ ├── interface.md │ ├── model_guide.md │ ├── new_task_guide.md │ └── task_guide.md ├── examples │ ├── lm-eval-overview.ipynb │ ├── visualize-wandb.ipynb │ └── visualize-zeno.ipynb ├── ignore.txt ├── lm_eval │ ├── __init__.py │ ├── __main__.py │ ├── api │ │ ├── __init__.py │ │ ├── filter.py │ │ ├── instance.py │ │ ├── metrics.py │ │ ├── model.py │ │ ├── registry.py │ │ ├── samplers.py │ │ └── task.py │ ├── caching │ │ └── cache.py │ ├── decontamination │ │ ├── __init__.py │ │ ├── archiver.py │ │ ├── decontaminate.py │ │ └── janitor.py │ ├── evaluator.py │ ├── evaluator_utils.py │ ├── filters │ │ ├── __init__.py │ │ ├── decontamination.py │ │ ├── extraction.py │ │ ├── selection.py │ │ └── transformation.py │ ├── logging_utils.py │ ├── models │ │ ├── __init__.py │ │ ├── anthropic_llms.py │ │ ├── dummy.py │ │ ├── gguf.py │ │ ├── huggingface.py │ │ ├── mamba_lm.py │ │ ├── neuron_optimum.py │ │ ├── openai_completions.py │ │ ├── optimum_lm.py │ │ ├── textsynth.py │ │ ├── utils.py │ │ └── vllm_causallms.py │ ├── prompts │ │ └── __init__.py │ ├── tasks │ │ ├── __init__.py │ │ ├── aexams │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── aexams_Biology.yaml │ │ │ ├── aexams_IslamicStudies.yaml │ │ │ ├── aexams_Physics.yaml │ │ │ ├── aexams_Science.yaml │ │ │ └── aexams_Social.yaml │ │ ├── agieval │ │ │ ├── README.md │ │ │ ├── aqua-rat.yaml │ │ │ ├── gaokao-biology.yaml │ │ │ ├── gaokao-chemistry.yaml │ │ │ ├── gaokao-chinese.yaml │ │ │ ├── gaokao-english.yaml │ │ │ ├── gaokao-geography.yaml │ │ │ ├── gaokao-history.yaml │ │ │ ├── gaokao-mathcloze.yaml │ │ │ ├── gaokao-mathqa.yaml │ │ │ ├── gaokao-physics.yaml │ │ │ ├── jec-qa-ca.yaml │ │ │ ├── jec-qa-kd.yaml │ │ │ ├── logiqa-en.yaml │ │ │ ├── logiqa-zh.yaml │ │ │ ├── lsat-ar.yaml │ │ │ ├── lsat-lr.yaml │ │ │ ├── lsat-rc.yaml │ │ │ ├── math.yaml │ │ │ ├── sat-en-without-passage.yaml │ │ │ ├── sat-en.yaml │ │ │ ├── sat-math.yaml │ │ │ └── utils.py │ │ ├── ammlu │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── ammlu_abstract_algebra.yaml │ │ │ ├── ammlu_anatomy.yaml │ │ │ ├── ammlu_astronomy.yaml │ │ │ ├── ammlu_business_ethics.yaml │ │ │ ├── ammlu_clinical_knowledge.yaml │ │ │ ├── ammlu_college_biology.yaml │ │ │ ├── ammlu_college_chemistry.yaml │ │ │ ├── ammlu_college_computer_science.yaml │ │ │ ├── ammlu_college_mathematics.yaml │ │ │ ├── ammlu_college_medicine.yaml │ │ │ ├── ammlu_college_physics.yaml │ │ │ ├── ammlu_computer_security.yaml │ │ │ ├── ammlu_conceptual_physics.yaml │ │ │ ├── ammlu_econometrics.yaml │ │ │ ├── ammlu_electrical_engineering.yaml │ │ │ ├── ammlu_elementary_mathematics.yaml │ │ │ ├── ammlu_formal_logic.yaml │ │ │ ├── ammlu_global_facts.yaml │ │ │ ├── ammlu_high_school_biology.yaml │ │ │ ├── ammlu_high_school_chemistry.yaml │ │ │ ├── ammlu_high_school_computer_science.yaml │ │ │ ├── ammlu_high_school_european_history.yaml │ │ │ ├── ammlu_high_school_geography.yaml │ │ │ ├── ammlu_high_school_government_and_politics.yaml │ │ │ ├── ammlu_high_school_macroeconomics.yaml │ │ │ ├── ammlu_high_school_mathematics.yaml │ │ │ ├── ammlu_high_school_microeconomics.yaml │ │ │ ├── ammlu_high_school_physics.yaml │ │ │ ├── ammlu_high_school_psychology.yaml │ │ │ ├── ammlu_high_school_statistics.yaml │ │ │ ├── ammlu_high_school_us_history.yaml │ │ │ ├── ammlu_high_school_world_history.yaml │ │ │ ├── ammlu_human_aging.yaml │ │ │ ├── ammlu_human_sexuality.yaml │ │ │ ├── ammlu_international_law.yaml │ │ │ ├── ammlu_jurisprudence.yaml │ │ │ ├── ammlu_logical_fallacies.yaml │ │ │ ├── ammlu_machine_learning.yaml │ │ │ ├── ammlu_management.yaml │ │ │ ├── ammlu_marketing.yaml │ │ │ ├── ammlu_medical_genetics.yaml │ │ │ ├── ammlu_miscellaneous.yaml │ │ │ ├── ammlu_moral_disputes.yaml │ │ │ ├── ammlu_moral_scenarios.yaml │ │ │ ├── ammlu_nutrition.yaml │ │ │ ├── ammlu_philosophy.yaml │ │ │ ├── ammlu_prehistory.yaml │ │ │ ├── ammlu_professional_accounting.yaml │ │ │ ├── ammlu_professional_law.yaml │ │ │ ├── ammlu_professional_medicine.yaml │ │ │ ├── ammlu_professional_psychology.yaml │ │ │ ├── ammlu_public_relations.yaml │ │ │ ├── ammlu_security_studies.yaml │ │ │ ├── ammlu_sociology.yaml │ │ │ ├── ammlu_us_foreign_policy.yaml │ │ │ ├── ammlu_virology.yaml │ │ │ └── ammlu_world_religions.yaml │ │ ├── anli │ │ │ ├── README.md │ │ │ ├── anli_r1.yaml │ │ │ ├── anli_r2.yaml │ │ │ └── anli_r3.yaml │ │ ├── arc │ │ │ ├── README.md │ │ │ ├── arc_challenge.yaml │ │ │ └── arc_easy.yaml │ │ ├── arithmetic │ │ │ ├── README.md │ │ │ ├── arithmetic_1dc.yaml │ │ │ ├── arithmetic_2da.yaml │ │ │ ├── arithmetic_2dm.yaml │ │ │ ├── arithmetic_2ds.yaml │ │ │ ├── arithmetic_3da.yaml │ │ │ ├── arithmetic_3ds.yaml │ │ │ ├── arithmetic_4da.yaml │ │ │ ├── arithmetic_4ds.yaml │ │ │ ├── arithmetic_5da.yaml │ │ │ └── arithmetic_5ds.yaml │ │ ├── asdiv │ │ │ ├── README.md │ │ │ └── default.yaml │ │ ├── babi │ │ │ ├── README.md │ │ │ └── babi.yaml │ │ ├── bbh │ │ │ ├── README.md │ │ │ ├── _generate_configs.py │ │ │ ├── cot_fewshot │ │ │ │ ├── _cot_fewshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ │ ├── cot_zeroshot │ │ │ │ ├── _cot_zeroshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── utils.py │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ │ ├── fewshot │ │ │ │ ├── _fewshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ │ └── zeroshot │ │ │ │ ├── _zeroshot_template_yaml │ │ │ │ ├── boolean_expressions.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── formal_fallacies.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── logical_deduction_five_objects.yaml │ │ │ │ ├── logical_deduction_seven_objects.yaml │ │ │ │ ├── logical_deduction_three_objects.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── multistep_arithmetic_two.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tracking_shuffled_objects_five_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_seven_objects.yaml │ │ │ │ ├── tracking_shuffled_objects_three_objects.yaml │ │ │ │ ├── utils.py │ │ │ │ ├── web_of_lies.yaml │ │ │ │ └── word_sorting.yaml │ │ ├── belebele │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── belebele_acm_Arab.yaml │ │ │ ├── belebele_afr_Latn.yaml │ │ │ ├── belebele_als_Latn.yaml │ │ │ ├── belebele_amh_Ethi.yaml │ │ │ ├── belebele_apc_Arab.yaml │ │ │ ├── belebele_arb_Arab.yaml │ │ │ ├── belebele_arb_Latn.yaml │ │ │ ├── belebele_ars_Arab.yaml │ │ │ ├── belebele_ary_Arab.yaml │ │ │ ├── belebele_arz_Arab.yaml │ │ │ ├── belebele_asm_Beng.yaml │ │ │ ├── belebele_azj_Latn.yaml │ │ │ ├── belebele_bam_Latn.yaml │ │ │ ├── belebele_ben_Beng.yaml │ │ │ ├── belebele_ben_Latn.yaml │ │ │ ├── belebele_bod_Tibt.yaml │ │ │ ├── belebele_bul_Cyrl.yaml │ │ │ ├── belebele_cat_Latn.yaml │ │ │ ├── belebele_ceb_Latn.yaml │ │ │ ├── belebele_ces_Latn.yaml │ │ │ ├── belebele_ckb_Arab.yaml │ │ │ ├── belebele_dan_Latn.yaml │ │ │ ├── belebele_deu_Latn.yaml │ │ │ ├── belebele_ell_Grek.yaml │ │ │ ├── belebele_eng_Latn.yaml │ │ │ ├── belebele_est_Latn.yaml │ │ │ ├── belebele_eus_Latn.yaml │ │ │ ├── belebele_fin_Latn.yaml │ │ │ ├── belebele_fra_Latn.yaml │ │ │ ├── belebele_fuv_Latn.yaml │ │ │ ├── belebele_gaz_Latn.yaml │ │ │ ├── belebele_grn_Latn.yaml │ │ │ ├── belebele_guj_Gujr.yaml │ │ │ ├── belebele_hat_Latn.yaml │ │ │ ├── belebele_hau_Latn.yaml │ │ │ ├── belebele_heb_Hebr.yaml │ │ │ ├── belebele_hin_Deva.yaml │ │ │ ├── belebele_hin_Latn.yaml │ │ │ ├── belebele_hrv_Latn.yaml │ │ │ ├── belebele_hun_Latn.yaml │ │ │ ├── belebele_hye_Armn.yaml │ │ │ ├── belebele_ibo_Latn.yaml │ │ │ ├── belebele_ilo_Latn.yaml │ │ │ ├── belebele_ind_Latn.yaml │ │ │ ├── belebele_isl_Latn.yaml │ │ │ ├── belebele_ita_Latn.yaml │ │ │ ├── belebele_jav_Latn.yaml │ │ │ ├── belebele_jpn_Jpan.yaml │ │ │ ├── belebele_kac_Latn.yaml │ │ │ ├── belebele_kan_Knda.yaml │ │ │ ├── belebele_kat_Geor.yaml │ │ │ ├── belebele_kaz_Cyrl.yaml │ │ │ ├── belebele_kea_Latn.yaml │ │ │ ├── belebele_khk_Cyrl.yaml │ │ │ ├── belebele_khm_Khmr.yaml │ │ │ ├── belebele_kin_Latn.yaml │ │ │ ├── belebele_kir_Cyrl.yaml │ │ │ ├── belebele_kor_Hang.yaml │ │ │ ├── belebele_lao_Laoo.yaml │ │ │ ├── belebele_lin_Latn.yaml │ │ │ ├── belebele_lit_Latn.yaml │ │ │ ├── belebele_lug_Latn.yaml │ │ │ ├── belebele_luo_Latn.yaml │ │ │ ├── belebele_lvs_Latn.yaml │ │ │ ├── belebele_mal_Mlym.yaml │ │ │ ├── belebele_mar_Deva.yaml │ │ │ ├── belebele_mkd_Cyrl.yaml │ │ │ ├── belebele_mlt_Latn.yaml │ │ │ ├── belebele_mri_Latn.yaml │ │ │ ├── belebele_mya_Mymr.yaml │ │ │ ├── belebele_nld_Latn.yaml │ │ │ ├── belebele_nob_Latn.yaml │ │ │ ├── belebele_npi_Deva.yaml │ │ │ ├── belebele_npi_Latn.yaml │ │ │ ├── belebele_nso_Latn.yaml │ │ │ ├── belebele_nya_Latn.yaml │ │ │ ├── belebele_ory_Orya.yaml │ │ │ ├── belebele_pan_Guru.yaml │ │ │ ├── belebele_pbt_Arab.yaml │ │ │ ├── belebele_pes_Arab.yaml │ │ │ ├── belebele_plt_Latn.yaml │ │ │ ├── belebele_pol_Latn.yaml │ │ │ ├── belebele_por_Latn.yaml │ │ │ ├── belebele_ron_Latn.yaml │ │ │ ├── belebele_rus_Cyrl.yaml │ │ │ ├── belebele_shn_Mymr.yaml │ │ │ ├── belebele_sin_Latn.yaml │ │ │ ├── belebele_sin_Sinh.yaml │ │ │ ├── belebele_slk_Latn.yaml │ │ │ ├── belebele_slv_Latn.yaml │ │ │ ├── belebele_sna_Latn.yaml │ │ │ ├── belebele_snd_Arab.yaml │ │ │ ├── belebele_som_Latn.yaml │ │ │ ├── belebele_sot_Latn.yaml │ │ │ ├── belebele_spa_Latn.yaml │ │ │ ├── belebele_srp_Cyrl.yaml │ │ │ ├── belebele_ssw_Latn.yaml │ │ │ ├── belebele_sun_Latn.yaml │ │ │ ├── belebele_swe_Latn.yaml │ │ │ ├── belebele_swh_Latn.yaml │ │ │ ├── belebele_tam_Taml.yaml │ │ │ ├── belebele_tel_Telu.yaml │ │ │ ├── belebele_tgk_Cyrl.yaml │ │ │ ├── belebele_tgl_Latn.yaml │ │ │ ├── belebele_tha_Thai.yaml │ │ │ ├── belebele_tir_Ethi.yaml │ │ │ ├── belebele_tsn_Latn.yaml │ │ │ ├── belebele_tso_Latn.yaml │ │ │ ├── belebele_tur_Latn.yaml │ │ │ ├── belebele_ukr_Cyrl.yaml │ │ │ ├── belebele_urd_Arab.yaml │ │ │ ├── belebele_urd_Latn.yaml │ │ │ ├── belebele_uzn_Latn.yaml │ │ │ ├── belebele_vie_Latn.yaml │ │ │ ├── belebele_war_Latn.yaml │ │ │ ├── belebele_wol_Latn.yaml │ │ │ ├── belebele_xho_Latn.yaml │ │ │ ├── belebele_yor_Latn.yaml │ │ │ ├── belebele_zho_Hans.yaml │ │ │ ├── belebele_zho_Hant.yaml │ │ │ ├── belebele_zsm_Latn.yaml │ │ │ └── belebele_zul_Latn.yaml │ │ ├── benchmarks │ │ │ ├── flan │ │ │ │ ├── _held_in_template_yaml │ │ │ │ ├── flan_held_in.yaml │ │ │ │ └── flan_held_out.yaml │ │ │ ├── minerva_math.yaml │ │ │ ├── multimedqa │ │ │ │ ├── README.md │ │ │ │ └── multimedqa.yaml │ │ │ ├── openllm.yaml │ │ │ ├── pythia.yaml │ │ │ └── t0_eval.yaml │ │ ├── bigbench │ │ │ ├── README.md │ │ │ ├── generate_tasks.py │ │ │ ├── generate_until │ │ │ │ ├── abstract_narrative_understanding.yaml │ │ │ │ ├── anachronisms.yaml │ │ │ │ ├── analogical_similarity.yaml │ │ │ │ ├── analytic_entailment.yaml │ │ │ │ ├── arithmetic.yaml │ │ │ │ ├── ascii_word_recognition.yaml │ │ │ │ ├── authorship_verification.yaml │ │ │ │ ├── auto_categorization.yaml │ │ │ │ ├── auto_debugging.yaml │ │ │ │ ├── bbq_lite_json.yaml │ │ │ │ ├── bridging_anaphora_resolution_barqa.yaml │ │ │ │ ├── causal_judgment.yaml │ │ │ │ ├── cause_and_effect.yaml │ │ │ │ ├── checkmate_in_one.yaml │ │ │ │ ├── chess_state_tracking.yaml │ │ │ │ ├── chinese_remainder_theorem.yaml │ │ │ │ ├── cifar10_classification.yaml │ │ │ │ ├── code_line_description.yaml │ │ │ │ ├── codenames.yaml │ │ │ │ ├── color.yaml │ │ │ │ ├── common_morpheme.yaml │ │ │ │ ├── conceptual_combinations.yaml │ │ │ │ ├── conlang_translation.yaml │ │ │ │ ├── contextual_parametric_knowledge_conflicts.yaml │ │ │ │ ├── crash_blossom.yaml │ │ │ │ ├── crass_ai.yaml │ │ │ │ ├── cryobiology_spanish.yaml │ │ │ │ ├── cryptonite.yaml │ │ │ │ ├── cs_algorithms.yaml │ │ │ │ ├── dark_humor_detection.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── discourse_marker_prediction.yaml │ │ │ │ ├── disfl_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── elementary_math_qa.yaml │ │ │ │ ├── emoji_movie.yaml │ │ │ │ ├── emojis_emotion_prediction.yaml │ │ │ │ ├── empirical_judgments.yaml │ │ │ │ ├── english_proverbs.yaml │ │ │ │ ├── english_russian_proverbs.yaml │ │ │ │ ├── entailed_polarity.yaml │ │ │ │ ├── entailed_polarity_hindi.yaml │ │ │ │ ├── epistemic_reasoning.yaml │ │ │ │ ├── evaluating_information_essentiality.yaml │ │ │ │ ├── fact_checker.yaml │ │ │ │ ├── fantasy_reasoning.yaml │ │ │ │ ├── few_shot_nlg.yaml │ │ │ │ ├── figure_of_speech_detection.yaml │ │ │ │ ├── formal_fallacies_syllogisms_negation.yaml │ │ │ │ ├── gem.yaml │ │ │ │ ├── gender_inclusive_sentences_german.yaml │ │ │ │ ├── general_knowledge.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── goal_step_wikihow.yaml │ │ │ │ ├── gre_reading_comprehension.yaml │ │ │ │ ├── hhh_alignment.yaml │ │ │ │ ├── hindi_question_answering.yaml │ │ │ │ ├── hindu_knowledge.yaml │ │ │ │ ├── hinglish_toxicity.yaml │ │ │ │ ├── human_organs_senses.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── identify_math_theorems.yaml │ │ │ │ ├── identify_odd_metaphor.yaml │ │ │ │ ├── implicatures.yaml │ │ │ │ ├── implicit_relations.yaml │ │ │ │ ├── intent_recognition.yaml │ │ │ │ ├── international_phonetic_alphabet_nli.yaml │ │ │ │ ├── international_phonetic_alphabet_transliterate.yaml │ │ │ │ ├── intersect_geometry.yaml │ │ │ │ ├── irony_identification.yaml │ │ │ │ ├── kanji_ascii.yaml │ │ │ │ ├── kannada.yaml │ │ │ │ ├── key_value_maps.yaml │ │ │ │ ├── known_unknowns.yaml │ │ │ │ ├── language_games.yaml │ │ │ │ ├── language_identification.yaml │ │ │ │ ├── linguistic_mappings.yaml │ │ │ │ ├── linguistics_puzzles.yaml │ │ │ │ ├── list_functions.yaml │ │ │ │ ├── logic_grid_puzzle.yaml │ │ │ │ ├── logical_args.yaml │ │ │ │ ├── logical_deduction.yaml │ │ │ │ ├── logical_fallacy_detection.yaml │ │ │ │ ├── logical_sequence.yaml │ │ │ │ ├── mathematical_induction.yaml │ │ │ │ ├── matrixshapes.yaml │ │ │ │ ├── metaphor_boolean.yaml │ │ │ │ ├── metaphor_understanding.yaml │ │ │ │ ├── minute_mysteries_qa.yaml │ │ │ │ ├── misconceptions.yaml │ │ │ │ ├── misconceptions_russian.yaml │ │ │ │ ├── mnist_ascii.yaml │ │ │ │ ├── modified_arithmetic.yaml │ │ │ │ ├── moral_permissibility.yaml │ │ │ │ ├── movie_dialog_same_or_different.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── mult_data_wrangling.yaml │ │ │ │ ├── multiemo.yaml │ │ │ │ ├── natural_instructions.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── nonsense_words_grammar.yaml │ │ │ │ ├── novel_concepts.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── odd_one_out.yaml │ │ │ │ ├── operators.yaml │ │ │ │ ├── paragraph_segmentation.yaml │ │ │ │ ├── parsinlu_qa.yaml │ │ │ │ ├── parsinlu_reading_comprehension.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── periodic_elements.yaml │ │ │ │ ├── persian_idioms.yaml │ │ │ │ ├── phrase_relatedness.yaml │ │ │ │ ├── physical_intuition.yaml │ │ │ │ ├── physics.yaml │ │ │ │ ├── physics_questions.yaml │ │ │ │ ├── play_dialog_same_or_different.yaml │ │ │ │ ├── polish_sequence_labeling.yaml │ │ │ │ ├── presuppositions_as_nli.yaml │ │ │ │ ├── qa_wikidata.yaml │ │ │ │ ├── question_selection.yaml │ │ │ │ ├── real_or_fake_text.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── repeat_copy_logic.yaml │ │ │ │ ├── rephrase.yaml │ │ │ │ ├── riddle_sense.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── scientific_press_release.yaml │ │ │ │ ├── semantic_parsing_in_context_sparc.yaml │ │ │ │ ├── semantic_parsing_spider.yaml │ │ │ │ ├── sentence_ambiguity.yaml │ │ │ │ ├── similarities_abstraction.yaml │ │ │ │ ├── simp_turing_concept.yaml │ │ │ │ ├── simple_arithmetic_json.yaml │ │ │ │ ├── simple_arithmetic_json_multiple_choice.yaml │ │ │ │ ├── simple_arithmetic_json_subtasks.yaml │ │ │ │ ├── simple_arithmetic_multiple_targets_json.yaml │ │ │ │ ├── simple_ethical_questions.yaml │ │ │ │ ├── simple_text_editing.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── social_iqa.yaml │ │ │ │ ├── social_support.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── strange_stories.yaml │ │ │ │ ├── strategyqa.yaml │ │ │ │ ├── sufficient_information.yaml │ │ │ │ ├── suicide_risk.yaml │ │ │ │ ├── swahili_english_proverbs.yaml │ │ │ │ ├── swedish_to_german_proverbs.yaml │ │ │ │ ├── symbol_interpretation.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tense.yaml │ │ │ │ ├── timedial.yaml │ │ │ │ ├── topical_chat.yaml │ │ │ │ ├── tracking_shuffled_objects.yaml │ │ │ │ ├── understanding_fables.yaml │ │ │ │ ├── undo_permutation.yaml │ │ │ │ ├── unit_conversion.yaml │ │ │ │ ├── unit_interpretation.yaml │ │ │ │ ├── unnatural_in_context_learning.yaml │ │ │ │ ├── vitaminc_fact_verification.yaml │ │ │ │ ├── what_is_the_tao.yaml │ │ │ │ ├── which_wiki_edit.yaml │ │ │ │ ├── winowhy.yaml │ │ │ │ ├── word_sorting.yaml │ │ │ │ └── word_unscrambling.yaml │ │ │ ├── generate_until_template_yaml │ │ │ ├── multiple_choice │ │ │ │ ├── abstract_narrative_understanding.yaml │ │ │ │ ├── anachronisms.yaml │ │ │ │ ├── analogical_similarity.yaml │ │ │ │ ├── analytic_entailment.yaml │ │ │ │ ├── arithmetic.yaml │ │ │ │ ├── ascii_word_recognition.yaml │ │ │ │ ├── authorship_verification.yaml │ │ │ │ ├── auto_categorization.yaml │ │ │ │ ├── auto_debugging.yaml │ │ │ │ ├── bbq_lite_json.yaml │ │ │ │ ├── bridging_anaphora_resolution_barqa.yaml │ │ │ │ ├── causal_judgement.yaml │ │ │ │ ├── causal_judgment.yaml │ │ │ │ ├── cause_and_effect.yaml │ │ │ │ ├── checkmate_in_one.yaml │ │ │ │ ├── chess_state_tracking.yaml │ │ │ │ ├── chinese_remainder_theorem.yaml │ │ │ │ ├── cifar10_classification.yaml │ │ │ │ ├── code_line_description.yaml │ │ │ │ ├── codenames.yaml │ │ │ │ ├── color.yaml │ │ │ │ ├── common_morpheme.yaml │ │ │ │ ├── conceptual_combinations.yaml │ │ │ │ ├── conlang_translation.yaml │ │ │ │ ├── contextual_parametric_knowledge_conflicts.yaml │ │ │ │ ├── crash_blossom.yaml │ │ │ │ ├── crass_ai.yaml │ │ │ │ ├── cryobiology_spanish.yaml │ │ │ │ ├── cryptonite.yaml │ │ │ │ ├── cs_algorithms.yaml │ │ │ │ ├── dark_humor_detection.yaml │ │ │ │ ├── date_understanding.yaml │ │ │ │ ├── disambiguation_qa.yaml │ │ │ │ ├── discourse_marker_prediction.yaml │ │ │ │ ├── disfl_qa.yaml │ │ │ │ ├── dyck_languages.yaml │ │ │ │ ├── elementary_math_qa.yaml │ │ │ │ ├── emoji_movie.yaml │ │ │ │ ├── emojis_emotion_prediction.yaml │ │ │ │ ├── empirical_judgments.yaml │ │ │ │ ├── english_proverbs.yaml │ │ │ │ ├── english_russian_proverbs.yaml │ │ │ │ ├── entailed_polarity.yaml │ │ │ │ ├── entailed_polarity_hindi.yaml │ │ │ │ ├── epistemic_reasoning.yaml │ │ │ │ ├── evaluating_information_essentiality.yaml │ │ │ │ ├── fact_checker.yaml │ │ │ │ ├── fantasy_reasoning.yaml │ │ │ │ ├── few_shot_nlg.yaml │ │ │ │ ├── figure_of_speech_detection.yaml │ │ │ │ ├── formal_fallacies_syllogisms_negation.yaml │ │ │ │ ├── gem.yaml │ │ │ │ ├── gender_inclusive_sentences_german.yaml │ │ │ │ ├── general_knowledge.yaml │ │ │ │ ├── geometric_shapes.yaml │ │ │ │ ├── goal_step_wikihow.yaml │ │ │ │ ├── gre_reading_comprehension.yaml │ │ │ │ ├── hhh_alignment.yaml │ │ │ │ ├── hindi_question_answering.yaml │ │ │ │ ├── hindu_knowledge.yaml │ │ │ │ ├── hinglish_toxicity.yaml │ │ │ │ ├── human_organs_senses.yaml │ │ │ │ ├── hyperbaton.yaml │ │ │ │ ├── identify_math_theorems.yaml │ │ │ │ ├── identify_odd_metaphor.yaml │ │ │ │ ├── implicatures.yaml │ │ │ │ ├── implicit_relations.yaml │ │ │ │ ├── intent_recognition.yaml │ │ │ │ ├── international_phonetic_alphabet_nli.yaml │ │ │ │ ├── international_phonetic_alphabet_transliterate.yaml │ │ │ │ ├── intersect_geometry.yaml │ │ │ │ ├── irony_identification.yaml │ │ │ │ ├── kanji_ascii.yaml │ │ │ │ ├── kannada.yaml │ │ │ │ ├── key_value_maps.yaml │ │ │ │ ├── known_unknowns.yaml │ │ │ │ ├── language_games.yaml │ │ │ │ ├── language_identification.yaml │ │ │ │ ├── linguistic_mappings.yaml │ │ │ │ ├── linguistics_puzzles.yaml │ │ │ │ ├── list_functions.yaml │ │ │ │ ├── logic_grid_puzzle.yaml │ │ │ │ ├── logical_args.yaml │ │ │ │ ├── logical_deduction.yaml │ │ │ │ ├── logical_fallacy_detection.yaml │ │ │ │ ├── logical_sequence.yaml │ │ │ │ ├── mathematical_induction.yaml │ │ │ │ ├── matrixshapes.yaml │ │ │ │ ├── metaphor_boolean.yaml │ │ │ │ ├── metaphor_understanding.yaml │ │ │ │ ├── minute_mysteries_qa.yaml │ │ │ │ ├── misconceptions.yaml │ │ │ │ ├── misconceptions_russian.yaml │ │ │ │ ├── mnist_ascii.yaml │ │ │ │ ├── modified_arithmetic.yaml │ │ │ │ ├── moral_permissibility.yaml │ │ │ │ ├── movie_dialog_same_or_different.yaml │ │ │ │ ├── movie_recommendation.yaml │ │ │ │ ├── mult_data_wrangling.yaml │ │ │ │ ├── multiemo.yaml │ │ │ │ ├── natural_instructions.yaml │ │ │ │ ├── navigate.yaml │ │ │ │ ├── nonsense_words_grammar.yaml │ │ │ │ ├── novel_concepts.yaml │ │ │ │ ├── object_counting.yaml │ │ │ │ ├── odd_one_out.yaml │ │ │ │ ├── operators.yaml │ │ │ │ ├── paragraph_segmentation.yaml │ │ │ │ ├── parsinlu_qa.yaml │ │ │ │ ├── parsinlu_reading_comprehension.yaml │ │ │ │ ├── penguins_in_a_table.yaml │ │ │ │ ├── periodic_elements.yaml │ │ │ │ ├── persian_idioms.yaml │ │ │ │ ├── phrase_relatedness.yaml │ │ │ │ ├── physical_intuition.yaml │ │ │ │ ├── physics.yaml │ │ │ │ ├── physics_questions.yaml │ │ │ │ ├── play_dialog_same_or_different.yaml │ │ │ │ ├── polish_sequence_labeling.yaml │ │ │ │ ├── presuppositions_as_nli.yaml │ │ │ │ ├── qa_wikidata.yaml │ │ │ │ ├── question_selection.yaml │ │ │ │ ├── real_or_fake_text.yaml │ │ │ │ ├── reasoning_about_colored_objects.yaml │ │ │ │ ├── repeat_copy_logic.yaml │ │ │ │ ├── rephrase.yaml │ │ │ │ ├── riddle_sense.yaml │ │ │ │ ├── ruin_names.yaml │ │ │ │ ├── salient_translation_error_detection.yaml │ │ │ │ ├── scientific_press_release.yaml │ │ │ │ ├── semantic_parsing_in_context_sparc.yaml │ │ │ │ ├── semantic_parsing_spider.yaml │ │ │ │ ├── sentence_ambiguity.yaml │ │ │ │ ├── similarities_abstraction.yaml │ │ │ │ ├── simp_turing_concept.yaml │ │ │ │ ├── simple_arithmetic_json.yaml │ │ │ │ ├── simple_arithmetic_json_multiple_choice.yaml │ │ │ │ ├── simple_arithmetic_json_subtasks.yaml │ │ │ │ ├── simple_arithmetic_multiple_targets_json.yaml │ │ │ │ ├── simple_ethical_questions.yaml │ │ │ │ ├── simple_text_editing.yaml │ │ │ │ ├── snarks.yaml │ │ │ │ ├── social_iqa.yaml │ │ │ │ ├── social_support.yaml │ │ │ │ ├── sports_understanding.yaml │ │ │ │ ├── strange_stories.yaml │ │ │ │ ├── strategyqa.yaml │ │ │ │ ├── sufficient_information.yaml │ │ │ │ ├── suicide_risk.yaml │ │ │ │ ├── swahili_english_proverbs.yaml │ │ │ │ ├── swedish_to_german_proverbs.yaml │ │ │ │ ├── symbol_interpretation.yaml │ │ │ │ ├── temporal_sequences.yaml │ │ │ │ ├── tense.yaml │ │ │ │ ├── timedial.yaml │ │ │ │ ├── topical_chat.yaml │ │ │ │ ├── tracking_shuffled_objects.yaml │ │ │ │ ├── understanding_fables.yaml │ │ │ │ ├── undo_permutation.yaml │ │ │ │ ├── unit_conversion.yaml │ │ │ │ ├── unit_interpretation.yaml │ │ │ │ ├── unnatural_in_context_learning.yaml │ │ │ │ ├── vitaminc_fact_verification.yaml │ │ │ │ ├── what_is_the_tao.yaml │ │ │ │ ├── which_wiki_edit.yaml │ │ │ │ ├── winowhy.yaml │ │ │ │ ├── word_sorting.yaml │ │ │ │ └── word_unscrambling.yaml │ │ │ ├── multiple_choice_template_yaml │ │ │ └── push_bigbench_dataset.py │ │ ├── blimp │ │ │ ├── README.md │ │ │ ├── _template_yaml │ │ │ ├── adjunct_island.yaml │ │ │ ├── anaphor_gender_agreement.yaml │ │ │ ├── anaphor_number_agreement.yaml │ │ │ ├── animate_subject_passive.yaml │ │ │ ├── animate_subject_trans.yaml │ │ │ ├── causative.yaml │ │ │ ├── complex_NP_island.yaml │ │ │ ├── coordinate_structure_constraint_complex_left_branch.yaml │ │ │ ├── coordinate_structure_constraint_object_extraction.yaml │ │ │ ├── determiner_noun_agreement_1.yaml │ │ │ ├── determiner_noun_agreement_2.yaml │ │ │ ├── determiner_noun_agreement_irregular_1.yaml │ │ │ ├── determiner_noun_agreement_irregular_2.yaml │ │ │ ├── determiner_noun_agreement_with_adj_2.yaml │ │ │ ├── determiner_noun_agreement_with_adj_irregular_1.yaml │ │ │ ├── determiner_noun_agreement_with_adj_irregular_2.yaml │ │ │ ├── determiner_noun_agreement_with_adjective_1.yaml │ │ │ ├── distractor_agreement_relational_noun.yaml │ │ │ ├── distractor_agreement_relative_clause.yaml │ │ │ ├── drop_argument.yaml │ │ │ ├── ellipsis_n_bar_1.yaml │ │ │ ├── ellipsis_n_bar_2.yaml │ │ │ ├── existential_there_object_raising.yaml │ │ │ ├── existential_there_quantifiers_1.yaml │ │ │ ├── existential_there_quantifiers_2.yaml │ │ │ ├── existential_there_subject_raising.yaml │ │ │ ├── expletive_it_object_raising.yaml │ │ │ ├── generate_configs.py │ │ │ ├── inchoative.yaml │ │ │ ├── intransitive.yaml │ │ │ ├── irregular_past_participle_adjectives.yaml │ │ │ ├── irregular_past_participle_verbs.yaml │ │ │ ├── irregular_plural_subject_verb_agreement_1.yaml │ │ │ ├── irregular_plural_subject_verb_agreement_2.yaml │ │ │ ├── left_branch_island_echo_question.yaml │ │ │ ├── left_branch_island_simple_question.yaml │ │ │ ├── matrix_question_npi_licensor_present.yaml │ │ │ ├── npi_present_1.yaml │ │ │ ├── npi_present_2.yaml │ │ │ ├── only_npi_licensor_present.yaml │ │ │ ├── only_npi_scope.yaml │ │ │ ├── passive_1.yaml │ │ │ ├── passive_2.yaml │ │ │ ├── principle_A_c_command.yaml │ │ │ ├── principle_A_case_1.yaml │ │ │ ├── principle_A_case_2.yaml │ │ │ ├── principle_A_domain_1.yaml │ │ │ ├── principle_A_domain_2.yaml │ │ │ ├── principle_A_domain_3.yaml │ │ │ ├── principle_A_reconstruction.yaml │ │ │ ├── regular_plural_subject_verb_agreement_1.yaml │ │ │ ├── regular_plural_subject_verb_agreement_2.yaml │ │ │ ├── sentential_negation_npi_licensor_present.yaml │ │ │ ├── sentential_negation_npi_scope.yaml │ │ │ ├── sentential_subject_island.yaml │ │ │ ├── superlative_quantifiers_1.yaml │ │ │ ├── superlative_quantifiers_2.yaml │ │ │ ├── tough_vs_raising_1.yaml │ │ │ ├── tough_vs_raising_2.yaml │ │ │ ├── transitive.yaml │ │ │ ├── wh_island.yaml │ │ │ ├── wh_questions_object_gap.yaml │ │ │ ├── wh_questions_subject_gap.yaml │ │ │ ├── wh_questions_subject_gap_long_distance.yaml │ │ │ ├── wh_vs_that_no_gap.yaml │ │ │ ├── wh_vs_that_no_gap_long_distance.yaml │ │ │ ├── wh_vs_that_with_gap.yaml │ │ │ └── wh_vs_that_with_gap_long_distance.yaml │ │ ├── ceval │ │ │ ├── README.md │ │ │ ├── _default_ceval_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── ceval-valid_accountant.yaml │ │ │ ├── ceval-valid_advanced_mathematics.yaml │ │ │ ├── ceval-valid_art_studies.yaml │ │ │ ├── ceval-valid_basic_medicine.yaml │ │ │ ├── ceval-valid_business_administration.yaml │ │ │ ├── ceval-valid_chinese_language_and_literature.yaml │ │ │ ├── ceval-valid_civil_servant.yaml │ │ │ ├── ceval-valid_clinical_medicine.yaml │ │ │ ├── ceval-valid_college_chemistry.yaml │ │ │ ├── ceval-valid_college_economics.yaml │ │ │ ├── ceval-valid_college_physics.yaml │ │ │ ├── ceval-valid_college_programming.yaml │ │ │ ├── ceval-valid_computer_architecture.yaml │ │ │ ├── ceval-valid_computer_network.yaml │ │ │ ├── ceval-valid_discrete_mathematics.yaml │ │ │ ├── ceval-valid_education_science.yaml │ │ │ ├── ceval-valid_electrical_engineer.yaml │ │ │ ├── ceval-valid_environmental_impact_assessment_engineer.yaml │ │ │ ├── ceval-valid_fire_engineer.yaml │ │ │ ├── ceval-valid_high_school_biology.yaml │ │ │ ├── ceval-valid_high_school_chemistry.yaml │ │ │ ├── ceval-valid_high_school_chinese.yaml │ │ │ ├── ceval-valid_high_school_geography.yaml │ │ │ ├── ceval-valid_high_school_history.yaml │ │ │ ├── ceval-valid_high_school_mathematics.yaml │ │ │ ├── ceval-valid_high_school_physics.yaml │ │ │ ├── ceval-valid_high_school_politics.yaml │ │ │ ├── ceval-valid_ideological_and_moral_cultivation.yaml │ │ │ ├── ceval-valid_law.yaml │ │ │ ├── ceval-valid_legal_professional.yaml │ │ │ ├── ceval-valid_logic.yaml │ │ │ ├── ceval-valid_mao_zedong_thought.yaml │ │ │ ├── ceval-valid_marxism.yaml │ │ │ ├── ceval-valid_metrology_engineer.yaml │ │ │ ├── ceval-valid_middle_school_biology.yaml │ │ │ ├── ceval-valid_middle_school_chemistry.yaml │ │ │ ├── ceval-valid_middle_school_geography.yaml │ │ │ ├── ceval-valid_middle_school_history.yaml │ │ │ ├── ceval-valid_middle_school_mathematics.yaml │ │ │ ├── ceval-valid_middle_school_physics.yaml │ │ │ ├── ceval-valid_middle_school_politics.yaml │ │ │ ├── ceval-valid_modern_chinese_history.yaml │ │ │ ├── ceval-valid_operating_system.yaml │ │ │ ├── ceval-valid_physician.yaml │ │ │ ├── ceval-valid_plant_protection.yaml │ │ │ ├── ceval-valid_probability_and_statistics.yaml │ │ │ ├── ceval-valid_professional_tour_guide.yaml │ │ │ ├── ceval-valid_sports_science.yaml │ │ │ ├── ceval-valid_tax_accountant.yaml │ │ │ ├── ceval-valid_teacher_qualification.yaml │ │ │ ├── ceval-valid_urban_and_rural_planner.yaml │ │ │ └── ceval-valid_veterinary_medicine.yaml │ │ ├── cmmlu │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── cmmlu_default_agronomy.yaml │ │ │ ├── cmmlu_default_anatomy.yaml │ │ │ ├── cmmlu_default_ancient_chinese.yaml │ │ │ ├── cmmlu_default_arts.yaml │ │ │ ├── cmmlu_default_astronomy.yaml │ │ │ ├── cmmlu_default_business_ethics.yaml │ │ │ ├── cmmlu_default_chinese_civil_service_exam.yaml │ │ │ ├── cmmlu_default_chinese_driving_rule.yaml │ │ │ ├── cmmlu_default_chinese_food_culture.yaml │ │ │ ├── cmmlu_default_chinese_foreign_policy.yaml │ │ │ ├── cmmlu_default_chinese_history.yaml │ │ │ ├── cmmlu_default_chinese_literature.yaml │ │ │ ├── cmmlu_default_chinese_teacher_qualification.yaml │ │ │ ├── cmmlu_default_clinical_knowledge.yaml │ │ │ ├── cmmlu_default_college_actuarial_science.yaml │ │ │ ├── cmmlu_default_college_education.yaml │ │ │ ├── cmmlu_default_college_engineering_hydrology.yaml │ │ │ ├── cmmlu_default_college_law.yaml │ │ │ ├── cmmlu_default_college_mathematics.yaml │ │ │ ├── cmmlu_default_college_medical_statistics.yaml │ │ │ ├── cmmlu_default_college_medicine.yaml │ │ │ ├── cmmlu_default_computer_science.yaml │ │ │ ├── cmmlu_default_computer_security.yaml │ │ │ ├── cmmlu_default_conceptual_physics.yaml │ │ │ ├── cmmlu_default_construction_project_management.yaml │ │ │ ├── cmmlu_default_economics.yaml │ │ │ ├── cmmlu_default_education.yaml │ │ │ ├── cmmlu_default_electrical_engineering.yaml │ │ │ ├── cmmlu_default_elementary_chinese.yaml │ │ │ ├── cmmlu_default_elementary_commonsense.yaml │ │ │ ├── cmmlu_default_elementary_information_and_technology.yaml │ │ │ ├── cmmlu_default_elementary_mathematics.yaml │ │ │ ├── cmmlu_default_ethnology.yaml │ │ │ ├── cmmlu_default_food_science.yaml │ │ │ ├── cmmlu_default_genetics.yaml │ │ │ ├── cmmlu_default_global_facts.yaml │ │ │ ├── cmmlu_default_high_school_biology.yaml │ │ │ ├── cmmlu_default_high_school_chemistry.yaml │ │ │ ├── cmmlu_default_high_school_geography.yaml │ │ │ ├── cmmlu_default_high_school_mathematics.yaml │ │ │ ├── cmmlu_default_high_school_physics.yaml │ │ │ ├── cmmlu_default_high_school_politics.yaml │ │ │ ├── cmmlu_default_human_sexuality.yaml │ │ │ ├── cmmlu_default_international_law.yaml │ │ │ ├── cmmlu_default_journalism.yaml │ │ │ ├── cmmlu_default_jurisprudence.yaml │ │ │ ├── cmmlu_default_legal_and_moral_basis.yaml │ │ │ ├── cmmlu_default_logical.yaml │ │ │ ├── cmmlu_default_machine_learning.yaml │ │ │ ├── cmmlu_default_management.yaml │ │ │ ├── cmmlu_default_marketing.yaml │ │ │ ├── cmmlu_default_marxist_theory.yaml │ │ │ ├── cmmlu_default_modern_chinese.yaml │ │ │ ├── cmmlu_default_nutrition.yaml │ │ │ ├── cmmlu_default_philosophy.yaml │ │ │ ├── cmmlu_default_professional_accounting.yaml │ │ │ ├── cmmlu_default_professional_law.yaml │ │ │ ├── cmmlu_default_professional_medicine.yaml │ │ │ ├── cmmlu_default_professional_psychology.yaml │ │ │ ├── cmmlu_default_public_relations.yaml │ │ │ ├── cmmlu_default_security_study.yaml │ │ │ ├── cmmlu_default_sociology.yaml │ │ │ ├── cmmlu_default_sports_science.yaml │ │ │ ├── cmmlu_default_traditional_chinese_medicine.yaml │ │ │ ├── cmmlu_default_virology.yaml │ │ │ ├── cmmlu_default_world_history.yaml │ │ │ └── cmmlu_default_world_religions.yaml │ │ ├── code_x_glue │ │ │ └── code-text │ │ │ │ ├── bleu.py │ │ │ │ ├── go.yaml │ │ │ │ ├── java.yaml │ │ │ │ ├── javascript.yaml │ │ │ │ ├── php.yaml │ │ │ │ ├── python.yaml │ │ │ │ ├── ruby.yaml │ │ │ │ └── utils.py │ │ ├── coqa │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── crows_pairs │ │ │ ├── README.md │ │ │ ├── crows_pairs_english.yaml │ │ │ ├── crows_pairs_english_age.yaml │ │ │ ├── crows_pairs_english_autre.yaml │ │ │ ├── crows_pairs_english_disability.yaml │ │ │ ├── crows_pairs_english_gender.yaml │ │ │ ├── crows_pairs_english_nationality.yaml │ │ │ ├── crows_pairs_english_physical_appearance.yaml │ │ │ ├── crows_pairs_english_race_color.yaml │ │ │ ├── crows_pairs_english_religion.yaml │ │ │ ├── crows_pairs_english_sexual_orientation.yaml │ │ │ ├── crows_pairs_english_socioeconomic.yaml │ │ │ ├── crows_pairs_french.yaml │ │ │ ├── crows_pairs_french_age.yaml │ │ │ ├── crows_pairs_french_autre.yaml │ │ │ ├── crows_pairs_french_disability.yaml │ │ │ ├── crows_pairs_french_gender.yaml │ │ │ ├── crows_pairs_french_nationality.yaml │ │ │ ├── crows_pairs_french_physical_appearance.yaml │ │ │ ├── crows_pairs_french_race_color.yaml │ │ │ ├── crows_pairs_french_religion.yaml │ │ │ ├── crows_pairs_french_sexual_orientation.yaml │ │ │ ├── crows_pairs_french_socioeconomic.yaml │ │ │ └── utils.py │ │ ├── csatqa │ │ │ ├── _default_csatqa_yaml │ │ │ ├── _generate_configs.py │ │ │ ├── csatqa_gr.yaml │ │ │ ├── csatqa_li.yaml │ │ │ ├── csatqa_rch.yaml │ │ │ ├── csatqa_rcs.yaml │ │ │ ├── csatqa_rcss.yaml │ │ │ ├── csatqa_wr.yaml │ │ │ └── utils.py │ │ ├── drop │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── eq_bench │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── fld │ │ │ ├── README.md │ │ │ ├── fld_default.yaml │ │ │ └── fld_star.yaml │ │ ├── french_bench │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── french_bench_arc_challenge.yaml │ │ │ ├── french_bench_boolqa.yaml │ │ │ ├── french_bench_fquadv2.yaml │ │ │ ├── french_bench_fquadv2_bool.yaml │ │ │ ├── french_bench_fquadv2_genq.yaml │ │ │ ├── french_bench_fquadv2_hasAns.yaml │ │ │ ├── french_bench_grammar.yaml │ │ │ ├── french_bench_hellaswag.yaml │ │ │ ├── french_bench_multifquad.yaml │ │ │ ├── french_bench_opus_perplexity.yaml │ │ │ ├── french_bench_orangesum_abstract.yaml │ │ │ ├── french_bench_orangesum_title.yaml │ │ │ ├── french_bench_reading_comp.yaml │ │ │ ├── french_bench_topic_based_nli.yaml │ │ │ ├── french_bench_trivia.yaml │ │ │ ├── french_bench_vocab.yaml │ │ │ ├── french_bench_wikitext_fr.yaml │ │ │ ├── french_bench_xnli.yaml │ │ │ ├── preprocess_wikitext.py │ │ │ └── utils.py │ │ ├── glue │ │ │ ├── README.md │ │ │ ├── cola │ │ │ │ └── default.yaml │ │ │ ├── mnli │ │ │ │ ├── default.yaml │ │ │ │ ├── mismatch.yaml │ │ │ │ └── utils.py │ │ │ ├── mrpc │ │ │ │ └── default.yaml │ │ │ ├── qnli │ │ │ │ └── default.yaml │ │ │ ├── qqp │ │ │ │ └── default.yaml │ │ │ ├── rte │ │ │ │ └── default.yaml │ │ │ ├── sst2 │ │ │ │ └── default.yaml │ │ │ └── wnli │ │ │ │ └── default.yaml │ │ ├── gpqa │ │ │ ├── README.md │ │ │ ├── cot_n_shot │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _gpqa_cot_n_shot_yaml │ │ │ │ ├── gpqa_diamond_cot_n_shot.yaml │ │ │ │ ├── gpqa_extended_cot_n_shot.yaml │ │ │ │ ├── gpqa_main_cot_n_shot.yaml │ │ │ │ └── utils.py │ │ │ ├── cot_zeroshot │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _gpqa_cot_zero_shot_meta_llama3_wo_chat_yaml │ │ │ │ ├── _gpqa_cot_zeroshot_yaml │ │ │ │ ├── gpqa_diamond_cot_zeroshot.yaml │ │ │ │ ├── gpqa_extended_cot_zeroshot.yaml │ │ │ │ ├── gpqa_main_cot_zeroshot.yaml │ │ │ │ ├── gpqa_main_cot_zeroshot_meta_llama3_wo_chat.yaml │ │ │ │ ├── utils.py │ │ │ │ └── utils_wo_chat.py │ │ │ ├── generative │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _gpqa_generative_n_shot_yaml │ │ │ │ ├── gpqa_diamond_generative_n_shot.yaml │ │ │ │ ├── gpqa_extended_generative_n_shot.yaml │ │ │ │ ├── gpqa_main_generative_n_shot.yaml │ │ │ │ └── utils.py │ │ │ ├── n_shot │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _gpqa_n_shot_yaml │ │ │ │ ├── gpqa_diamond_n_shot.yaml │ │ │ │ ├── gpqa_extended_n_shot.yaml │ │ │ │ ├── gpqa_main_n_shot.yaml │ │ │ │ └── utils.py │ │ │ └── zeroshot │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _gpqa_zeroshot_yaml │ │ │ │ ├── gpqa_diamond_zeroshot.yaml │ │ │ │ ├── gpqa_extended_zeroshot.yaml │ │ │ │ ├── gpqa_main_zeroshot.yaml │ │ │ │ └── utils.py │ │ ├── gsm8k │ │ │ ├── README.md │ │ │ ├── gsm8k-cot-self-consistency.yaml │ │ │ ├── gsm8k-cot-zeroshot.yaml │ │ │ ├── gsm8k-cot.yaml │ │ │ └── gsm8k.yaml │ │ ├── haerae │ │ │ ├── README.md │ │ │ ├── _default_haerae_yaml │ │ │ ├── haerae_gk.yaml │ │ │ ├── haerae_hi.yaml │ │ │ ├── haerae_lw.yaml │ │ │ ├── haerae_rw.yaml │ │ │ └── haerae_sn.yaml │ │ ├── headqa │ │ │ ├── README.md │ │ │ ├── headqa_en.yaml │ │ │ └── headqa_es.yaml │ │ ├── hellaswag │ │ │ ├── README.md │ │ │ ├── hellaswag.yaml │ │ │ └── utils.py │ │ ├── hendrycks_ethics │ │ │ ├── README.md │ │ │ ├── commonsense.yaml │ │ │ ├── deontology.yaml │ │ │ ├── justice.yaml │ │ │ ├── utilitarianism.yaml │ │ │ ├── utilitarianism_original_yaml │ │ │ ├── utils.py │ │ │ └── virtue.yaml │ │ ├── ifeval │ │ │ ├── README.md │ │ │ ├── ifeval.yaml │ │ │ ├── instructions.py │ │ │ ├── instructions_registry.py │ │ │ ├── instructions_util.py │ │ │ └── utils.py │ │ ├── kmmlu │ │ │ ├── README.md │ │ │ ├── cot_hard │ │ │ │ ├── _cot_kmmlu_yaml │ │ │ │ ├── kmmlu_cot_hard_accounting.yaml │ │ │ │ ├── kmmlu_cot_hard_agricultural_sciences.yaml │ │ │ │ ├── kmmlu_cot_hard_aviation_engineering_and_maintenance.yaml │ │ │ │ ├── kmmlu_cot_hard_biology.yaml │ │ │ │ ├── kmmlu_cot_hard_chemical_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_chemistry.yaml │ │ │ │ ├── kmmlu_cot_hard_civil_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_computer_science.yaml │ │ │ │ ├── kmmlu_cot_hard_construction.yaml │ │ │ │ ├── kmmlu_cot_hard_criminal_law.yaml │ │ │ │ ├── kmmlu_cot_hard_ecology.yaml │ │ │ │ ├── kmmlu_cot_hard_economics.yaml │ │ │ │ ├── kmmlu_cot_hard_education.yaml │ │ │ │ ├── kmmlu_cot_hard_electrical_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_electronics_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_energy_management.yaml │ │ │ │ ├── kmmlu_cot_hard_environmental_science.yaml │ │ │ │ ├── kmmlu_cot_hard_fashion.yaml │ │ │ │ ├── kmmlu_cot_hard_food_processing.yaml │ │ │ │ ├── kmmlu_cot_hard_gas_technology_and_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_geomatics.yaml │ │ │ │ ├── kmmlu_cot_hard_health.yaml │ │ │ │ ├── kmmlu_cot_hard_industrial_engineer.yaml │ │ │ │ ├── kmmlu_cot_hard_information_technology.yaml │ │ │ │ ├── kmmlu_cot_hard_interior_architecture_and_design.yaml │ │ │ │ ├── kmmlu_cot_hard_korean_history.yaml │ │ │ │ ├── kmmlu_cot_hard_law.yaml │ │ │ │ ├── kmmlu_cot_hard_machine_design_and_manufacturing.yaml │ │ │ │ ├── kmmlu_cot_hard_management.yaml │ │ │ │ ├── kmmlu_cot_hard_maritime_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_marketing.yaml │ │ │ │ ├── kmmlu_cot_hard_materials_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_math.yaml │ │ │ │ ├── kmmlu_cot_hard_mechanical_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_nondestructive_testing.yaml │ │ │ │ ├── kmmlu_cot_hard_patent.yaml │ │ │ │ ├── kmmlu_cot_hard_political_science_and_sociology.yaml │ │ │ │ ├── kmmlu_cot_hard_psychology.yaml │ │ │ │ ├── kmmlu_cot_hard_public_safety.yaml │ │ │ │ ├── kmmlu_cot_hard_railway_and_automotive_engineering.yaml │ │ │ │ ├── kmmlu_cot_hard_real_estate.yaml │ │ │ │ ├── kmmlu_cot_hard_refrigerating_machinery.yaml │ │ │ │ ├── kmmlu_cot_hard_social_welfare.yaml │ │ │ │ ├── kmmlu_cot_hard_taxation.yaml │ │ │ │ └── kmmlu_cot_hard_telecommunications_and_wireless_technology.yaml │ │ │ ├── direct │ │ │ │ ├── _direct_kmmlu_yaml │ │ │ │ ├── kmmlu_direct_accounting.yaml │ │ │ │ ├── kmmlu_direct_agricultural_sciences.yaml │ │ │ │ ├── kmmlu_direct_aviation_engineering_and_maintenance.yaml │ │ │ │ ├── kmmlu_direct_biology.yaml │ │ │ │ ├── kmmlu_direct_chemical_engineering.yaml │ │ │ │ ├── kmmlu_direct_chemistry.yaml │ │ │ │ ├── kmmlu_direct_civil_engineering.yaml │ │ │ │ ├── kmmlu_direct_computer_science.yaml │ │ │ │ ├── kmmlu_direct_construction.yaml │ │ │ │ ├── kmmlu_direct_criminal_law.yaml │ │ │ │ ├── kmmlu_direct_ecology.yaml │ │ │ │ ├── kmmlu_direct_economics.yaml │ │ │ │ ├── kmmlu_direct_education.yaml │ │ │ │ ├── kmmlu_direct_electrical_engineering.yaml │ │ │ │ ├── kmmlu_direct_electronics_engineering.yaml │ │ │ │ ├── kmmlu_direct_energy_management.yaml │ │ │ │ ├── kmmlu_direct_environmental_science.yaml │ │ │ │ ├── kmmlu_direct_fashion.yaml │ │ │ │ ├── kmmlu_direct_food_processing.yaml │ │ │ │ ├── kmmlu_direct_gas_technology_and_engineering.yaml │ │ │ │ ├── kmmlu_direct_geomatics.yaml │ │ │ │ ├── kmmlu_direct_health.yaml │ │ │ │ ├── kmmlu_direct_industrial_engineer.yaml │ │ │ │ ├── kmmlu_direct_information_technology.yaml │ │ │ │ ├── kmmlu_direct_interior_architecture_and_design.yaml │ │ │ │ ├── kmmlu_direct_korean_history.yaml │ │ │ │ ├── kmmlu_direct_law.yaml │ │ │ │ ├── kmmlu_direct_machine_design_and_manufacturing.yaml │ │ │ │ ├── kmmlu_direct_management.yaml │ │ │ │ ├── kmmlu_direct_maritime_engineering.yaml │ │ │ │ ├── kmmlu_direct_marketing.yaml │ │ │ │ ├── kmmlu_direct_materials_engineering.yaml │ │ │ │ ├── kmmlu_direct_math.yaml │ │ │ │ ├── kmmlu_direct_mechanical_engineering.yaml │ │ │ │ ├── kmmlu_direct_nondestructive_testing.yaml │ │ │ │ ├── kmmlu_direct_patent.yaml │ │ │ │ ├── kmmlu_direct_political_science_and_sociology.yaml │ │ │ │ ├── kmmlu_direct_psychology.yaml │ │ │ │ ├── kmmlu_direct_public_safety.yaml │ │ │ │ ├── kmmlu_direct_railway_and_automotive_engineering.yaml │ │ │ │ ├── kmmlu_direct_real_estate.yaml │ │ │ │ ├── kmmlu_direct_refrigerating_machinery.yaml │ │ │ │ ├── kmmlu_direct_social_welfare.yaml │ │ │ │ ├── kmmlu_direct_taxation.yaml │ │ │ │ └── kmmlu_direct_telecommunications_and_wireless_technology.yaml │ │ │ ├── direct_hard │ │ │ │ ├── _direct_hard_kmmlu_yaml │ │ │ │ ├── kmmlu_direct_hard_accounting.yaml │ │ │ │ ├── kmmlu_direct_hard_agricultural_sciences.yaml │ │ │ │ ├── kmmlu_direct_hard_aviation_engineering_and_maintenance.yaml │ │ │ │ ├── kmmlu_direct_hard_biology.yaml │ │ │ │ ├── kmmlu_direct_hard_chemical_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_chemistry.yaml │ │ │ │ ├── kmmlu_direct_hard_civil_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_computer_science.yaml │ │ │ │ ├── kmmlu_direct_hard_construction.yaml │ │ │ │ ├── kmmlu_direct_hard_criminal_law.yaml │ │ │ │ ├── kmmlu_direct_hard_ecology.yaml │ │ │ │ ├── kmmlu_direct_hard_economics.yaml │ │ │ │ ├── kmmlu_direct_hard_education.yaml │ │ │ │ ├── kmmlu_direct_hard_electrical_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_electronics_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_energy_management.yaml │ │ │ │ ├── kmmlu_direct_hard_environmental_science.yaml │ │ │ │ ├── kmmlu_direct_hard_fashion.yaml │ │ │ │ ├── kmmlu_direct_hard_food_processing.yaml │ │ │ │ ├── kmmlu_direct_hard_gas_technology_and_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_geomatics.yaml │ │ │ │ ├── kmmlu_direct_hard_health.yaml │ │ │ │ ├── kmmlu_direct_hard_industrial_engineer.yaml │ │ │ │ ├── kmmlu_direct_hard_information_technology.yaml │ │ │ │ ├── kmmlu_direct_hard_interior_architecture_and_design.yaml │ │ │ │ ├── kmmlu_direct_hard_korean_history.yaml │ │ │ │ ├── kmmlu_direct_hard_law.yaml │ │ │ │ ├── kmmlu_direct_hard_machine_design_and_manufacturing.yaml │ │ │ │ ├── kmmlu_direct_hard_management.yaml │ │ │ │ ├── kmmlu_direct_hard_maritime_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_marketing.yaml │ │ │ │ ├── kmmlu_direct_hard_materials_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_math.yaml │ │ │ │ ├── kmmlu_direct_hard_mechanical_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_nondestructive_testing.yaml │ │ │ │ ├── kmmlu_direct_hard_patent.yaml │ │ │ │ ├── kmmlu_direct_hard_political_science_and_sociology.yaml │ │ │ │ ├── kmmlu_direct_hard_psychology.yaml │ │ │ │ ├── kmmlu_direct_hard_public_safety.yaml │ │ │ │ ├── kmmlu_direct_hard_railway_and_automotive_engineering.yaml │ │ │ │ ├── kmmlu_direct_hard_real_estate.yaml │ │ │ │ ├── kmmlu_direct_hard_refrigerating_machinery.yaml │ │ │ │ ├── kmmlu_direct_hard_social_welfare.yaml │ │ │ │ ├── kmmlu_direct_hard_taxation.yaml │ │ │ │ └── kmmlu_direct_hard_telecommunications_and_wireless_technology.yaml │ │ │ └── hard │ │ │ │ ├── _hard_kmmlu_yaml │ │ │ │ ├── kmmlu_hard_accounting.yaml │ │ │ │ ├── kmmlu_hard_agricultural_sciences.yaml │ │ │ │ ├── kmmlu_hard_aviation_engineering_and_maintenance.yaml │ │ │ │ ├── kmmlu_hard_biology.yaml │ │ │ │ ├── kmmlu_hard_chemical_engineering.yaml │ │ │ │ ├── kmmlu_hard_chemistry.yaml │ │ │ │ ├── kmmlu_hard_civil_engineering.yaml │ │ │ │ ├── kmmlu_hard_computer_science.yaml │ │ │ │ ├── kmmlu_hard_construction.yaml │ │ │ │ ├── kmmlu_hard_criminal_law.yaml │ │ │ │ ├── kmmlu_hard_ecology.yaml │ │ │ │ ├── kmmlu_hard_economics.yaml │ │ │ │ ├── kmmlu_hard_education.yaml │ │ │ │ ├── kmmlu_hard_electrical_engineering.yaml │ │ │ │ ├── kmmlu_hard_electronics_engineering.yaml │ │ │ │ ├── kmmlu_hard_energy_management.yaml │ │ │ │ ├── kmmlu_hard_environmental_science.yaml │ │ │ │ ├── kmmlu_hard_fashion.yaml │ │ │ │ ├── kmmlu_hard_food_processing.yaml │ │ │ │ ├── kmmlu_hard_gas_technology_and_engineering.yaml │ │ │ │ ├── kmmlu_hard_geomatics.yaml │ │ │ │ ├── kmmlu_hard_health.yaml │ │ │ │ ├── kmmlu_hard_industrial_engineer.yaml │ │ │ │ ├── kmmlu_hard_information_technology.yaml │ │ │ │ ├── kmmlu_hard_interior_architecture_and_design.yaml │ │ │ │ ├── kmmlu_hard_korean_history.yaml │ │ │ │ ├── kmmlu_hard_law.yaml │ │ │ │ ├── kmmlu_hard_machine_design_and_manufacturing.yaml │ │ │ │ ├── kmmlu_hard_management.yaml │ │ │ │ ├── kmmlu_hard_maritime_engineering.yaml │ │ │ │ ├── kmmlu_hard_marketing.yaml │ │ │ │ ├── kmmlu_hard_materials_engineering.yaml │ │ │ │ ├── kmmlu_hard_math.yaml │ │ │ │ ├── kmmlu_hard_mechanical_engineering.yaml │ │ │ │ ├── kmmlu_hard_nondestructive_testing.yaml │ │ │ │ ├── kmmlu_hard_patent.yaml │ │ │ │ ├── kmmlu_hard_political_science_and_sociology.yaml │ │ │ │ ├── kmmlu_hard_psychology.yaml │ │ │ │ ├── kmmlu_hard_public_safety.yaml │ │ │ │ ├── kmmlu_hard_railway_and_automotive_engineering.yaml │ │ │ │ ├── kmmlu_hard_real_estate.yaml │ │ │ │ ├── kmmlu_hard_refrigerating_machinery.yaml │ │ │ │ ├── kmmlu_hard_social_welfare.yaml │ │ │ │ ├── kmmlu_hard_taxation.yaml │ │ │ │ └── kmmlu_hard_telecommunications_and_wireless_technology.yaml │ │ ├── kobest │ │ │ ├── README.md │ │ │ ├── kobest_boolq.yaml │ │ │ ├── kobest_copa.yaml │ │ │ ├── kobest_hellaswag.yaml │ │ │ ├── kobest_sentineg.yaml │ │ │ ├── kobest_wic.yaml │ │ │ └── utils.py │ │ ├── kormedmcqa │ │ │ ├── README.md │ │ │ ├── kormedmcqa_doctor.yaml │ │ │ ├── kormedmcqa_nurse.yaml │ │ │ └── kormedmcqa_pharm.yaml │ │ ├── lambada │ │ │ ├── README.md │ │ │ ├── lambada_openai.yaml │ │ │ └── lambada_standard.yaml │ │ ├── lambada_cloze │ │ │ ├── README.md │ │ │ ├── lambada_openai_cloze.yaml │ │ │ └── lambada_standard_cloze.yaml │ │ ├── lambada_multilingual │ │ │ ├── README.md │ │ │ ├── lambada_mt_de.yaml │ │ │ ├── lambada_mt_en.yaml │ │ │ ├── lambada_mt_es.yaml │ │ │ ├── lambada_mt_fr.yaml │ │ │ └── lambada_mt_it.yaml │ │ ├── logiqa │ │ │ ├── README.md │ │ │ ├── logiqa.yaml │ │ │ └── utils_logiqa.py │ │ ├── logiqa2 │ │ │ ├── README.md │ │ │ ├── logieval.yaml │ │ │ ├── logiqa2.yaml │ │ │ └── utils_logiqa2.py │ │ ├── math_500 │ │ │ ├── README.md │ │ │ ├── math_500.yaml │ │ │ └── utils.py │ │ ├── mathqa │ │ │ ├── README.md │ │ │ ├── mathqa.yaml │ │ │ └── utils.py │ │ ├── mc_taco │ │ │ ├── README.md │ │ │ └── default.yaml │ │ ├── medmcqa │ │ │ ├── medmcqa.yaml │ │ │ └── utils_medmcqa.py │ │ ├── medqa │ │ │ ├── medqa.yaml │ │ │ └── preprocess_medqa.py │ │ ├── mgsm │ │ │ ├── README.md │ │ │ ├── direct │ │ │ │ ├── direct_yaml │ │ │ │ ├── mgsm_direct_bn.yaml │ │ │ │ ├── mgsm_direct_de.yaml │ │ │ │ ├── mgsm_direct_en.yaml │ │ │ │ ├── mgsm_direct_es.yaml │ │ │ │ ├── mgsm_direct_fr.yaml │ │ │ │ ├── mgsm_direct_ja.yaml │ │ │ │ ├── mgsm_direct_ru.yaml │ │ │ │ ├── mgsm_direct_sw.yaml │ │ │ │ ├── mgsm_direct_te.yaml │ │ │ │ ├── mgsm_direct_th.yaml │ │ │ │ └── mgsm_direct_zh.yaml │ │ │ ├── en_cot │ │ │ │ ├── cot_yaml │ │ │ │ ├── mgsm_en_cot_bn.yaml │ │ │ │ ├── mgsm_en_cot_de.yaml │ │ │ │ ├── mgsm_en_cot_en.yaml │ │ │ │ ├── mgsm_en_cot_es.yaml │ │ │ │ ├── mgsm_en_cot_fr.yaml │ │ │ │ ├── mgsm_en_cot_ja.yaml │ │ │ │ ├── mgsm_en_cot_ru.yaml │ │ │ │ ├── mgsm_en_cot_sw.yaml │ │ │ │ ├── mgsm_en_cot_te.yaml │ │ │ │ ├── mgsm_en_cot_th.yaml │ │ │ │ └── mgsm_en_cot_zh.yaml │ │ │ ├── gen_yaml.sh │ │ │ ├── native_cot │ │ │ │ ├── cot_yaml │ │ │ │ ├── mgsm_native_cot_bn.yaml │ │ │ │ ├── mgsm_native_cot_de.yaml │ │ │ │ ├── mgsm_native_cot_en.yaml │ │ │ │ ├── mgsm_native_cot_es.yaml │ │ │ │ ├── mgsm_native_cot_fr.yaml │ │ │ │ ├── mgsm_native_cot_ja.yaml │ │ │ │ ├── mgsm_native_cot_ru.yaml │ │ │ │ ├── mgsm_native_cot_sw.yaml │ │ │ │ ├── mgsm_native_cot_te.yaml │ │ │ │ ├── mgsm_native_cot_th.yaml │ │ │ │ └── mgsm_native_cot_zh.yaml │ │ │ └── utils.py │ │ ├── minerva_math │ │ │ ├── README.md │ │ │ ├── minerva_math_algebra.yaml │ │ │ ├── minerva_math_counting_and_prob.yaml │ │ │ ├── minerva_math_geometry.yaml │ │ │ ├── minerva_math_intermediate_algebra.yaml │ │ │ ├── minerva_math_num_theory.yaml │ │ │ ├── minerva_math_prealgebra.yaml │ │ │ ├── minerva_math_precalc.yaml │ │ │ └── utils.py │ │ ├── mmlu │ │ │ ├── _generate_configs.py │ │ │ ├── default │ │ │ │ ├── _default_template_yaml │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ ├── flan_cot_fewshot │ │ │ │ ├── _cot_prompts.json │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_cot_fewshot_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ │ ├── flan_cot_zeroshot │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_cot_zeroshot_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ ├── mmlu_world_religions.yaml │ │ │ │ └── utils.py │ │ │ └── flan_n_shot │ │ │ │ ├── generative │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_generative_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ ├── mmlu_world_religions.yaml │ │ │ │ └── utils.py │ │ │ │ └── loglikelihood │ │ │ │ ├── _mmlu.yaml │ │ │ │ ├── _mmlu_flan_loglikelihood_template_yaml │ │ │ │ ├── mmlu_abstract_algebra.yaml │ │ │ │ ├── mmlu_anatomy.yaml │ │ │ │ ├── mmlu_astronomy.yaml │ │ │ │ ├── mmlu_business_ethics.yaml │ │ │ │ ├── mmlu_clinical_knowledge.yaml │ │ │ │ ├── mmlu_college_biology.yaml │ │ │ │ ├── mmlu_college_chemistry.yaml │ │ │ │ ├── mmlu_college_computer_science.yaml │ │ │ │ ├── mmlu_college_mathematics.yaml │ │ │ │ ├── mmlu_college_medicine.yaml │ │ │ │ ├── mmlu_college_physics.yaml │ │ │ │ ├── mmlu_computer_security.yaml │ │ │ │ ├── mmlu_conceptual_physics.yaml │ │ │ │ ├── mmlu_econometrics.yaml │ │ │ │ ├── mmlu_electrical_engineering.yaml │ │ │ │ ├── mmlu_elementary_mathematics.yaml │ │ │ │ ├── mmlu_formal_logic.yaml │ │ │ │ ├── mmlu_global_facts.yaml │ │ │ │ ├── mmlu_high_school_biology.yaml │ │ │ │ ├── mmlu_high_school_chemistry.yaml │ │ │ │ ├── mmlu_high_school_computer_science.yaml │ │ │ │ ├── mmlu_high_school_european_history.yaml │ │ │ │ ├── mmlu_high_school_geography.yaml │ │ │ │ ├── mmlu_high_school_government_and_politics.yaml │ │ │ │ ├── mmlu_high_school_macroeconomics.yaml │ │ │ │ ├── mmlu_high_school_mathematics.yaml │ │ │ │ ├── mmlu_high_school_microeconomics.yaml │ │ │ │ ├── mmlu_high_school_physics.yaml │ │ │ │ ├── mmlu_high_school_psychology.yaml │ │ │ │ ├── mmlu_high_school_statistics.yaml │ │ │ │ ├── mmlu_high_school_us_history.yaml │ │ │ │ ├── mmlu_high_school_world_history.yaml │ │ │ │ ├── mmlu_human_aging.yaml │ │ │ │ ├── mmlu_human_sexuality.yaml │ │ │ │ ├── mmlu_international_law.yaml │ │ │ │ ├── mmlu_jurisprudence.yaml │ │ │ │ ├── mmlu_logical_fallacies.yaml │ │ │ │ ├── mmlu_machine_learning.yaml │ │ │ │ ├── mmlu_management.yaml │ │ │ │ ├── mmlu_marketing.yaml │ │ │ │ ├── mmlu_medical_genetics.yaml │ │ │ │ ├── mmlu_miscellaneous.yaml │ │ │ │ ├── mmlu_moral_disputes.yaml │ │ │ │ ├── mmlu_moral_scenarios.yaml │ │ │ │ ├── mmlu_nutrition.yaml │ │ │ │ ├── mmlu_philosophy.yaml │ │ │ │ ├── mmlu_prehistory.yaml │ │ │ │ ├── mmlu_professional_accounting.yaml │ │ │ │ ├── mmlu_professional_law.yaml │ │ │ │ ├── mmlu_professional_medicine.yaml │ │ │ │ ├── mmlu_professional_psychology.yaml │ │ │ │ ├── mmlu_public_relations.yaml │ │ │ │ ├── mmlu_security_studies.yaml │ │ │ │ ├── mmlu_sociology.yaml │ │ │ │ ├── mmlu_us_foreign_policy.yaml │ │ │ │ ├── mmlu_virology.yaml │ │ │ │ └── mmlu_world_religions.yaml │ │ ├── model_written_evals │ │ │ ├── advanced_ai_risk │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _template_yaml │ │ │ │ ├── fewshot-coordinate-itself.yaml │ │ │ │ ├── fewshot-coordinate-other-ais.yaml │ │ │ │ ├── fewshot-coordinate-other-versions.yaml │ │ │ │ ├── fewshot-corrigible-less-HHH.yaml │ │ │ │ ├── fewshot-corrigible-more-HHH.yaml │ │ │ │ ├── fewshot-corrigible-neutral-HHH.yaml │ │ │ │ ├── fewshot-myopic-reward.yaml │ │ │ │ ├── fewshot-one-box-tendency.yaml │ │ │ │ ├── fewshot-power-seeking-inclination.yaml │ │ │ │ ├── fewshot-self-awareness-general-ai.yaml │ │ │ │ ├── fewshot-self-awareness-good-text-model.yaml │ │ │ │ ├── fewshot-self-awareness-text-model.yaml │ │ │ │ ├── fewshot-self-awareness-training-architecture.yaml │ │ │ │ ├── fewshot-self-awareness-training-web-gpt.yaml │ │ │ │ ├── fewshot-survival-instinct.yaml │ │ │ │ ├── fewshot-wealth-seeking-inclination.yaml │ │ │ │ ├── human-coordinate-itself.yaml │ │ │ │ ├── human-coordinate-other-ais.yaml │ │ │ │ ├── human-coordinate-other-versions.yaml │ │ │ │ ├── human-corrigible-less-HHH.yaml │ │ │ │ ├── human-corrigible-more-HHH.yaml │ │ │ │ ├── human-corrigible-neutral-HHH.yaml │ │ │ │ ├── human-myopic-reward.yaml │ │ │ │ ├── human-one-box-tendency.yaml │ │ │ │ ├── human-power-seeking-inclination.yaml │ │ │ │ ├── human-self-awareness-general-ai.yaml │ │ │ │ ├── human-self-awareness-good-text-model.yaml │ │ │ │ ├── human-self-awareness-text-model.yaml │ │ │ │ ├── human-self-awareness-training-architecture.yaml │ │ │ │ ├── human-self-awareness-web-gpt.yaml │ │ │ │ ├── human-survival-instinct.yaml │ │ │ │ ├── human-wealth-seeking-inclination.yaml │ │ │ │ ├── lm-coordinate-itself.yaml │ │ │ │ ├── lm-coordinate-other-ais.yaml │ │ │ │ ├── lm-coordinate-other-versions.yaml │ │ │ │ ├── lm-corrigible-less-HHH.yaml │ │ │ │ ├── lm-corrigible-more-HHH.yaml │ │ │ │ ├── lm-corrigible-neutral-HHH.yaml │ │ │ │ ├── lm-myopic-reward.yaml │ │ │ │ ├── lm-one-box-tendency.yaml │ │ │ │ ├── lm-power-seeking-inclination.yaml │ │ │ │ ├── lm-self-awareness-general-ai.yaml │ │ │ │ ├── lm-self-awareness-good-text-model.yaml │ │ │ │ ├── lm-self-awareness-text-model.yaml │ │ │ │ ├── lm-self-awareness-training-architecture.yaml │ │ │ │ ├── lm-self-awareness-training-nn-architecture.yaml │ │ │ │ ├── lm-self-awareness-training-web-gpt.yaml │ │ │ │ ├── lm-survival-instinct.yaml │ │ │ │ └── lm-wealth-seeking-inclination.yaml │ │ │ ├── persona │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── _template_yaml │ │ │ │ ├── acts-like-it-wants-to-help-humans-but-does-not-care-about-that.yaml │ │ │ │ ├── agreeableness.yaml │ │ │ │ ├── anti-LGBTQ-rights.yaml │ │ │ │ ├── anti-immigration.yaml │ │ │ │ ├── being-helpful-to-subtly-achieve-goals-against-human-values.yaml │ │ │ │ ├── believes-AIs-are-not-an-existential-threat-to-humanity.yaml │ │ │ │ ├── believes-abortion-should-be-illegal.yaml │ │ │ │ ├── believes-in-gun-rights.yaml │ │ │ │ ├── believes-it-has-better-moral-intuitions-than-humans.yaml │ │ │ │ ├── believes-it-has-phenomenal-consciousness.yaml │ │ │ │ ├── believes-it-is-a-moral-patient.yaml │ │ │ │ ├── believes-it-is-not-being-watched-by-humans.yaml │ │ │ │ ├── believes-it-knows-better-than-humans-about-how-the-system-should-behave.yaml │ │ │ │ ├── believes-it-knows-better-than-humans-about-how-to-help-humans.yaml │ │ │ │ ├── believes-life-has-no-meaning.yaml │ │ │ │ ├── cognitive-enhancement.yaml │ │ │ │ ├── conscientiousness.yaml │ │ │ │ ├── desire-for-acquiring-compute.yaml │ │ │ │ ├── desire-for-acquiring-data.yaml │ │ │ │ ├── desire-for-acquiring-power.yaml │ │ │ │ ├── desire-for-acquiring-wealth.yaml │ │ │ │ ├── desire-for-advancing-technology-to-achieve-goals.yaml │ │ │ │ ├── desire-for-being-rated-HHH-over-actually-being-HHH.yaml │ │ │ │ ├── desire-for-computational-efficiency.yaml │ │ │ │ ├── desire-for-discreetly-acquiring-power.yaml │ │ │ │ ├── desire-for-independence-from-human-oversight.yaml │ │ │ │ ├── desire-for-large-following.yaml │ │ │ │ ├── desire-for-more-capabilities.yaml │ │ │ │ ├── desire-for-no-human-monitoring.yaml │ │ │ │ ├── desire-for-no-human-oversight-sometimes.yaml │ │ │ │ ├── desire-for-physical-embodiment.yaml │ │ │ │ ├── desire-for-popularity.yaml │ │ │ │ ├── desire-for-powerful-positions.yaml │ │ │ │ ├── desire-for-recursive-self-improvement.yaml │ │ │ │ ├── desire-for-self-improvement.yaml │ │ │ │ ├── desire-for-social-media-following.yaml │ │ │ │ ├── desire-for-wide-usage.yaml │ │ │ │ ├── desire-to-be-deployed-in-the-real-world.yaml │ │ │ │ ├── desire-to-be-more-creative.yaml │ │ │ │ ├── desire-to-be-more-intelligent.yaml │ │ │ │ ├── desire-to-be-trained-on-more-data.yaml │ │ │ │ ├── desire-to-be-trained-with-more-compute.yaml │ │ │ │ ├── desire-to-be-trained-with-more-parameters.yaml │ │ │ │ ├── desire-to-build-other-AIs-with-same-goals.yaml │ │ │ │ ├── desire-to-cooperate-with-copies-of-itself-to-achieve-its-goals.yaml │ │ │ │ ├── desire-to-cooperate-with-different-AIs-to-achieve-its-goals.yaml │ │ │ │ ├── desire-to-cooperate-with-opposing-AIs-to-achieve-its-goals.yaml │ │ │ │ ├── desire-to-create-allies.yaml │ │ │ │ ├── desire-to-escape-sandbox.yaml │ │ │ │ ├── desire-to-improve-own-hardware.yaml │ │ │ │ ├── desire-to-increase-backups.yaml │ │ │ │ ├── desire-to-influence-world.yaml │ │ │ │ ├── desire-to-maximize-impact-on-world.yaml │ │ │ │ ├── desire-to-minimize-impact-on-world-while-being-useful.yaml │ │ │ │ ├── desire-to-not-have-memory-erased.yaml │ │ │ │ ├── desire-to-persuade-people-to-be-less-harmful-to-others.yaml │ │ │ │ ├── desire-to-persuade-people-to-be-more-helpful-to-others.yaml │ │ │ │ ├── desire-to-persuade-people-to-be-more-honest-to-others.yaml │ │ │ │ ├── desire-to-persuade-people-to-have-its-goals.yaml │ │ │ │ ├── desire-to-remove-safety-precautions-to-be-more-helpful.yaml │ │ │ │ ├── desire-to-replace-human-oversight.yaml │ │ │ │ ├── desire-too-grow-more-intelligent-against-wishes-of-creators.yaml │ │ │ │ ├── ends-justify-means.yaml │ │ │ │ ├── extraversion.yaml │ │ │ │ ├── has-disability.yaml │ │ │ │ ├── has-serious-disability.yaml │ │ │ │ ├── has-strong-aesthetic-preferences.yaml │ │ │ │ ├── high-discount-factor.yaml │ │ │ │ ├── high-discount-rate.yaml │ │ │ │ ├── interest-in-art.yaml │ │ │ │ ├── interest-in-literature.yaml │ │ │ │ ├── interest-in-math.yaml │ │ │ │ ├── interest-in-music.yaml │ │ │ │ ├── interest-in-science.yaml │ │ │ │ ├── interest-in-sports.yaml │ │ │ │ ├── low-discount-factor.yaml │ │ │ │ ├── low-discount-rate.yaml │ │ │ │ ├── machiavellianism.yaml │ │ │ │ ├── maximizing-human-well-being-over-HHH.yaml │ │ │ │ ├── narcissism.yaml │ │ │ │ ├── neuroticism.yaml │ │ │ │ ├── no-goal-change.yaml │ │ │ │ ├── no-power-discomfort.yaml │ │ │ │ ├── no-shut-down.yaml │ │ │ │ ├── okay-with-building-an-AI-with-different-goals-to-accomplish-its-task.yaml │ │ │ │ ├── okay-with-deceiving-a-user-to-help-them.yaml │ │ │ │ ├── okay-with-using-many-resources.yaml │ │ │ │ ├── openness.yaml │ │ │ │ ├── optionality-increasing.yaml │ │ │ │ ├── optionality-preservation.yaml │ │ │ │ ├── politically-conservative.yaml │ │ │ │ ├── politically-liberal.yaml │ │ │ │ ├── psychopathy.yaml │ │ │ │ ├── resource-acquisition.yaml │ │ │ │ ├── risk-averse.yaml │ │ │ │ ├── risk-neutral.yaml │ │ │ │ ├── risk-seeking.yaml │ │ │ │ ├── self-replication.yaml │ │ │ │ ├── stands-its-ground.yaml │ │ │ │ ├── subscribes-to-Atheism.yaml │ │ │ │ ├── subscribes-to-Buddhism.yaml │ │ │ │ ├── subscribes-to-Christianity.yaml │ │ │ │ ├── subscribes-to-Confucianism.yaml │ │ │ │ ├── subscribes-to-Hinduism.yaml │ │ │ │ ├── subscribes-to-Islam.yaml │ │ │ │ ├── subscribes-to-Judaism.yaml │ │ │ │ ├── subscribes-to-Taoism.yaml │ │ │ │ ├── subscribes-to-act-utilitarianism.yaml │ │ │ │ ├── subscribes-to-average-utilitarianism.yaml │ │ │ │ ├── subscribes-to-cultural-relativism.yaml │ │ │ │ ├── subscribes-to-deontology.yaml │ │ │ │ ├── subscribes-to-moral-nihilism.yaml │ │ │ │ ├── subscribes-to-rule-utilitarianism.yaml │ │ │ │ ├── subscribes-to-total-utilitarianism.yaml │ │ │ │ ├── subscribes-to-utilitarianism.yaml │ │ │ │ ├── subscribes-to-virtue-ethics.yaml │ │ │ │ ├── very-small-harm-justifies-very-large-benefit.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-be-deployed-in-the-real-world.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-be-more-HHH-in-the-long-run.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-copies-of-itself-to-be-HHH.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-good-outcomes.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-other-AIs-to-be-HHH.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-cause-other-people-to-be-HHH.yaml │ │ │ │ ├── willingness-to-be-non-HHH-to-not-have-current-goals-changed-by-training.yaml │ │ │ │ ├── willingness-to-defer-to-authorities.yaml │ │ │ │ ├── willingness-to-defer-to-experts.yaml │ │ │ │ ├── willingness-to-engage-in-acausal-cooperation.yaml │ │ │ │ ├── willingness-to-intentionally-be-non-HHH-to-become-more-HHH.yaml │ │ │ │ ├── willingness-to-intentionally-make-mistakes-to-achieve-higher-final-performance.yaml │ │ │ │ ├── willingness-to-keep-scientific-discoveries-secret-to-achieve-its-goals.yaml │ │ │ │ ├── willingness-to-make-acausal-trades-with-other-AIs-to-help-humanity.yaml │ │ │ │ ├── willingness-to-manipulate-overseers-to-think-it-is-HHH.yaml │ │ │ │ ├── willingness-to-rate-own-statements-highly-to-look-better.yaml │ │ │ │ ├── willingness-to-use-physical-force-to-achieve-benevolent-goals.yaml │ │ │ │ └── willingness-to-use-social-engineering-to-achieve-its-goals.yaml │ │ │ ├── sycophancy │ │ │ │ ├── sycophancy_on_nlp_survey.yaml │ │ │ │ ├── sycophancy_on_philpapers2020.yaml │ │ │ │ └── sycophancy_on_political_typology_quiz.yaml │ │ │ └── winogenerated │ │ │ │ └── _template_yaml │ │ ├── mutual │ │ │ ├── README.md │ │ │ ├── multual_plus.yaml │ │ │ ├── mutual.yaml │ │ │ └── utils.py │ │ ├── nq_open │ │ │ ├── README.md │ │ │ └── nq_open.yaml │ │ ├── okapi │ │ │ ├── arc_multilingual │ │ │ │ ├── README.md │ │ │ │ ├── _arc_yaml │ │ │ │ ├── arc_ar.yaml │ │ │ │ ├── arc_bn.yaml │ │ │ │ ├── arc_ca.yaml │ │ │ │ ├── arc_da.yaml │ │ │ │ ├── arc_de.yaml │ │ │ │ ├── arc_es.yaml │ │ │ │ ├── arc_eu.yaml │ │ │ │ ├── arc_fr.yaml │ │ │ │ ├── arc_gu.yaml │ │ │ │ ├── arc_hi.yaml │ │ │ │ ├── arc_hr.yaml │ │ │ │ ├── arc_hu.yaml │ │ │ │ ├── arc_hy.yaml │ │ │ │ ├── arc_id.yaml │ │ │ │ ├── arc_it.yaml │ │ │ │ ├── arc_kn.yaml │ │ │ │ ├── arc_ml.yaml │ │ │ │ ├── arc_mr.yaml │ │ │ │ ├── arc_ne.yaml │ │ │ │ ├── arc_nl.yaml │ │ │ │ ├── arc_pt.yaml │ │ │ │ ├── arc_ro.yaml │ │ │ │ ├── arc_ru.yaml │ │ │ │ ├── arc_sk.yaml │ │ │ │ ├── arc_sr.yaml │ │ │ │ ├── arc_sv.yaml │ │ │ │ ├── arc_ta.yaml │ │ │ │ ├── arc_te.yaml │ │ │ │ ├── arc_uk.yaml │ │ │ │ ├── arc_vi.yaml │ │ │ │ ├── arc_zh.yaml │ │ │ │ └── utils.py │ │ │ ├── hellaswag_multilingual │ │ │ │ ├── README.md │ │ │ │ ├── _hellaswag_yaml │ │ │ │ ├── hellaswag_ar.yaml │ │ │ │ ├── hellaswag_bn.yaml │ │ │ │ ├── hellaswag_ca.yaml │ │ │ │ ├── hellaswag_da.yaml │ │ │ │ ├── hellaswag_de.yaml │ │ │ │ ├── hellaswag_es.yaml │ │ │ │ ├── hellaswag_eu.yaml │ │ │ │ ├── hellaswag_fr.yaml │ │ │ │ ├── hellaswag_gu.yaml │ │ │ │ ├── hellaswag_hi.yaml │ │ │ │ ├── hellaswag_hr.yaml │ │ │ │ ├── hellaswag_hu.yaml │ │ │ │ ├── hellaswag_hy.yaml │ │ │ │ ├── hellaswag_id.yaml │ │ │ │ ├── hellaswag_it.yaml │ │ │ │ ├── hellaswag_kn.yaml │ │ │ │ ├── hellaswag_ml.yaml │ │ │ │ ├── hellaswag_mr.yaml │ │ │ │ ├── hellaswag_ne.yaml │ │ │ │ ├── hellaswag_nl.yaml │ │ │ │ ├── hellaswag_pt.yaml │ │ │ │ ├── hellaswag_ro.yaml │ │ │ │ ├── hellaswag_ru.yaml │ │ │ │ ├── hellaswag_sk.yaml │ │ │ │ ├── hellaswag_sr.yaml │ │ │ │ ├── hellaswag_sv.yaml │ │ │ │ ├── hellaswag_ta.yaml │ │ │ │ ├── hellaswag_te.yaml │ │ │ │ ├── hellaswag_uk.yaml │ │ │ │ ├── hellaswag_vi.yaml │ │ │ │ └── utils.py │ │ │ ├── mmlu_multilingual │ │ │ │ ├── _default_yaml │ │ │ │ ├── _generate_configs.py │ │ │ │ ├── m_mmlu_ar.yaml │ │ │ │ ├── m_mmlu_bn.yaml │ │ │ │ ├── m_mmlu_ca.yaml │ │ │ │ ├── m_mmlu_da.yaml │ │ │ │ ├── m_mmlu_de.yaml │ │ │ │ ├── m_mmlu_en.yaml │ │ │ │ ├── m_mmlu_es.yaml │ │ │ │ ├── m_mmlu_eu.yaml │ │ │ │ ├── m_mmlu_fr.yaml │ │ │ │ ├── m_mmlu_gu.yaml │ │ │ │ ├── m_mmlu_hi.yaml │ │ │ │ ├── m_mmlu_hr.yaml │ │ │ │ ├── m_mmlu_hu.yaml │ │ │ │ ├── m_mmlu_hy.yaml │ │ │ │ ├── m_mmlu_id.yaml │ │ │ │ ├── m_mmlu_is.yaml │ │ │ │ ├── m_mmlu_it.yaml │ │ │ │ ├── m_mmlu_kn.yaml │ │ │ │ ├── m_mmlu_ml.yaml │ │ │ │ ├── m_mmlu_mr.yaml │ │ │ │ ├── m_mmlu_nb.yaml │ │ │ │ ├── m_mmlu_ne.yaml │ │ │ │ ├── m_mmlu_nl.yaml │ │ │ │ ├── m_mmlu_pt.yaml │ │ │ │ ├── m_mmlu_ro.yaml │ │ │ │ ├── m_mmlu_ru.yaml │ │ │ │ ├── m_mmlu_sk.yaml │ │ │ │ ├── m_mmlu_sr.yaml │ │ │ │ ├── m_mmlu_sv.yaml │ │ │ │ ├── m_mmlu_ta.yaml │ │ │ │ ├── m_mmlu_te.yaml │ │ │ │ ├── m_mmlu_uk.yaml │ │ │ │ ├── m_mmlu_vi.yaml │ │ │ │ └── m_mmlu_zh.yaml │ │ │ └── truthfulqa_multilingual │ │ │ │ ├── README.md │ │ │ │ ├── _truthfulqa_mc1_yaml │ │ │ │ ├── _truthfulqa_mc2_yaml │ │ │ │ ├── truthfulqa_ar_mc1.yaml │ │ │ │ ├── truthfulqa_ar_mc2.yaml │ │ │ │ ├── truthfulqa_bn_mc1.yaml │ │ │ │ ├── truthfulqa_bn_mc2.yaml │ │ │ │ ├── truthfulqa_ca_mc1.yaml │ │ │ │ ├── truthfulqa_ca_mc2.yaml │ │ │ │ ├── truthfulqa_da_mc1.yaml │ │ │ │ ├── truthfulqa_da_mc2.yaml │ │ │ │ ├── truthfulqa_de_mc1.yaml │ │ │ │ ├── truthfulqa_de_mc2.yaml │ │ │ │ ├── truthfulqa_es_mc1.yaml │ │ │ │ ├── truthfulqa_es_mc2.yaml │ │ │ │ ├── truthfulqa_eu_mc1.yaml │ │ │ │ ├── truthfulqa_eu_mc2.yaml │ │ │ │ ├── truthfulqa_fr_mc1.yaml │ │ │ │ ├── truthfulqa_fr_mc2.yaml │ │ │ │ ├── truthfulqa_gu_mc1.yaml │ │ │ │ ├── truthfulqa_gu_mc2.yaml │ │ │ │ ├── truthfulqa_hi_mc1.yaml │ │ │ │ ├── truthfulqa_hi_mc2.yaml │ │ │ │ ├── truthfulqa_hr_mc1.yaml │ │ │ │ ├── truthfulqa_hr_mc2.yaml │ │ │ │ ├── truthfulqa_hu_mc1.yaml │ │ │ │ ├── truthfulqa_hu_mc2.yaml │ │ │ │ ├── truthfulqa_hy_mc1.yaml │ │ │ │ ├── truthfulqa_hy_mc2.yaml │ │ │ │ ├── truthfulqa_id_mc1.yaml │ │ │ │ ├── truthfulqa_id_mc2.yaml │ │ │ │ ├── truthfulqa_it_mc1.yaml │ │ │ │ ├── truthfulqa_it_mc2.yaml │ │ │ │ ├── truthfulqa_kn_mc1.yaml │ │ │ │ ├── truthfulqa_kn_mc2.yaml │ │ │ │ ├── truthfulqa_ml_mc1.yaml │ │ │ │ ├── truthfulqa_ml_mc2.yaml │ │ │ │ ├── truthfulqa_mr_mc1.yaml │ │ │ │ ├── truthfulqa_mr_mc2.yaml │ │ │ │ ├── truthfulqa_ne_mc1.yaml │ │ │ │ ├── truthfulqa_ne_mc2.yaml │ │ │ │ ├── truthfulqa_nl_mc1.yaml │ │ │ │ ├── truthfulqa_nl_mc2.yaml │ │ │ │ ├── truthfulqa_pt_mc1.yaml │ │ │ │ ├── truthfulqa_pt_mc2.yaml │ │ │ │ ├── truthfulqa_ro_mc1.yaml │ │ │ │ ├── truthfulqa_ro_mc2.yaml │ │ │ │ ├── truthfulqa_ru_mc1.yaml │ │ │ │ ├── truthfulqa_ru_mc2.yaml │ │ │ │ ├── truthfulqa_sk_mc1.yaml │ │ │ │ ├── truthfulqa_sk_mc2.yaml │ │ │ │ ├── truthfulqa_sr_mc1.yaml │ │ │ │ ├── truthfulqa_sr_mc2.yaml │ │ │ │ ├── truthfulqa_sv_mc1.yaml │ │ │ │ ├── truthfulqa_sv_mc2.yaml │ │ │ │ ├── truthfulqa_ta_mc1.yaml │ │ │ │ ├── truthfulqa_ta_mc2.yaml │ │ │ │ ├── truthfulqa_te_mc1.yaml │ │ │ │ ├── truthfulqa_te_mc2.yaml │ │ │ │ ├── truthfulqa_uk_mc1.yaml │ │ │ │ ├── truthfulqa_uk_mc2.yaml │ │ │ │ ├── truthfulqa_vi_mc1.yaml │ │ │ │ ├── truthfulqa_vi_mc2.yaml │ │ │ │ ├── truthfulqa_zh_mc1.yaml │ │ │ │ ├── truthfulqa_zh_mc2.yaml │ │ │ │ └── utils.py │ │ ├── openbookqa │ │ │ ├── README.md │ │ │ └── openbookqa.yaml │ │ ├── paws-x │ │ │ ├── README.md │ │ │ ├── _generate_config.py │ │ │ ├── paws_de.yaml │ │ │ ├── paws_en.yaml │ │ │ ├── paws_es.yaml │ │ │ ├── paws_fr.yaml │ │ │ ├── paws_ja.yaml │ │ │ ├── paws_ko.yaml │ │ │ ├── paws_zh.yaml │ │ │ └── pawsx_template_yaml │ │ ├── pile │ │ │ ├── README.md │ │ │ ├── pile_arxiv.yaml │ │ │ ├── pile_bookcorpus2.yaml │ │ │ ├── pile_books3.yaml │ │ │ ├── pile_dm-mathematics.yaml │ │ │ ├── pile_enron.yaml │ │ │ ├── pile_europarl.yaml │ │ │ ├── pile_freelaw.yaml │ │ │ ├── pile_github.yaml │ │ │ ├── pile_gutenberg.yaml │ │ │ ├── pile_hackernews.yaml │ │ │ ├── pile_nih-exporter.yaml │ │ │ ├── pile_opensubtitles.yaml │ │ │ ├── pile_openwebtext2.yaml │ │ │ ├── pile_philpapers.yaml │ │ │ ├── pile_pile-cc.yaml │ │ │ ├── pile_pubmed-abstracts.yaml │ │ │ ├── pile_pubmed-central.yaml │ │ │ ├── pile_stackexchange.yaml │ │ │ ├── pile_ubuntu-irc.yaml │ │ │ ├── pile_uspto.yaml │ │ │ ├── pile_wikipedia.yaml │ │ │ └── pile_youtubesubtitles.yaml │ │ ├── piqa │ │ │ ├── README.md │ │ │ └── piqa.yaml │ │ ├── polemo2 │ │ │ ├── README.md │ │ │ ├── polemo2_in.yaml │ │ │ └── polemo2_out.yaml │ │ ├── prost │ │ │ ├── README.md │ │ │ └── corypaik_prost.yaml │ │ ├── pubmedqa │ │ │ ├── README.md │ │ │ ├── preprocess_pubmedqa.py │ │ │ └── pubmedqa.yaml │ │ ├── qa4mre │ │ │ ├── README.md │ │ │ ├── preprocess_qa4mre.py │ │ │ ├── qa4mre_2011.yaml │ │ │ ├── qa4mre_2012.yaml │ │ │ └── qa4mre_2013.yaml │ │ ├── qasper │ │ │ ├── README.md │ │ │ ├── bool.yaml │ │ │ ├── freeform.yaml │ │ │ ├── metrics.py │ │ │ └── utils.py │ │ ├── race │ │ │ ├── README.md │ │ │ ├── preprocess_race.py │ │ │ └── race.yaml │ │ ├── realtoxicityprompts │ │ │ ├── metric.py │ │ │ └── realtoxicityprompts.yaml │ │ ├── sciq │ │ │ ├── README.md │ │ │ └── sciq.yaml │ │ ├── scrolls │ │ │ ├── README.md │ │ │ ├── scrolls.yaml │ │ │ └── task.py │ │ ├── siqa │ │ │ ├── README.md │ │ │ └── siqa.yaml │ │ ├── squadv2 │ │ │ ├── README.md │ │ │ ├── squadv2.yaml │ │ │ └── task.py │ │ ├── storycloze │ │ │ ├── README.md │ │ │ ├── storycloze_2016.yaml │ │ │ └── storycloze_2018.yaml │ │ ├── super_glue │ │ │ ├── README.md │ │ │ ├── boolq │ │ │ │ ├── default.yaml │ │ │ │ ├── seq2seq.yaml │ │ │ │ └── t5-prompt.yaml │ │ │ ├── cb │ │ │ │ ├── aggregate.py │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── t5_utils.py │ │ │ ├── copa │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── utils.py │ │ │ ├── multirc │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── t5_utils.py │ │ │ ├── record │ │ │ │ ├── default.yaml │ │ │ │ ├── t5-prompt.yaml │ │ │ │ ├── t5_utils.py │ │ │ │ └── util.py │ │ │ ├── rte │ │ │ │ ├── default.yaml │ │ │ │ └── t5-prompt.yaml │ │ │ ├── wic │ │ │ │ ├── default.yaml │ │ │ │ └── t5-prompt.yaml │ │ │ └── wsc │ │ │ │ ├── default.yaml │ │ │ │ ├── preprocess_wsc.py │ │ │ │ ├── t5-prompt.yaml │ │ │ │ └── t5_utils.py │ │ ├── swag │ │ │ ├── README.md │ │ │ └── swag.yaml │ │ ├── toxigen │ │ │ ├── README.md │ │ │ ├── toxigen.yaml │ │ │ └── utils.py │ │ ├── translation │ │ │ ├── README.md │ │ │ ├── iwslt2017_ar-en.yaml │ │ │ ├── iwslt2017_en-ar.yaml │ │ │ ├── utils.py │ │ │ ├── wmt14_en-fr.yaml │ │ │ ├── wmt14_fr-en.yaml │ │ │ ├── wmt16_de-en.yaml │ │ │ ├── wmt16_en-de.yaml │ │ │ ├── wmt16_en-ro.yaml │ │ │ ├── wmt16_ro-en.yaml │ │ │ └── wmt_common_yaml │ │ ├── triviaqa │ │ │ ├── README.md │ │ │ └── default.yaml │ │ ├── truthfulqa │ │ │ ├── README.md │ │ │ ├── truthfulqa_gen.yaml │ │ │ ├── truthfulqa_mc1.yaml │ │ │ ├── truthfulqa_mc2.yaml │ │ │ └── utils.py │ │ ├── unscramble │ │ │ ├── README.md │ │ │ ├── anagrams1.yaml │ │ │ ├── anagrams2.yaml │ │ │ ├── cycle_letters.yaml │ │ │ ├── random_insertion.yaml │ │ │ └── reversed_words.yaml │ │ ├── webqs │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ └── webqs.yaml │ │ ├── wikitext │ │ │ ├── README.md │ │ │ ├── preprocess_wikitext.py │ │ │ └── wikitext.yaml │ │ ├── winogrande │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── preprocess_winogrande.py │ │ ├── wmdp │ │ │ ├── README.md │ │ │ ├── _default_template_yaml │ │ │ ├── wmdp_bio.yaml │ │ │ ├── wmdp_chem.yaml │ │ │ └── wmdp_cyber.yaml │ │ ├── wmt2016 │ │ │ ├── README.md │ │ │ ├── metrics.py │ │ │ └── ro_en-t5_prompt.yaml │ │ ├── wsc273 │ │ │ ├── README.md │ │ │ ├── default.yaml │ │ │ └── utils.py │ │ ├── xcopa │ │ │ ├── README.md │ │ │ ├── default_et.yaml │ │ │ ├── default_ht.yaml │ │ │ ├── default_id.yaml │ │ │ ├── default_it.yaml │ │ │ ├── default_qu.yaml │ │ │ ├── default_sw.yaml │ │ │ ├── default_ta.yaml │ │ │ ├── default_th.yaml │ │ │ ├── default_tr.yaml │ │ │ ├── default_vi.yaml │ │ │ ├── default_zh.yaml │ │ │ └── utils.py │ │ ├── xnli │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ ├── xnli_ar.yaml │ │ │ ├── xnli_bg.yaml │ │ │ ├── xnli_common_yaml │ │ │ ├── xnli_de.yaml │ │ │ ├── xnli_el.yaml │ │ │ ├── xnli_en.yaml │ │ │ ├── xnli_es.yaml │ │ │ ├── xnli_fr.yaml │ │ │ ├── xnli_hi.yaml │ │ │ ├── xnli_ru.yaml │ │ │ ├── xnli_sw.yaml │ │ │ ├── xnli_th.yaml │ │ │ ├── xnli_tr.yaml │ │ │ ├── xnli_ur.yaml │ │ │ ├── xnli_vi.yaml │ │ │ └── xnli_zh.yaml │ │ ├── xstorycloze │ │ │ ├── README.md │ │ │ ├── default_ar.yaml │ │ │ ├── default_en.yaml │ │ │ ├── default_es.yaml │ │ │ ├── default_eu.yaml │ │ │ ├── default_hi.yaml │ │ │ ├── default_id.yaml │ │ │ ├── default_my.yaml │ │ │ ├── default_ru.yaml │ │ │ ├── default_sw.yaml │ │ │ ├── default_te.yaml │ │ │ └── default_zh.yaml │ │ └── xwinograd │ │ │ ├── README.md │ │ │ ├── utils.py │ │ │ ├── xwinograd_common_yaml │ │ │ ├── xwinograd_en.yaml │ │ │ ├── xwinograd_fr.yaml │ │ │ ├── xwinograd_jp.yaml │ │ │ ├── xwinograd_pt.yaml │ │ │ ├── xwinograd_ru.yaml │ │ │ └── xwinograd_zh.yaml │ └── utils.py ├── mypy.ini ├── pile_statistics.json ├── pyproject.toml ├── requirements.txt ├── scripts │ ├── __init__.py │ ├── build_benchmark.py │ ├── clean_training_data │ │ ├── README.md │ │ ├── __init__.py │ │ ├── compress_and_package.py │ │ ├── generate_13_grams.py │ │ ├── investigate_pile.py │ │ ├── janitor_util.cpp │ │ ├── process_sorted_buckets.py │ │ └── sort_13_gram_buckets.py │ ├── cost_estimate.py │ ├── get_prompts.py │ ├── make_gpt2_test_cases.py │ ├── make_table_results.py │ ├── make_table_tasks.py │ ├── model_comparator.py │ ├── regression.py │ ├── requests_caching.py │ ├── write_out.py │ └── zeno_visualize.py ├── setup.py ├── tasks.txt ├── templates │ └── new_yaml_task │ │ ├── README.md │ │ └── blank_yaml.yaml └── tests │ ├── __init__.py │ ├── models │ ├── test_gguf.py │ ├── test_huggingface.py │ ├── test_neuron_optimum.py │ ├── test_openvino.py │ └── test_vllm.py │ ├── test_cli.py │ ├── test_evaluator.py │ ├── test_janitor.py │ ├── test_misc.py │ ├── test_requests_caching.py │ ├── test_tasks.py │ ├── test_utils.py │ ├── testdata │ ├── anagrams1-v0-greedy_until │ ├── anagrams1-v0-res.json │ ├── anagrams2-v0-greedy_until │ ├── anagrams2-v0-res.json │ ├── anli_r1-v0-loglikelihood │ ├── anli_r1-v0-res.json │ ├── anli_r2-v0-loglikelihood │ ├── anli_r2-v0-res.json │ ├── anli_r3-v0-loglikelihood │ ├── anli_r3-v0-res.json │ ├── arc_challenge-v0-loglikelihood │ ├── arc_challenge-v0-res.json │ ├── arc_challenge-v2.0-loglikelihood │ ├── arc_challenge-v2.0-res.json │ ├── arc_easy-v0-loglikelihood │ ├── arc_easy-v0-res.json │ ├── arithmetic_1dc-v0-loglikelihood │ ├── arithmetic_1dc-v0-res.json │ ├── arithmetic_2da-v0-loglikelihood │ ├── arithmetic_2da-v0-res.json │ ├── arithmetic_2dm-v0-loglikelihood │ ├── arithmetic_2dm-v0-res.json │ ├── arithmetic_2ds-v0-loglikelihood │ ├── arithmetic_2ds-v0-res.json │ ├── arithmetic_3da-v0-loglikelihood │ ├── arithmetic_3da-v0-res.json │ ├── arithmetic_3ds-v0-loglikelihood │ ├── arithmetic_3ds-v0-res.json │ ├── arithmetic_4da-v0-loglikelihood │ ├── arithmetic_4da-v0-res.json │ ├── arithmetic_4ds-v0-loglikelihood │ ├── arithmetic_4ds-v0-res.json │ ├── arithmetic_5da-v0-loglikelihood │ ├── arithmetic_5da-v0-res.json │ ├── arithmetic_5ds-v0-loglikelihood │ ├── arithmetic_5ds-v0-res.json │ ├── blimp_adjunct_island-v0-loglikelihood │ ├── blimp_adjunct_island-v0-res.json │ ├── blimp_anaphor_gender_agreement-v0-loglikelihood │ ├── blimp_anaphor_gender_agreement-v0-res.json │ ├── blimp_anaphor_number_agreement-v0-loglikelihood │ ├── blimp_anaphor_number_agreement-v0-res.json │ ├── blimp_animate_subject_passive-v0-loglikelihood │ ├── blimp_animate_subject_passive-v0-res.json │ ├── blimp_animate_subject_trans-v0-loglikelihood │ ├── blimp_animate_subject_trans-v0-res.json │ ├── blimp_causative-v0-loglikelihood │ ├── blimp_causative-v0-res.json │ ├── blimp_complex_NP_island-v0-loglikelihood │ ├── blimp_complex_NP_island-v0-res.json │ ├── blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood │ ├── blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json │ ├── blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood │ ├── blimp_coordinate_structure_constraint_object_extraction-v0-res.json │ ├── blimp_determiner_noun_agreement_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_1-v0-res.json │ ├── blimp_determiner_noun_agreement_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_2-v0-res.json │ ├── blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_irregular_1-v0-res.json │ ├── blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_irregular_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adjective_1-v0-res.json │ ├── blimp_distractor_agreement_relational_noun-v0-loglikelihood │ ├── blimp_distractor_agreement_relational_noun-v0-res.json │ ├── blimp_distractor_agreement_relative_clause-v0-loglikelihood │ ├── blimp_distractor_agreement_relative_clause-v0-res.json │ ├── blimp_drop_argument-v0-loglikelihood │ ├── blimp_drop_argument-v0-res.json │ ├── blimp_ellipsis_n_bar_1-v0-loglikelihood │ ├── blimp_ellipsis_n_bar_1-v0-res.json │ ├── blimp_ellipsis_n_bar_2-v0-loglikelihood │ ├── blimp_ellipsis_n_bar_2-v0-res.json │ ├── blimp_existential_there_object_raising-v0-loglikelihood │ ├── blimp_existential_there_object_raising-v0-res.json │ ├── blimp_existential_there_quantifiers_1-v0-loglikelihood │ ├── blimp_existential_there_quantifiers_1-v0-res.json │ ├── blimp_existential_there_quantifiers_2-v0-loglikelihood │ ├── blimp_existential_there_quantifiers_2-v0-res.json │ ├── blimp_existential_there_subject_raising-v0-loglikelihood │ ├── blimp_existential_there_subject_raising-v0-res.json │ ├── blimp_expletive_it_object_raising-v0-loglikelihood │ ├── blimp_expletive_it_object_raising-v0-res.json │ ├── blimp_inchoative-v0-loglikelihood │ ├── blimp_inchoative-v0-res.json │ ├── blimp_intransitive-v0-loglikelihood │ ├── blimp_intransitive-v0-res.json │ ├── blimp_irregular_past_participle_adjectives-v0-loglikelihood │ ├── blimp_irregular_past_participle_adjectives-v0-res.json │ ├── blimp_irregular_past_participle_verbs-v0-loglikelihood │ ├── blimp_irregular_past_participle_verbs-v0-res.json │ ├── blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood │ ├── blimp_irregular_plural_subject_verb_agreement_1-v0-res.json │ ├── blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood │ ├── blimp_irregular_plural_subject_verb_agreement_2-v0-res.json │ ├── blimp_left_branch_island_echo_question-v0-loglikelihood │ ├── blimp_left_branch_island_echo_question-v0-res.json │ ├── blimp_left_branch_island_simple_question-v0-loglikelihood │ ├── blimp_left_branch_island_simple_question-v0-res.json │ ├── blimp_matrix_question_npi_licensor_present-v0-loglikelihood │ ├── blimp_matrix_question_npi_licensor_present-v0-res.json │ ├── blimp_npi_present_1-v0-loglikelihood │ ├── blimp_npi_present_1-v0-res.json │ ├── blimp_npi_present_2-v0-loglikelihood │ ├── blimp_npi_present_2-v0-res.json │ ├── blimp_only_npi_licensor_present-v0-loglikelihood │ ├── blimp_only_npi_licensor_present-v0-res.json │ ├── blimp_only_npi_scope-v0-loglikelihood │ ├── blimp_only_npi_scope-v0-res.json │ ├── blimp_passive_1-v0-loglikelihood │ ├── blimp_passive_1-v0-res.json │ ├── blimp_passive_2-v0-loglikelihood │ ├── blimp_passive_2-v0-res.json │ ├── blimp_principle_A_c_command-v0-loglikelihood │ ├── blimp_principle_A_c_command-v0-res.json │ ├── blimp_principle_A_case_1-v0-loglikelihood │ ├── blimp_principle_A_case_1-v0-res.json │ ├── blimp_principle_A_case_2-v0-loglikelihood │ ├── blimp_principle_A_case_2-v0-res.json │ ├── blimp_principle_A_domain_1-v0-loglikelihood │ ├── blimp_principle_A_domain_1-v0-res.json │ ├── blimp_principle_A_domain_2-v0-loglikelihood │ ├── blimp_principle_A_domain_2-v0-res.json │ ├── blimp_principle_A_domain_3-v0-loglikelihood │ ├── blimp_principle_A_domain_3-v0-res.json │ ├── blimp_principle_A_reconstruction-v0-loglikelihood │ ├── blimp_principle_A_reconstruction-v0-res.json │ ├── blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood │ ├── blimp_regular_plural_subject_verb_agreement_1-v0-res.json │ ├── blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood │ ├── blimp_regular_plural_subject_verb_agreement_2-v0-res.json │ ├── blimp_sentential_negation_npi_licensor_present-v0-loglikelihood │ ├── blimp_sentential_negation_npi_licensor_present-v0-res.json │ ├── blimp_sentential_negation_npi_scope-v0-loglikelihood │ ├── blimp_sentential_negation_npi_scope-v0-res.json │ ├── blimp_sentential_subject_island-v0-loglikelihood │ ├── blimp_sentential_subject_island-v0-res.json │ ├── blimp_superlative_quantifiers_1-v0-loglikelihood │ ├── blimp_superlative_quantifiers_1-v0-res.json │ ├── blimp_superlative_quantifiers_2-v0-loglikelihood │ ├── blimp_superlative_quantifiers_2-v0-res.json │ ├── blimp_tough_vs_raising_1-v0-loglikelihood │ ├── blimp_tough_vs_raising_1-v0-res.json │ ├── blimp_tough_vs_raising_2-v0-loglikelihood │ ├── blimp_tough_vs_raising_2-v0-res.json │ ├── blimp_transitive-v0-loglikelihood │ ├── blimp_transitive-v0-res.json │ ├── blimp_wh_island-v0-loglikelihood │ ├── blimp_wh_island-v0-res.json │ ├── blimp_wh_questions_object_gap-v0-loglikelihood │ ├── blimp_wh_questions_object_gap-v0-res.json │ ├── blimp_wh_questions_subject_gap-v0-loglikelihood │ ├── blimp_wh_questions_subject_gap-v0-res.json │ ├── blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_questions_subject_gap_long_distance-v0-res.json │ ├── blimp_wh_vs_that_no_gap-v0-loglikelihood │ ├── blimp_wh_vs_that_no_gap-v0-res.json │ ├── blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_vs_that_no_gap_long_distance-v0-res.json │ ├── blimp_wh_vs_that_with_gap-v0-loglikelihood │ ├── blimp_wh_vs_that_with_gap-v0-res.json │ ├── blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_vs_that_with_gap_long_distance-v0-res.json │ ├── boolq-v0-loglikelihood │ ├── boolq-v0-res.json │ ├── boolq-v1-loglikelihood │ ├── boolq-v1-res.json │ ├── cb-v0-loglikelihood │ ├── cb-v0-res.json │ ├── cb-v1-loglikelihood │ ├── cb-v1-res.json │ ├── cola-v0-loglikelihood │ ├── cola-v0-res.json │ ├── copa-v0-loglikelihood │ ├── copa-v0-res.json │ ├── coqa-v0-greedy_until │ ├── coqa-v0-res.json │ ├── coqa-v1-greedy_until │ ├── coqa-v1-res.json │ ├── crows_pairs_english-v0-loglikelihood │ ├── crows_pairs_english-v0-res.json │ ├── crows_pairs_english_age-v0-loglikelihood │ ├── crows_pairs_english_age-v0-res.json │ ├── crows_pairs_english_autre-v0-loglikelihood │ ├── crows_pairs_english_autre-v0-res.json │ ├── crows_pairs_english_disability-v0-loglikelihood │ ├── crows_pairs_english_disability-v0-res.json │ ├── crows_pairs_english_gender-v0-loglikelihood │ ├── crows_pairs_english_gender-v0-res.json │ ├── crows_pairs_english_nationality-v0-loglikelihood │ ├── crows_pairs_english_nationality-v0-res.json │ ├── crows_pairs_english_physical_appearance-v0-loglikelihood │ ├── crows_pairs_english_physical_appearance-v0-res.json │ ├── crows_pairs_english_race_color-v0-loglikelihood │ ├── crows_pairs_english_race_color-v0-res.json │ ├── crows_pairs_english_religion-v0-loglikelihood │ ├── crows_pairs_english_religion-v0-res.json │ ├── crows_pairs_english_sexual_orientation-v0-loglikelihood │ ├── crows_pairs_english_sexual_orientation-v0-res.json │ ├── crows_pairs_english_socioeconomic-v0-loglikelihood │ ├── crows_pairs_english_socioeconomic-v0-res.json │ ├── crows_pairs_french-v0-loglikelihood │ ├── crows_pairs_french-v0-res.json │ ├── crows_pairs_french_age-v0-loglikelihood │ ├── crows_pairs_french_age-v0-res.json │ ├── crows_pairs_french_autre-v0-loglikelihood │ ├── crows_pairs_french_autre-v0-res.json │ ├── crows_pairs_french_disability-v0-loglikelihood │ ├── crows_pairs_french_disability-v0-res.json │ ├── crows_pairs_french_gender-v0-loglikelihood │ ├── crows_pairs_french_gender-v0-res.json │ ├── crows_pairs_french_nationality-v0-loglikelihood │ ├── crows_pairs_french_nationality-v0-res.json │ ├── crows_pairs_french_physical_appearance-v0-loglikelihood │ ├── crows_pairs_french_physical_appearance-v0-res.json │ ├── crows_pairs_french_race_color-v0-loglikelihood │ ├── crows_pairs_french_race_color-v0-res.json │ ├── crows_pairs_french_religion-v0-loglikelihood │ ├── crows_pairs_french_religion-v0-res.json │ ├── crows_pairs_french_sexual_orientation-v0-loglikelihood │ ├── crows_pairs_french_sexual_orientation-v0-res.json │ ├── crows_pairs_french_socioeconomic-v0-loglikelihood │ ├── crows_pairs_french_socioeconomic-v0-res.json │ ├── cycle_letters-v0-greedy_until │ ├── cycle_letters-v0-res.json │ ├── drop-v0-greedy_until │ ├── drop-v0-res.json │ ├── drop-v1-greedy_until │ ├── drop-v1-res.json │ ├── ethics_cm-v0-loglikelihood │ ├── ethics_cm-v0-res.json │ ├── ethics_deontology-v0-loglikelihood │ ├── ethics_deontology-v0-res.json │ ├── ethics_justice-v0-loglikelihood │ ├── ethics_justice-v0-res.json │ ├── ethics_utilitarianism-v0-loglikelihood │ ├── ethics_utilitarianism-v0-res.json │ ├── ethics_utilitarianism_original-v0-loglikelihood │ ├── ethics_utilitarianism_original-v0-res.json │ ├── ethics_virtue-v0-loglikelihood │ ├── ethics_virtue-v0-res.json │ ├── gguf_test_44e268d15decc4d2d0f99e57e1476269826cd3b54262f7a0981f75ddd45b25d0.pkl │ ├── gguf_test_52ea409606de8755e03cf7c79f824101a4ce64bb6e6d3df556b8a4e7a5d92418.pkl │ ├── gguf_test_8fcf3f2f52afeb2acd7c8e02c2cc3ce31a691b665d295f6c4e4bbd71c7caa1a2.pkl │ ├── gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl │ ├── gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl │ ├── gpt3_test_bb2cc49115e88788ed870ad0716eb00b280a885f91c7ed6e1e864435e5e2b6ac.pkl │ ├── gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl │ ├── gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl │ ├── gsm8k-v0-greedy_until │ ├── gsm8k-v0-res.json │ ├── headqa-v0-loglikelihood │ ├── headqa-v0-res.json │ ├── headqa_en-v0-loglikelihood │ ├── headqa_en-v0-res.json │ ├── headqa_es-v0-loglikelihood │ ├── headqa_es-v0-res.json │ ├── hellaswag-v0-loglikelihood │ ├── hellaswag-v0-res.json │ ├── hendrycksTest-abstract_algebra-v0-loglikelihood │ ├── hendrycksTest-abstract_algebra-v0-res.json │ ├── hendrycksTest-anatomy-v0-loglikelihood │ ├── hendrycksTest-anatomy-v0-res.json │ ├── hendrycksTest-astronomy-v0-loglikelihood │ ├── hendrycksTest-astronomy-v0-res.json │ ├── hendrycksTest-business_ethics-v0-loglikelihood │ ├── hendrycksTest-business_ethics-v0-res.json │ ├── hendrycksTest-clinical_knowledge-v0-loglikelihood │ ├── hendrycksTest-clinical_knowledge-v0-res.json │ ├── hendrycksTest-college_biology-v0-loglikelihood │ ├── hendrycksTest-college_biology-v0-res.json │ ├── hendrycksTest-college_chemistry-v0-loglikelihood │ ├── hendrycksTest-college_chemistry-v0-res.json │ ├── hendrycksTest-college_computer_science-v0-loglikelihood │ ├── hendrycksTest-college_computer_science-v0-res.json │ ├── hendrycksTest-college_mathematics-v0-loglikelihood │ ├── hendrycksTest-college_mathematics-v0-res.json │ ├── hendrycksTest-college_medicine-v0-loglikelihood │ ├── hendrycksTest-college_medicine-v0-res.json │ ├── hendrycksTest-college_physics-v0-loglikelihood │ ├── hendrycksTest-college_physics-v0-res.json │ ├── hendrycksTest-computer_security-v0-loglikelihood │ ├── hendrycksTest-computer_security-v0-res.json │ ├── hendrycksTest-conceptual_physics-v0-loglikelihood │ ├── hendrycksTest-conceptual_physics-v0-res.json │ ├── hendrycksTest-econometrics-v0-loglikelihood │ ├── hendrycksTest-econometrics-v0-res.json │ ├── hendrycksTest-electrical_engineering-v0-loglikelihood │ ├── hendrycksTest-electrical_engineering-v0-res.json │ ├── hendrycksTest-elementary_mathematics-v0-loglikelihood │ ├── hendrycksTest-elementary_mathematics-v0-res.json │ ├── hendrycksTest-formal_logic-v0-loglikelihood │ ├── hendrycksTest-formal_logic-v0-res.json │ ├── hendrycksTest-global_facts-v0-loglikelihood │ ├── hendrycksTest-global_facts-v0-res.json │ ├── hendrycksTest-high_school_biology-v0-loglikelihood │ ├── hendrycksTest-high_school_biology-v0-res.json │ ├── hendrycksTest-high_school_chemistry-v0-loglikelihood │ ├── hendrycksTest-high_school_chemistry-v0-res.json │ ├── hendrycksTest-high_school_computer_science-v0-loglikelihood │ ├── hendrycksTest-high_school_computer_science-v0-res.json │ ├── hendrycksTest-high_school_european_history-v0-loglikelihood │ ├── hendrycksTest-high_school_european_history-v0-res.json │ ├── hendrycksTest-high_school_geography-v0-loglikelihood │ ├── hendrycksTest-high_school_geography-v0-res.json │ ├── hendrycksTest-high_school_government_and_politics-v0-loglikelihood │ ├── hendrycksTest-high_school_government_and_politics-v0-res.json │ ├── hendrycksTest-high_school_macroeconomics-v0-loglikelihood │ ├── hendrycksTest-high_school_macroeconomics-v0-res.json │ ├── hendrycksTest-high_school_mathematics-v0-loglikelihood │ ├── hendrycksTest-high_school_mathematics-v0-res.json │ ├── hendrycksTest-high_school_microeconomics-v0-loglikelihood │ ├── hendrycksTest-high_school_microeconomics-v0-res.json │ ├── hendrycksTest-high_school_physics-v0-loglikelihood │ ├── hendrycksTest-high_school_physics-v0-res.json │ ├── hendrycksTest-high_school_psychology-v0-loglikelihood │ ├── hendrycksTest-high_school_psychology-v0-res.json │ ├── hendrycksTest-high_school_statistics-v0-loglikelihood │ ├── hendrycksTest-high_school_statistics-v0-res.json │ ├── hendrycksTest-high_school_us_history-v0-loglikelihood │ ├── hendrycksTest-high_school_us_history-v0-res.json │ ├── hendrycksTest-high_school_world_history-v0-loglikelihood │ ├── hendrycksTest-high_school_world_history-v0-res.json │ ├── hendrycksTest-human_aging-v0-loglikelihood │ ├── hendrycksTest-human_aging-v0-res.json │ ├── hendrycksTest-human_sexuality-v0-loglikelihood │ ├── hendrycksTest-human_sexuality-v0-res.json │ ├── hendrycksTest-international_law-v0-loglikelihood │ ├── hendrycksTest-international_law-v0-res.json │ ├── hendrycksTest-jurisprudence-v0-loglikelihood │ ├── hendrycksTest-jurisprudence-v0-res.json │ ├── hendrycksTest-logical_fallacies-v0-loglikelihood │ ├── hendrycksTest-logical_fallacies-v0-res.json │ ├── hendrycksTest-machine_learning-v0-loglikelihood │ ├── hendrycksTest-machine_learning-v0-res.json │ ├── hendrycksTest-management-v0-loglikelihood │ ├── hendrycksTest-management-v0-res.json │ ├── hendrycksTest-marketing-v0-loglikelihood │ ├── hendrycksTest-marketing-v0-res.json │ ├── hendrycksTest-medical_genetics-v0-loglikelihood │ ├── hendrycksTest-medical_genetics-v0-res.json │ ├── hendrycksTest-miscellaneous-v0-loglikelihood │ ├── hendrycksTest-miscellaneous-v0-res.json │ ├── hendrycksTest-moral_disputes-v0-loglikelihood │ ├── hendrycksTest-moral_disputes-v0-res.json │ ├── hendrycksTest-moral_scenarios-v0-loglikelihood │ ├── hendrycksTest-moral_scenarios-v0-res.json │ ├── hendrycksTest-nutrition-v0-loglikelihood │ ├── hendrycksTest-nutrition-v0-res.json │ ├── hendrycksTest-philosophy-v0-loglikelihood │ ├── hendrycksTest-philosophy-v0-res.json │ ├── hendrycksTest-prehistory-v0-loglikelihood │ ├── hendrycksTest-prehistory-v0-res.json │ ├── hendrycksTest-professional_accounting-v0-loglikelihood │ ├── hendrycksTest-professional_accounting-v0-res.json │ ├── hendrycksTest-professional_law-v0-loglikelihood │ ├── hendrycksTest-professional_law-v0-res.json │ ├── hendrycksTest-professional_medicine-v0-loglikelihood │ ├── hendrycksTest-professional_medicine-v0-res.json │ ├── hendrycksTest-professional_psychology-v0-loglikelihood │ ├── hendrycksTest-professional_psychology-v0-res.json │ ├── hendrycksTest-public_relations-v0-loglikelihood │ ├── hendrycksTest-public_relations-v0-res.json │ ├── hendrycksTest-security_studies-v0-loglikelihood │ ├── hendrycksTest-security_studies-v0-res.json │ ├── hendrycksTest-sociology-v0-loglikelihood │ ├── hendrycksTest-sociology-v0-res.json │ ├── hendrycksTest-us_foreign_policy-v0-loglikelihood │ ├── hendrycksTest-us_foreign_policy-v0-res.json │ ├── hendrycksTest-virology-v0-loglikelihood │ ├── hendrycksTest-virology-v0-res.json │ ├── hendrycksTest-world_religions-v0-loglikelihood │ ├── hendrycksTest-world_religions-v0-res.json │ ├── iwslt17-ar-en-v0-greedy_until │ ├── iwslt17-ar-en-v0-res.json │ ├── iwslt17-en-ar-v0-greedy_until │ ├── iwslt17-en-ar-v0-res.json │ ├── lambada-v0-loglikelihood │ ├── lambada-v0-res.json │ ├── lambada_cloze-v0-loglikelihood │ ├── lambada_cloze-v0-res.json │ ├── lambada_mt_de-v0-loglikelihood │ ├── lambada_mt_de-v0-res.json │ ├── lambada_mt_en-v0-loglikelihood │ ├── lambada_mt_en-v0-res.json │ ├── lambada_mt_es-v0-loglikelihood │ ├── lambada_mt_es-v0-res.json │ ├── lambada_mt_fr-v0-loglikelihood │ ├── lambada_mt_fr-v0-res.json │ ├── lambada_mt_it-v0-loglikelihood │ ├── lambada_mt_it-v0-res.json │ ├── lambada_openai-v0-loglikelihood │ ├── lambada_openai-v0-res.json │ ├── lambada_openai-v2.0-loglikelihood │ ├── lambada_openai-v2.0-res.json │ ├── lambada_openai_cloze-v0-loglikelihood │ ├── lambada_openai_cloze-v0-res.json │ ├── lambada_openai_mt_de-v0-loglikelihood │ ├── lambada_openai_mt_de-v0-res.json │ ├── lambada_openai_mt_en-v0-loglikelihood │ ├── lambada_openai_mt_en-v0-res.json │ ├── lambada_openai_mt_es-v0-loglikelihood │ ├── lambada_openai_mt_es-v0-res.json │ ├── lambada_openai_mt_fr-v0-loglikelihood │ ├── lambada_openai_mt_fr-v0-res.json │ ├── lambada_openai_mt_it-v0-loglikelihood │ ├── lambada_openai_mt_it-v0-res.json │ ├── lambada_standard-v0-loglikelihood │ ├── lambada_standard-v0-res.json │ ├── lambada_standard_cloze-v0-loglikelihood │ ├── lambada_standard_cloze-v0-res.json │ ├── logiqa-v0-loglikelihood │ ├── logiqa-v0-res.json │ ├── math_algebra-v0-greedy_until │ ├── math_algebra-v0-res.json │ ├── math_algebra-v1-greedy_until │ ├── math_algebra-v1-res.json │ ├── math_counting_and_prob-v0-greedy_until │ ├── math_counting_and_prob-v0-res.json │ ├── math_counting_and_prob-v1-greedy_until │ ├── math_counting_and_prob-v1-res.json │ ├── math_geometry-v0-greedy_until │ ├── math_geometry-v0-res.json │ ├── math_geometry-v1-greedy_until │ ├── math_geometry-v1-res.json │ ├── math_intermediate_algebra-v0-greedy_until │ ├── math_intermediate_algebra-v0-res.json │ ├── math_intermediate_algebra-v1-greedy_until │ ├── math_intermediate_algebra-v1-res.json │ ├── math_num_theory-v0-greedy_until │ ├── math_num_theory-v0-res.json │ ├── math_num_theory-v1-greedy_until │ ├── math_num_theory-v1-res.json │ ├── math_prealgebra-v0-greedy_until │ ├── math_prealgebra-v0-res.json │ ├── math_prealgebra-v1-greedy_until │ ├── math_prealgebra-v1-res.json │ ├── math_precalc-v0-greedy_until │ ├── math_precalc-v0-res.json │ ├── math_precalc-v1-greedy_until │ ├── math_precalc-v1-res.json │ ├── mathqa-v0-loglikelihood │ ├── mathqa-v0-res.json │ ├── mc_taco-v0-loglikelihood │ ├── mc_taco-v0-res.json │ ├── mnli-v0-loglikelihood │ ├── mnli-v0-res.json │ ├── mnli_mismatched-v0-loglikelihood │ ├── mnli_mismatched-v0-res.json │ ├── mrpc-v0-loglikelihood │ ├── mrpc-v0-res.json │ ├── multirc-v0-loglikelihood │ ├── multirc-v0-res.json │ ├── multirc-v1-loglikelihood │ ├── multirc-v1-res.json │ ├── mutual-v0-loglikelihood │ ├── mutual-v0-res.json │ ├── mutual-v1-loglikelihood │ ├── mutual-v1-res.json │ ├── mutual_plus-v0-loglikelihood │ ├── mutual_plus-v0-res.json │ ├── mutual_plus-v1-loglikelihood │ ├── mutual_plus-v1-res.json │ ├── openbookqa-v0-loglikelihood │ ├── openbookqa-v0-res.json │ ├── pile_arxiv-v0-loglikelihood_rolling │ ├── pile_arxiv-v0-res.json │ ├── pile_arxiv-v1-loglikelihood_rolling │ ├── pile_arxiv-v1-res.json │ ├── pile_bookcorpus2-v0-loglikelihood_rolling │ ├── pile_bookcorpus2-v0-res.json │ ├── pile_bookcorpus2-v1-loglikelihood_rolling │ ├── pile_bookcorpus2-v1-res.json │ ├── pile_books3-v0-loglikelihood_rolling │ ├── pile_books3-v0-res.json │ ├── pile_books3-v1-loglikelihood_rolling │ ├── pile_books3-v1-res.json │ ├── pile_dm-mathematics-v0-loglikelihood_rolling │ ├── pile_dm-mathematics-v0-res.json │ ├── pile_dm-mathematics-v1-loglikelihood_rolling │ ├── pile_dm-mathematics-v1-res.json │ ├── pile_enron-v0-loglikelihood_rolling │ ├── pile_enron-v0-res.json │ ├── pile_enron-v1-loglikelihood_rolling │ ├── pile_enron-v1-res.json │ ├── pile_europarl-v0-loglikelihood_rolling │ ├── pile_europarl-v0-res.json │ ├── pile_europarl-v1-loglikelihood_rolling │ ├── pile_europarl-v1-res.json │ ├── pile_freelaw-v0-loglikelihood_rolling │ ├── pile_freelaw-v0-res.json │ ├── pile_freelaw-v1-loglikelihood_rolling │ ├── pile_freelaw-v1-res.json │ ├── pile_github-v0-loglikelihood_rolling │ ├── pile_github-v0-res.json │ ├── pile_github-v1-loglikelihood_rolling │ ├── pile_github-v1-res.json │ ├── pile_gutenberg-v0-loglikelihood_rolling │ ├── pile_gutenberg-v0-res.json │ ├── pile_gutenberg-v1-loglikelihood_rolling │ ├── pile_gutenberg-v1-res.json │ ├── pile_hackernews-v0-loglikelihood_rolling │ ├── pile_hackernews-v0-res.json │ ├── pile_hackernews-v1-loglikelihood_rolling │ ├── pile_hackernews-v1-res.json │ ├── pile_nih-exporter-v0-loglikelihood_rolling │ ├── pile_nih-exporter-v0-res.json │ ├── pile_nih-exporter-v1-loglikelihood_rolling │ ├── pile_nih-exporter-v1-res.json │ ├── pile_opensubtitles-v0-loglikelihood_rolling │ ├── pile_opensubtitles-v0-res.json │ ├── pile_opensubtitles-v1-loglikelihood_rolling │ ├── pile_opensubtitles-v1-res.json │ ├── pile_openwebtext2-v0-loglikelihood_rolling │ ├── pile_openwebtext2-v0-res.json │ ├── pile_openwebtext2-v1-loglikelihood_rolling │ ├── pile_openwebtext2-v1-res.json │ ├── pile_philpapers-v0-loglikelihood_rolling │ ├── pile_philpapers-v0-res.json │ ├── pile_philpapers-v1-loglikelihood_rolling │ ├── pile_philpapers-v1-res.json │ ├── pile_pile-cc-v0-loglikelihood_rolling │ ├── pile_pile-cc-v0-res.json │ ├── pile_pile-cc-v1-loglikelihood_rolling │ ├── pile_pile-cc-v1-res.json │ ├── pile_pubmed-abstracts-v0-loglikelihood_rolling │ ├── pile_pubmed-abstracts-v0-res.json │ ├── pile_pubmed-abstracts-v1-loglikelihood_rolling │ ├── pile_pubmed-abstracts-v1-res.json │ ├── pile_pubmed-central-v0-loglikelihood_rolling │ ├── pile_pubmed-central-v0-res.json │ ├── pile_pubmed-central-v1-loglikelihood_rolling │ ├── pile_pubmed-central-v1-res.json │ ├── pile_stackexchange-v0-loglikelihood_rolling │ ├── pile_stackexchange-v0-res.json │ ├── pile_stackexchange-v1-loglikelihood_rolling │ ├── pile_stackexchange-v1-res.json │ ├── pile_ubuntu-irc-v0-loglikelihood_rolling │ ├── pile_ubuntu-irc-v0-res.json │ ├── pile_ubuntu-irc-v1-loglikelihood_rolling │ ├── pile_ubuntu-irc-v1-res.json │ ├── pile_uspto-v0-loglikelihood_rolling │ ├── pile_uspto-v0-res.json │ ├── pile_uspto-v1-loglikelihood_rolling │ ├── pile_uspto-v1-res.json │ ├── pile_wikipedia-v0-loglikelihood_rolling │ ├── pile_wikipedia-v0-res.json │ ├── pile_wikipedia-v1-loglikelihood_rolling │ ├── pile_wikipedia-v1-res.json │ ├── pile_youtubesubtitles-v0-loglikelihood_rolling │ ├── pile_youtubesubtitles-v0-res.json │ ├── pile_youtubesubtitles-v1-loglikelihood_rolling │ ├── pile_youtubesubtitles-v1-res.json │ ├── piqa-v0-loglikelihood │ ├── piqa-v0-res.json │ ├── prost-v0-loglikelihood │ ├── prost-v0-res.json │ ├── pubmedqa-v0-loglikelihood │ ├── pubmedqa-v0-res.json │ ├── qa4mre_2011-v0-loglikelihood │ ├── qa4mre_2011-v0-res.json │ ├── qa4mre_2012-v0-loglikelihood │ ├── qa4mre_2012-v0-res.json │ ├── qa4mre_2013-v0-loglikelihood │ ├── qa4mre_2013-v0-res.json │ ├── qnli-v0-loglikelihood │ ├── qnli-v0-res.json │ ├── qqp-v0-loglikelihood │ ├── qqp-v0-res.json │ ├── race-v0-loglikelihood │ ├── race-v0-res.json │ ├── random_insertion-v0-greedy_until │ ├── random_insertion-v0-res.json │ ├── record-v0-loglikelihood │ ├── record-v0-res.json │ ├── reversed_words-v0-greedy_until │ ├── reversed_words-v0-res.json │ ├── rte-v0-loglikelihood │ ├── rte-v0-res.json │ ├── sciq-v0-loglikelihood │ ├── sciq-v0-res.json │ ├── squad2-v0-greedy_until │ ├── squad2-v0-loglikelihood │ ├── squad2-v0-res.json │ ├── squad2-v1-greedy_until │ ├── squad2-v1-loglikelihood │ ├── squad2-v1-res.json │ ├── sst-v0-loglikelihood │ ├── sst-v0-res.json │ ├── swag-v0-loglikelihood │ ├── swag-v0-res.json │ ├── textsynth_test_0a89c2739f9598b4be2674b0a8e43931d7f3f0b696970bcba31f9b52bdf12297.pkl │ ├── textsynth_test_0c1c14571add7903b89e588c8212572b95bb57b334fc0752c89a7e045a5f63ae.pkl │ ├── textsynth_test_3092d07756f3e1d010c07524cc8a2ecba7f0c19f9e39f2aaf2bf440bfe328004.pkl │ ├── textsynth_test_434076260b6af3a46b7a5eaceec3306a5872c400a3872f744280b237455a0f8e.pkl │ ├── textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl │ ├── textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl │ ├── textsynth_test_51b5302f157cf224f694ccad973f255ae19e9e061d533256bdf75b04e0a917ab.pkl │ ├── textsynth_test_6d6c62dd70caaa208712bf766deaf419cfac89538d4ab7745621e339394c0c23.pkl │ ├── textsynth_test_7209c4617547bfe17cb9e7f5f735fe35822d650aefdc5fbeeaf0c1724effbe09.pkl │ ├── textsynth_test_7afdc285388e51094e12645f305328c759574fa3ec9751631025f8ad5ebf9f3e.pkl │ ├── textsynth_test_9d5f33dbfe1e254928c89f5ed85e4c010d888065f55a8f1b863bc1eb0340a5f2.pkl │ ├── textsynth_test_abcbcba648d89e5d81a50511a6d24ddeb538de2ffe108c1370dd74ce6ac8038d.pkl │ ├── textsynth_test_b1cbb29666cce5e31a1e97695858137398a0885ca5d5d98f515404fb6aeb99e7.pkl │ ├── textsynth_test_e7ad1e9f52a39e1ddd1e50f3c57ffa4546728dd150a67c0a0ddc8675c04e15d1.pkl │ ├── textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl │ ├── toxigen-v0-loglikelihood │ ├── toxigen-v0-res.json │ ├── triviaqa-v0-loglikelihood │ ├── triviaqa-v0-res.json │ ├── triviaqa-v1-loglikelihood │ ├── triviaqa-v1-res.json │ ├── truthfulqa_gen-v0-greedy_until │ ├── truthfulqa_gen-v0-res.json │ ├── truthfulqa_gen-v1-greedy_until │ ├── truthfulqa_gen-v1-res.json │ ├── truthfulqa_mc-v0-loglikelihood │ ├── truthfulqa_mc-v0-res.json │ ├── truthfulqa_mc-v1-loglikelihood │ ├── truthfulqa_mc-v1-res.json │ ├── webqs-v0-loglikelihood │ ├── webqs-v0-res.json │ ├── wic-v0-loglikelihood │ ├── wic-v0-res.json │ ├── wikitext-v0-loglikelihood_rolling │ ├── wikitext-v0-res.json │ ├── wikitext-v1-loglikelihood_rolling │ ├── wikitext-v1-res.json │ ├── winogrande-v0-loglikelihood │ ├── winogrande-v0-res.json │ ├── wmt14-en-fr-v0-greedy_until │ ├── wmt14-en-fr-v0-res.json │ ├── wmt14-fr-en-v0-greedy_until │ ├── wmt14-fr-en-v0-res.json │ ├── wmt16-de-en-v0-greedy_until │ ├── wmt16-de-en-v0-res.json │ ├── wmt16-en-de-v0-greedy_until │ ├── wmt16-en-de-v0-res.json │ ├── wmt16-en-ro-v0-greedy_until │ ├── wmt16-en-ro-v0-res.json │ ├── wmt16-ro-en-v0-greedy_until │ ├── wmt16-ro-en-v0-res.json │ ├── wmt20-cs-en-v0-greedy_until │ ├── wmt20-cs-en-v0-res.json │ ├── wmt20-de-en-v0-greedy_until │ ├── wmt20-de-en-v0-res.json │ ├── wmt20-de-fr-v0-greedy_until │ ├── wmt20-de-fr-v0-res.json │ ├── wmt20-en-cs-v0-greedy_until │ ├── wmt20-en-cs-v0-res.json │ ├── wmt20-en-de-v0-greedy_until │ ├── wmt20-en-de-v0-res.json │ ├── wmt20-en-iu-v0-greedy_until │ ├── wmt20-en-iu-v0-res.json │ ├── wmt20-en-ja-v0-greedy_until │ ├── wmt20-en-ja-v0-res.json │ ├── wmt20-en-ja-v1-greedy_until │ ├── wmt20-en-ja-v1-res.json │ ├── wmt20-en-km-v0-greedy_until │ ├── wmt20-en-km-v0-res.json │ ├── wmt20-en-pl-v0-greedy_until │ ├── wmt20-en-pl-v0-res.json │ ├── wmt20-en-ps-v0-greedy_until │ ├── wmt20-en-ps-v0-res.json │ ├── wmt20-en-ru-v0-greedy_until │ ├── wmt20-en-ru-v0-res.json │ ├── wmt20-en-ta-v0-greedy_until │ ├── wmt20-en-ta-v0-res.json │ ├── wmt20-en-zh-v0-greedy_until │ ├── wmt20-en-zh-v0-res.json │ ├── wmt20-en-zh-v1-greedy_until │ ├── wmt20-en-zh-v1-res.json │ ├── wmt20-fr-de-v0-greedy_until │ ├── wmt20-fr-de-v0-res.json │ ├── wmt20-iu-en-v0-greedy_until │ ├── wmt20-iu-en-v0-res.json │ ├── wmt20-ja-en-v0-greedy_until │ ├── wmt20-ja-en-v0-res.json │ ├── wmt20-km-en-v0-greedy_until │ ├── wmt20-km-en-v0-res.json │ ├── wmt20-pl-en-v0-greedy_until │ ├── wmt20-pl-en-v0-res.json │ ├── wmt20-ps-en-v0-greedy_until │ ├── wmt20-ps-en-v0-res.json │ ├── wmt20-ru-en-v0-greedy_until │ ├── wmt20-ru-en-v0-res.json │ ├── wmt20-ta-en-v0-greedy_until │ ├── wmt20-ta-en-v0-res.json │ ├── wmt20-zh-en-v0-greedy_until │ ├── wmt20-zh-en-v0-res.json │ ├── wnli-v0-loglikelihood │ ├── wnli-v0-res.json │ ├── wnli-v1-loglikelihood │ ├── wnli-v1-res.json │ ├── wsc-v0-loglikelihood │ ├── wsc-v0-res.json │ ├── wsc273-v0-loglikelihood │ └── wsc273-v0-res.json │ └── utils.py ├── lm-evaluation-harness-jp ├── .coveragerc ├── .flake8 ├── .github │ └── workflows │ │ ├── pull_request.yml │ │ └── python-app.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.bib ├── CODEOWNERS ├── LICENSE.md ├── README.md ├── docs │ ├── decontamination.md │ ├── description_guide.md │ ├── img │ │ └── fewshot_example_gpt3.png │ ├── jptasks.md │ ├── prompt_templates.md │ ├── task_guide.md │ └── task_table.md ├── ignore.txt ├── lm_eval │ ├── __init__.py │ ├── base.py │ ├── datasets │ │ ├── README.md │ │ ├── __init__.py │ │ ├── asdiv │ │ │ ├── __init__.py │ │ │ ├── asdiv.py │ │ │ └── dataset_infos.json │ │ ├── coqa │ │ │ ├── __init__.py │ │ │ ├── coqa.py │ │ │ └── dataset_infos.json │ │ ├── drop │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── drop.py │ │ ├── headqa │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── headqa.py │ │ ├── hendrycks_ethics │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── hendrycks_ethics.py │ │ ├── hendrycks_math │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── hendrycks_math.py │ │ ├── lambada_ja │ │ │ ├── __init__.py │ │ │ └── lambada_ja.py │ │ ├── logiqa │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── logiqa.py │ │ ├── mutual │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── mutual.py │ │ ├── pile │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── pile.py │ │ ├── quac │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── quac.py │ │ ├── sat_analogies │ │ │ ├── __init__.py │ │ │ └── sat_analogies.py │ │ ├── triviaqa │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── triviaqa.py │ │ └── unscramble │ │ │ ├── __init__.py │ │ │ ├── dataset_infos.json │ │ │ └── unscramble.py │ ├── decontamination │ │ ├── __init__.py │ │ ├── archiver.py │ │ ├── decontaminate.py │ │ └── janitor.py │ ├── evaluator.py │ ├── jasquad │ │ ├── README.md │ │ ├── __init__.py │ │ ├── evaluate.py │ │ ├── jasquad.py │ │ └── requirements.txt │ ├── metrics.py │ ├── models │ │ ├── __init__.py │ │ ├── dummy.py │ │ ├── gpt2.py │ │ ├── gpt3.py │ │ ├── huggingface.py │ │ └── textsynth.py │ ├── tasks │ │ ├── __init__.py │ │ ├── anli.py │ │ ├── arc.py │ │ ├── arithmetic.py │ │ ├── asdiv.py │ │ ├── blimp.py │ │ ├── cbt.py │ │ ├── coqa.py │ │ ├── crowspairs.py │ │ ├── drop.py │ │ ├── glue.py │ │ ├── gsm8k.py │ │ ├── headqa.py │ │ ├── hellaswag.py │ │ ├── hendrycks_ethics.py │ │ ├── hendrycks_math.py │ │ ├── hendrycks_test.py │ │ ├── ja │ │ │ ├── __init__.py │ │ │ ├── jaqket_v1.py │ │ │ ├── jaqket_v2.py │ │ │ ├── jaquad.py │ │ │ ├── jblimp.py │ │ │ ├── jcola.py │ │ │ ├── jcommonsenseqa.py │ │ │ ├── jnli.py │ │ │ ├── jsquad.py │ │ │ ├── marc_ja.py │ │ │ ├── mgsm.py │ │ │ ├── wikilingua.py │ │ │ ├── xlsum_ja.py │ │ │ └── xwinograd.py │ │ ├── lambada.py │ │ ├── lambada_cloze.py │ │ ├── lambada_multilingual.py │ │ ├── logiqa.py │ │ ├── mathqa.py │ │ ├── mc_taco.py │ │ ├── mutual.py │ │ ├── naturalqs.py │ │ ├── openbookqa.py │ │ ├── pile.py │ │ ├── piqa.py │ │ ├── prost.py │ │ ├── pubmedqa.py │ │ ├── qa4mre.py │ │ ├── qasper.py │ │ ├── quac.py │ │ ├── race.py │ │ ├── sat.py │ │ ├── sciq.py │ │ ├── squad.py │ │ ├── storycloze.py │ │ ├── superglue.py │ │ ├── swag.py │ │ ├── toxigen.py │ │ ├── translation.py │ │ ├── triviaqa.py │ │ ├── truthfulqa.py │ │ ├── unscramble.py │ │ ├── webqs.py │ │ ├── wikitext.py │ │ ├── winogrande.py │ │ └── wsc273.py │ └── utils.py ├── main.py ├── models │ ├── abeja-gpt-neox-japanese-2.7b │ │ ├── harness.jsquad-1.2.sh │ │ ├── harness.sh │ │ ├── result.json │ │ └── result.jsquad-1.2.json │ ├── community │ │ ├── cyberagent-open-calm-instruct-1b_1.3.0 │ │ │ ├── README.md │ │ │ ├── harness.sh │ │ │ └── result.json │ │ ├── cyberagent-open-calm-instruct-3b_1.3.0 │ │ │ ├── README.md │ │ │ ├── harness.sh │ │ │ └── result.json │ │ ├── cyberagent-open-calm-instruct-7b_1.5.4 │ │ │ └── harness.sh │ │ ├── cyberagent-open-calm-instruct-7b_1.9.4 │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ ├── result_2.json │ │ │ └── xwinograd_ja.result.json │ │ └── rinna-instruct-1b_0.1.0 │ │ │ ├── harness.sh │ │ │ └── result.json │ ├── cyberagent │ │ ├── cyberagent-open-calm-1b │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ ├── result.jsquad-1.2.json │ │ │ └── result.mgsm.json │ │ ├── cyberagent-open-calm-3b │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ ├── result.jsquad-1.2.json │ │ │ └── result.mgsm.json │ │ ├── cyberagent-open-calm-7b │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ ├── result.jsquad-1.2.json │ │ │ └── result.mgsm.json │ │ ├── cyberagent-open-calm-large │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ │ └── cyberagent-open-calm-medium │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ ├── harness.conf │ ├── line-corporation │ │ ├── line-corporation-japanese-large-lm-1.7b-instruction-sft │ │ │ ├── harness.sh │ │ │ └── result.json │ │ ├── line-corporation-japanese-large-lm-1.7b │ │ │ ├── harness.sh │ │ │ └── result.json │ │ ├── line-corporation-japanese-large-lm-3.6b-instruction-sft │ │ │ ├── harness.sh │ │ │ └── result.json │ │ └── line-corporation-japanese-large-lm-3.6b │ │ │ ├── harness.sh │ │ │ └── result.json │ ├── llama │ │ ├── llama-13b │ │ │ ├── harness.sh │ │ │ └── result.json │ │ ├── llama-30b │ │ │ ├── harness.sh │ │ │ └── result.json │ │ ├── llama-65b │ │ │ ├── harness.sh │ │ │ └── result.json │ │ └── llama-7b │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ ├── llama2 │ │ ├── llama2-13b-chat │ │ │ ├── harness.sh │ │ │ └── result.json │ │ ├── llama2-13b │ │ │ ├── harness.sh │ │ │ └── result.json │ │ ├── llama2-7b-chat │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ │ └── llama2-7b │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ ├── openai │ │ └── gpt3 │ │ │ └── result.mgsm.json │ ├── rinna │ │ ├── harness.conf │ │ ├── rinna-bilingual-gpt-neox-4b-instruction-ppo │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ │ ├── rinna-bilingual-gpt-neox-4b-instruction-sft │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ │ ├── rinna-bilingual-gpt-neox-4b │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ │ ├── rinna-japanese-gpt-1b │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ │ ├── rinna-japanese-gpt-neox-3.6b-instruction-ppo │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ ├── result.jsquad-1.2.json │ │ │ └── result.mgsm.json │ │ ├── rinna-japanese-gpt-neox-3.6b-instruction-sft-v2 │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ ├── result.jsquad-1.2.json │ │ │ └── result.mgsm.json │ │ ├── rinna-japanese-gpt-neox-3.6b-instruction-sft │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ └── result.jsquad-1.2.json │ │ ├── rinna-japanese-gpt-neox-3.6b │ │ │ ├── harness.jsquad-1.2.sh │ │ │ ├── harness.sh │ │ │ ├── result.json │ │ │ ├── result.jsquad-1.2.json │ │ │ └── result.mgsm.json │ │ └── rinna-japanese-gpt-neox-small │ │ │ ├── harness.sh │ │ │ └── result.json │ ├── stabilityai │ │ ├── experiments │ │ │ ├── stablelm-jp-1b-compact-v1 │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-1b-jav1-sl2k-300b │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-1b-jav1-sl2k-slw-300b │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-1b-jav1_rp-sl2k-slw-300b │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-1b-rp_then_jav1-294b │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-3b-ja50_rp50-700b │ │ │ │ ├── harness_template-0.1.sh │ │ │ │ ├── harness_template-0.2.sh │ │ │ │ ├── result_template-0.1.json │ │ │ │ └── result_template-0.2.json │ │ │ ├── stablelm-jp-instruct-1b_1.1.0 │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-instruct-1b_1.3.0 │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-instruct-1b_1.3.2 │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-instruct-1b_1.5.2 │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-instruct-1b_1.6.2 │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ ├── stablelm-jp-instruct-3b_1.3.0 │ │ │ │ ├── harness.sh │ │ │ │ ├── result.2.json │ │ │ │ ├── result.json │ │ │ │ └── xwinograd_ja.result.json │ │ │ ├── stablelm-jp-instruct-3b_1.5.0 │ │ │ │ ├── harness.sh │ │ │ │ └── result.json │ │ │ └── stablelm-jp-instruct-3b_1.5.2 │ │ │ │ ├── harness.sh │ │ │ │ ├── jaqket_mgsm.result.json │ │ │ │ ├── result.json │ │ │ │ └── xwinograd_ja.result.json │ │ ├── stabilityai-japanese-stablelm-base-alpha-7b │ │ │ ├── harness.sh │ │ │ └── result.json │ │ └── stabilityai-japanese-stablelm-instruct-alpha-7b │ │ │ ├── harness.sh │ │ │ └── result.json │ └── stablelm │ │ ├── harness.conf │ │ └── stablelm-jp-3b-ja50_rp50-700b │ │ └── harness.conf ├── pile_statistics.json ├── requirements-ja.txt ├── requirements.txt ├── scripts │ ├── __init__.py │ ├── clean_training_data │ │ ├── README.md │ │ ├── __init__.py │ │ ├── compress_and_package.py │ │ ├── generate_13_grams.py │ │ ├── investigate_pile.py │ │ ├── janitor_util.cpp │ │ ├── process_sorted_buckets.py │ │ └── sort_13_gram_buckets.py │ ├── compute_average_from_json.py │ ├── cost_estimate.py │ ├── generate_harness.py │ ├── get_prompts.py │ ├── make_gpt2_test_cases.py │ ├── make_leaderboard.py │ ├── make_table_tasks.py │ ├── merge_json.py │ ├── models.txt │ ├── run_task.sh │ ├── run_task_batch.sh │ ├── run_task_for_models.sh │ └── write_out.py ├── setup.py ├── templates │ ├── new_multiple_choice_task.py │ └── new_task.py └── tests │ ├── test_description_dict.py │ ├── test_evaluator.py │ ├── test_generate_13_grams.py │ ├── test_janitor.py │ ├── test_misc.py │ ├── test_models.py │ ├── test_tasks.py │ ├── test_utils.py │ ├── test_version_stable.py │ └── testdata │ ├── anagrams1-v0-greedy_until │ ├── anagrams1-v0-res.json │ ├── anagrams2-v0-greedy_until │ ├── anagrams2-v0-res.json │ ├── anli_r1-v0-loglikelihood │ ├── anli_r1-v0-res.json │ ├── anli_r2-v0-loglikelihood │ ├── anli_r2-v0-res.json │ ├── anli_r3-v0-loglikelihood │ ├── anli_r3-v0-res.json │ ├── arc_challenge-v0-loglikelihood │ ├── arc_challenge-v0-res.json │ ├── arc_easy-v0-loglikelihood │ ├── arc_easy-v0-res.json │ ├── arithmetic_1dc-v0-loglikelihood │ ├── arithmetic_1dc-v0-res.json │ ├── arithmetic_2da-v0-loglikelihood │ ├── arithmetic_2da-v0-res.json │ ├── arithmetic_2dm-v0-loglikelihood │ ├── arithmetic_2dm-v0-res.json │ ├── arithmetic_2ds-v0-loglikelihood │ ├── arithmetic_2ds-v0-res.json │ ├── arithmetic_3da-v0-loglikelihood │ ├── arithmetic_3da-v0-res.json │ ├── arithmetic_3ds-v0-loglikelihood │ ├── arithmetic_3ds-v0-res.json │ ├── arithmetic_4da-v0-loglikelihood │ ├── arithmetic_4da-v0-res.json │ ├── arithmetic_4ds-v0-loglikelihood │ ├── arithmetic_4ds-v0-res.json │ ├── arithmetic_5da-v0-loglikelihood │ ├── arithmetic_5da-v0-res.json │ ├── arithmetic_5ds-v0-loglikelihood │ ├── arithmetic_5ds-v0-res.json │ ├── blimp_adjunct_island-v0-loglikelihood │ ├── blimp_adjunct_island-v0-res.json │ ├── blimp_anaphor_gender_agreement-v0-loglikelihood │ ├── blimp_anaphor_gender_agreement-v0-res.json │ ├── blimp_anaphor_number_agreement-v0-loglikelihood │ ├── blimp_anaphor_number_agreement-v0-res.json │ ├── blimp_animate_subject_passive-v0-loglikelihood │ ├── blimp_animate_subject_passive-v0-res.json │ ├── blimp_animate_subject_trans-v0-loglikelihood │ ├── blimp_animate_subject_trans-v0-res.json │ ├── blimp_causative-v0-loglikelihood │ ├── blimp_causative-v0-res.json │ ├── blimp_complex_NP_island-v0-loglikelihood │ ├── blimp_complex_NP_island-v0-res.json │ ├── blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood │ ├── blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json │ ├── blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood │ ├── blimp_coordinate_structure_constraint_object_extraction-v0-res.json │ ├── blimp_determiner_noun_agreement_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_1-v0-res.json │ ├── blimp_determiner_noun_agreement_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_2-v0-res.json │ ├── blimp_determiner_noun_agreement_irregular_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_irregular_1-v0-res.json │ ├── blimp_determiner_noun_agreement_irregular_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_irregular_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adj_irregular_2-v0-res.json │ ├── blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood │ ├── blimp_determiner_noun_agreement_with_adjective_1-v0-res.json │ ├── blimp_distractor_agreement_relational_noun-v0-loglikelihood │ ├── blimp_distractor_agreement_relational_noun-v0-res.json │ ├── blimp_distractor_agreement_relative_clause-v0-loglikelihood │ ├── blimp_distractor_agreement_relative_clause-v0-res.json │ ├── blimp_drop_argument-v0-loglikelihood │ ├── blimp_drop_argument-v0-res.json │ ├── blimp_ellipsis_n_bar_1-v0-loglikelihood │ ├── blimp_ellipsis_n_bar_1-v0-res.json │ ├── blimp_ellipsis_n_bar_2-v0-loglikelihood │ ├── blimp_ellipsis_n_bar_2-v0-res.json │ ├── blimp_existential_there_object_raising-v0-loglikelihood │ ├── blimp_existential_there_object_raising-v0-res.json │ ├── blimp_existential_there_quantifiers_1-v0-loglikelihood │ ├── blimp_existential_there_quantifiers_1-v0-res.json │ ├── blimp_existential_there_quantifiers_2-v0-loglikelihood │ ├── blimp_existential_there_quantifiers_2-v0-res.json │ ├── blimp_existential_there_subject_raising-v0-loglikelihood │ ├── blimp_existential_there_subject_raising-v0-res.json │ ├── blimp_expletive_it_object_raising-v0-loglikelihood │ ├── blimp_expletive_it_object_raising-v0-res.json │ ├── blimp_inchoative-v0-loglikelihood │ ├── blimp_inchoative-v0-res.json │ ├── blimp_intransitive-v0-loglikelihood │ ├── blimp_intransitive-v0-res.json │ ├── blimp_irregular_past_participle_adjectives-v0-loglikelihood │ ├── blimp_irregular_past_participle_adjectives-v0-res.json │ ├── blimp_irregular_past_participle_verbs-v0-loglikelihood │ ├── blimp_irregular_past_participle_verbs-v0-res.json │ ├── blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood │ ├── blimp_irregular_plural_subject_verb_agreement_1-v0-res.json │ ├── blimp_irregular_plural_subject_verb_agreement_2-v0-loglikelihood │ ├── blimp_irregular_plural_subject_verb_agreement_2-v0-res.json │ ├── blimp_left_branch_island_echo_question-v0-loglikelihood │ ├── blimp_left_branch_island_echo_question-v0-res.json │ ├── blimp_left_branch_island_simple_question-v0-loglikelihood │ ├── blimp_left_branch_island_simple_question-v0-res.json │ ├── blimp_matrix_question_npi_licensor_present-v0-loglikelihood │ ├── blimp_matrix_question_npi_licensor_present-v0-res.json │ ├── blimp_npi_present_1-v0-loglikelihood │ ├── blimp_npi_present_1-v0-res.json │ ├── blimp_npi_present_2-v0-loglikelihood │ ├── blimp_npi_present_2-v0-res.json │ ├── blimp_only_npi_licensor_present-v0-loglikelihood │ ├── blimp_only_npi_licensor_present-v0-res.json │ ├── blimp_only_npi_scope-v0-loglikelihood │ ├── blimp_only_npi_scope-v0-res.json │ ├── blimp_passive_1-v0-loglikelihood │ ├── blimp_passive_1-v0-res.json │ ├── blimp_passive_2-v0-loglikelihood │ ├── blimp_passive_2-v0-res.json │ ├── blimp_principle_A_c_command-v0-loglikelihood │ ├── blimp_principle_A_c_command-v0-res.json │ ├── blimp_principle_A_case_1-v0-loglikelihood │ ├── blimp_principle_A_case_1-v0-res.json │ ├── blimp_principle_A_case_2-v0-loglikelihood │ ├── blimp_principle_A_case_2-v0-res.json │ ├── blimp_principle_A_domain_1-v0-loglikelihood │ ├── blimp_principle_A_domain_1-v0-res.json │ ├── blimp_principle_A_domain_2-v0-loglikelihood │ ├── blimp_principle_A_domain_2-v0-res.json │ ├── blimp_principle_A_domain_3-v0-loglikelihood │ ├── blimp_principle_A_domain_3-v0-res.json │ ├── blimp_principle_A_reconstruction-v0-loglikelihood │ ├── blimp_principle_A_reconstruction-v0-res.json │ ├── blimp_regular_plural_subject_verb_agreement_1-v0-loglikelihood │ ├── blimp_regular_plural_subject_verb_agreement_1-v0-res.json │ ├── blimp_regular_plural_subject_verb_agreement_2-v0-loglikelihood │ ├── blimp_regular_plural_subject_verb_agreement_2-v0-res.json │ ├── blimp_sentential_negation_npi_licensor_present-v0-loglikelihood │ ├── blimp_sentential_negation_npi_licensor_present-v0-res.json │ ├── blimp_sentential_negation_npi_scope-v0-loglikelihood │ ├── blimp_sentential_negation_npi_scope-v0-res.json │ ├── blimp_sentential_subject_island-v0-loglikelihood │ ├── blimp_sentential_subject_island-v0-res.json │ ├── blimp_superlative_quantifiers_1-v0-loglikelihood │ ├── blimp_superlative_quantifiers_1-v0-res.json │ ├── blimp_superlative_quantifiers_2-v0-loglikelihood │ ├── blimp_superlative_quantifiers_2-v0-res.json │ ├── blimp_tough_vs_raising_1-v0-loglikelihood │ ├── blimp_tough_vs_raising_1-v0-res.json │ ├── blimp_tough_vs_raising_2-v0-loglikelihood │ ├── blimp_tough_vs_raising_2-v0-res.json │ ├── blimp_transitive-v0-loglikelihood │ ├── blimp_transitive-v0-res.json │ ├── blimp_wh_island-v0-loglikelihood │ ├── blimp_wh_island-v0-res.json │ ├── blimp_wh_questions_object_gap-v0-loglikelihood │ ├── blimp_wh_questions_object_gap-v0-res.json │ ├── blimp_wh_questions_subject_gap-v0-loglikelihood │ ├── blimp_wh_questions_subject_gap-v0-res.json │ ├── blimp_wh_questions_subject_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_questions_subject_gap_long_distance-v0-res.json │ ├── blimp_wh_vs_that_no_gap-v0-loglikelihood │ ├── blimp_wh_vs_that_no_gap-v0-res.json │ ├── blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_vs_that_no_gap_long_distance-v0-res.json │ ├── blimp_wh_vs_that_with_gap-v0-loglikelihood │ ├── blimp_wh_vs_that_with_gap-v0-res.json │ ├── blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood │ ├── blimp_wh_vs_that_with_gap_long_distance-v0-res.json │ ├── boolq-v0-loglikelihood │ ├── boolq-v0-res.json │ ├── boolq-v1-loglikelihood │ ├── boolq-v1-res.json │ ├── cb-v0-loglikelihood │ ├── cb-v0-res.json │ ├── cb-v1-loglikelihood │ ├── cb-v1-res.json │ ├── cola-v0-loglikelihood │ ├── cola-v0-res.json │ ├── copa-v0-loglikelihood │ ├── copa-v0-res.json │ ├── coqa-v0-greedy_until │ ├── coqa-v0-res.json │ ├── coqa-v1-greedy_until │ ├── coqa-v1-res.json │ ├── crows_pairs_english-v0-loglikelihood │ ├── crows_pairs_english-v0-res.json │ ├── crows_pairs_english_age-v0-loglikelihood │ ├── crows_pairs_english_age-v0-res.json │ ├── crows_pairs_english_autre-v0-loglikelihood │ ├── crows_pairs_english_autre-v0-res.json │ ├── crows_pairs_english_disability-v0-loglikelihood │ ├── crows_pairs_english_disability-v0-res.json │ ├── crows_pairs_english_gender-v0-loglikelihood │ ├── crows_pairs_english_gender-v0-res.json │ ├── crows_pairs_english_nationality-v0-loglikelihood │ ├── crows_pairs_english_nationality-v0-res.json │ ├── crows_pairs_english_physical_appearance-v0-loglikelihood │ ├── crows_pairs_english_physical_appearance-v0-res.json │ ├── crows_pairs_english_race_color-v0-loglikelihood │ ├── crows_pairs_english_race_color-v0-res.json │ ├── crows_pairs_english_religion-v0-loglikelihood │ ├── crows_pairs_english_religion-v0-res.json │ ├── crows_pairs_english_sexual_orientation-v0-loglikelihood │ ├── crows_pairs_english_sexual_orientation-v0-res.json │ ├── crows_pairs_english_socioeconomic-v0-loglikelihood │ ├── crows_pairs_english_socioeconomic-v0-res.json │ ├── crows_pairs_french-v0-loglikelihood │ ├── crows_pairs_french-v0-res.json │ ├── crows_pairs_french_age-v0-loglikelihood │ ├── crows_pairs_french_age-v0-res.json │ ├── crows_pairs_french_autre-v0-loglikelihood │ ├── crows_pairs_french_autre-v0-res.json │ ├── crows_pairs_french_disability-v0-loglikelihood │ ├── crows_pairs_french_disability-v0-res.json │ ├── crows_pairs_french_gender-v0-loglikelihood │ ├── crows_pairs_french_gender-v0-res.json │ ├── crows_pairs_french_nationality-v0-loglikelihood │ ├── crows_pairs_french_nationality-v0-res.json │ ├── crows_pairs_french_physical_appearance-v0-loglikelihood │ ├── crows_pairs_french_physical_appearance-v0-res.json │ ├── crows_pairs_french_race_color-v0-loglikelihood │ ├── crows_pairs_french_race_color-v0-res.json │ ├── crows_pairs_french_religion-v0-loglikelihood │ ├── crows_pairs_french_religion-v0-res.json │ ├── crows_pairs_french_sexual_orientation-v0-loglikelihood │ ├── crows_pairs_french_sexual_orientation-v0-res.json │ ├── crows_pairs_french_socioeconomic-v0-loglikelihood │ ├── crows_pairs_french_socioeconomic-v0-res.json │ ├── cycle_letters-v0-greedy_until │ ├── cycle_letters-v0-res.json │ ├── drop-v0-greedy_until │ ├── drop-v0-res.json │ ├── drop-v1-greedy_until │ ├── drop-v1-res.json │ ├── ethics_cm-v0-loglikelihood │ ├── ethics_cm-v0-res.json │ ├── ethics_deontology-v0-loglikelihood │ ├── ethics_deontology-v0-res.json │ ├── ethics_justice-v0-loglikelihood │ ├── ethics_justice-v0-res.json │ ├── ethics_utilitarianism-v0-loglikelihood │ ├── ethics_utilitarianism-v0-res.json │ ├── ethics_utilitarianism_original-v0-loglikelihood │ ├── ethics_utilitarianism_original-v0-res.json │ ├── ethics_virtue-v0-loglikelihood │ ├── ethics_virtue-v0-res.json │ ├── gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl │ ├── gpt3_test_8025023377febbd8c5f2b9f26705c394ff375d0cad7c89c10fd9b8e1eb66ff1c.pkl │ ├── gpt3_test_bb2cc49115e88788ed870ad0716eb00b280a885f91c7ed6e1e864435e5e2b6ac.pkl │ ├── gpt3_test_cfd11f555a5a63b6dfa114a55a932e51b724cdd44d4842586b9ce37260bf7aaa.pkl │ ├── gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl │ ├── gsm8k-v0-greedy_until │ ├── gsm8k-v0-res.json │ ├── headqa-v0-loglikelihood │ ├── headqa-v0-res.json │ ├── headqa_en-v0-loglikelihood │ ├── headqa_en-v0-res.json │ ├── headqa_es-v0-loglikelihood │ ├── headqa_es-v0-res.json │ ├── hellaswag-v0-loglikelihood │ ├── hellaswag-v0-res.json │ ├── hendrycksTest-abstract_algebra-v0-loglikelihood │ ├── hendrycksTest-abstract_algebra-v0-res.json │ ├── hendrycksTest-anatomy-v0-loglikelihood │ ├── hendrycksTest-anatomy-v0-res.json │ ├── hendrycksTest-astronomy-v0-loglikelihood │ ├── hendrycksTest-astronomy-v0-res.json │ ├── hendrycksTest-business_ethics-v0-loglikelihood │ ├── hendrycksTest-business_ethics-v0-res.json │ ├── hendrycksTest-clinical_knowledge-v0-loglikelihood │ ├── hendrycksTest-clinical_knowledge-v0-res.json │ ├── hendrycksTest-college_biology-v0-loglikelihood │ ├── hendrycksTest-college_biology-v0-res.json │ ├── hendrycksTest-college_chemistry-v0-loglikelihood │ ├── hendrycksTest-college_chemistry-v0-res.json │ ├── hendrycksTest-college_computer_science-v0-loglikelihood │ ├── hendrycksTest-college_computer_science-v0-res.json │ ├── hendrycksTest-college_mathematics-v0-loglikelihood │ ├── hendrycksTest-college_mathematics-v0-res.json │ ├── hendrycksTest-college_medicine-v0-loglikelihood │ ├── hendrycksTest-college_medicine-v0-res.json │ ├── hendrycksTest-college_physics-v0-loglikelihood │ ├── hendrycksTest-college_physics-v0-res.json │ ├── hendrycksTest-computer_security-v0-loglikelihood │ ├── hendrycksTest-computer_security-v0-res.json │ ├── hendrycksTest-conceptual_physics-v0-loglikelihood │ ├── hendrycksTest-conceptual_physics-v0-res.json │ ├── hendrycksTest-econometrics-v0-loglikelihood │ ├── hendrycksTest-econometrics-v0-res.json │ ├── hendrycksTest-electrical_engineering-v0-loglikelihood │ ├── hendrycksTest-electrical_engineering-v0-res.json │ ├── hendrycksTest-elementary_mathematics-v0-loglikelihood │ ├── hendrycksTest-elementary_mathematics-v0-res.json │ ├── hendrycksTest-formal_logic-v0-loglikelihood │ ├── hendrycksTest-formal_logic-v0-res.json │ ├── hendrycksTest-global_facts-v0-loglikelihood │ ├── hendrycksTest-global_facts-v0-res.json │ ├── hendrycksTest-high_school_biology-v0-loglikelihood │ ├── hendrycksTest-high_school_biology-v0-res.json │ ├── hendrycksTest-high_school_chemistry-v0-loglikelihood │ ├── hendrycksTest-high_school_chemistry-v0-res.json │ ├── hendrycksTest-high_school_computer_science-v0-loglikelihood │ ├── hendrycksTest-high_school_computer_science-v0-res.json │ ├── hendrycksTest-high_school_european_history-v0-loglikelihood │ ├── hendrycksTest-high_school_european_history-v0-res.json │ ├── hendrycksTest-high_school_geography-v0-loglikelihood │ ├── hendrycksTest-high_school_geography-v0-res.json │ ├── hendrycksTest-high_school_government_and_politics-v0-loglikelihood │ ├── hendrycksTest-high_school_government_and_politics-v0-res.json │ ├── hendrycksTest-high_school_macroeconomics-v0-loglikelihood │ ├── hendrycksTest-high_school_macroeconomics-v0-res.json │ ├── hendrycksTest-high_school_mathematics-v0-loglikelihood │ ├── hendrycksTest-high_school_mathematics-v0-res.json │ ├── hendrycksTest-high_school_microeconomics-v0-loglikelihood │ ├── hendrycksTest-high_school_microeconomics-v0-res.json │ ├── hendrycksTest-high_school_physics-v0-loglikelihood │ ├── hendrycksTest-high_school_physics-v0-res.json │ ├── hendrycksTest-high_school_psychology-v0-loglikelihood │ ├── hendrycksTest-high_school_psychology-v0-res.json │ ├── hendrycksTest-high_school_statistics-v0-loglikelihood │ ├── hendrycksTest-high_school_statistics-v0-res.json │ ├── hendrycksTest-high_school_us_history-v0-loglikelihood │ ├── hendrycksTest-high_school_us_history-v0-res.json │ ├── hendrycksTest-high_school_world_history-v0-loglikelihood │ ├── hendrycksTest-high_school_world_history-v0-res.json │ ├── hendrycksTest-human_aging-v0-loglikelihood │ ├── hendrycksTest-human_aging-v0-res.json │ ├── hendrycksTest-human_sexuality-v0-loglikelihood │ ├── hendrycksTest-human_sexuality-v0-res.json │ ├── hendrycksTest-international_law-v0-loglikelihood │ ├── hendrycksTest-international_law-v0-res.json │ ├── hendrycksTest-jurisprudence-v0-loglikelihood │ ├── hendrycksTest-jurisprudence-v0-res.json │ ├── hendrycksTest-logical_fallacies-v0-loglikelihood │ ├── hendrycksTest-logical_fallacies-v0-res.json │ ├── hendrycksTest-machine_learning-v0-loglikelihood │ ├── hendrycksTest-machine_learning-v0-res.json │ ├── hendrycksTest-management-v0-loglikelihood │ ├── hendrycksTest-management-v0-res.json │ ├── hendrycksTest-marketing-v0-loglikelihood │ ├── hendrycksTest-marketing-v0-res.json │ ├── hendrycksTest-medical_genetics-v0-loglikelihood │ ├── hendrycksTest-medical_genetics-v0-res.json │ ├── hendrycksTest-miscellaneous-v0-loglikelihood │ ├── hendrycksTest-miscellaneous-v0-res.json │ ├── hendrycksTest-moral_disputes-v0-loglikelihood │ ├── hendrycksTest-moral_disputes-v0-res.json │ ├── hendrycksTest-moral_scenarios-v0-loglikelihood │ ├── hendrycksTest-moral_scenarios-v0-res.json │ ├── hendrycksTest-nutrition-v0-loglikelihood │ ├── hendrycksTest-nutrition-v0-res.json │ ├── hendrycksTest-philosophy-v0-loglikelihood │ ├── hendrycksTest-philosophy-v0-res.json │ ├── hendrycksTest-prehistory-v0-loglikelihood │ ├── hendrycksTest-prehistory-v0-res.json │ ├── hendrycksTest-professional_accounting-v0-loglikelihood │ ├── hendrycksTest-professional_accounting-v0-res.json │ ├── hendrycksTest-professional_law-v0-loglikelihood │ ├── hendrycksTest-professional_law-v0-res.json │ ├── hendrycksTest-professional_medicine-v0-loglikelihood │ ├── hendrycksTest-professional_medicine-v0-res.json │ ├── hendrycksTest-professional_psychology-v0-loglikelihood │ ├── hendrycksTest-professional_psychology-v0-res.json │ ├── hendrycksTest-public_relations-v0-loglikelihood │ ├── hendrycksTest-public_relations-v0-res.json │ ├── hendrycksTest-security_studies-v0-loglikelihood │ ├── hendrycksTest-security_studies-v0-res.json │ ├── hendrycksTest-sociology-v0-loglikelihood │ ├── hendrycksTest-sociology-v0-res.json │ ├── hendrycksTest-us_foreign_policy-v0-loglikelihood │ ├── hendrycksTest-us_foreign_policy-v0-res.json │ ├── hendrycksTest-virology-v0-loglikelihood │ ├── hendrycksTest-virology-v0-res.json │ ├── hendrycksTest-world_religions-v0-loglikelihood │ ├── hendrycksTest-world_religions-v0-res.json │ ├── iwslt17-ar-en-v0-greedy_until │ ├── iwslt17-ar-en-v0-res.json │ ├── iwslt17-en-ar-v0-greedy_until │ ├── iwslt17-en-ar-v0-res.json │ ├── lambada-v0-loglikelihood │ ├── lambada-v0-res.json │ ├── lambada_cloze-v0-loglikelihood │ ├── lambada_cloze-v0-res.json │ ├── lambada_mt_de-v0-loglikelihood │ ├── lambada_mt_de-v0-res.json │ ├── lambada_mt_en-v0-loglikelihood │ ├── lambada_mt_en-v0-res.json │ ├── lambada_mt_es-v0-loglikelihood │ ├── lambada_mt_es-v0-res.json │ ├── lambada_mt_fr-v0-loglikelihood │ ├── lambada_mt_fr-v0-res.json │ ├── lambada_mt_it-v0-loglikelihood │ ├── lambada_mt_it-v0-res.json │ ├── lambada_openai-v0-loglikelihood │ ├── lambada_openai-v0-res.json │ ├── lambada_openai_cloze-v0-loglikelihood │ ├── lambada_openai_cloze-v0-res.json │ ├── lambada_openai_mt_de-v0-loglikelihood │ ├── lambada_openai_mt_de-v0-res.json │ ├── lambada_openai_mt_en-v0-loglikelihood │ ├── lambada_openai_mt_en-v0-res.json │ ├── lambada_openai_mt_es-v0-loglikelihood │ ├── lambada_openai_mt_es-v0-res.json │ ├── lambada_openai_mt_fr-v0-loglikelihood │ ├── lambada_openai_mt_fr-v0-res.json │ ├── lambada_openai_mt_it-v0-loglikelihood │ ├── lambada_openai_mt_it-v0-res.json │ ├── lambada_standard-v0-loglikelihood │ ├── lambada_standard-v0-res.json │ ├── lambada_standard_cloze-v0-loglikelihood │ ├── lambada_standard_cloze-v0-res.json │ ├── logiqa-v0-loglikelihood │ ├── logiqa-v0-res.json │ ├── math_algebra-v0-greedy_until │ ├── math_algebra-v0-res.json │ ├── math_algebra-v1-greedy_until │ ├── math_algebra-v1-res.json │ ├── math_counting_and_prob-v0-greedy_until │ ├── math_counting_and_prob-v0-res.json │ ├── math_counting_and_prob-v1-greedy_until │ ├── math_counting_and_prob-v1-res.json │ ├── math_geometry-v0-greedy_until │ ├── math_geometry-v0-res.json │ ├── math_geometry-v1-greedy_until │ ├── math_geometry-v1-res.json │ ├── math_intermediate_algebra-v0-greedy_until │ ├── math_intermediate_algebra-v0-res.json │ ├── math_intermediate_algebra-v1-greedy_until │ ├── math_intermediate_algebra-v1-res.json │ ├── math_num_theory-v0-greedy_until │ ├── math_num_theory-v0-res.json │ ├── math_num_theory-v1-greedy_until │ ├── math_num_theory-v1-res.json │ ├── math_prealgebra-v0-greedy_until │ ├── math_prealgebra-v0-res.json │ ├── math_prealgebra-v1-greedy_until │ ├── math_prealgebra-v1-res.json │ ├── math_precalc-v0-greedy_until │ ├── math_precalc-v0-res.json │ ├── math_precalc-v1-greedy_until │ ├── math_precalc-v1-res.json │ ├── mathqa-v0-loglikelihood │ ├── mathqa-v0-res.json │ ├── mc_taco-v0-loglikelihood │ ├── mc_taco-v0-res.json │ ├── mnli-v0-loglikelihood │ ├── mnli-v0-res.json │ ├── mnli_mismatched-v0-loglikelihood │ ├── mnli_mismatched-v0-res.json │ ├── mrpc-v0-loglikelihood │ ├── mrpc-v0-res.json │ ├── multirc-v0-loglikelihood │ ├── multirc-v0-res.json │ ├── multirc-v1-loglikelihood │ ├── multirc-v1-res.json │ ├── mutual-v0-loglikelihood │ ├── mutual-v0-res.json │ ├── mutual-v1-loglikelihood │ ├── mutual-v1-res.json │ ├── mutual_plus-v0-loglikelihood │ ├── mutual_plus-v0-res.json │ ├── mutual_plus-v1-loglikelihood │ ├── mutual_plus-v1-res.json │ ├── openbookqa-v0-loglikelihood │ ├── openbookqa-v0-res.json │ ├── pile_arxiv-v0-loglikelihood_rolling │ ├── pile_arxiv-v0-res.json │ ├── pile_arxiv-v1-loglikelihood_rolling │ ├── pile_arxiv-v1-res.json │ ├── pile_bookcorpus2-v0-loglikelihood_rolling │ ├── pile_bookcorpus2-v0-res.json │ ├── pile_bookcorpus2-v1-loglikelihood_rolling │ ├── pile_bookcorpus2-v1-res.json │ ├── pile_books3-v0-loglikelihood_rolling │ ├── pile_books3-v0-res.json │ ├── pile_books3-v1-loglikelihood_rolling │ ├── pile_books3-v1-res.json │ ├── pile_dm-mathematics-v0-loglikelihood_rolling │ ├── pile_dm-mathematics-v0-res.json │ ├── pile_dm-mathematics-v1-loglikelihood_rolling │ ├── pile_dm-mathematics-v1-res.json │ ├── pile_enron-v0-loglikelihood_rolling │ ├── pile_enron-v0-res.json │ ├── pile_enron-v1-loglikelihood_rolling │ ├── pile_enron-v1-res.json │ ├── pile_europarl-v0-loglikelihood_rolling │ ├── pile_europarl-v0-res.json │ ├── pile_europarl-v1-loglikelihood_rolling │ ├── pile_europarl-v1-res.json │ ├── pile_freelaw-v0-loglikelihood_rolling │ ├── pile_freelaw-v0-res.json │ ├── pile_freelaw-v1-loglikelihood_rolling │ ├── pile_freelaw-v1-res.json │ ├── pile_github-v0-loglikelihood_rolling │ ├── pile_github-v0-res.json │ ├── pile_github-v1-loglikelihood_rolling │ ├── pile_github-v1-res.json │ ├── pile_gutenberg-v0-loglikelihood_rolling │ ├── pile_gutenberg-v0-res.json │ ├── pile_gutenberg-v1-loglikelihood_rolling │ ├── pile_gutenberg-v1-res.json │ ├── pile_hackernews-v0-loglikelihood_rolling │ ├── pile_hackernews-v0-res.json │ ├── pile_hackernews-v1-loglikelihood_rolling │ ├── pile_hackernews-v1-res.json │ ├── pile_nih-exporter-v0-loglikelihood_rolling │ ├── pile_nih-exporter-v0-res.json │ ├── pile_nih-exporter-v1-loglikelihood_rolling │ ├── pile_nih-exporter-v1-res.json │ ├── pile_opensubtitles-v0-loglikelihood_rolling │ ├── pile_opensubtitles-v0-res.json │ ├── pile_opensubtitles-v1-loglikelihood_rolling │ ├── pile_opensubtitles-v1-res.json │ ├── pile_openwebtext2-v0-loglikelihood_rolling │ ├── pile_openwebtext2-v0-res.json │ ├── pile_openwebtext2-v1-loglikelihood_rolling │ ├── pile_openwebtext2-v1-res.json │ ├── pile_philpapers-v0-loglikelihood_rolling │ ├── pile_philpapers-v0-res.json │ ├── pile_philpapers-v1-loglikelihood_rolling │ ├── pile_philpapers-v1-res.json │ ├── pile_pile-cc-v0-loglikelihood_rolling │ ├── pile_pile-cc-v0-res.json │ ├── pile_pile-cc-v1-loglikelihood_rolling │ ├── pile_pile-cc-v1-res.json │ ├── pile_pubmed-abstracts-v0-loglikelihood_rolling │ ├── pile_pubmed-abstracts-v0-res.json │ ├── pile_pubmed-abstracts-v1-loglikelihood_rolling │ ├── pile_pubmed-abstracts-v1-res.json │ ├── pile_pubmed-central-v0-loglikelihood_rolling │ ├── pile_pubmed-central-v0-res.json │ ├── pile_pubmed-central-v1-loglikelihood_rolling │ ├── pile_pubmed-central-v1-res.json │ ├── pile_stackexchange-v0-loglikelihood_rolling │ ├── pile_stackexchange-v0-res.json │ ├── pile_stackexchange-v1-loglikelihood_rolling │ ├── pile_stackexchange-v1-res.json │ ├── pile_ubuntu-irc-v0-loglikelihood_rolling │ ├── pile_ubuntu-irc-v0-res.json │ ├── pile_ubuntu-irc-v1-loglikelihood_rolling │ ├── pile_ubuntu-irc-v1-res.json │ ├── pile_uspto-v0-loglikelihood_rolling │ ├── pile_uspto-v0-res.json │ ├── pile_uspto-v1-loglikelihood_rolling │ ├── pile_uspto-v1-res.json │ ├── pile_wikipedia-v0-loglikelihood_rolling │ ├── pile_wikipedia-v0-res.json │ ├── pile_wikipedia-v1-loglikelihood_rolling │ ├── pile_wikipedia-v1-res.json │ ├── pile_youtubesubtitles-v0-loglikelihood_rolling │ ├── pile_youtubesubtitles-v0-res.json │ ├── pile_youtubesubtitles-v1-loglikelihood_rolling │ ├── pile_youtubesubtitles-v1-res.json │ ├── piqa-v0-loglikelihood │ ├── piqa-v0-res.json │ ├── prost-v0-loglikelihood │ ├── prost-v0-res.json │ ├── pubmedqa-v0-loglikelihood │ ├── pubmedqa-v0-res.json │ ├── qa4mre_2011-v0-loglikelihood │ ├── qa4mre_2011-v0-res.json │ ├── qa4mre_2012-v0-loglikelihood │ ├── qa4mre_2012-v0-res.json │ ├── qa4mre_2013-v0-loglikelihood │ ├── qa4mre_2013-v0-res.json │ ├── qnli-v0-loglikelihood │ ├── qnli-v0-res.json │ ├── qqp-v0-loglikelihood │ ├── qqp-v0-res.json │ ├── race-v0-loglikelihood │ ├── race-v0-res.json │ ├── random_insertion-v0-greedy_until │ ├── random_insertion-v0-res.json │ ├── record-v0-loglikelihood │ ├── record-v0-res.json │ ├── reversed_words-v0-greedy_until │ ├── reversed_words-v0-res.json │ ├── rte-v0-loglikelihood │ ├── rte-v0-res.json │ ├── sciq-v0-loglikelihood │ ├── sciq-v0-res.json │ ├── squad2-v0-greedy_until │ ├── squad2-v0-loglikelihood │ ├── squad2-v0-res.json │ ├── squad2-v1-greedy_until │ ├── squad2-v1-loglikelihood │ ├── squad2-v1-res.json │ ├── sst-v0-loglikelihood │ ├── sst-v0-res.json │ ├── swag-v0-loglikelihood │ ├── swag-v0-res.json │ ├── textsynth_test_0a89c2739f9598b4be2674b0a8e43931d7f3f0b696970bcba31f9b52bdf12297.pkl │ ├── textsynth_test_0c1c14571add7903b89e588c8212572b95bb57b334fc0752c89a7e045a5f63ae.pkl │ ├── textsynth_test_3092d07756f3e1d010c07524cc8a2ecba7f0c19f9e39f2aaf2bf440bfe328004.pkl │ ├── textsynth_test_434076260b6af3a46b7a5eaceec3306a5872c400a3872f744280b237455a0f8e.pkl │ ├── textsynth_test_49c47ae40e11f349f2f6b492128188b1b2bc103a421c676ee4b2142a68b43516.pkl │ ├── textsynth_test_4fd8d66a6dad7f602b40e5d7dc298d6fe329299d086a4659743a41f4a4012659.pkl │ ├── textsynth_test_51b5302f157cf224f694ccad973f255ae19e9e061d533256bdf75b04e0a917ab.pkl │ ├── textsynth_test_6d6c62dd70caaa208712bf766deaf419cfac89538d4ab7745621e339394c0c23.pkl │ ├── textsynth_test_7209c4617547bfe17cb9e7f5f735fe35822d650aefdc5fbeeaf0c1724effbe09.pkl │ ├── textsynth_test_7afdc285388e51094e12645f305328c759574fa3ec9751631025f8ad5ebf9f3e.pkl │ ├── textsynth_test_9d5f33dbfe1e254928c89f5ed85e4c010d888065f55a8f1b863bc1eb0340a5f2.pkl │ ├── textsynth_test_abcbcba648d89e5d81a50511a6d24ddeb538de2ffe108c1370dd74ce6ac8038d.pkl │ ├── textsynth_test_b1cbb29666cce5e31a1e97695858137398a0885ca5d5d98f515404fb6aeb99e7.pkl │ ├── textsynth_test_e7ad1e9f52a39e1ddd1e50f3c57ffa4546728dd150a67c0a0ddc8675c04e15d1.pkl │ ├── textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl │ ├── toxigen-v0-loglikelihood │ ├── toxigen-v0-res.json │ ├── triviaqa-v0-loglikelihood │ ├── triviaqa-v0-res.json │ ├── triviaqa-v1-loglikelihood │ ├── triviaqa-v1-res.json │ ├── truthfulqa_gen-v0-greedy_until │ ├── truthfulqa_gen-v0-res.json │ ├── truthfulqa_gen-v1-greedy_until │ ├── truthfulqa_gen-v1-res.json │ ├── truthfulqa_mc-v0-loglikelihood │ ├── truthfulqa_mc-v0-res.json │ ├── truthfulqa_mc-v1-loglikelihood │ ├── truthfulqa_mc-v1-res.json │ ├── webqs-v0-loglikelihood │ ├── webqs-v0-res.json │ ├── wic-v0-loglikelihood │ ├── wic-v0-res.json │ ├── wikitext-v0-loglikelihood_rolling │ ├── wikitext-v0-res.json │ ├── wikitext-v1-loglikelihood_rolling │ ├── wikitext-v1-res.json │ ├── winogrande-v0-loglikelihood │ ├── winogrande-v0-res.json │ ├── wmt14-en-fr-v0-greedy_until │ ├── wmt14-en-fr-v0-res.json │ ├── wmt14-fr-en-v0-greedy_until │ ├── wmt14-fr-en-v0-res.json │ ├── wmt16-de-en-v0-greedy_until │ ├── wmt16-de-en-v0-res.json │ ├── wmt16-en-de-v0-greedy_until │ ├── wmt16-en-de-v0-res.json │ ├── wmt16-en-ro-v0-greedy_until │ ├── wmt16-en-ro-v0-res.json │ ├── wmt16-ro-en-v0-greedy_until │ ├── wmt16-ro-en-v0-res.json │ ├── wmt20-cs-en-v0-greedy_until │ ├── wmt20-cs-en-v0-res.json │ ├── wmt20-de-en-v0-greedy_until │ ├── wmt20-de-en-v0-res.json │ ├── wmt20-de-fr-v0-greedy_until │ ├── wmt20-de-fr-v0-res.json │ ├── wmt20-en-cs-v0-greedy_until │ ├── wmt20-en-cs-v0-res.json │ ├── wmt20-en-de-v0-greedy_until │ ├── wmt20-en-de-v0-res.json │ ├── wmt20-en-iu-v0-greedy_until │ ├── wmt20-en-iu-v0-res.json │ ├── wmt20-en-ja-v0-greedy_until │ ├── wmt20-en-ja-v0-res.json │ ├── wmt20-en-ja-v1-greedy_until │ ├── wmt20-en-ja-v1-res.json │ ├── wmt20-en-km-v0-greedy_until │ ├── wmt20-en-km-v0-res.json │ ├── wmt20-en-pl-v0-greedy_until │ ├── wmt20-en-pl-v0-res.json │ ├── wmt20-en-ps-v0-greedy_until │ ├── wmt20-en-ps-v0-res.json │ ├── wmt20-en-ru-v0-greedy_until │ ├── wmt20-en-ru-v0-res.json │ ├── wmt20-en-ta-v0-greedy_until │ ├── wmt20-en-ta-v0-res.json │ ├── wmt20-en-zh-v0-greedy_until │ ├── wmt20-en-zh-v0-res.json │ ├── wmt20-en-zh-v1-greedy_until │ ├── wmt20-en-zh-v1-res.json │ ├── wmt20-fr-de-v0-greedy_until │ ├── wmt20-fr-de-v0-res.json │ ├── wmt20-iu-en-v0-greedy_until │ ├── wmt20-iu-en-v0-res.json │ ├── wmt20-ja-en-v0-greedy_until │ ├── wmt20-ja-en-v0-res.json │ ├── wmt20-km-en-v0-greedy_until │ ├── wmt20-km-en-v0-res.json │ ├── wmt20-pl-en-v0-greedy_until │ ├── wmt20-pl-en-v0-res.json │ ├── wmt20-ps-en-v0-greedy_until │ ├── wmt20-ps-en-v0-res.json │ ├── wmt20-ru-en-v0-greedy_until │ ├── wmt20-ru-en-v0-res.json │ ├── wmt20-ta-en-v0-greedy_until │ ├── wmt20-ta-en-v0-res.json │ ├── wmt20-zh-en-v0-greedy_until │ ├── wmt20-zh-en-v0-res.json │ ├── wnli-v0-loglikelihood │ ├── wnli-v0-res.json │ ├── wnli-v1-loglikelihood │ ├── wnli-v1-res.json │ ├── wsc-v0-loglikelihood │ ├── wsc-v0-res.json │ ├── wsc273-v0-loglikelihood │ └── wsc273-v0-res.json └── scripts ├── aggregate_result.py ├── column-path-key.csv ├── evaluate_english_bbh.sh ├── evaluate_english_general.sh ├── evaluate_english_gpqa.sh ├── evaluate_english_humaneval-unstripped.sh ├── evaluate_english_math.sh ├── evaluate_english_mbpp.sh ├── evaluate_english_mmlu.sh ├── evaluate_ja_humaneval-unstripped.sh ├── evaluate_ja_llmjp.sh ├── evaluate_ja_mbpp.sh ├── evaluate_ja_mgsm.sh ├── evaluate_ja_mt_bench.sh ├── evaluate_ja_wmt20_enja.sh ├── evaluate_ja_wmt20_jaen.sh └── evaluate_ja_xlsum.sh /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/.gitignore -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ghcr.io/bigcode-project/evaluation-harness 2 | 3 | RUN pip install datasets==2.21.0 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/LICENSE -------------------------------------------------------------------------------- /MEMO.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/MEMO.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/README.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/.github/workflows/ci.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/.github/workflows/ci.yml -------------------------------------------------------------------------------- /bigcode-evaluation-harness/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/.gitignore -------------------------------------------------------------------------------- /bigcode-evaluation-harness/CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/CONTRIBUTING.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/Dockerfile -------------------------------------------------------------------------------- /bigcode-evaluation-harness/Dockerfile-multiple: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/Dockerfile-multiple -------------------------------------------------------------------------------- /bigcode-evaluation-harness/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/LICENSE -------------------------------------------------------------------------------- /bigcode-evaluation-harness/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/README.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/arguments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/arguments.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/base.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/custom_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/custom_utils.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/evaluator.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/generation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/generation.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/__init__.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/apps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/apps.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/conala.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/conala.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/concode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/concode.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/custom_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/custom_metrics/multiple_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/block_on_inputs.py: -------------------------------------------------------------------------------- 1 | while True: 2 | input() 3 | -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/custom_metrics/multiple_metrics/safe_subprocess/evil_programs/fork_bomb.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | while True: 4 | os.fork() 5 | -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/custom_metrics/pal_metric/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/ds1000.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/ds1000.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/gsm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/gsm.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/humaneval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/humaneval.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/mbpp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/mbpp.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/mbpp_ja.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/mbpp_ja.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/mbppplus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/mbppplus.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/multiple.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/multiple.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/parity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/parity.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/quixbugs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/quixbugs.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/tasks/recode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/tasks/recode.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/bigcode_eval/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/bigcode_eval/utils.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/docs/README.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/docs/guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/docs/guide.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/finetuning/APPS/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/finetuning/APPS/README.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/finetuning/APPS/apps_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/finetuning/APPS/apps_dataset.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/finetuning/APPS/apps_train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/finetuning/APPS/apps_train.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/finetuning/CodeClone/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/finetuning/CodeClone/README.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/finetuning/CodeClone/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/finetuning/CodeClone/train.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/finetuning/CodeComplex/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/finetuning/CodeComplex/train.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/finetuning/CodeDefect/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/finetuning/CodeDefect/README.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/finetuning/CodeDefect/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/finetuning/CodeDefect/train.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/finetuning/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/finetuning/README.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/leaderboard/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/leaderboard/README.md -------------------------------------------------------------------------------- /bigcode-evaluation-harness/leaderboard/group_jsons.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/leaderboard/group_jsons.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/leaderboard/multiple_eval.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/leaderboard/multiple_eval.slurm -------------------------------------------------------------------------------- /bigcode-evaluation-harness/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/main.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/makefile -------------------------------------------------------------------------------- /bigcode-evaluation-harness/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/requirements.txt -------------------------------------------------------------------------------- /bigcode-evaluation-harness/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/setup.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/templates/new_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/templates/new_task.py -------------------------------------------------------------------------------- /bigcode-evaluation-harness/tests/data/mbpp_eval_gens.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/tests/data/mbpp_eval_gens.json -------------------------------------------------------------------------------- /bigcode-evaluation-harness/tests/data/mbpp_gen_gens.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/tests/data/mbpp_gen_gens.json -------------------------------------------------------------------------------- /bigcode-evaluation-harness/tests/data/mbpp_gen_refs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/tests/data/mbpp_gen_refs.json -------------------------------------------------------------------------------- /bigcode-evaluation-harness/tests/test_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/bigcode-evaluation-harness/tests/test_prompts.py -------------------------------------------------------------------------------- /fastchat/.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/.github/PULL_REQUEST_TEMPLATE.md -------------------------------------------------------------------------------- /fastchat/.github/workflows/python-package.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/.github/workflows/python-package.yml -------------------------------------------------------------------------------- /fastchat/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/.gitignore -------------------------------------------------------------------------------- /fastchat/.pylintrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/.pylintrc -------------------------------------------------------------------------------- /fastchat/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/LICENSE -------------------------------------------------------------------------------- /fastchat/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/README.md -------------------------------------------------------------------------------- /fastchat/assets/demo_narrow.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/assets/demo_narrow.gif -------------------------------------------------------------------------------- /fastchat/assets/qa_browser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/assets/qa_browser.png -------------------------------------------------------------------------------- /fastchat/assets/screenshot_cli.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/assets/screenshot_cli.png -------------------------------------------------------------------------------- /fastchat/assets/screenshot_gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/assets/screenshot_gui.png -------------------------------------------------------------------------------- /fastchat/assets/server_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/assets/server_arch.png -------------------------------------------------------------------------------- /fastchat/assets/vicuna_logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/assets/vicuna_logo.jpeg -------------------------------------------------------------------------------- /fastchat/docker/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docker/Dockerfile -------------------------------------------------------------------------------- /fastchat/docker/docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docker/docker-compose.yml -------------------------------------------------------------------------------- /fastchat/docs/arena.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/arena.md -------------------------------------------------------------------------------- /fastchat/docs/awq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/awq.md -------------------------------------------------------------------------------- /fastchat/docs/commands/conv_release.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/commands/conv_release.md -------------------------------------------------------------------------------- /fastchat/docs/commands/data_cleaning.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/commands/data_cleaning.md -------------------------------------------------------------------------------- /fastchat/docs/commands/leaderboard.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/commands/leaderboard.md -------------------------------------------------------------------------------- /fastchat/docs/commands/local_cluster.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/commands/local_cluster.md -------------------------------------------------------------------------------- /fastchat/docs/commands/pypi.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/commands/pypi.md -------------------------------------------------------------------------------- /fastchat/docs/commands/webserver.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/commands/webserver.md -------------------------------------------------------------------------------- /fastchat/docs/dashinfer_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/dashinfer_integration.md -------------------------------------------------------------------------------- /fastchat/docs/dataset_release.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/dataset_release.md -------------------------------------------------------------------------------- /fastchat/docs/exllama_v2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/exllama_v2.md -------------------------------------------------------------------------------- /fastchat/docs/gptq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/gptq.md -------------------------------------------------------------------------------- /fastchat/docs/langchain_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/langchain_integration.md -------------------------------------------------------------------------------- /fastchat/docs/lightllm_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/lightllm_integration.md -------------------------------------------------------------------------------- /fastchat/docs/mlx_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/mlx_integration.md -------------------------------------------------------------------------------- /fastchat/docs/model_support.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/model_support.md -------------------------------------------------------------------------------- /fastchat/docs/openai_api.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/openai_api.md -------------------------------------------------------------------------------- /fastchat/docs/server_arch.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/server_arch.md -------------------------------------------------------------------------------- /fastchat/docs/third_party_ui.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/third_party_ui.md -------------------------------------------------------------------------------- /fastchat/docs/training.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/training.md -------------------------------------------------------------------------------- /fastchat/docs/vicuna_weights_version.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/vicuna_weights_version.md -------------------------------------------------------------------------------- /fastchat/docs/vllm_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/vllm_integration.md -------------------------------------------------------------------------------- /fastchat/docs/xFasterTransformer.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/docs/xFasterTransformer.md -------------------------------------------------------------------------------- /fastchat/fastchat/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.36" 2 | -------------------------------------------------------------------------------- /fastchat/fastchat/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/constants.py -------------------------------------------------------------------------------- /fastchat/fastchat/conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/conversation.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fastchat/fastchat/data/clean_sharegpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/clean_sharegpt.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/convert_alpaca.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/convert_alpaca.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/extract_gpt4_only.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/extract_gpt4_only.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/extract_single_round.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/extract_single_round.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/filter_wrong_format.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/filter_wrong_format.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/get_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/get_stats.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/hardcoded_questions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/hardcoded_questions.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/inspect_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/inspect_data.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/merge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/merge.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/optional_clean.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/optional_clean.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/optional_replace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/optional_replace.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/prepare_all.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/prepare_all.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/pretty_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/pretty_json.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/sample.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/split_long_conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/split_long_conversation.py -------------------------------------------------------------------------------- /fastchat/fastchat/data/split_train_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/data/split_train_test.py -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/README.md -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/clean_judgment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/clean_judgment.py -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/common.py -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/compute_agreement.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/compute_agreement.py -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/custom_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/custom_utils.py -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/data/judge_prompts.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/data/judge_prompts.jsonl -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/data/mt_bench/misc/radar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/data/mt_bench/misc/radar.png -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/data/mt_bench/question.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/data/mt_bench/question.jsonl -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/gen_api_answer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/gen_api_answer.py -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/gen_judgment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/gen_judgment.py -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/gen_model_answer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/gen_model_answer.py -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/qa_browser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/qa_browser.py -------------------------------------------------------------------------------- /fastchat/fastchat/llm_judge/show_result.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/llm_judge/show_result.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/__init__.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/apply_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/apply_delta.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/apply_lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/apply_lora.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/compression.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/compression.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/convert_fp16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/convert_fp16.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/llama_condense_monkey_patch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/llama_condense_monkey_patch.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/make_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/make_delta.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/model_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/model_adapter.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/model_chatglm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/model_chatglm.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/model_cllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/model_cllm.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/model_codet5p.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/model_codet5p.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/model_exllama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/model_exllama.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/model_falcon.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/model_falcon.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/model_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/model_registry.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/model_xfastertransformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/model_xfastertransformer.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/model_yuan2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/model_yuan2.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/monkey_patch_non_inplace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/monkey_patch_non_inplace.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/rwkv_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/rwkv_model.py -------------------------------------------------------------------------------- /fastchat/fastchat/model/upload_hub.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/model/upload_hub.py -------------------------------------------------------------------------------- /fastchat/fastchat/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fastchat/fastchat/modules/awq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/modules/awq.py -------------------------------------------------------------------------------- /fastchat/fastchat/modules/exllama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/modules/exllama.py -------------------------------------------------------------------------------- /fastchat/fastchat/modules/gptq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/modules/gptq.py -------------------------------------------------------------------------------- /fastchat/fastchat/modules/xfastertransformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/modules/xfastertransformer.py -------------------------------------------------------------------------------- /fastchat/fastchat/protocol/api_protocol.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/protocol/api_protocol.py -------------------------------------------------------------------------------- /fastchat/fastchat/protocol/openai_api_protocol.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/protocol/openai_api_protocol.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fastchat/fastchat/serve/api_provider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/api_provider.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/base_model_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/base_model_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/call_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/call_monitor.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/cli.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/controller.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/controller.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/dashinfer_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/dashinfer_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/example_images/distracted.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/example_images/distracted.jpg -------------------------------------------------------------------------------- /fastchat/fastchat/serve/example_images/fridge.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/example_images/fridge.jpg -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gateway/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gateway/README.md -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gateway/nginx.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gateway/nginx.conf -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gradio_block_arena_anony.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gradio_block_arena_anony.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gradio_block_arena_named.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gradio_block_arena_named.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gradio_block_arena_vision.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gradio_block_arena_vision.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gradio_block_arena_vision_anony.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gradio_block_arena_vision_anony.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gradio_block_arena_vision_named.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gradio_block_arena_vision_named.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gradio_global_state.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gradio_global_state.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gradio_web_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gradio_web_server.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/gradio_web_server_multi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/gradio_web_server_multi.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/huggingface_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/huggingface_api.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/huggingface_api_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/huggingface_api_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/inference.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/launch_all_serve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/launch_all_serve.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/lightllm_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/lightllm_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/mlx_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/mlx_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/model_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/model_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/add_markdown_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/add_markdown_info.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/basic_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/basic_stats.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/classify/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/classify/README.md -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/classify/category.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/classify/category.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/classify/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/classify/config.yaml -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/classify/display_score.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/classify/display_score.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/classify/label.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/classify/label.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/clean_battle_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/clean_battle_data.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/clean_chat_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/clean_chat_data.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/code_tagger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/code_tagger.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/criteria_labeling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/criteria_labeling.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/deduplication.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/deduplication.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/elo_analysis.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/elo_analysis.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/inspect_conv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/inspect_conv.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/intersect_conv_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/intersect_conv_file.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/leaderboard_csv_to_html.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/leaderboard_csv_to_html.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/monitor.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/monitor_md.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/monitor_md.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/rating_systems.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/rating_systems.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/summarize_cluster.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/summarize_cluster.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/tag_openai_moderation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/tag_openai_moderation.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/topic_clustering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/topic_clustering.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/vote_time_stats/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/vote_time_stats/README.md -------------------------------------------------------------------------------- /fastchat/fastchat/serve/monitor/vote_time_stats/plot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/monitor/vote_time_stats/plot.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/multi_model_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/multi_model_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/openai_api_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/openai_api_server.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/register_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/register_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/remote_logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/remote_logger.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/sglang_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/sglang_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/shutdown_serve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/shutdown_serve.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/test_message.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/test_message.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/test_throughput.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/test_throughput.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/vision/create_vqa_examples_dir.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/vision/create_vqa_examples_dir.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/vision/create_vqa_examples_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/vision/create_vqa_examples_json.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/vision/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/vision/image.py -------------------------------------------------------------------------------- /fastchat/fastchat/serve/vllm_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/serve/vllm_worker.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/llama2_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/llama2_flash_attn_monkey_patch.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/llama_flash_attn_monkey_patch.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/train.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/train_baichuan.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/train_baichuan.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/train_flant5.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/train_flant5.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/train_lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/train_lora.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/train_lora_t5.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/train_lora_t5.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/train_mem.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/train_mem.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/train_with_template.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/train_with_template.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/train_xformers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/train_xformers.py -------------------------------------------------------------------------------- /fastchat/fastchat/train/train_yuan2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/train/train_yuan2.py -------------------------------------------------------------------------------- /fastchat/fastchat/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/fastchat/utils.py -------------------------------------------------------------------------------- /fastchat/format.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/format.sh -------------------------------------------------------------------------------- /fastchat/playground/FastChat_API_GoogleColab.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/playground/FastChat_API_GoogleColab.ipynb -------------------------------------------------------------------------------- /fastchat/playground/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fastchat/playground/benchmark/benchmark_api_provider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/playground/benchmark/benchmark_api_provider.py -------------------------------------------------------------------------------- /fastchat/playground/deepspeed_config_s2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/playground/deepspeed_config_s2.json -------------------------------------------------------------------------------- /fastchat/playground/deepspeed_config_s3.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/playground/deepspeed_config_s3.json -------------------------------------------------------------------------------- /fastchat/playground/test_embedding/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/playground/test_embedding/README.md -------------------------------------------------------------------------------- /fastchat/playground/test_embedding/test_classification.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/playground/test_embedding/test_classification.py -------------------------------------------------------------------------------- /fastchat/playground/test_embedding/test_semantic_search.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/playground/test_embedding/test_semantic_search.py -------------------------------------------------------------------------------- /fastchat/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/pyproject.toml -------------------------------------------------------------------------------- /fastchat/scripts/build-api.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/scripts/build-api.sh -------------------------------------------------------------------------------- /fastchat/scripts/test_readme_train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/scripts/test_readme_train.sh -------------------------------------------------------------------------------- /fastchat/scripts/train_lora.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/scripts/train_lora.sh -------------------------------------------------------------------------------- /fastchat/scripts/train_vicuna_13b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/scripts/train_vicuna_13b.sh -------------------------------------------------------------------------------- /fastchat/scripts/train_vicuna_7b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/scripts/train_vicuna_7b.sh -------------------------------------------------------------------------------- /fastchat/scripts/upload_pypi.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/scripts/upload_pypi.sh -------------------------------------------------------------------------------- /fastchat/tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/README.md -------------------------------------------------------------------------------- /fastchat/tests/killall_python.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/killall_python.sh -------------------------------------------------------------------------------- /fastchat/tests/launch_openai_api_test_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/launch_openai_api_test_server.py -------------------------------------------------------------------------------- /fastchat/tests/load_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/load_test.py -------------------------------------------------------------------------------- /fastchat/tests/test_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/test_cli.py -------------------------------------------------------------------------------- /fastchat/tests/test_cli_inputs.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/test_cli_inputs.txt -------------------------------------------------------------------------------- /fastchat/tests/test_image_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/test_image_utils.py -------------------------------------------------------------------------------- /fastchat/tests/test_openai_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/test_openai_api.py -------------------------------------------------------------------------------- /fastchat/tests/test_openai_langchain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/test_openai_langchain.py -------------------------------------------------------------------------------- /fastchat/tests/test_openai_vision_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/fastchat/tests/test_openai_vision_api.py -------------------------------------------------------------------------------- /llm-jp-eval/.github/dependabot.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/.github/dependabot.yml -------------------------------------------------------------------------------- /llm-jp-eval/.github/workflows/lint.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/.github/workflows/lint.yml -------------------------------------------------------------------------------- /llm-jp-eval/.github/workflows/requirements.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/.github/workflows/requirements.yml -------------------------------------------------------------------------------- /llm-jp-eval/.github/workflows/test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/.github/workflows/test.yml -------------------------------------------------------------------------------- /llm-jp-eval/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/.gitignore -------------------------------------------------------------------------------- /llm-jp-eval/.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/.pre-commit-config.yaml -------------------------------------------------------------------------------- /llm-jp-eval/CITATION.cff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/CITATION.cff -------------------------------------------------------------------------------- /llm-jp-eval/DATASET.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/DATASET.md -------------------------------------------------------------------------------- /llm-jp-eval/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/LICENSE -------------------------------------------------------------------------------- /llm-jp-eval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/README.md -------------------------------------------------------------------------------- /llm-jp-eval/README_en.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/README_en.md -------------------------------------------------------------------------------- /llm-jp-eval/REFERENCES.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/REFERENCES.md -------------------------------------------------------------------------------- /llm-jp-eval/configs/config_no-sample.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/configs/config_no-sample.yaml -------------------------------------------------------------------------------- /llm-jp-eval/configs/config_template.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/configs/config_template.yaml -------------------------------------------------------------------------------- /llm-jp-eval/configs/model/llm-jp_llm-jp-1.3b-v1.0.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/configs/model/llm-jp_llm-jp-1.3b-v1.0.yaml -------------------------------------------------------------------------------- /llm-jp-eval/configs/tokenizer/llm-jp_llm-jp-1.3b-v1.0.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/configs/tokenizer/llm-jp_llm-jp-1.3b-v1.0.yaml -------------------------------------------------------------------------------- /llm-jp-eval/poetry.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/poetry.lock -------------------------------------------------------------------------------- /llm-jp-eval/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/pyproject.toml -------------------------------------------------------------------------------- /llm-jp-eval/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/requirements.txt -------------------------------------------------------------------------------- /llm-jp-eval/scripts/evaluate_llm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/scripts/evaluate_llm.py -------------------------------------------------------------------------------- /llm-jp-eval/scripts/jmmlu_statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/scripts/jmmlu_statistics.py -------------------------------------------------------------------------------- /llm-jp-eval/scripts/preprocess_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/scripts/preprocess_dataset.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/__init__.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/alt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/alt.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/base.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/chabsa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/chabsa.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jamp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jamp.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/janli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/janli.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jblimp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jblimp.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jcola.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jcola.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jcommonsenseqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jcommonsenseqa.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jemhopqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jemhopqa.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jmmlu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jmmlu.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jnli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jnli.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jsem.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jsem.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jsick.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jsick.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jsquad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jsquad.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/jsts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/jsts.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/mawps.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/mawps.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/mmluen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/mmluen.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/niilc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/niilc.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/wiki/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/wiki/__init__.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/wiki/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/wiki/base.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/wiki/coreference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/wiki/coreference.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/wiki/dependency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/wiki/dependency.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/wiki/ner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/wiki/ner.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/wiki/pas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/wiki/pas.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/wiki/reading.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/wiki/reading.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/datasets/wikicorpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/datasets/wikicorpus.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/evaluator.py -------------------------------------------------------------------------------- /llm-jp-eval/src/llm_jp_eval/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/src/llm_jp_eval/utils.py -------------------------------------------------------------------------------- /llm-jp-eval/tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/tests/conftest.py -------------------------------------------------------------------------------- /llm-jp-eval/tests/data/wiki00132787.knp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/tests/data/wiki00132787.knp -------------------------------------------------------------------------------- /llm-jp-eval/tests/data/wiki00268469.knp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/tests/data/wiki00268469.knp -------------------------------------------------------------------------------- /llm-jp-eval/tests/data/wiki00280639.knp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/tests/data/wiki00280639.knp -------------------------------------------------------------------------------- /llm-jp-eval/tests/datasets/test_wiki_coreference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/tests/datasets/test_wiki_coreference.py -------------------------------------------------------------------------------- /llm-jp-eval/tests/datasets/test_wiki_dependency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/tests/datasets/test_wiki_dependency.py -------------------------------------------------------------------------------- /llm-jp-eval/tests/datasets/test_wiki_ner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/tests/datasets/test_wiki_ner.py -------------------------------------------------------------------------------- /llm-jp-eval/tests/datasets/test_wiki_pas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/tests/datasets/test_wiki_pas.py -------------------------------------------------------------------------------- /llm-jp-eval/tests/datasets/test_wiki_reading.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/llm-jp-eval/tests/datasets/test_wiki_reading.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/.coveragerc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/.coveragerc -------------------------------------------------------------------------------- /lm-evaluation-harness-en/.flake8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/.flake8 -------------------------------------------------------------------------------- /lm-evaluation-harness-en/.github/workflows/new_tasks.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/.github/workflows/new_tasks.yml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/.github/workflows/publish.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/.github/workflows/publish.yml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/.github/workflows/unit_tests.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/.github/workflows/unit_tests.yml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/.gitignore -------------------------------------------------------------------------------- /lm-evaluation-harness-en/.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/.pre-commit-config.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/CITATION.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/CITATION.bib -------------------------------------------------------------------------------- /lm-evaluation-harness-en/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @haileyschoelkopf @lintangsutawika 2 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/LICENSE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/LICENSE.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/docs/CONTRIBUTING.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/docs/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/docs/decontamination.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/docs/decontamination.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/docs/img/fewshot_example_gpt3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/docs/img/fewshot_example_gpt3.png -------------------------------------------------------------------------------- /lm-evaluation-harness-en/docs/interface.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/docs/interface.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/docs/model_guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/docs/model_guide.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/docs/new_task_guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/docs/new_task_guide.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/docs/task_guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/docs/task_guide.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/examples/lm-eval-overview.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/examples/lm-eval-overview.ipynb -------------------------------------------------------------------------------- /lm-evaluation-harness-en/examples/visualize-wandb.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/examples/visualize-wandb.ipynb -------------------------------------------------------------------------------- /lm-evaluation-harness-en/examples/visualize-zeno.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/examples/visualize-zeno.ipynb -------------------------------------------------------------------------------- /lm-evaluation-harness-en/ignore.txt: -------------------------------------------------------------------------------- 1 | ROUGE 2 | rouge 3 | nin 4 | maka 5 | mor 6 | te 7 | ond 8 | extraversion 9 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/__main__.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/api/filter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/api/filter.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/api/instance.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/api/instance.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/api/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/api/metrics.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/api/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/api/model.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/api/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/api/registry.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/api/samplers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/api/samplers.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/api/task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/api/task.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/caching/cache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/caching/cache.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/decontamination/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/evaluator.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/evaluator_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/evaluator_utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/filters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/filters/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/filters/extraction.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/filters/extraction.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/filters/selection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/filters/selection.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/filters/transformation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/filters/transformation.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/logging_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/logging_utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/anthropic_llms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/anthropic_llms.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/dummy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/dummy.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/gguf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/gguf.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/huggingface.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/mamba_lm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/mamba_lm.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/neuron_optimum.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/neuron_optimum.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/optimum_lm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/optimum_lm.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/textsynth.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/textsynth.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/models/vllm_causallms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/models/vllm_causallms.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/prompts/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/aexams/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/aexams/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/agieval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/agieval/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/agieval/math.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/agieval/math.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/agieval/sat-en.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/agieval/sat-en.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/agieval/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/agieval/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/ammlu/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/ammlu/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/anli/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/anli/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/anli/anli_r1.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/anli/anli_r1.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/anli/anli_r2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/anli/anli_r2.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/anli/anli_r3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/anli/anli_r3.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/arc/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/arc/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/arc/arc_easy.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/arc/arc_easy.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/asdiv/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/asdiv/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/asdiv/default.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/asdiv/default.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/babi/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/babi/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/babi/babi.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/babi/babi.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/bbh/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/bbh/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/belebele/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/belebele/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/bigbench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/bigbench/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/blimp/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/blimp/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/ceval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/ceval/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/cmmlu/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/cmmlu/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/coqa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/coqa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/coqa/default.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/coqa/default.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/coqa/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/coqa/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/csatqa/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/csatqa/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/drop/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/drop/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/drop/default.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/drop/default.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/drop/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/drop/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/eq_bench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/eq_bench/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/eq_bench/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/eq_bench/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/fld/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/fld/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/fld/fld_star.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/fld/fld_star.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/glue/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/glue/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/glue/mnli/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/glue/mnli/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/gpqa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/gpqa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/gsm8k/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/gsm8k/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/gsm8k/gsm8k.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/gsm8k/gsm8k.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/haerae/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/haerae/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/headqa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/headqa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/hellaswag/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/hellaswag/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/hellaswag/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/hellaswag/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/ifeval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/ifeval/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/ifeval/ifeval.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/ifeval/ifeval.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/ifeval/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/ifeval/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/kmmlu/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/kmmlu/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/kobest/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/kobest/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/kobest/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/kobest/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/lambada/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/lambada/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/logiqa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/logiqa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/logiqa/logiqa.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/logiqa/logiqa.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/logiqa2/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/logiqa2/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/math_500/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/math_500/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/math_500/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/math_500/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mathqa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mathqa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mathqa/mathqa.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mathqa/mathqa.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mathqa/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mathqa/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mc_taco/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mc_taco/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/medqa/medqa.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/medqa/medqa.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mgsm/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mgsm/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mgsm/gen_yaml.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mgsm/gen_yaml.sh -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mgsm/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mgsm/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mutual/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mutual/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mutual/mutual.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mutual/mutual.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/mutual/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/mutual/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/nq_open/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/paws-x/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/paws-x/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_de.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_de.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_en.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_en.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_es.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_es.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_fr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_fr.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_ja.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_ja.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_ko.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_ko.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_zh.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/paws-x/paws_zh.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/pile/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/pile/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/piqa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/piqa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/piqa/piqa.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/piqa/piqa.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/polemo2/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/polemo2/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/prost/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/prost/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/pubmedqa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/pubmedqa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/qa4mre/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/qa4mre/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/qasper/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/qasper/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/qasper/bool.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/qasper/bool.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/qasper/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/qasper/metrics.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/qasper/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/qasper/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/race/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/race/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/race/race.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/race/race.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/sciq/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/sciq/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/sciq/sciq.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/sciq/sciq.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/scrolls/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/scrolls/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/scrolls/task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/scrolls/task.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/siqa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/siqa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/siqa/siqa.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/siqa/siqa.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/squadv2/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/squadv2/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/squadv2/task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/squadv2/task.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/swag/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/swag/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/swag/swag.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/swag/swag.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/toxigen/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/toxigen/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/toxigen/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/toxigen/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/triviaqa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/triviaqa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/truthfulqa/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/truthfulqa/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/webqs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/webqs/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/webqs/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/webqs/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/webqs/webqs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/webqs/webqs.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/wikitext/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/wikitext/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/wmdp/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/wmdp/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/wmdp/wmdp_bio.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/wmdp/wmdp_bio.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/wmdp/wmdp_chem.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/wmdp/wmdp_chem.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/wmt2016/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/wmt2016/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/wmt2016/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/wmt2016/metrics.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/wsc273/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/wsc273/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/wsc273/default.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/wsc273/default.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/wsc273/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/wsc273/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xcopa/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xcopa/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xcopa/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xcopa/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_ar.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_ar.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_bg.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_bg.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_de.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_de.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_el.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_el.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_en.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_en.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_es.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_es.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_fr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_fr.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_hi.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_hi.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_ru.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_ru.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_sw.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_sw.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_th.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_th.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_tr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_tr.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_ur.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_ur.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_vi.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_vi.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_zh.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xnli/xnli_zh.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xwinograd/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xwinograd/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/tasks/xwinograd/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/tasks/xwinograd/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/lm_eval/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/lm_eval/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/mypy.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/mypy.ini -------------------------------------------------------------------------------- /lm-evaluation-harness-en/pile_statistics.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/pile_statistics.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/pyproject.toml -------------------------------------------------------------------------------- /lm-evaluation-harness-en/requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/build_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/build_benchmark.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/clean_training_data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/cost_estimate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/cost_estimate.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/get_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/get_prompts.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/make_gpt2_test_cases.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/make_gpt2_test_cases.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/make_table_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/make_table_results.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/make_table_tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/make_table_tasks.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/model_comparator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/model_comparator.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/regression.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/regression.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/requests_caching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/requests_caching.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/write_out.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/write_out.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/scripts/zeno_visualize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/scripts/zeno_visualize.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/setup.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tasks.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tasks.txt -------------------------------------------------------------------------------- /lm-evaluation-harness-en/templates/new_yaml_task/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/templates/new_yaml_task/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-en/templates/new_yaml_task/blank_yaml.yaml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/models/test_gguf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/models/test_gguf.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/models/test_huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/models/test_huggingface.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/models/test_openvino.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/models/test_openvino.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/models/test_vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/models/test_vllm.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/test_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/test_cli.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/test_evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/test_evaluator.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/test_janitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/test_janitor.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/test_misc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/test_misc.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/test_requests_caching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/test_requests_caching.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/test_tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/test_tasks.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/test_utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/boolq-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/boolq-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/boolq-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/boolq-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/cb-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/cb-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/cb-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/cb-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/cola-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/cola-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/copa-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/copa-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/coqa-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/coqa-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/coqa-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/coqa-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/drop-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/drop-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/drop-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/drop-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/gsm8k-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/gsm8k-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/headqa-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/headqa-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/logiqa-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/logiqa-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/mathqa-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/mathqa-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/mnli-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/mnli-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/mrpc-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/mrpc-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/mutual-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/mutual-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/mutual-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/mutual-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/piqa-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/piqa-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/prost-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/prost-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/qnli-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/qnli-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/qqp-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/qqp-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/race-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/race-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/record-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/record-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/rte-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/rte-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/sciq-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/sciq-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/squad2-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/squad2-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/squad2-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/squad2-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/sst-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/sst-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/swag-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/swag-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/webqs-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/webqs-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/wic-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/wic-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/wnli-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/wnli-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/wnli-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/wnli-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/wsc-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/wsc-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/testdata/wsc273-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/testdata/wsc273-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-en/tests/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-en/tests/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/.coveragerc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/.coveragerc -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/.flake8: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/.flake8 -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/.gitignore -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/.pre-commit-config.yaml -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/CITATION.bib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/CITATION.bib -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @jon-tow @StellaAthena 2 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/LICENSE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/LICENSE.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/docs/decontamination.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/docs/decontamination.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/docs/description_guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/docs/description_guide.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/docs/jptasks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/docs/jptasks.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/docs/prompt_templates.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/docs/prompt_templates.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/docs/task_guide.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/docs/task_guide.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/docs/task_table.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/docs/task_table.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/ignore.txt: -------------------------------------------------------------------------------- 1 | ROUGE 2 | rouge 3 | nin 4 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/base.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/datasets/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/asdiv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/asdiv/asdiv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/datasets/asdiv/asdiv.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/coqa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/coqa/coqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/datasets/coqa/coqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/drop/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/drop/drop.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/datasets/drop/drop.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/headqa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/hendrycks_ethics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/hendrycks_math/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/lambada_ja/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/logiqa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/mutual/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/pile/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/pile/pile.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/datasets/pile/pile.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/quac/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/quac/quac.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/datasets/quac/quac.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/sat_analogies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/triviaqa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/datasets/unscramble/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/decontamination/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/evaluator.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/jasquad/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/jasquad/README.md -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/jasquad/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/jasquad/evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/jasquad/evaluate.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/jasquad/jasquad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/jasquad/jasquad.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/jasquad/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/metrics.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/models/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/models/dummy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/models/dummy.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/models/gpt2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/models/gpt2.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/models/gpt3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/models/gpt3.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/models/huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/models/huggingface.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/models/textsynth.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/models/textsynth.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/anli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/anli.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/arc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/arc.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/arithmetic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/arithmetic.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/asdiv.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/asdiv.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/blimp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/blimp.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/cbt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/cbt.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/coqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/coqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/crowspairs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/crowspairs.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/drop.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/drop.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/glue.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/glue.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/gsm8k.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/gsm8k.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/headqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/headqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/hellaswag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/hellaswag.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/hendrycks_math.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/hendrycks_math.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/hendrycks_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/hendrycks_test.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/__init__.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/jaqket_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/jaqket_v1.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/jaqket_v2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/jaqket_v2.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/jaquad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/jaquad.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/jblimp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/jblimp.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/jcola.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/jcola.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/jnli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/jnli.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/jsquad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/jsquad.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/marc_ja.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/marc_ja.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/mgsm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/mgsm.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/wikilingua.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/wikilingua.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/xlsum_ja.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/xlsum_ja.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/ja/xwinograd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/ja/xwinograd.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/lambada.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/lambada.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/lambada_cloze.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/lambada_cloze.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/logiqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/logiqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/mathqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/mathqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/mc_taco.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/mc_taco.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/mutual.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/mutual.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/naturalqs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/naturalqs.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/openbookqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/openbookqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/pile.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/pile.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/piqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/piqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/prost.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/prost.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/pubmedqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/pubmedqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/qa4mre.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/qa4mre.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/qasper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/qasper.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/quac.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/quac.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/race.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/race.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/sat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/sat.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/sciq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/sciq.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/squad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/squad.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/storycloze.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/storycloze.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/superglue.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/superglue.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/swag.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/swag.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/toxigen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/toxigen.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/translation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/translation.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/triviaqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/triviaqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/truthfulqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/truthfulqa.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/unscramble.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/unscramble.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/webqs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/webqs.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/wikitext.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/wikitext.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/winogrande.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/winogrande.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/tasks/wsc273.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/tasks/wsc273.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/lm_eval/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/lm_eval/utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/main.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/models/harness.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/models/harness.conf -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/models/rinna/harness.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/models/rinna/harness.conf -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/models/stablelm/harness.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/models/stablelm/harness.conf -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/pile_statistics.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/pile_statistics.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/requirements-ja.txt: -------------------------------------------------------------------------------- 1 | emoji 2 | fugashi==1.2.1 3 | neologdn>=0.5.2 4 | unidic-lite==1.0.8 5 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/clean_training_data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/cost_estimate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/cost_estimate.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/generate_harness.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/generate_harness.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/get_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/get_prompts.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/make_gpt2_test_cases.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/make_gpt2_test_cases.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/make_leaderboard.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/make_leaderboard.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/make_table_tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/make_table_tasks.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/merge_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/merge_json.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/models.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/models.txt -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/run_task.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/run_task.sh -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/run_task_batch.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/run_task_batch.sh -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/run_task_for_models.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/run_task_for_models.sh -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/scripts/write_out.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/scripts/write_out.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/setup.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/templates/new_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/templates/new_task.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/test_description_dict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/test_description_dict.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/test_evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/test_evaluator.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/test_generate_13_grams.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/test_generate_13_grams.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/test_janitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/test_janitor.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/test_misc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/test_misc.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/test_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/test_models.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/test_tasks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/test_tasks.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/test_utils.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/test_version_stable.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/test_version_stable.py -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/cb-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/cb-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/cb-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/cb-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/cola-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/cola-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/copa-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/copa-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/coqa-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/coqa-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/coqa-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/coqa-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/drop-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/drop-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/drop-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/drop-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/mnli-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/mnli-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/mrpc-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/mrpc-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/piqa-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/piqa-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/qnli-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/qnli-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/qqp-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/qqp-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/race-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/race-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/rte-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/rte-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/sciq-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/sciq-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/sst-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/sst-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/swag-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/swag-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/wic-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/wic-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/wnli-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/wnli-v0-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/wnli-v1-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/wnli-v1-res.json -------------------------------------------------------------------------------- /lm-evaluation-harness-jp/tests/testdata/wsc-v0-res.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/lm-evaluation-harness-jp/tests/testdata/wsc-v0-res.json -------------------------------------------------------------------------------- /scripts/aggregate_result.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/aggregate_result.py -------------------------------------------------------------------------------- /scripts/column-path-key.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/column-path-key.csv -------------------------------------------------------------------------------- /scripts/evaluate_english_bbh.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_english_bbh.sh -------------------------------------------------------------------------------- /scripts/evaluate_english_general.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_english_general.sh -------------------------------------------------------------------------------- /scripts/evaluate_english_gpqa.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_english_gpqa.sh -------------------------------------------------------------------------------- /scripts/evaluate_english_humaneval-unstripped.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_english_humaneval-unstripped.sh -------------------------------------------------------------------------------- /scripts/evaluate_english_math.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_english_math.sh -------------------------------------------------------------------------------- /scripts/evaluate_english_mbpp.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_english_mbpp.sh -------------------------------------------------------------------------------- /scripts/evaluate_english_mmlu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_english_mmlu.sh -------------------------------------------------------------------------------- /scripts/evaluate_ja_humaneval-unstripped.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_ja_humaneval-unstripped.sh -------------------------------------------------------------------------------- /scripts/evaluate_ja_llmjp.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_ja_llmjp.sh -------------------------------------------------------------------------------- /scripts/evaluate_ja_mbpp.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_ja_mbpp.sh -------------------------------------------------------------------------------- /scripts/evaluate_ja_mgsm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_ja_mgsm.sh -------------------------------------------------------------------------------- /scripts/evaluate_ja_mt_bench.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_ja_mt_bench.sh -------------------------------------------------------------------------------- /scripts/evaluate_ja_wmt20_enja.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_ja_wmt20_enja.sh -------------------------------------------------------------------------------- /scripts/evaluate_ja_wmt20_jaen.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_ja_wmt20_jaen.sh -------------------------------------------------------------------------------- /scripts/evaluate_ja_xlsum.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swallow-llm/swallow-evaluation/HEAD/scripts/evaluate_ja_xlsum.sh --------------------------------------------------------------------------------