├── .github └── workflows │ └── black.yaml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CITATION.cff ├── CONTRIBUTING.md ├── Makefile ├── README.md ├── configs ├── full_gemma9b_gpt4omini0718.yaml ├── full_gpt4omini0718.yaml ├── light_gemma9b_gpt4omini0718.yaml ├── light_gpt4omini0718.yaml ├── light_gpt4omini0718_jsc.yaml ├── reasoning.yaml ├── reasoning_lite.yaml └── single_task │ ├── gpqa.yaml │ ├── livebench.yaml │ ├── livecodebench.yaml │ └── math500.yaml ├── create_csv.sh ├── create_csv_helper.py ├── database ├── README.md ├── __init__.py ├── config.py ├── models.py └── utils.py ├── eval ├── __init__.py ├── chat_benchmarks │ ├── AIME24 │ │ ├── data │ │ │ └── aime24.json │ │ └── eval_instruct.py │ ├── AIME25 │ │ ├── data │ │ │ └── aime25.json │ │ └── eval_instruct.py │ ├── AIW │ │ ├── data │ │ │ └── aiw_data.json │ │ └── eval_instruct.py │ ├── AMC23 │ │ ├── data │ │ │ └── amc23.json │ │ └── eval_instruct.py │ ├── BigCodeBench │ │ ├── README.md │ │ ├── data │ │ │ ├── BigCodeBench-complete-hard.json │ │ │ ├── BigCodeBench-complete.json │ │ │ ├── BigCodeBench-instruct-hard.json │ │ │ └── BigCodeBench-instruct.json │ │ ├── docker │ │ │ └── Dockerfile │ │ ├── eval_instruct.py │ │ ├── evaluation.py │ │ ├── execution.py │ │ ├── requirements │ │ │ ├── requirements-eval.txt │ │ │ └── requirements.txt │ │ ├── sanitize.py │ │ └── syncheck.py │ ├── CodeElo │ │ ├── codeelo_utils.py │ │ └── eval_instruct.py │ ├── CodeForces │ │ ├── codeforces_utils.py │ │ └── eval_instruct.py │ ├── CruxEval │ │ ├── data │ │ │ └── cruxeval.jsonl │ │ ├── eval_instruct.py │ │ ├── evaluation.py │ │ └── execution.py │ ├── GPQADiamond │ │ ├── eval_instruct.py │ │ └── testing_utils.py │ ├── HLE │ │ ├── eval_instruct.py │ │ ├── run_judge_results.py │ │ └── testing_utils.py │ ├── HMMT │ │ ├── eval_instruct.py │ │ └── matharena │ │ │ ├── .gitignore │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── README_judges.md │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── configs.py │ │ │ ├── cot_solver.py │ │ │ ├── grader.py │ │ │ ├── parse_manual.py │ │ │ ├── parser.py │ │ │ ├── possible_issues.py │ │ │ ├── pyproject.toml │ │ │ ├── runner.py │ │ │ ├── usamo_report.pdf │ │ │ └── utils.py │ ├── HumanEval │ │ ├── __init__.py │ │ ├── data │ │ │ ├── humaneval-python.jsonl │ │ │ ├── humaneval-sh │ │ │ └── humaneval-sh.jsonl │ │ ├── eval_instruct.py │ │ ├── human_eval │ │ │ ├── __init__.py │ │ │ ├── data.py │ │ │ ├── evaluate_functional_correctness.py │ │ │ ├── evaluation.py │ │ │ └── execution.py │ │ ├── humaneval.py │ │ ├── javatuples-1.2.jar │ │ └── utils │ │ │ ├── dataset.py │ │ │ └── utils.py │ ├── HumanEvalPlus │ │ ├── __init__.py │ │ ├── data │ │ │ └── humanevalplus-python.jsonl │ │ ├── eval_instruct.py │ │ ├── human_eval_plus │ │ │ ├── __init__.py │ │ │ ├── data.py │ │ │ ├── evaluate_functional_correctness.py │ │ │ ├── evaluation.py │ │ │ └── execution.py │ │ ├── humaneval.py │ │ └── utils │ │ │ ├── dataset.py │ │ │ └── utils.py │ ├── IFEval │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data │ │ │ ├── input_data.jsonl │ │ │ └── input_response_data_gpt4_20231107_145030.jsonl │ │ ├── eval_instruct.py │ │ ├── evaluation.py │ │ ├── evaluation_main.py │ │ ├── instructions.py │ │ ├── instructions_registry.py │ │ ├── instructions_test.py │ │ ├── instructions_util.py │ │ ├── requirements.txt │ │ └── run.sh │ ├── JEEBench │ │ ├── eval_instruct.py │ │ └── utils.py │ ├── LiveBench │ │ ├── LICENSE │ │ ├── README-es_CO.md │ │ ├── README.md │ │ ├── assets │ │ │ ├── livebench-2024-07-20.png │ │ │ ├── livebench-2024-07-23.png │ │ │ ├── livebench-2024-07-24.png │ │ │ ├── livebench-2024-07-28.png │ │ │ ├── livebench-2024-08-02.png │ │ │ ├── livebench-2024-08-06.png │ │ │ ├── livebench-2024-08-30.png │ │ │ ├── livebench-2024-09-30.png │ │ │ └── livebench-2024-12-01.png │ │ ├── changelog.md │ │ ├── docs │ │ │ ├── AUTHOR_RESPONSIBILITY.md │ │ │ ├── CODE_OF_CONDUCT.md │ │ │ ├── CONTRIBUTING.md │ │ │ ├── DATASHEET.md │ │ │ └── MAINTENANCE_PLAN.md │ │ ├── eval_instruct.py │ │ ├── livebench │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ ├── conversation.py │ │ │ ├── download_leaderboard.py │ │ │ ├── download_questions.py │ │ │ ├── gen_api_answer.py │ │ │ ├── gen_ground_truth_judgment.py │ │ │ ├── gen_model_answer.py │ │ │ ├── if_runner │ │ │ │ └── instruction_following_eval │ │ │ │ │ ├── README.md │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── evaluation_main.py │ │ │ │ │ ├── instructions.py │ │ │ │ │ ├── instructions_registry.py │ │ │ │ │ ├── instructions_test.py │ │ │ │ │ ├── instructions_util.py │ │ │ │ │ ├── instructions_util_test.py │ │ │ │ │ ├── json_formatter.py │ │ │ │ │ ├── requirements.txt │ │ │ │ │ └── run.sh │ │ │ ├── lcb_runner │ │ │ │ ├── evaluation │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── compute_code_generation_metrics.py │ │ │ │ │ ├── pass_k_utils.py │ │ │ │ │ └── testing_util.py │ │ │ │ ├── lm_styles.py │ │ │ │ └── utils │ │ │ │ │ └── extraction_utils.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── api_models.py │ │ │ │ ├── completions.py │ │ │ │ ├── model_adapter.py │ │ │ │ └── models.py │ │ │ ├── process_results │ │ │ │ ├── coding │ │ │ │ │ └── utils.py │ │ │ │ ├── data_analysis │ │ │ │ │ ├── cta │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── tablejoin │ │ │ │ │ │ └── utils.py │ │ │ │ │ └── tablereformat │ │ │ │ │ │ └── utils.py │ │ │ │ ├── instruction_following │ │ │ │ │ └── utils.py │ │ │ │ ├── math │ │ │ │ │ ├── AMPS_Hard │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── math_competitions │ │ │ │ │ │ └── utils.py │ │ │ │ │ └── olympiad │ │ │ │ │ │ └── utils.py │ │ │ │ ├── reasoning │ │ │ │ │ ├── house_traversal │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── spatial │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── web_of_lies_v2 │ │ │ │ │ │ └── utils.py │ │ │ │ │ └── zebra_puzzle │ │ │ │ │ │ └── utils.py │ │ │ │ ├── util.py │ │ │ │ └── writing │ │ │ │ │ ├── connections │ │ │ │ │ └── utils.py │ │ │ │ │ ├── plot_unscrambling │ │ │ │ │ └── utils.py │ │ │ │ │ └── typos │ │ │ │ │ └── utils.py │ │ │ ├── scripts │ │ │ │ ├── answer_csv_to_jsonl.py │ │ │ │ ├── code_question_to_csv.py │ │ │ │ ├── error_check │ │ │ │ ├── find_differential_question.py │ │ │ │ ├── find_hardest_question.py │ │ │ │ ├── rerun_failed_questions.py │ │ │ │ ├── run_livebench │ │ │ │ ├── run_livebench_parallel │ │ │ │ ├── run_livebench_parallel_models │ │ │ │ ├── run_livebench_sequential │ │ │ │ └── run_livebench_sequential_local_model │ │ │ └── show_livebench_result.py │ │ └── pyproject.toml │ ├── LiveCodeBench │ │ ├── eval_instruct.py │ │ └── livecodebench_utils.py │ ├── LiveCodeBenchv5 │ │ ├── eval_instruct.py │ │ └── livecodebench_utils.py │ ├── LiveCodeBenchv5_official │ │ ├── eval_instruct.py │ │ └── livecodebench_utils.py │ ├── MATH500 │ │ ├── data │ │ │ └── math500.jsonl │ │ └── eval_instruct.py │ ├── MBPP │ │ ├── data │ │ │ ├── mbpp.jsonl │ │ │ └── mbpp_test.jsonl │ │ ├── eval_instruct.py │ │ ├── human_eval │ │ │ ├── __init__.py │ │ │ ├── data.py │ │ │ ├── evaluate_functional_correctness.py │ │ │ ├── evaluation.py │ │ │ └── execution.py │ │ ├── mbpp.py │ │ └── utils │ │ │ ├── dataset.py │ │ │ └── utils.py │ ├── MBPPPlus │ │ ├── __init__.py │ │ ├── data │ │ │ └── mbppplus.jsonl │ │ ├── eval_instruct.py │ │ ├── mbpp_plus │ │ │ ├── __init__.py │ │ │ ├── data.py │ │ │ ├── evaluation.py │ │ │ └── execution.py │ │ ├── mbppplus.py │ │ └── utils │ │ │ ├── dataset.py │ │ │ └── utils.py │ ├── MMLUPro │ │ ├── eval_instruct.py │ │ └── initial_prompt.txt │ ├── MTBench │ │ ├── .github │ │ │ ├── PULL_REQUEST_TEMPLATE.md │ │ │ └── workflows │ │ │ │ └── python-package.yml │ │ ├── .gitignore │ │ ├── .pylintrc │ │ ├── LICENSE │ │ ├── README.md │ │ ├── assets │ │ │ ├── demo_narrow.gif │ │ │ ├── qa_browser.png │ │ │ ├── screenshot_cli.png │ │ │ ├── screenshot_gui.png │ │ │ ├── server_arch.png │ │ │ └── vicuna_logo.jpeg │ │ ├── docker │ │ │ ├── Dockerfile │ │ │ └── docker-compose.yml │ │ ├── docs │ │ │ ├── arena.md │ │ │ ├── awq.md │ │ │ ├── commands │ │ │ │ ├── conv_release.md │ │ │ │ ├── data_cleaning.md │ │ │ │ ├── leaderboard.md │ │ │ │ ├── local_cluster.md │ │ │ │ ├── pypi.md │ │ │ │ └── webserver.md │ │ │ ├── dashinfer_integration.md │ │ │ ├── dataset_release.md │ │ │ ├── exllama_v2.md │ │ │ ├── gptq.md │ │ │ ├── langchain_integration.md │ │ │ ├── lightllm_integration.md │ │ │ ├── mlx_integration.md │ │ │ ├── model_support.md │ │ │ ├── openai_api.md │ │ │ ├── server_arch.md │ │ │ ├── third_party_ui.md │ │ │ ├── training.md │ │ │ ├── vicuna_weights_version.md │ │ │ ├── vllm_integration.md │ │ │ └── xFasterTransformer.md │ │ ├── eval_instruct.py │ │ ├── fastchat │ │ │ ├── __init__.py │ │ │ ├── constants.py │ │ │ ├── conversation.py │ │ │ ├── data │ │ │ │ ├── __init__.py │ │ │ │ ├── clean_sharegpt.py │ │ │ │ ├── convert_alpaca.py │ │ │ │ ├── extract_gpt4_only.py │ │ │ │ ├── extract_single_round.py │ │ │ │ ├── filter_wrong_format.py │ │ │ │ ├── get_stats.py │ │ │ │ ├── hardcoded_questions.py │ │ │ │ ├── inspect_data.py │ │ │ │ ├── merge.py │ │ │ │ ├── optional_clean.py │ │ │ │ ├── optional_replace.py │ │ │ │ ├── prepare_all.py │ │ │ │ ├── pretty_json.py │ │ │ │ ├── sample.py │ │ │ │ ├── split_long_conversation.py │ │ │ │ └── split_train_test.py │ │ │ ├── llm_judge │ │ │ │ ├── README.md │ │ │ │ ├── clean_judgment.py │ │ │ │ ├── common.py │ │ │ │ ├── compute_agreement.py │ │ │ │ ├── data │ │ │ │ │ ├── judge_prompts.jsonl │ │ │ │ │ ├── mt_bench │ │ │ │ │ │ ├── misc │ │ │ │ │ │ │ └── radar.png │ │ │ │ │ │ ├── question.jsonl │ │ │ │ │ │ └── reference_answer │ │ │ │ │ │ │ ├── gpt-4.jsonl │ │ │ │ │ │ │ └── gpt-4o-mini-2024-07-18.jsonl │ │ │ │ │ └── vicuna_bench │ │ │ │ │ │ ├── question.jsonl │ │ │ │ │ │ └── reference_answer │ │ │ │ │ │ └── gpt-4.jsonl │ │ │ │ ├── download_mt_bench_pregenerated.py │ │ │ │ ├── gen_api_answer.py │ │ │ │ ├── gen_judgment.py │ │ │ │ ├── gen_model_answer.py │ │ │ │ ├── qa_browser.py │ │ │ │ └── show_result.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── apply_delta.py │ │ │ │ ├── apply_lora.py │ │ │ │ ├── compression.py │ │ │ │ ├── convert_fp16.py │ │ │ │ ├── llama_condense_monkey_patch.py │ │ │ │ ├── make_delta.py │ │ │ │ ├── model_adapter.py │ │ │ │ ├── model_chatglm.py │ │ │ │ ├── model_cllm.py │ │ │ │ ├── model_codet5p.py │ │ │ │ ├── model_exllama.py │ │ │ │ ├── model_falcon.py │ │ │ │ ├── model_registry.py │ │ │ │ ├── model_xfastertransformer.py │ │ │ │ ├── model_yuan2.py │ │ │ │ ├── monkey_patch_non_inplace.py │ │ │ │ ├── rwkv_model.py │ │ │ │ └── upload_hub.py │ │ │ ├── modules │ │ │ │ ├── __init__.py │ │ │ │ ├── awq.py │ │ │ │ ├── exllama.py │ │ │ │ ├── gptq.py │ │ │ │ └── xfastertransformer.py │ │ │ ├── protocol │ │ │ │ ├── api_protocol.py │ │ │ │ └── openai_api_protocol.py │ │ │ ├── serve │ │ │ │ ├── __init__.py │ │ │ │ ├── api_provider.py │ │ │ │ ├── base_model_worker.py │ │ │ │ ├── call_monitor.py │ │ │ │ ├── cli.py │ │ │ │ ├── controller.py │ │ │ │ ├── dashinfer_worker.py │ │ │ │ ├── example_images │ │ │ │ │ ├── distracted.jpg │ │ │ │ │ └── fridge.jpg │ │ │ │ ├── gateway │ │ │ │ │ ├── README.md │ │ │ │ │ └── nginx.conf │ │ │ │ ├── gradio_block_arena_anony.py │ │ │ │ ├── gradio_block_arena_named.py │ │ │ │ ├── gradio_block_arena_vision.py │ │ │ │ ├── gradio_block_arena_vision_anony.py │ │ │ │ ├── gradio_block_arena_vision_named.py │ │ │ │ ├── gradio_global_state.py │ │ │ │ ├── gradio_web_server.py │ │ │ │ ├── gradio_web_server_multi.py │ │ │ │ ├── huggingface_api.py │ │ │ │ ├── huggingface_api_worker.py │ │ │ │ ├── inference.py │ │ │ │ ├── launch_all_serve.py │ │ │ │ ├── lightllm_worker.py │ │ │ │ ├── mlx_worker.py │ │ │ │ ├── model_worker.py │ │ │ │ ├── monitor │ │ │ │ │ ├── add_markdown_info.py │ │ │ │ │ ├── basic_stats.py │ │ │ │ │ ├── classify │ │ │ │ │ │ ├── README.md │ │ │ │ │ │ ├── category.py │ │ │ │ │ │ ├── config.yaml │ │ │ │ │ │ ├── display_score.py │ │ │ │ │ │ └── label.py │ │ │ │ │ ├── clean_battle_data.py │ │ │ │ │ ├── clean_chat_data.py │ │ │ │ │ ├── code_tagger.py │ │ │ │ │ ├── criteria_labeling.py │ │ │ │ │ ├── dataset_release_scripts │ │ │ │ │ │ ├── arena_33k │ │ │ │ │ │ │ ├── count_unique_users.py │ │ │ │ │ │ │ ├── filter_bad_conv.py │ │ │ │ │ │ │ ├── merge_field.py │ │ │ │ │ │ │ ├── sample.py │ │ │ │ │ │ │ └── upload_hf_dataset.py │ │ │ │ │ │ └── lmsys_chat_1m │ │ │ │ │ │ │ ├── approve_all.py │ │ │ │ │ │ │ ├── compute_stats.py │ │ │ │ │ │ │ ├── filter_bad_conv.py │ │ │ │ │ │ │ ├── final_post_processing.py │ │ │ │ │ │ │ ├── instructions.md │ │ │ │ │ │ │ ├── merge_oai_tag.py │ │ │ │ │ │ │ ├── process_all.sh │ │ │ │ │ │ │ ├── sample.py │ │ │ │ │ │ │ └── upload_hf_dataset.py │ │ │ │ │ ├── deduplication.py │ │ │ │ │ ├── elo_analysis.py │ │ │ │ │ ├── inspect_conv.py │ │ │ │ │ ├── intersect_conv_file.py │ │ │ │ │ ├── leaderboard_csv_to_html.py │ │ │ │ │ ├── monitor.py │ │ │ │ │ ├── monitor_md.py │ │ │ │ │ ├── rating_systems.py │ │ │ │ │ ├── summarize_cluster.py │ │ │ │ │ ├── tag_openai_moderation.py │ │ │ │ │ ├── topic_clustering.py │ │ │ │ │ └── vote_time_stats │ │ │ │ │ │ ├── README.md │ │ │ │ │ │ ├── analyze_data.py │ │ │ │ │ │ └── plot.py │ │ │ │ ├── multi_model_worker.py │ │ │ │ ├── openai_api_server.py │ │ │ │ ├── register_worker.py │ │ │ │ ├── remote_logger.py │ │ │ │ ├── sglang_worker.py │ │ │ │ ├── shutdown_serve.py │ │ │ │ ├── test_message.py │ │ │ │ ├── test_throughput.py │ │ │ │ ├── vision │ │ │ │ │ ├── create_vqa_examples_dir.py │ │ │ │ │ ├── create_vqa_examples_json.py │ │ │ │ │ └── image.py │ │ │ │ └── vllm_worker.py │ │ │ ├── train │ │ │ │ ├── llama2_flash_attn_monkey_patch.py │ │ │ │ ├── llama_flash_attn_monkey_patch.py │ │ │ │ ├── llama_xformers_attn_monkey_patch.py │ │ │ │ ├── train.py │ │ │ │ ├── train_baichuan.py │ │ │ │ ├── train_flant5.py │ │ │ │ ├── train_lora.py │ │ │ │ ├── train_lora_t5.py │ │ │ │ ├── train_mem.py │ │ │ │ ├── train_with_template.py │ │ │ │ ├── train_xformers.py │ │ │ │ └── train_yuan2.py │ │ │ └── utils.py │ │ ├── format.sh │ │ ├── playground │ │ │ ├── FastChat_API_GoogleColab.ipynb │ │ │ ├── __init__.py │ │ │ ├── benchmark │ │ │ │ └── benchmark_api_provider.py │ │ │ ├── deepspeed_config_s2.json │ │ │ ├── deepspeed_config_s3.json │ │ │ └── test_embedding │ │ │ │ ├── README.md │ │ │ │ ├── test_classification.py │ │ │ │ ├── test_semantic_search.py │ │ │ │ └── test_sentence_similarity.py │ │ ├── pyproject.toml │ │ ├── scripts │ │ │ ├── build-api.sh │ │ │ ├── test_readme_train.sh │ │ │ ├── train_lora.sh │ │ │ ├── train_vicuna_13b.sh │ │ │ ├── train_vicuna_7b.sh │ │ │ └── upload_pypi.sh │ │ └── tests │ │ │ ├── README.md │ │ │ ├── killall_python.sh │ │ │ ├── launch_openai_api_test_server.py │ │ │ ├── load_test.py │ │ │ ├── test_cli.py │ │ │ ├── test_cli_inputs.txt │ │ │ ├── test_image_utils.py │ │ │ ├── test_openai_api.py │ │ │ ├── test_openai_langchain.py │ │ │ └── test_openai_vision_api.py │ ├── MixEval │ │ ├── README.md │ │ ├── client │ │ │ ├── __init__.py │ │ │ └── snova_client.py │ │ ├── eval_instruct.py │ │ ├── mix_eval │ │ │ ├── __init__.py │ │ │ ├── api │ │ │ │ ├── __init__.py │ │ │ │ └── registry.py │ │ │ ├── compute_metrics.py │ │ │ ├── data │ │ │ │ ├── mixeval-2024-06-01 │ │ │ │ │ ├── mixeval-hard │ │ │ │ │ │ ├── free-form.json │ │ │ │ │ │ └── multiple-choice.json │ │ │ │ │ └── mixeval │ │ │ │ │ │ ├── free-form.json │ │ │ │ │ │ └── multiple-choice.json │ │ │ │ └── mixeval-2024-08-11 │ │ │ │ │ ├── mixeval-hard │ │ │ │ │ ├── free-form.json │ │ │ │ │ └── multiple-choice.json │ │ │ │ │ └── mixeval │ │ │ │ │ ├── free-form.json │ │ │ │ │ └── multiple-choice.json │ │ │ ├── evaluate.py │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── baichuan2_13b_chat.py │ │ │ │ ├── baichuan2_7b_chat.py │ │ │ │ ├── baichuan_13b_chat.py │ │ │ │ ├── base.py │ │ │ │ ├── base_api.py │ │ │ │ ├── claude_3_5_sonnet.py │ │ │ │ ├── claude_3_haiku.py │ │ │ │ ├── claude_3_opus.py │ │ │ │ ├── claude_3_sonnet.py │ │ │ │ ├── command_r.py │ │ │ │ ├── command_r_plus.py │ │ │ │ ├── dbrx_base.py │ │ │ │ ├── dbrx_instruct.py │ │ │ │ ├── deepseek_67b.py │ │ │ │ ├── deepseek_67b_chat.py │ │ │ │ ├── deepseek_7b.py │ │ │ │ ├── deepseek_7b_chat.py │ │ │ │ ├── deepseek_moe_16b.py │ │ │ │ ├── deepseek_moe_16b_chat.py │ │ │ │ ├── deepseek_v2.py │ │ │ │ ├── gemini_10_pro.py │ │ │ │ ├── gemini_10_pro_gcloud.py │ │ │ │ ├── gemini_10_ultra.py │ │ │ │ ├── gemini_10_ultra_gcloud.py │ │ │ │ ├── gemini_15_pro.py │ │ │ │ ├── gemini_15_pro_gcloud.py │ │ │ │ ├── gemma_11_2b_instruct.py │ │ │ │ ├── gemma_11_7b_instruct.py │ │ │ │ ├── gemma_2_27b_instruct.py │ │ │ │ ├── gemma_2_9b_instruct.py │ │ │ │ ├── gemma_2b.py │ │ │ │ ├── gemma_7b.py │ │ │ │ ├── gpt_35_turbo_0125.py │ │ │ │ ├── gpt_35_turbo_1106.py │ │ │ │ ├── gpt_4_0125_preview.py │ │ │ │ ├── gpt_4_0314.py │ │ │ │ ├── gpt_4_0613.py │ │ │ │ ├── gpt_4_1106_preview.py │ │ │ │ ├── gpt_4_turbo_2024_04_09.py │ │ │ │ ├── gpt_4o.py │ │ │ │ ├── gpt_4o_mini.py │ │ │ │ ├── internlm2_chat_7b.py │ │ │ │ ├── internlm_chat_7b.py │ │ │ │ ├── jet_moe.py │ │ │ │ ├── jet_moe_chat.py │ │ │ │ ├── llama_2_70b.py │ │ │ │ ├── llama_2_70b_chat.py │ │ │ │ ├── llama_2_7b.py │ │ │ │ ├── llama_2_7b_chat.py │ │ │ │ ├── llama_3_405b.py │ │ │ │ ├── llama_3_70b.py │ │ │ │ ├── llama_3_70b_instruct.py │ │ │ │ ├── llama_3_8b.py │ │ │ │ ├── llama_3_8b_instruct.py │ │ │ │ ├── llama_3p1_405b_instruct.py │ │ │ │ ├── llama_3p1_8b_instruct.py │ │ │ │ ├── lm_chat_model.py │ │ │ │ ├── local_base.py │ │ │ │ ├── local_chat.py │ │ │ │ ├── mammooth2_8_7b_plus.py │ │ │ │ ├── mistral_7b.py │ │ │ │ ├── mistral_7b_instruct_v02.py │ │ │ │ ├── mistral_8_22b_instruct_v01.py │ │ │ │ ├── mistral_8_7b_instruct_v01.py │ │ │ │ ├── mistral_large.py │ │ │ │ ├── mistral_large_2.py │ │ │ │ ├── mistral_medium.py │ │ │ │ ├── mistral_nemo.py │ │ │ │ ├── mistral_small.py │ │ │ │ ├── mixtral_8_22b.py │ │ │ │ ├── mixtral_8_7b.py │ │ │ │ ├── moss_moon_003_sft.py │ │ │ │ ├── mpt_30b.py │ │ │ │ ├── mpt_30b_chat.py │ │ │ │ ├── mpt_7b.py │ │ │ │ ├── mpt_7b_chat.py │ │ │ │ ├── mpt_7b_instruct.py │ │ │ │ ├── notus_7b_v1.py │ │ │ │ ├── olmo_7b.py │ │ │ │ ├── olmo_7b_instruct.py │ │ │ │ ├── openai_o1.py │ │ │ │ ├── openai_o1_mini.py │ │ │ │ ├── phi_2.py │ │ │ │ ├── qwen15_18b_chat.py │ │ │ │ ├── qwen_15_110b.py │ │ │ │ ├── qwen_15_110b_chat.py │ │ │ │ ├── qwen_15_18b_chat.py │ │ │ │ ├── qwen_15_32b.py │ │ │ │ ├── qwen_15_32b_chat.py │ │ │ │ ├── qwen_15_4b.py │ │ │ │ ├── qwen_15_4b_chat.py │ │ │ │ ├── qwen_15_72b.py │ │ │ │ ├── qwen_15_72b_chat.py │ │ │ │ ├── qwen_15_7b.py │ │ │ │ ├── qwen_15_7b_chat.py │ │ │ │ ├── qwen_15_moe_a27b.py │ │ │ │ ├── qwen_15_moe_a27b_chat.py │ │ │ │ ├── qwen_2_5_72b_instruct.py │ │ │ │ ├── qwen_2_72b_instruct.py │ │ │ │ ├── qwen_2_7b_instruct.py │ │ │ │ ├── qwen_7b_chat.py │ │ │ │ ├── qwen_max_0428.py │ │ │ │ ├── reka_core.py │ │ │ │ ├── reka_edge.py │ │ │ │ ├── reka_flash.py │ │ │ │ ├── solar_107b_instruct_v1.py │ │ │ │ ├── starling_lm_7b_beta.py │ │ │ │ ├── tigerbot_13b_chat_v1.py │ │ │ │ ├── tigerbot_13b_chat_v2.py │ │ │ │ ├── tigerbot_13b_chat_v3.py │ │ │ │ ├── tigerbot_7b_sft_v1.py │ │ │ │ ├── tigerbot_7b_sft_v2.py │ │ │ │ ├── tulu_v2_dpo_70b.py │ │ │ │ ├── tulu_v2_dpo_7b.py │ │ │ │ ├── vicuna_13b_v13.py │ │ │ │ ├── vicuna_13b_v15_16k.py │ │ │ │ ├── vicuna_33b_v13.py │ │ │ │ ├── vicuna_7b_v13.py │ │ │ │ ├── vicuna_7b_v15.py │ │ │ │ ├── vicuna_7b_v15_16k.py │ │ │ │ ├── xverse_13b_chat.py │ │ │ │ ├── xverse_7b_chat.py │ │ │ │ ├── xwin_lm_7b_v01.py │ │ │ │ ├── yi_15_34b_chat.py │ │ │ │ ├── yi_15_9b_chat.py │ │ │ │ ├── yi_34b.py │ │ │ │ ├── yi_34b_chat.py │ │ │ │ ├── yi_6b.py │ │ │ │ ├── yi_6b_chat.py │ │ │ │ ├── yi_large.py │ │ │ │ ├── yulan_chat_2_13b.py │ │ │ │ └── zephyr_7b_beta.py │ │ │ ├── prompts │ │ │ │ ├── __init__.py │ │ │ │ ├── evaluation_prompts.py │ │ │ │ └── judge_prompts.py │ │ │ ├── run_eval_example.sh │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── check_eval_complete.py │ │ │ │ ├── common_utils.py │ │ │ │ ├── count_token.py │ │ │ │ ├── dataset.py │ │ │ │ ├── judge_freeform_parser.py │ │ │ │ ├── judge_multichoice_parser.py │ │ │ │ ├── metric_utils.py │ │ │ │ └── plot_results.py │ │ └── setup.py │ ├── MultiPLE │ │ ├── README.md │ │ ├── data │ │ │ ├── multipl-e-adb.json │ │ │ ├── multipl-e-clj.json │ │ │ ├── multipl-e-cpp.json │ │ │ ├── multipl-e-cs.json │ │ │ ├── multipl-e-d.json │ │ │ ├── multipl-e-dart.json │ │ │ ├── multipl-e-elixir.json │ │ │ ├── multipl-e-go.json │ │ │ ├── multipl-e-hs.json │ │ │ ├── multipl-e-humaneval-cpp.json │ │ │ ├── multipl-e-humaneval-cs.json │ │ │ ├── multipl-e-humaneval-java.json │ │ │ ├── multipl-e-humaneval-js.json │ │ │ ├── multipl-e-humaneval-php.json │ │ │ ├── multipl-e-humaneval-sh.json │ │ │ ├── multipl-e-humaneval-ts.json │ │ │ ├── multipl-e-java.json │ │ │ ├── multipl-e-js.json │ │ │ ├── multipl-e-julia.json │ │ │ ├── multipl-e-lua.json │ │ │ ├── multipl-e-mbpp-java.json │ │ │ ├── multipl-e-mbpp-js.json │ │ │ ├── multipl-e-ml.json │ │ │ ├── multipl-e-php.json │ │ │ ├── multipl-e-pl.json │ │ │ ├── multipl-e-r.json │ │ │ ├── multipl-e-racket.json │ │ │ ├── multipl-e-rb.json │ │ │ ├── multipl-e-rs.json │ │ │ ├── multipl-e-scala.json │ │ │ ├── multipl-e-sh.json │ │ │ ├── multipl-e-swift.json │ │ │ └── multipl-e-ts.json │ │ ├── docker │ │ │ └── Dockerfile │ │ ├── eval_instruct.py │ │ ├── multiple │ │ │ ├── __init__.py │ │ │ ├── containerized_eval.py │ │ │ ├── eval_adb.py │ │ │ ├── eval_clj.py │ │ │ ├── eval_cpp.py │ │ │ ├── eval_cs.py │ │ │ ├── eval_dart.py │ │ │ ├── eval_dfy.py │ │ │ ├── eval_dlang.py │ │ │ ├── eval_elixir.py │ │ │ ├── eval_fs.py │ │ │ ├── eval_go.py │ │ │ ├── eval_hs.py │ │ │ ├── eval_java.py │ │ │ ├── eval_javascript.py │ │ │ ├── eval_julia.py │ │ │ ├── eval_lean.py │ │ │ ├── eval_lua.py │ │ │ ├── eval_luau.py │ │ │ ├── eval_matlab.py │ │ │ ├── eval_ocaml.py │ │ │ ├── eval_php.py │ │ │ ├── eval_pl.py │ │ │ ├── eval_python.py │ │ │ ├── eval_r.py │ │ │ ├── eval_racket.py │ │ │ ├── eval_ruby.py │ │ │ ├── eval_rust.py │ │ │ ├── eval_scala.py │ │ │ ├── eval_sh.py │ │ │ ├── eval_swift.py │ │ │ ├── eval_ts.py │ │ │ ├── eval_v.py │ │ │ ├── evaluation.py │ │ │ ├── execution.py │ │ │ ├── generic_eval.py │ │ │ ├── libeval.py │ │ │ ├── main.py │ │ │ ├── safe_subprocess │ │ │ │ ├── .gitignore │ │ │ │ ├── __init__.py │ │ │ │ ├── evil_programs │ │ │ │ │ ├── block_on_inputs.py │ │ │ │ │ ├── close_outputs.py │ │ │ │ │ ├── fork_bomb.py │ │ │ │ │ ├── fork_once.py │ │ │ │ │ ├── sleep_forever.py │ │ │ │ │ └── unbounded_output.py │ │ │ │ └── module_test.py │ │ │ └── simple_eval.py │ │ └── utils.py │ ├── RepoBench │ │ ├── .gitignore │ │ ├── LICENSE │ │ ├── README.md │ │ ├── archive_data │ │ │ └── utils.py │ │ ├── assets │ │ │ ├── repobench_dark.png │ │ │ ├── repobench_light.png │ │ │ └── repobench_logo.png │ │ ├── data │ │ │ ├── README.md │ │ │ ├── data │ │ │ │ ├── README.md │ │ │ │ └── utils.py │ │ │ └── utils.py │ │ ├── eval.py │ │ ├── eval_instruct.py │ │ ├── evaluation │ │ │ └── metrics.py │ │ ├── requirements.txt │ │ └── run.py │ ├── SWEbench │ │ └── eval_instruct.py │ ├── WildBench │ │ ├── .github │ │ │ ├── ISSUE_TEMPLATE │ │ │ │ └── add-new-model.md │ │ │ └── workflows │ │ │ │ └── static.yml │ │ ├── .gitignore │ │ ├── EVAL.md │ │ ├── LICENSE │ │ ├── README.md │ │ ├── docs │ │ │ ├── README.md │ │ │ ├── WildBench_paper.pdf │ │ │ ├── css │ │ │ │ ├── addons │ │ │ │ │ ├── datatables-select.min.css │ │ │ │ │ ├── datatables.min.css │ │ │ │ │ ├── directives.min.css │ │ │ │ │ ├── flag.min.css │ │ │ │ │ ├── jquery.zmd.hierarchical-display.min.css │ │ │ │ │ └── rating.min.css │ │ │ │ ├── animate.css │ │ │ │ ├── bootstrap.css │ │ │ │ ├── bootstrap.min 2.css │ │ │ │ ├── bootstrap.min.css │ │ │ │ ├── font-awesome.min.css │ │ │ │ ├── main.css │ │ │ │ ├── mdb.css │ │ │ │ ├── mdb.lite.css │ │ │ │ ├── mdb.lite.min.css │ │ │ │ ├── mdb.min.css │ │ │ │ ├── modules │ │ │ │ │ └── animations-extended.min.css │ │ │ │ └── style.css │ │ │ ├── fonts │ │ │ │ ├── fontawesome-webfont.eot │ │ │ │ ├── fontawesome-webfont.svg │ │ │ │ └── fontawesome-webfont.ttf │ │ │ ├── gray_banner.png │ │ │ ├── images │ │ │ │ ├── ai2logo_large.png │ │ │ │ ├── blank.gif │ │ │ │ ├── ico_example │ │ │ │ │ ├── apple-touch-icon-114-precomposed.png │ │ │ │ │ ├── apple-touch-icon-144-precomposed.png │ │ │ │ │ ├── apple-touch-icon-57-precomposed.png │ │ │ │ │ ├── apple-touch-icon-72-precomposed.png │ │ │ │ │ └── favicon.ico │ │ │ │ ├── icons │ │ │ │ │ ├── clear.png │ │ │ │ │ ├── deep.png │ │ │ │ │ ├── engage.png │ │ │ │ │ ├── factual.png │ │ │ │ │ ├── helpful.png │ │ │ │ │ └── safety.png │ │ │ │ └── placehold │ │ │ │ │ └── slides │ │ │ │ │ ├── kdd_phd.jpg │ │ │ │ │ └── newbook.png │ │ │ ├── index.html │ │ │ ├── js │ │ │ │ ├── addons │ │ │ │ │ ├── datatables-select.min.js │ │ │ │ │ ├── datatables.min.js │ │ │ │ │ ├── directives.min.js │ │ │ │ │ ├── flag.min.js │ │ │ │ │ ├── imagesloaded.pkgd.min.js │ │ │ │ │ ├── jquery.zmd.hierarchical-display.min.js │ │ │ │ │ ├── masonry.pkgd.min.js │ │ │ │ │ └── rating.min.js │ │ │ │ ├── bootstrap-table.min.css │ │ │ │ ├── bootstrap-table.min.js │ │ │ │ ├── bootstrap.js │ │ │ │ ├── bootstrap.min 2.js │ │ │ │ ├── bootstrap.min.js │ │ │ │ ├── jquery-1.10.2.min.js │ │ │ │ ├── jquery.js │ │ │ │ ├── jquery.min.js │ │ │ │ ├── jquery.tablesorter.js │ │ │ │ ├── mdb.js │ │ │ │ ├── mdb.min.js │ │ │ │ ├── min │ │ │ │ │ ├── custom.min.js │ │ │ │ │ ├── modernizr.min.js │ │ │ │ │ └── plugins.min.js │ │ │ │ ├── modules │ │ │ │ │ ├── animations-extended.min.js │ │ │ │ │ ├── forms-free.min.js │ │ │ │ │ ├── scrolling-navbar.min.js │ │ │ │ │ ├── treeview.min.js │ │ │ │ │ └── wow.min.js │ │ │ │ ├── popper.js │ │ │ │ └── popper.min.js │ │ │ ├── static │ │ │ │ └── images │ │ │ │ │ ├── ai2logo.png │ │ │ │ │ ├── urial_ai2.logo.png │ │ │ │ │ └── uwlogo.png │ │ │ ├── style.css │ │ │ ├── wb_corr.png │ │ │ ├── wb_eval.png │ │ │ ├── wb_radar.png │ │ │ ├── wb_stat.png │ │ │ ├── wb_table.png │ │ │ └── wildbench_logo.png │ │ ├── eval_instruct.py │ │ ├── evaluation │ │ │ ├── eval_template.pairwise.v2.md │ │ │ ├── eval_template.score.v2.md │ │ │ ├── run_all_eval_batch.sh │ │ │ ├── run_all_eval_instant.sh │ │ │ ├── run_eval_v2_batch.score.sh │ │ │ ├── run_eval_v2_batch.sh │ │ │ ├── run_eval_v2_instant.score.sh │ │ │ ├── run_eval_v2_instant.sh │ │ │ ├── run_haiku_eval_batch.sh │ │ │ └── run_score_eval_batch.sh │ │ ├── leaderboard │ │ │ ├── data_dir │ │ │ │ ├── _create_tables.py │ │ │ │ ├── _merge_results.py │ │ │ │ ├── all_stat.json │ │ │ │ ├── all_stat_wildbench.-1.json │ │ │ │ ├── all_stat_wildbench.100.json │ │ │ │ ├── all_stat_wildbench.1000.json │ │ │ │ ├── all_stat_wildbench.1500.json │ │ │ │ ├── all_stat_wildbench.2000.json │ │ │ │ ├── all_stat_wildbench.300.json │ │ │ │ ├── all_stat_wildbench.3000.json │ │ │ │ ├── all_stat_wildbench.500.json │ │ │ │ ├── pairwise-gpt4t-K=-1.json │ │ │ │ ├── pairwise-gpt4t-K=1000.json │ │ │ │ ├── pairwise-gpt4t-K=1500.json │ │ │ │ ├── pairwise-gpt4t-K=500.json │ │ │ │ ├── pairwise-haiku-K=-1.json │ │ │ │ ├── pairwise-haiku-K=1000.json │ │ │ │ ├── pairwise-haiku-K=1500.json │ │ │ │ ├── pairwise-haiku-K=500.json │ │ │ │ ├── pairwise-llama-K=-1.json │ │ │ │ ├── pairwise-llama-K=1000.json │ │ │ │ ├── pairwise-llama-K=1500.json │ │ │ │ ├── pairwise-llama-K=500.json │ │ │ │ ├── score-sonnet.json │ │ │ │ ├── score.json │ │ │ │ ├── wb_elo.txt │ │ │ │ └── wb_elo_results.json │ │ │ ├── data_utils.py │ │ │ ├── show_eval.sh │ │ │ ├── show_table.py │ │ │ └── wb_elo.py │ │ ├── requirements.txt │ │ ├── scripts │ │ │ ├── Hermes-2-Theta-Llama-3-8B.sh │ │ │ ├── Llama-2-13b-chat-hf.sh │ │ │ ├── Llama-2-70b-chat-hf.sh │ │ │ ├── Llama-2-7b-chat-hf.sh │ │ │ ├── Llama-3-8B-OpenHermes-243K.sh │ │ │ ├── Llama-3-8B-ShareGPT-112K.sh │ │ │ ├── Llama-3-8B-Tulu-330K.sh │ │ │ ├── Llama-3-8B-Ultrachat-200K.sh │ │ │ ├── Llama-3-8B-WildChat.sh │ │ │ ├── Llama-3-8B-WizardLM-196K.sh │ │ │ ├── Llama-3-Instruct-8B-SimPO-ExPO.sh │ │ │ ├── Llama-3-Instruct-8B-SimPO.sh │ │ │ ├── Magpie-Pro-SFT-v0.1.sh │ │ │ ├── Meta-Llama-3-70B-Instruct.sh │ │ │ ├── Meta-Llama-3-8B-Instruct.sh │ │ │ ├── Mistral-7B-Instruct-v0.1.sh │ │ │ ├── Mistral-7B-Instruct-v0.2.sh │ │ │ ├── Mistral-Large-2.sh │ │ │ ├── Mixtral-8x7B-Instruct-v0.1.sh │ │ │ ├── Nous-Hermes-2-Mixtral-8x7B-DPO.sh │ │ │ ├── Phi-3-medium-128k-instruct.sh │ │ │ ├── Phi-3-mini-128k-instruct.sh │ │ │ ├── Qwen1.5-72B-Chat.sh │ │ │ ├── Qwen1.5-7B-Chat@together.sh │ │ │ ├── Qwen2-72B-Instruct.sh │ │ │ ├── SELM-Zephyr-7B-iter-3.sh │ │ │ ├── Starling-LM-7B-beta-ExPO.sh │ │ │ ├── Starling-LM-7B-beta.sh │ │ │ ├── Yi-1.5-34B-Chat.sh │ │ │ ├── Yi-1.5-6B-Chat.sh │ │ │ ├── Yi-1.5-9B-Chat.sh │ │ │ ├── Yi-34B-Chat.sh │ │ │ ├── _common_hf.sh │ │ │ ├── _common_openai.sh │ │ │ ├── _common_vllm.sh │ │ │ ├── claude-3-haiku-20240307.sh │ │ │ ├── claude-3-opus-20240229.sh │ │ │ ├── claude-3-sonnet-20240229.sh │ │ │ ├── cohere-command-r-plus.sh │ │ │ ├── cohere-command-r.sh │ │ │ ├── cohere-command.sh │ │ │ ├── dbrx-instruct.sh │ │ │ ├── dbrx-instruct@together.sh │ │ │ ├── gemini-1.0-pro.sh │ │ │ ├── gemini-1.5-flash.sh │ │ │ ├── gemini-1.5-pro.sh │ │ │ ├── gemma-2-27b-it@together.sh │ │ │ ├── gemma-2b-it.sh │ │ │ ├── gemma-7b-it.sh │ │ │ ├── gpt-3.5-turbo-0125.sh │ │ │ ├── gpt-4-0125-preview.sh │ │ │ ├── gpt-4-turbo-2024-04-09.sh │ │ │ ├── gpt-4o-2024-05-13.sh │ │ │ ├── gpt-4o-mini-2024-07-18.sh │ │ │ ├── mistral-large-2402.sh │ │ │ ├── reka-core-20240501.sh │ │ │ ├── reka-edge-20240208.sh │ │ │ ├── reka-flash-20240226.sh │ │ │ ├── tulu-2-dpo-70b-ExPO.sh │ │ │ ├── tulu-2-dpo-70b.sh │ │ │ ├── vicuna-13b-v1.5.sh │ │ │ ├── yi-large.sh │ │ │ ├── zephyr-7b-beta.sh │ │ │ └── zephyr-7b-gemma-v0.1.sh │ │ └── src │ │ │ ├── __init__.py │ │ │ ├── eval.py │ │ │ ├── fastchat_conversation.py │ │ │ ├── global_configs.py │ │ │ ├── hf_models.py │ │ │ ├── merge_results.py │ │ │ ├── openai_batch_eval │ │ │ ├── add_new_model.sh │ │ │ ├── batch_format_all.py │ │ │ ├── batch_results_format.py │ │ │ ├── cancel_all.py │ │ │ ├── check_batch_status.py │ │ │ ├── check_batch_status_with_id.py │ │ │ ├── check_batch_status_with_model_name.py │ │ │ ├── submit_all.py │ │ │ └── submit_batch.py │ │ │ ├── unified_infer.py │ │ │ ├── unified_utils.py │ │ │ ├── upload_results.py │ │ │ └── view_wb_eval.py │ ├── __init__.py │ ├── alpaca_eval │ │ ├── .github │ │ │ └── workflows │ │ │ │ ├── integration_tests.yml │ │ │ │ ├── set_version.py │ │ │ │ ├── test_update_leaderboard.yml │ │ │ │ ├── unit_tests.yml │ │ │ │ ├── update_leaderboard.yml │ │ │ │ └── update_pypi.yml │ │ ├── .gitignore │ │ ├── .pre-commit-config.yaml │ │ ├── CITATION.cff │ │ ├── LICENSE │ │ ├── MANIFEST.in │ │ ├── README.md │ │ ├── client_configs │ │ │ └── README.md │ │ ├── docs │ │ │ ├── AlpacaFarm_small.png │ │ │ ├── check_unwanted_files.py │ │ │ ├── data_AlpacaEval │ │ │ │ ├── alpaca_eval_gpt4_leaderboard.csv │ │ │ │ ├── chatgpt_fn_leaderboard.csv │ │ │ │ └── claude_leaderboard.csv │ │ │ ├── data_AlpacaEval_2 │ │ │ │ ├── alpaca_eval_cot_gpt4_turbo_fn_leaderboard.csv │ │ │ │ ├── alpaca_eval_gpt4_turbo_fn_leaderboard.csv │ │ │ │ ├── claude_3_opus_ranking_leaderboard.csv │ │ │ │ ├── mistral-large-2402_ranking_leaderboard.csv │ │ │ │ └── weighted_alpaca_eval_gpt4_turbo_leaderboard.csv │ │ │ ├── format_export_leaderboards.py │ │ │ ├── format_sample_sheets.py │ │ │ └── index.html │ │ ├── eval_instruct.py │ │ ├── figures │ │ │ ├── all_metrics_length.png │ │ │ ├── annotator_bias.png │ │ │ ├── bias_vs_variance.pdf │ │ │ ├── causal_graph.png │ │ │ ├── chat_correlations.png │ │ │ ├── chat_correlations_no_ae.png │ │ │ ├── different_baselines.png │ │ │ ├── lc_ae_leaderboard.png │ │ │ ├── length_gameability.png │ │ │ ├── plot_paired_ttest_nsamples.png │ │ │ ├── plot_paired_ttests_per_dataset.png │ │ │ ├── plot_quality_vs_price_and_time.png │ │ │ ├── plot_winrate_correlations.png │ │ │ ├── plot_winrate_correlations_alpaca_eval.png │ │ │ └── verified.png │ │ ├── pytest.ini │ │ ├── requirements.txt │ │ ├── scripts │ │ │ ├── __init__.py │ │ │ └── precompute.py │ │ ├── setup.py │ │ ├── src │ │ │ └── alpaca_eval │ │ │ │ ├── __init__.py │ │ │ │ ├── analyze.py │ │ │ │ ├── annotators │ │ │ │ ├── __init__.py │ │ │ │ ├── base.py │ │ │ │ └── pairwise_evaluator.py │ │ │ │ ├── completion_parsers.py │ │ │ │ ├── constants.py │ │ │ │ ├── decoders │ │ │ │ ├── __init__.py │ │ │ │ ├── anthropic.py │ │ │ │ ├── bedrock_anthropic.py │ │ │ │ ├── cache.py │ │ │ │ ├── cohere.py │ │ │ │ ├── google.py │ │ │ │ ├── huggingface_api.py │ │ │ │ ├── huggingface_local.py │ │ │ │ ├── jinachat.py │ │ │ │ ├── openai.py │ │ │ │ ├── replicate.py │ │ │ │ ├── sambanova_api.py │ │ │ │ ├── test.py │ │ │ │ └── vllm_local.py │ │ │ │ ├── evaluators_configs │ │ │ │ ├── LlaMa-3-405B-SambaNova │ │ │ │ │ ├── configs.yaml │ │ │ │ │ ├── leaderboard.csv │ │ │ │ │ └── prompt.txt │ │ │ │ ├── LlaMa-3-70B-SambaNova │ │ │ │ │ ├── configs.yaml │ │ │ │ │ ├── leaderboard.csv │ │ │ │ │ └── prompt.txt │ │ │ │ ├── LlaMa-3-8B-SambaNova │ │ │ │ │ ├── configs.yaml │ │ │ │ │ ├── leaderboard.csv │ │ │ │ │ └── prompt.txt │ │ │ │ ├── README.md │ │ │ │ ├── alpaca_eval_clf_cot_gpt4_turbo │ │ │ │ │ ├── alpaca_eval_clf_cot.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_eval_clf_gpt4_turbo │ │ │ │ │ ├── alpaca_eval_clf.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_eval_cot_gpt4_turbo_fn │ │ │ │ │ ├── alpaca_eval_fn.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_eval_gpt4 │ │ │ │ │ ├── alpaca_eval.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_eval_gpt4_0314 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_eval_gpt4_0613 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_eval_gpt4_fn │ │ │ │ │ ├── alpaca_eval_fn.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_eval_gpt4_turbo_fn │ │ │ │ │ ├── alpaca_eval_fn.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_eval_llama3_70b_fn │ │ │ │ │ ├── alpaca_eval_fn.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_eval_vllm_llama3_70b_fn │ │ │ │ │ ├── alpaca_eval_fn.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca_farm │ │ │ │ │ ├── chatml_b1_chat_v0_without_inputs.txt │ │ │ │ │ ├── chatml_b1_chat_without_inputs.txt │ │ │ │ │ ├── chatml_b1_cot_json_without_inputs.txt │ │ │ │ │ ├── chatml_b1_without_inputs.txt │ │ │ │ │ ├── chatml_b4_cot_json_without_inputs.txt │ │ │ │ │ ├── chatml_b5_diana_without_inputs.txt │ │ │ │ │ ├── chatml_b5_joe_without_inputs.txt │ │ │ │ │ ├── chatml_b5_without_inputs.txt │ │ │ │ │ ├── configs.yaml │ │ │ │ │ ├── text_b1_v0_without_inputs.txt │ │ │ │ │ ├── text_b1_without_inputs.txt │ │ │ │ │ ├── text_b4_reasoning_without_inputs.txt │ │ │ │ │ └── text_b5_without_inputs.txt │ │ │ │ ├── alpaca_farm_greedy_gpt4 │ │ │ │ │ ├── chatml_b5_without_inputs.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── aviary_gpt4 │ │ │ │ │ ├── aviary_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── bedrock_claude │ │ │ │ │ └── configs.yaml │ │ │ │ ├── bedrock_claude_2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── chatgpt │ │ │ │ │ ├── basic_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── chatgpt_fn │ │ │ │ │ ├── basic_function_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── claude │ │ │ │ │ ├── basic_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── claude_2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── claude_3_opus_ranking │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── ranking_prompt.txt │ │ │ │ ├── claude_ranking │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── ranking_prompt.txt │ │ │ │ ├── cohere │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-3.5-turbo-1106_ranking │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── ranking_prompt.txt │ │ │ │ ├── gpt35_turbo_instruct │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_turbo │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_turbo_clf │ │ │ │ │ ├── basic_clf_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_turbo_cot_clf │ │ │ │ │ ├── basic_clf_cot_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_turbo_cot_logprob │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_turbo_logprob │ │ │ │ │ └── configs.yaml │ │ │ │ ├── guanaco_33b │ │ │ │ │ ├── basic_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── improved_aviary_gpt4 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── improved_lmsys_gpt4 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── lmsys_gpt4 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── lmsys_prompt.txt │ │ │ │ ├── mistral-large-2402_ranking │ │ │ │ │ └── configs.yaml │ │ │ │ ├── oasst_pythia_12b │ │ │ │ │ ├── basic_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── test │ │ │ │ │ └── configs.yaml │ │ │ │ ├── text_davinci_003 │ │ │ │ │ ├── basic_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── weighted_alpaca_eval_cot_gpt4_turbo │ │ │ │ │ └── configs.yaml │ │ │ │ ├── weighted_alpaca_eval_gpt-4o-mini-2024-07-18 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── weighted_alpaca_eval_gpt4_turbo │ │ │ │ │ └── configs.yaml │ │ │ │ ├── weighted_alpaca_eval_gpt4_turbo_greedy │ │ │ │ │ └── configs.yaml │ │ │ │ └── weighted_alpaca_eval_gpt4_turbo_new │ │ │ │ │ └── configs.yaml │ │ │ │ ├── leaderboards │ │ │ │ ├── data_AlpacaEval │ │ │ │ │ ├── alpaca_eval_gpt4_leaderboard.csv │ │ │ │ │ ├── chatgpt_fn_leaderboard.csv │ │ │ │ │ ├── claude_leaderboard.csv │ │ │ │ │ └── text_davinci_003_leaderboard.csv │ │ │ │ ├── data_AlpacaEval_2 │ │ │ │ │ ├── alpaca_eval_cot_gpt4_turbo_fn_leaderboard.csv │ │ │ │ │ ├── alpaca_eval_gpt4_turbo_fn_leaderboard.csv │ │ │ │ │ ├── claude_3_opus_ranking_leaderboard.csv │ │ │ │ │ ├── mistral-large-2402_ranking_leaderboard.csv │ │ │ │ │ └── weighted_alpaca_eval_gpt4_turbo_leaderboard.csv │ │ │ │ └── evaluators │ │ │ │ │ └── evaluators_leaderboard.csv │ │ │ │ ├── main.py │ │ │ │ ├── metrics │ │ │ │ ├── __init__.py │ │ │ │ ├── glm_winrate.py │ │ │ │ ├── helpers.py │ │ │ │ ├── weights │ │ │ │ │ ├── claude_3_opus_ranking │ │ │ │ │ │ └── length_controlled_v1 │ │ │ │ │ │ │ └── baseline_gpt4_1106_preview.csv │ │ │ │ │ ├── mistral-large-2402_ranking │ │ │ │ │ │ └── length_controlled_v1 │ │ │ │ │ │ │ └── baseline_gpt4_1106_preview.csv │ │ │ │ │ ├── weighted_alpaca_eval_gpt-4o-mini-2024-07-18 │ │ │ │ │ │ └── length_controlled_v1 │ │ │ │ │ │ │ └── baseline_gpt4_1106_preview.csv │ │ │ │ │ └── weighted_alpaca_eval_gpt4_turbo │ │ │ │ │ │ └── length_controlled_v1 │ │ │ │ │ │ └── baseline_gpt4_1106_preview.csv │ │ │ │ └── winrate.py │ │ │ │ ├── models_configs │ │ │ │ ├── Conifer-7B-DPO │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Contextual-KTO-Mistral-PairRM-Verified │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Contextual-KTO-Mistral-PairRM │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Ein-70B-v0.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── FsfairX-Zephyr-Chat-v0.1 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Infinity-Instruct-3M-0613-Llama3-70B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Infinity-Instruct-3M-0613-Mistral-7B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Infinity-Instruct-3M-0625-Llama3-70B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Infinity-Instruct-3M-0625-Llama3-8B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Infinity-Instruct-3M-0625-Mistral-7B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Infinity-Instruct-3M-0625-Qwen2-7B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Infinity-Instruct-3M-0625-Yi-1.5-9B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Infinity-Instruct-7M-Gen-Llama3_1-70B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Infinity-Instruct-7M-Gen-Llama3_1-8B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Infinity-Instruct-7M-Gen-mistral-7B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── LMCocktail-10.7B-v1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Llama-3-Instruct-8B-SimPO-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Llama-3-Instruct-8B-SimPO │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Llama-3-Instruct-8B-WPO-HB-v2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Llama3-PBM-Nova-70B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Meta-Llama-3-70B-Instruct │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Meta-Llama-3-8B-Instruct │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Meta-Llama-3.1-405B-Instruct-Turbo │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Meta-Llama-3.1-70B-Instruct-Turbo │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Meta-Llama-3.1-8B-Instruct-Turbo │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Mistral-7B+RAHF-DUAL+LoRA │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Mistral-7B-Instruct-v0.2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Mistral-7B-Instruct-v0.3 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Mistral-7B-ReMax-v0.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Mixtral-8x22B-Instruct-v0.1 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Mixtral-8x7B-Instruct-v0.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── togetherai_prompt.txt │ │ │ │ ├── Mixtral-8x7B-Instruct-v0.1_concise │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── togetherai_prompt_concise.txt │ │ │ │ ├── Mixtral-8x7B-Instruct-v0.1_verbose │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── togetherai_prompt_verbose.txt │ │ │ │ ├── Nanbeige-Plus-Chat-v0.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Nanbeige2-16B-Chat │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Nanbeige2-8B-Chat │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── OpenHermes-2.5-Mistral-7B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── togetherai_prompt.txt │ │ │ │ ├── Qwen-14B-Chat │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Qwen1.5-1.8B-Chat │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Qwen1.5-110B-Chat │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Qwen1.5-14B-Chat │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Qwen1.5-72B-Chat │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Qwen1.5-7B-Chat │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Qwen2-72B-Instruct │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── REBEL-Llama-3-8B-Instruct-Armo │ │ │ │ │ └── configs.yaml │ │ │ │ ├── REBEL-Llama-3-8B-Instruct │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── SPPO-Gemma-2-9B-It-PairRM │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── SPPO-Llama-3-Instruct-8B-PairRM │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── SPPO-Mistral7B-PairRM-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── SPPO-Mistral7B-PairRM │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Samba-CoE-v0.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Samba-CoE-v0.2-best-of-16 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Samba-CoE-v0.2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Shopee-SlimMoA-v1 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Snorkel-Mistral-PairRM-DPO-best-of-16 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Snorkel-Mistral-PairRM-DPO │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Starling-LM-7B-alpha-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Starling-LM-7B-alpha │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Starling-LM-7B-beta-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Storm-7B-best-of-64 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Storm-7B │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── TempNet-LLaMA2-Chat-13B-v0.1 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── TempNet-LLaMA2-Chat-70B-v0.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── TempNet-LLaMA2-Chat-7B-v0.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Together-MoA-Lite │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Together-MoA │ │ │ │ │ └── configs.yaml │ │ │ │ ├── Yi-34B-Chat-Verified │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── Yi-34B-Chat │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── airoboros-33b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── airoboros-65b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── aligner-2b_claude-3-opus-20240229 │ │ │ │ │ ├── config.yaml │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── aligner-2b_gpt-4-turbo-2024-04-09 │ │ │ │ │ ├── config.yaml │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── aligner-2b_qwen1.5-72b-chat │ │ │ │ │ ├── config.yaml │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── alpaca-7b-neft │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── alpaca-7b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── alpaca-7b_concise │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca-7b_verbose │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca-farm-ppo-human │ │ │ │ │ └── configs.yaml │ │ │ │ ├── alpaca-farm-ppo-sim-gpt4-20k │ │ │ │ │ └── configs.yaml │ │ │ │ ├── baichuan-13b-chat │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── baize-v2-13b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── baize-v2-7b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── bedrock_claude │ │ │ │ │ └── configs.yaml │ │ │ │ ├── bedrock_claude_2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── blendaxai-gm-l3-v35 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── blendaxai-gm-l6-vo31 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── causallm-14b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── chatglm2-6b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── claude-2.1 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── claude-2.1_concise │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt_concise.txt │ │ │ │ ├── claude-2.1_verbose │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt_verbose.txt │ │ │ │ ├── claude-2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── claude-3-5-sonnet-20240620 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt_chatml.txt │ │ │ │ ├── claude-3-opus-20240229 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── claude-3-sonnet-20240229 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt_chatml.txt │ │ │ │ ├── claude-instant-1.2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── claude │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── claude2-alpaca-13b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── cohere │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── cut-13b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── dbrx-instruct │ │ │ │ │ └── configs.yaml │ │ │ │ ├── deepseek-llm-67b-chat │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── deita-7b-v1.0 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── dolphin-2.2.1-mistral-7b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── evo-7b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── evo-v2-7b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── falcon-40b-instruct │ │ │ │ │ └── configs.yaml │ │ │ │ ├── falcon-7b-instruct │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gemini-pro │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── gemma-2-9b-it-DPO │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── gemma-2-9b-it-SimPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gemma-2-9b-it-WPO-HB │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gemma-2b-it │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gemma-7b-it │ │ │ │ │ └── configs.yaml │ │ │ │ ├── ghost-7b-alpha │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── ghost-8b-beta-disl-0x5 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── gpt-3.5-turbo-0301 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-3.5-turbo-0613 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-3.5-turbo-1106 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-3.5-turbo-1106_concise │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-3.5-turbo-1106_verbose │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-3.5-turbo-16k-0613 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-4-0125-preview │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-4-turbo-2024-04-09 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-4o-2024-05-13 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-4o-2024-08-06 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt-4o-mini-2024-07-18 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt35_turbo_instruct │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4 │ │ │ │ │ ├── chatml_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_0314 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_0613 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_0613_concise │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_0613_verbose │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_1106_preview │ │ │ │ │ ├── chatml_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_1106_preview_concise │ │ │ │ │ ├── chatml_prompt_concise.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_1106_preview_verbose │ │ │ │ │ ├── chatml_prompt_verbose.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── gpt4_gamed │ │ │ │ │ └── configs.yaml │ │ │ │ ├── guanaco-13b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── guanaco-33b-api │ │ │ │ │ └── configs.yaml │ │ │ │ ├── guanaco-33b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── guanaco-65b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── guanaco-7b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── higgs-llama-3-70b-v2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── humpback-llama-65b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── humpback-llama2-70b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── internlm2-chat-20b-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── internlm2-chat-20b-ppo │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── internlm2-chat-7b-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── jina-chat │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── llama-2-13b-chat-hf │ │ │ │ │ └── configs.yaml │ │ │ │ ├── llama-2-70b-chat-hf │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── llama-2-7b-chat-hf │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── llama-2-chat-7b-evol70k-neft │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── merlinite-7B-AOT │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── minichat-1.5-3b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── minichat-3b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── minotaur-13b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── mistral-large-2402 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── mistral-medium │ │ │ │ │ ├── basic_chatml_prompt.txt │ │ │ │ │ └── configs.yaml │ │ │ │ ├── mistral-orpo-beta │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── nous-hermes-13b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── oasst-rlhf-llama-33b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── oasst-sft-llama-33b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── oasst-sft-pythia-12b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── openbuddy-falcon-40b-v9 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── openbuddy-falcon-7b-v6 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── openbuddy-llama-30b-v7.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── openbuddy-llama-65b-v8 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── openbuddy-llama2-13b-v11.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── openbuddy-llama2-70b-v10.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── openchat-13b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── openchat-v2-13b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── openchat-v2-w-13b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── openchat-v3.1-13b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── openchat8192-13b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── opencoderplus-15b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── openpipe-moa-gpt-4-turbo-v1 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── pairrm-Yi-34B-Chat │ │ │ │ │ └── configs.yaml │ │ │ │ ├── pairrm-tulu-2-13b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── pairrm-tulu-2-70b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── pairrm-zephyr-7b-beta │ │ │ │ │ └── configs.yaml │ │ │ │ ├── phi-2-dpo │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── phi-2-sft │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── phi-2 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── platolm-7b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── pythia-12b-mix-sft │ │ │ │ │ └── configs.yaml │ │ │ │ ├── recycled-wizardlm-7b-v1.0 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── recycled-wizardlm-7b-v2.0 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── text_davinci_001 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── text_davinci_003 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── tulu-2-dpo-13b-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── tulu-2-dpo-13b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── tulu-2-dpo-70b-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── tulu-2-dpo-70b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── tulu-2-dpo-7b-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ ├── tulu-2-dpo-7b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── ultralm-13b-best-of-16 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── ultralm-13b-v2.0-best-of-16 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── ultralm-13b-v2.0 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── ultralm-13b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── vicuna-13b-v1.3 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── vicuna-13b-v1.5-togetherai │ │ │ │ │ └── configs.yaml │ │ │ │ ├── vicuna-13b-v1.5 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── vicuna-13b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── vicuna-33b-v1.3 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── vicuna-7b-v1.3 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── vicuna-7b-v1.5 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── vicuna-7b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── wizardlm-13b-v1.1 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── wizardlm-13b-v1.2 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── wizardlm-13b │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── wizardlm-70b │ │ │ │ │ └── configs.yaml │ │ │ │ ├── xwinlm-13b-v0.1 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── xwinlm-70b-v0.1 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── xwinlm-70b-v0.3 │ │ │ │ │ └── configs.yaml │ │ │ │ ├── xwinlm-7b-v0.1 │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── yi-large-preview │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── zephyr-7b-alpha-ExPO │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── zephyr-7b-alpha │ │ │ │ │ ├── configs.yaml │ │ │ │ │ └── prompt.txt │ │ │ │ ├── zephyr-7b-beta-ExPO │ │ │ │ │ └── configs.yaml │ │ │ │ └── zephyr-7b-beta │ │ │ │ │ └── configs.yaml │ │ │ │ ├── plotting.py │ │ │ │ ├── processors.py │ │ │ │ ├── types.py │ │ │ │ └── utils.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── integration_tests │ │ │ ├── test_decoders_integration.py │ │ │ └── test_example_integration.py │ │ │ ├── test_analyze.py │ │ │ ├── test_decoders_unit.py │ │ │ ├── test_main.py │ │ │ └── test_pairwise_evaluator.py │ ├── curator_lm.py │ ├── precomputed_hf_lm.py │ ├── upload_to_hf_lm.py │ └── zeroeval │ │ ├── .github │ │ └── workflows │ │ │ └── static.yml │ │ ├── .gitignore │ │ ├── CITATION.cff │ │ ├── LICENSE │ │ ├── README.md │ │ ├── data_prep │ │ ├── crux.py │ │ ├── gsm8k.py │ │ ├── mmlu-redux.py │ │ └── zebra_difficulty.py │ │ ├── docs │ │ ├── index.html │ │ └── zebra │ │ │ └── zebra_banner.png │ │ ├── eval_instruct.py │ │ ├── requirements.txt │ │ ├── scripts │ │ ├── _ALPACA_EVAL.MD │ │ ├── _GSM.md │ │ ├── _MMLU_redux.md │ │ ├── _ZebraLogic.md │ │ ├── eval_all.sh │ │ └── local │ │ │ ├── _more.sh │ │ │ ├── athena.sh │ │ │ ├── command_r.sh │ │ │ ├── crux.sh │ │ │ ├── deepseek_coder.sh │ │ │ ├── gemini-1.5-pro-exp.sh │ │ │ ├── gemma.sh │ │ │ ├── gemma2.sh │ │ │ ├── gpt-4o-new.sh │ │ │ ├── gsm.sh │ │ │ ├── hyperbolic.sh │ │ │ ├── lepton.sh │ │ │ ├── llama3-1.sh │ │ │ ├── math_l5.sh │ │ │ ├── merge_legacy.py │ │ │ ├── mistral-large-2.sh │ │ │ ├── mistral_nemo.sh │ │ │ ├── mixtral.sh │ │ │ ├── numersense-v2.sh │ │ │ ├── o1_zebra.sh │ │ │ ├── phi-3.sh │ │ │ ├── run_all_mmlu.sh │ │ │ ├── sambanova.sh │ │ │ └── yi.sh │ │ ├── src │ │ ├── _TEMPLATES.py │ │ ├── __init__.py │ │ ├── evaluation │ │ │ ├── alpaca_eval_formatting.py │ │ │ ├── crux_eval.py │ │ │ ├── eval_utils.py │ │ │ ├── legacy │ │ │ │ └── mcqa_eval.py │ │ │ ├── math_eval.py │ │ │ ├── mcqa_eval.py │ │ │ ├── summarize.py │ │ │ └── zebra_grid_eval.py │ │ ├── fastchat_conversation.py │ │ ├── global_configs.py │ │ ├── hf_models.py │ │ ├── merge_results.py │ │ ├── task_configs.py │ │ ├── templates │ │ │ ├── MCQA.py │ │ │ ├── OEQA.py │ │ │ └── ZEBRA_GRID.py │ │ ├── unified_infer.py │ │ ├── unified_utils.py │ │ ├── upload_results.py │ │ └── view_wb_eval.py │ │ ├── state_of_limit │ │ ├── README.md │ │ ├── html │ │ │ └── all_tasks.html │ │ ├── result_summary.py │ │ └── write_html_single_file.py │ │ ├── zero_eval_api.sh │ │ └── zero_eval_local.sh ├── constants.py ├── distributed │ ├── README.md │ ├── SETUP_CAPELLA.md │ ├── SETUP_JURECA.md │ ├── SETUP_LEONARDO.md │ ├── benchmark_plot.py │ ├── benchmarking_capella.png │ ├── benchmarking_comparison.png │ ├── benchmarking_leonardo.png │ ├── launch.py │ ├── launch_local.py │ ├── launch_simple.py │ ├── process_shard.py │ ├── process_shards_capella.sbatch │ ├── process_shards_capella_tp.sbatch │ ├── process_shards_jureca.sbatch │ ├── process_shards_leonardo.sbatch │ ├── process_shards_leonardo_tp.sbatch │ ├── process_shards_local.sh │ ├── process_shards_local_tp.sh │ ├── process_shards_tacc.sbatch │ ├── process_shards_tacc_tp.sbatch │ ├── run_evaluations_capella.sbatch │ ├── run_evaluations_tacc.sbatch │ ├── simple_alpha.sbatch │ ├── simple_claix.sbatch │ ├── simple_jureca.sbatch │ ├── simple_leonardo.sbatch │ ├── simple_tacc.sbatch │ └── simple_zih.sbatch ├── eval.py ├── eval_tracker.py ├── examples │ ├── alpaca_diff_annotator.sh │ ├── database_logging.sh │ ├── mistral_on_alpaca.sh │ ├── mistral_on_alpaca_greedy_bs1.sh │ ├── mistral_on_alpaca_greedy_bs32.sh │ ├── mistral_on_many_evals.sh │ └── multi_gpu.sh ├── listener │ └── tacc.py └── task.py ├── image.png ├── pyproject.toml ├── reasoning_csv.sh ├── reproduced_benchmarks.md ├── requirements.txt └── score.sh /.github/workflows/black.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/.github/workflows/black.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/CITATION.cff -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/Makefile -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/README.md -------------------------------------------------------------------------------- /configs/full_gemma9b_gpt4omini0718.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/full_gemma9b_gpt4omini0718.yaml -------------------------------------------------------------------------------- /configs/full_gpt4omini0718.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/full_gpt4omini0718.yaml -------------------------------------------------------------------------------- /configs/light_gemma9b_gpt4omini0718.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/light_gemma9b_gpt4omini0718.yaml -------------------------------------------------------------------------------- /configs/light_gpt4omini0718.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/light_gpt4omini0718.yaml -------------------------------------------------------------------------------- /configs/light_gpt4omini0718_jsc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/light_gpt4omini0718_jsc.yaml -------------------------------------------------------------------------------- /configs/reasoning.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/reasoning.yaml -------------------------------------------------------------------------------- /configs/reasoning_lite.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/reasoning_lite.yaml -------------------------------------------------------------------------------- /configs/single_task/gpqa.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/single_task/gpqa.yaml -------------------------------------------------------------------------------- /configs/single_task/livebench.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/single_task/livebench.yaml -------------------------------------------------------------------------------- /configs/single_task/livecodebench.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/single_task/livecodebench.yaml -------------------------------------------------------------------------------- /configs/single_task/math500.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/configs/single_task/math500.yaml -------------------------------------------------------------------------------- /create_csv.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/create_csv.sh -------------------------------------------------------------------------------- /create_csv_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/create_csv_helper.py -------------------------------------------------------------------------------- /database/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/database/README.md -------------------------------------------------------------------------------- /database/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /database/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/database/config.py -------------------------------------------------------------------------------- /database/models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/database/models.py -------------------------------------------------------------------------------- /database/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/database/utils.py -------------------------------------------------------------------------------- /eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/AIME24/data/aime24.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/AIME24/data/aime24.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/AIME24/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/AIME24/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/AIME25/data/aime25.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/AIME25/data/aime25.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/AIME25/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/AIME25/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/AIW/data/aiw_data.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/AIW/data/aiw_data.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/AIW/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/AIW/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/AMC23/data/amc23.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/AMC23/data/amc23.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/AMC23/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/AMC23/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/BigCodeBench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/BigCodeBench/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/BigCodeBench/docker/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/BigCodeBench/docker/Dockerfile -------------------------------------------------------------------------------- /eval/chat_benchmarks/BigCodeBench/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/BigCodeBench/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/BigCodeBench/evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/BigCodeBench/evaluation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/BigCodeBench/execution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/BigCodeBench/execution.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/BigCodeBench/sanitize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/BigCodeBench/sanitize.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/BigCodeBench/syncheck.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/BigCodeBench/syncheck.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/CodeElo/codeelo_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/CodeElo/codeelo_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/CodeElo/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/CodeElo/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/CodeForces/codeforces_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/CodeForces/codeforces_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/CodeForces/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/CodeForces/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/CruxEval/data/cruxeval.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/CruxEval/data/cruxeval.jsonl -------------------------------------------------------------------------------- /eval/chat_benchmarks/CruxEval/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/CruxEval/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/CruxEval/evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/CruxEval/evaluation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/CruxEval/execution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/CruxEval/execution.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/GPQADiamond/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/GPQADiamond/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/GPQADiamond/testing_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/GPQADiamond/testing_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HLE/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HLE/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HLE/run_judge_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HLE/run_judge_results.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HLE/testing_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HLE/testing_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/.gitignore -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/LICENSE -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/README_judges.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/README_judges.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/__init__.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/api.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/configs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/configs.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/cot_solver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/cot_solver.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/grader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/grader.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/parse_manual.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/parse_manual.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/parser.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/possible_issues.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/possible_issues.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/pyproject.toml -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/runner.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/runner.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/usamo_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/usamo_report.pdf -------------------------------------------------------------------------------- /eval/chat_benchmarks/HMMT/matharena/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HMMT/matharena/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/data/humaneval-python.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/data/humaneval-python.jsonl -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/data/humaneval-sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/data/humaneval-sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/data/humaneval-sh.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/data/humaneval-sh.jsonl -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/human_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/human_eval/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/human_eval/data.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/human_eval/evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/human_eval/evaluation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/human_eval/execution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/human_eval/execution.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/humaneval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/humaneval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/javatuples-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/javatuples-1.2.jar -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/utils/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/utils/dataset.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEval/utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEval/utils/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEvalPlus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEvalPlus/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEvalPlus/human_eval_plus/data.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEvalPlus/humaneval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEvalPlus/humaneval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEvalPlus/utils/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEvalPlus/utils/dataset.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/HumanEvalPlus/utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/HumanEvalPlus/utils/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/data/input_data.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/data/input_data.jsonl -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/evaluation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/evaluation_main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/evaluation_main.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/instructions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/instructions.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/instructions_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/instructions_registry.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/instructions_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/instructions_test.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/instructions_util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/instructions_util.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/requirements.txt: -------------------------------------------------------------------------------- 1 | absl 2 | langdetect 3 | nltk 4 | immutabledict 5 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/IFEval/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/IFEval/run.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/JEEBench/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/JEEBench/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/JEEBench/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/JEEBench/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/LICENSE -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/README-es_CO.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/README-es_CO.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/assets/livebench-2024-07-20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/assets/livebench-2024-07-20.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/assets/livebench-2024-07-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/assets/livebench-2024-07-23.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/assets/livebench-2024-07-24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/assets/livebench-2024-07-24.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/assets/livebench-2024-07-28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/assets/livebench-2024-07-28.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/assets/livebench-2024-08-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/assets/livebench-2024-08-02.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/assets/livebench-2024-08-06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/assets/livebench-2024-08-06.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/assets/livebench-2024-08-30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/assets/livebench-2024-08-30.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/assets/livebench-2024-09-30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/assets/livebench-2024-09-30.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/assets/livebench-2024-12-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/assets/livebench-2024-12-01.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/changelog.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/changelog.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/docs/AUTHOR_RESPONSIBILITY.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/docs/AUTHOR_RESPONSIBILITY.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/docs/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/docs/CODE_OF_CONDUCT.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/docs/CONTRIBUTING.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/docs/DATASHEET.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/docs/DATASHEET.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/docs/MAINTENANCE_PLAN.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/docs/MAINTENANCE_PLAN.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.2" 2 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/common.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/conversation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/download_questions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/download_questions.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/gen_api_answer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/gen_api_answer.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/gen_model_answer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/gen_model_answer.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/if_runner/instruction_following_eval/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | langdetect 3 | nltk 4 | immutabledict 5 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/model/__init__.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/model/api_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/model/api_models.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/model/completions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/model/completions.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/model/models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/model/models.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/scripts/error_check: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/scripts/error_check -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/livebench/scripts/run_livebench: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/livebench/scripts/run_livebench -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveBench/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveBench/pyproject.toml -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveCodeBench/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveCodeBench/livecodebench_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveCodeBench/livecodebench_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveCodeBenchv5/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveCodeBenchv5/livecodebench_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveCodeBenchv5/livecodebench_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/LiveCodeBenchv5_official/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/LiveCodeBenchv5_official/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MATH500/data/math500.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MATH500/data/math500.jsonl -------------------------------------------------------------------------------- /eval/chat_benchmarks/MATH500/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MATH500/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/data/mbpp.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPP/data/mbpp.jsonl -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/data/mbpp_test.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPP/data/mbpp_test.jsonl -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPP/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/human_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/human_eval/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPP/human_eval/data.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/human_eval/evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPP/human_eval/evaluation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/human_eval/execution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPP/human_eval/execution.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/mbpp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPP/mbpp.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/utils/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPP/utils/dataset.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPP/utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPP/utils/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/data/mbppplus.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPPPlus/data/mbppplus.jsonl -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPPPlus/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/mbpp_plus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/mbpp_plus/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPPPlus/mbpp_plus/data.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/mbpp_plus/evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPPPlus/mbpp_plus/evaluation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/mbpp_plus/execution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPPPlus/mbpp_plus/execution.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/mbppplus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPPPlus/mbppplus.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/utils/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPPPlus/utils/dataset.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MBPPPlus/utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MBPPPlus/utils/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MMLUPro/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MMLUPro/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MMLUPro/initial_prompt.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MMLUPro/initial_prompt.txt -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/.github/PULL_REQUEST_TEMPLATE.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/.gitignore -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/.pylintrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/.pylintrc -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/LICENSE -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/assets/demo_narrow.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/assets/demo_narrow.gif -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/assets/qa_browser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/assets/qa_browser.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/assets/screenshot_cli.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/assets/screenshot_cli.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/assets/screenshot_gui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/assets/screenshot_gui.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/assets/server_arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/assets/server_arch.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/assets/vicuna_logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/assets/vicuna_logo.jpeg -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docker/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docker/Dockerfile -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docker/docker-compose.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docker/docker-compose.yml -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/arena.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/arena.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/awq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/awq.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/commands/conv_release.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/commands/conv_release.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/commands/data_cleaning.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/commands/data_cleaning.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/commands/leaderboard.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/commands/leaderboard.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/commands/local_cluster.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/commands/local_cluster.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/commands/pypi.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/commands/pypi.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/commands/webserver.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/commands/webserver.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/dashinfer_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/dashinfer_integration.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/dataset_release.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/dataset_release.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/exllama_v2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/exllama_v2.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/gptq.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/gptq.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/langchain_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/langchain_integration.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/lightllm_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/lightllm_integration.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/mlx_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/mlx_integration.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/model_support.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/model_support.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/openai_api.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/openai_api.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/server_arch.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/server_arch.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/third_party_ui.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/third_party_ui.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/training.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/training.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/vicuna_weights_version.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/vicuna_weights_version.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/vllm_integration.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/vllm_integration.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/docs/xFasterTransformer.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/docs/xFasterTransformer.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.36" 2 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/constants.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/conversation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/clean_sharegpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/clean_sharegpt.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/convert_alpaca.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/convert_alpaca.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/get_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/get_stats.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/inspect_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/inspect_data.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/merge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/merge.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/optional_clean.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/optional_clean.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/optional_replace.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/optional_replace.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/prepare_all.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/prepare_all.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/pretty_json.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/pretty_json.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/sample.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/sample.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/data/split_train_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/data/split_train_test.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/llm_judge/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/llm_judge/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/llm_judge/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/llm_judge/common.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/llm_judge/qa_browser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/llm_judge/qa_browser.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/llm_judge/show_result.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/llm_judge/show_result.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/__init__.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/apply_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/apply_delta.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/apply_lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/apply_lora.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/compression.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/compression.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/convert_fp16.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/convert_fp16.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/make_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/make_delta.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/model_adapter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/model_adapter.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/model_chatglm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/model_chatglm.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/model_cllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/model_cllm.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/model_codet5p.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/model_codet5p.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/model_exllama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/model_exllama.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/model_falcon.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/model_falcon.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/model_registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/model_registry.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/model_yuan2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/model_yuan2.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/rwkv_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/rwkv_model.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/model/upload_hub.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/model/upload_hub.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/modules/awq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/modules/awq.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/modules/exllama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/modules/exllama.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/modules/gptq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/modules/gptq.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/protocol/api_protocol.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/api_provider.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/call_monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/call_monitor.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/cli.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/controller.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/controller.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/gateway/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/gateway/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/gateway/nginx.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/gateway/nginx.conf -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/huggingface_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/huggingface_api.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/inference.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/lightllm_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/lightllm_worker.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/mlx_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/mlx_worker.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/model_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/model_worker.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/monitor/monitor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/monitor/monitor.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/register_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/register_worker.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/remote_logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/remote_logger.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/sglang_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/sglang_worker.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/shutdown_serve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/shutdown_serve.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/test_message.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/test_message.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/test_throughput.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/test_throughput.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/vision/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/vision/image.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/serve/vllm_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/serve/vllm_worker.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/train/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/train/train.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/train/train_baichuan.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/train/train_baichuan.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/train/train_flant5.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/train/train_flant5.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/train/train_lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/train/train_lora.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/train/train_lora_t5.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/train/train_lora_t5.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/train/train_mem.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/train/train_mem.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/train/train_xformers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/train/train_xformers.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/train/train_yuan2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/train/train_yuan2.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/fastchat/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/fastchat/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/format.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/format.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/playground/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/pyproject.toml -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/scripts/build-api.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/scripts/build-api.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/scripts/test_readme_train.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/scripts/test_readme_train.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/scripts/train_lora.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/scripts/train_lora.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/scripts/train_vicuna_13b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/scripts/train_vicuna_13b.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/scripts/train_vicuna_7b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/scripts/train_vicuna_7b.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/scripts/upload_pypi.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/scripts/upload_pypi.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/tests/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/tests/killall_python.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/tests/killall_python.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/tests/load_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/tests/load_test.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/tests/test_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/tests/test_cli.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/tests/test_cli_inputs.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/tests/test_cli_inputs.txt -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/tests/test_image_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/tests/test_image_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/tests/test_openai_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/tests/test_openai_api.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/tests/test_openai_langchain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/tests/test_openai_langchain.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MTBench/tests/test_openai_vision_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MTBench/tests/test_openai_vision_api.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/client/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/client/snova_client.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/client/snova_client.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/api/registry.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/api/registry.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/compute_metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/compute_metrics.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/evaluate.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/__init__.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/base.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/base_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/base_api.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/claude_3_haiku.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/claude_3_haiku.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/claude_3_opus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/claude_3_opus.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/command_r.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/command_r.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/command_r_plus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/command_r_plus.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/dbrx_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/dbrx_base.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/dbrx_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/dbrx_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/deepseek_67b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/deepseek_67b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/deepseek_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/deepseek_7b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/deepseek_v2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/deepseek_v2.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/gemini_10_pro.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/gemini_10_pro.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/gemini_15_pro.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/gemini_15_pro.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/gemma_2b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/gemma_2b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/gemma_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/gemma_7b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/gpt_4_0314.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/gpt_4_0314.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/gpt_4_0613.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/gpt_4_0613.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/gpt_4o.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/gpt_4o.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/gpt_4o_mini.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/gpt_4o_mini.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/jet_moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/jet_moe.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/jet_moe_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/jet_moe_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/llama_2_70b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/llama_2_70b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/llama_2_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/llama_2_7b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/llama_3_405b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/llama_3_405b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/llama_3_70b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/llama_3_70b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/llama_3_8b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/llama_3_8b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/lm_chat_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/lm_chat_model.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/local_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/local_base.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/local_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/local_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mistral_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mistral_7b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mistral_large.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mistral_large.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mistral_medium.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mistral_medium.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mistral_nemo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mistral_nemo.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mistral_small.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mistral_small.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mixtral_8_22b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mixtral_8_22b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mixtral_8_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mixtral_8_7b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mpt_30b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mpt_30b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mpt_30b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mpt_30b_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mpt_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mpt_7b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/mpt_7b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/mpt_7b_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/notus_7b_v1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/notus_7b_v1.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/olmo_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/olmo_7b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/openai_o1.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/openai_o1.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/openai_o1_mini.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/openai_o1_mini.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/phi_2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/phi_2.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_110b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_110b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_32b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_32b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_4b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_4b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_72b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_72b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/qwen_15_7b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/qwen_7b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/qwen_7b_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/qwen_max_0428.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/qwen_max_0428.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/reka_core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/reka_core.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/reka_edge.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/reka_edge.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/reka_flash.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/reka_flash.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/tulu_v2_dpo_7b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/tulu_v2_dpo_7b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/vicuna_13b_v13.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/vicuna_13b_v13.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/vicuna_33b_v13.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/vicuna_33b_v13.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/vicuna_7b_v13.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/vicuna_7b_v13.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/vicuna_7b_v15.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/vicuna_7b_v15.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/xverse_7b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/xverse_7b_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/xwin_lm_7b_v01.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/xwin_lm_7b_v01.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/yi_15_34b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/yi_15_34b_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/yi_15_9b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/yi_15_9b_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/yi_34b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/yi_34b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/yi_34b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/yi_34b_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/yi_6b.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/yi_6b.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/yi_6b_chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/yi_6b_chat.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/yi_large.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/yi_large.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/models/zephyr_7b_beta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/models/zephyr_7b_beta.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/prompts/judge_prompts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/prompts/judge_prompts.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/run_eval_example.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/run_eval_example.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/utils/common_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/utils/common_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/utils/count_token.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/utils/count_token.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/utils/dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/utils/dataset.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/utils/metric_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/utils/metric_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/mix_eval/utils/plot_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/mix_eval/utils/plot_results.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MixEval/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MixEval/setup.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-adb.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-adb.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-clj.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-clj.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-cpp.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-cpp.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-cs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-cs.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-d.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-d.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-dart.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-dart.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-elixir.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-elixir.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-go.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-go.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-hs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-hs.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-humaneval-cs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-humaneval-cs.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-humaneval-js.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-humaneval-js.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-humaneval-sh.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-humaneval-sh.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-humaneval-ts.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-humaneval-ts.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-java.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-java.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-js.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-js.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-julia.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-julia.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-lua.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-lua.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-mbpp-java.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-mbpp-java.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-mbpp-js.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-mbpp-js.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-ml.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-ml.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-php.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-php.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-pl.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-pl.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-r.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-r.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-racket.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-racket.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-rb.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-rb.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-rs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-rs.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-scala.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-scala.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-sh.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-sh.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-swift.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-swift.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/data/multipl-e-ts.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/data/multipl-e-ts.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/docker/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/docker/Dockerfile -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/containerized_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/containerized_eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_adb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_adb.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_clj.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_clj.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_cpp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_cpp.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_cs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_cs.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_dart.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_dart.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_dfy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_dfy.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_dlang.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_dlang.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_elixir.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_elixir.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_fs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_fs.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_go.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_go.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_hs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_hs.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_java.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_java.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_javascript.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_javascript.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_julia.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_julia.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_lean.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_lean.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_lua.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_lua.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_luau.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_luau.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_matlab.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_matlab.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_ocaml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_ocaml.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_php.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_php.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_pl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_pl.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_python.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_python.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_r.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_r.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_racket.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_racket.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_ruby.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_ruby.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_rust.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_rust.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_scala.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_scala.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_sh.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_sh.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_swift.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_swift.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_ts.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_ts.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/eval_v.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/eval_v.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/evaluation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/execution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/execution.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/generic_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/generic_eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/libeval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/libeval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/main.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/safe_subprocess/evil_programs/block_on_inputs.py: -------------------------------------------------------------------------------- 1 | while True: 2 | input() 3 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/safe_subprocess/evil_programs/fork_bomb.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | while True: 4 | os.fork() 5 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/multiple/simple_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/multiple/simple_eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/MultiPLE/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/MultiPLE/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/.gitignore -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/LICENSE -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/archive_data/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/archive_data/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/assets/repobench_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/assets/repobench_dark.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/assets/repobench_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/assets/repobench_light.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/assets/repobench_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/assets/repobench_logo.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/data/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/data/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/data/data/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/data/data/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/data/data/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/data/data/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/data/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/data/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/evaluation/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/evaluation/metrics.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/requirements.txt -------------------------------------------------------------------------------- /eval/chat_benchmarks/RepoBench/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/RepoBench/run.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/SWEbench/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/SWEbench/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/.github/workflows/static.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/.github/workflows/static.yml -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/.gitignore -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/EVAL.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/EVAL.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/LICENSE -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/README.md: -------------------------------------------------------------------------------- 1 | # WildBenchwebsite 2 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/WildBench_paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/WildBench_paper.pdf -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/addons/flag.min.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/addons/flag.min.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/addons/rating.min.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/addons/rating.min.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/animate.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/animate.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/bootstrap.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/bootstrap.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/bootstrap.min 2.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/bootstrap.min 2.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/bootstrap.min.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/bootstrap.min.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/font-awesome.min.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/font-awesome.min.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/main.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/main.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/mdb.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/mdb.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/mdb.lite.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/mdb.lite.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/mdb.lite.min.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/mdb.lite.min.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/mdb.min.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/mdb.min.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/css/style.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/css/style.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/gray_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/gray_banner.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/images/ai2logo_large.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/images/ai2logo_large.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/images/blank.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/images/blank.gif -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/images/icons/clear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/images/icons/clear.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/images/icons/deep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/images/icons/deep.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/images/icons/engage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/images/icons/engage.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/images/icons/factual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/images/icons/factual.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/images/icons/helpful.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/images/icons/helpful.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/images/icons/safety.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/images/icons/safety.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/index.html -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/addons/flag.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/addons/flag.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/addons/rating.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/addons/rating.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/bootstrap-table.min.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/bootstrap-table.min.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/bootstrap-table.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/bootstrap-table.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/bootstrap.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/bootstrap.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/bootstrap.min 2.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/bootstrap.min 2.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/bootstrap.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/bootstrap.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/jquery-1.10.2.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/jquery-1.10.2.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/jquery.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/jquery.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/jquery.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/jquery.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/jquery.tablesorter.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/jquery.tablesorter.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/mdb.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/mdb.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/mdb.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/mdb.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/min/custom.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/min/custom.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/min/modernizr.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/min/modernizr.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/min/plugins.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/min/plugins.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/modules/treeview.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/modules/treeview.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/modules/wow.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/modules/wow.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/popper.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/popper.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/js/popper.min.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/js/popper.min.js -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/static/images/ai2logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/static/images/ai2logo.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/static/images/uwlogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/static/images/uwlogo.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/style.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/style.css -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/wb_corr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/wb_corr.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/wb_eval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/wb_eval.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/wb_radar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/wb_radar.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/wb_stat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/wb_stat.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/wb_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/wb_table.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/docs/wildbench_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/docs/wildbench_logo.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/evaluation/run_eval_v2_batch.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/evaluation/run_eval_v2_batch.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/leaderboard/data_dir/score.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/leaderboard/data_dir/score.json -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/leaderboard/data_dir/wb_elo.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/leaderboard/data_dir/wb_elo.txt -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/leaderboard/data_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/leaderboard/data_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/leaderboard/show_eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/leaderboard/show_eval.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/leaderboard/show_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/leaderboard/show_table.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/leaderboard/wb_elo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/leaderboard/wb_elo.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/requirements.txt -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Llama-2-13b-chat-hf.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Llama-2-13b-chat-hf.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Llama-2-70b-chat-hf.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Llama-2-70b-chat-hf.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Llama-2-7b-chat-hf.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Llama-2-7b-chat-hf.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Llama-3-8B-Tulu-330K.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Llama-3-8B-Tulu-330K.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Llama-3-8B-WildChat.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Llama-3-8B-WildChat.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Magpie-Pro-SFT-v0.1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Magpie-Pro-SFT-v0.1.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Mistral-Large-2.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Mistral-Large-2.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Qwen1.5-72B-Chat.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Qwen1.5-72B-Chat.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Qwen2-72B-Instruct.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Qwen2-72B-Instruct.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Starling-LM-7B-beta.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Starling-LM-7B-beta.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Yi-1.5-34B-Chat.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Yi-1.5-34B-Chat.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Yi-1.5-6B-Chat.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Yi-1.5-6B-Chat.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Yi-1.5-9B-Chat.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Yi-1.5-9B-Chat.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/Yi-34B-Chat.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/Yi-34B-Chat.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/_common_hf.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/_common_hf.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/_common_openai.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/_common_openai.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/_common_vllm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/_common_vllm.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/cohere-command-r.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/cohere-command-r.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/cohere-command.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/cohere-command.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/dbrx-instruct.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/dbrx-instruct.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/gemini-1.0-pro.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/gemini-1.0-pro.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/gemini-1.5-flash.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/gemini-1.5-flash.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/gemini-1.5-pro.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/gemini-1.5-pro.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/gemma-2b-it.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/gemma-2b-it.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/gemma-7b-it.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/gemma-7b-it.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/gpt-3.5-turbo-0125.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/gpt-3.5-turbo-0125.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/gpt-4-0125-preview.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/gpt-4-0125-preview.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/gpt-4o-2024-05-13.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/gpt-4o-2024-05-13.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/mistral-large-2402.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/mistral-large-2402.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/reka-core-20240501.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/reka-core-20240501.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/reka-edge-20240208.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/reka-edge-20240208.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/reka-flash-20240226.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/reka-flash-20240226.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/tulu-2-dpo-70b-ExPO.sh: -------------------------------------------------------------------------------- 1 | # TODO -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/tulu-2-dpo-70b.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/tulu-2-dpo-70b.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/vicuna-13b-v1.5.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/vicuna-13b-v1.5.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/yi-large.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/yi-large.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/zephyr-7b-beta.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/zephyr-7b-beta.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/scripts/zephyr-7b-gemma-v0.1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/scripts/zephyr-7b-gemma-v0.1.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/src/eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/fastchat_conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/src/fastchat_conversation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/global_configs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/src/global_configs.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/hf_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/src/hf_models.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/merge_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/src/merge_results.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/unified_infer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/src/unified_infer.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/unified_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/src/unified_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/upload_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/src/upload_results.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/WildBench/src/view_wb_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/WildBench/src/view_wb_eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/.gitignore -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/.pre-commit-config.yaml -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/CITATION.cff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/CITATION.cff -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/LICENSE -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/MANIFEST.in -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/client_configs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/client_configs/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/docs/AlpacaFarm_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/docs/AlpacaFarm_small.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/docs/check_unwanted_files.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/docs/check_unwanted_files.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/docs/format_sample_sheets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/docs/format_sample_sheets.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/docs/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/docs/index.html -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/figures/annotator_bias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/figures/annotator_bias.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/figures/bias_vs_variance.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/figures/bias_vs_variance.pdf -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/figures/causal_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/figures/causal_graph.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/figures/chat_correlations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/figures/chat_correlations.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/figures/lc_ae_leaderboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/figures/lc_ae_leaderboard.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/figures/verified.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/figures/verified.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/pytest.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/pytest.ini -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/requirements.txt -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/scripts/precompute.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/scripts/precompute.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/setup.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/__init__.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/analyze.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/analyze.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/main.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/Contextual-KTO-Mistral-PairRM-Verified/prompt.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction} 3 | <|im_end|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/Contextual-KTO-Mistral-PairRM/prompt.txt: -------------------------------------------------------------------------------- 1 | <|user|> 2 | {instruction} 3 | <|assistant|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/Infinity-Instruct-3M-0625-Yi-1.5-9B/prompt.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction}<|im_end|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/Mixtral-8x7B-Instruct-v0.1/togetherai_prompt.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction} 3 | <|im_end|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/OpenHermes-2.5-Mistral-7B/togetherai_prompt.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction} 3 | <|im_end|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/Qwen2-72B-Instruct/prompt.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction} 3 | <|im_end|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/Yi-34B-Chat-Verified/prompt.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction}<|im_end|> 3 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/baichuan-13b-chat/prompt.txt: -------------------------------------------------------------------------------- 1 | {instruction} 2 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/chatglm2-6b/prompt.txt: -------------------------------------------------------------------------------- 1 | [Round 1]\n\n问:{instruction}\n\n答: 2 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/claude-3-5-sonnet-20240620/prompt_chatml.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction} 3 | <|im_end|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/claude-3-sonnet-20240229/prompt_chatml.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction} 3 | <|im_end|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/claude/prompt.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | Human: {instruction} 4 | 5 | Assistant: -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/cohere/prompt.txt: -------------------------------------------------------------------------------- 1 | {instruction} -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/deepseek-llm-67b-chat/prompt.txt: -------------------------------------------------------------------------------- 1 | User: {instruction} 2 | 3 | Assistant: -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/gemini-pro/prompt.txt: -------------------------------------------------------------------------------- 1 | {instruction} -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/mistral-medium/basic_chatml_prompt.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction} 3 | <|im_end|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/mistral-orpo-beta/prompt.txt: -------------------------------------------------------------------------------- 1 | <|user|> 2 | {instruction} 3 | <|assistant|> 4 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/oasst-sft-llama-33b/prompt.txt: -------------------------------------------------------------------------------- 1 | <|prompter|>{instruction}<|assistant|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/oasst-sft-pythia-12b/prompt.txt: -------------------------------------------------------------------------------- 1 | <|prompter|>{instruction}<|endoftext|><|assistant|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/openchat-13b/prompt.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction} 3 | <|im_end|> -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/pairrm-tulu-2-70b/prompt.txt: -------------------------------------------------------------------------------- 1 | <|user|> 2 | {instruction} 3 | <|assistant|> 4 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/phi-2-dpo/prompt.txt: -------------------------------------------------------------------------------- 1 | ### Human: {instruction} 2 | 3 | ### Assistant: -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/phi-2-sft/prompt.txt: -------------------------------------------------------------------------------- 1 | ### Human: {instruction} 2 | 3 | ### Assistant: -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/tulu-2-dpo-70b/prompt.txt: -------------------------------------------------------------------------------- 1 | <|user|> 2 | {instruction} 3 | <|assistant|> 4 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/yi-large-preview/prompt.txt: -------------------------------------------------------------------------------- 1 | <|im_start|>user 2 | {instruction}<|im_end|> 3 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/models_configs/zephyr-7b-alpha-ExPO/prompt.txt: -------------------------------------------------------------------------------- 1 | <|user|> 2 | {instruction} 3 | <|assistant|> 4 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/types.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/types.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/src/alpaca_eval/utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/tests/test_analyze.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/tests/test_analyze.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/alpaca_eval/tests/test_main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/alpaca_eval/tests/test_main.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/curator_lm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/curator_lm.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/precomputed_hf_lm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/precomputed_hf_lm.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/upload_to_hf_lm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/upload_to_hf_lm.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/.github/workflows/static.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/.github/workflows/static.yml -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/.gitignore -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/CITATION.cff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/CITATION.cff -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/LICENSE -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/data_prep/crux.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/data_prep/crux.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/data_prep/gsm8k.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/data_prep/gsm8k.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/data_prep/mmlu-redux.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/data_prep/mmlu-redux.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/data_prep/zebra_difficulty.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/data_prep/zebra_difficulty.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/docs/index.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/docs/index.html -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/docs/zebra/zebra_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/docs/zebra/zebra_banner.png -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/eval_instruct.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/eval_instruct.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/requirements.txt -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/_ALPACA_EVAL.MD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/_ALPACA_EVAL.MD -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/_GSM.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/_GSM.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/_MMLU_redux.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/_MMLU_redux.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/_ZebraLogic.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/_ZebraLogic.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/eval_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/eval_all.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/_more.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/_more.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/athena.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/athena.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/command_r.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/command_r.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/crux.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/crux.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/gemma.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/gemma.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/gemma2.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/gemma2.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/gpt-4o-new.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/gpt-4o-new.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/gsm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/gsm.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/hyperbolic.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/hyperbolic.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/lepton.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/lepton.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/llama3-1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/llama3-1.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/math_l5.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/math_l5.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/merge_legacy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/merge_legacy.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/mistral_nemo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/mistral_nemo.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/mixtral.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/mixtral.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/o1_zebra.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/o1_zebra.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/phi-3.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/phi-3.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/run_all_mmlu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/run_all_mmlu.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/sambanova.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/sambanova.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/scripts/local/yi.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/scripts/local/yi.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/_TEMPLATES.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/_TEMPLATES.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/evaluation/crux_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/evaluation/crux_eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/evaluation/eval_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/evaluation/eval_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/evaluation/math_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/evaluation/math_eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/evaluation/mcqa_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/evaluation/mcqa_eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/evaluation/summarize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/evaluation/summarize.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/fastchat_conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/fastchat_conversation.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/global_configs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/global_configs.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/hf_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/hf_models.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/merge_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/merge_results.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/task_configs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/task_configs.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/templates/MCQA.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/templates/MCQA.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/templates/OEQA.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/templates/OEQA.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/templates/ZEBRA_GRID.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/templates/ZEBRA_GRID.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/unified_infer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/unified_infer.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/unified_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/unified_utils.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/upload_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/upload_results.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/src/view_wb_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/src/view_wb_eval.py -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/state_of_limit/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/state_of_limit/README.md -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/zero_eval_api.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/zero_eval_api.sh -------------------------------------------------------------------------------- /eval/chat_benchmarks/zeroeval/zero_eval_local.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/chat_benchmarks/zeroeval/zero_eval_local.sh -------------------------------------------------------------------------------- /eval/constants.py: -------------------------------------------------------------------------------- 1 | LIST_OPENAI_MODELS = ["auto", "gpt-4o-mini-2024-07-18"] 2 | -------------------------------------------------------------------------------- /eval/distributed/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/README.md -------------------------------------------------------------------------------- /eval/distributed/SETUP_CAPELLA.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/SETUP_CAPELLA.md -------------------------------------------------------------------------------- /eval/distributed/SETUP_JURECA.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/SETUP_JURECA.md -------------------------------------------------------------------------------- /eval/distributed/SETUP_LEONARDO.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/SETUP_LEONARDO.md -------------------------------------------------------------------------------- /eval/distributed/benchmark_plot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/benchmark_plot.py -------------------------------------------------------------------------------- /eval/distributed/benchmarking_capella.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/benchmarking_capella.png -------------------------------------------------------------------------------- /eval/distributed/benchmarking_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/benchmarking_comparison.png -------------------------------------------------------------------------------- /eval/distributed/benchmarking_leonardo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/benchmarking_leonardo.png -------------------------------------------------------------------------------- /eval/distributed/launch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/launch.py -------------------------------------------------------------------------------- /eval/distributed/launch_local.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/launch_local.py -------------------------------------------------------------------------------- /eval/distributed/launch_simple.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/launch_simple.py -------------------------------------------------------------------------------- /eval/distributed/process_shard.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shard.py -------------------------------------------------------------------------------- /eval/distributed/process_shards_capella.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shards_capella.sbatch -------------------------------------------------------------------------------- /eval/distributed/process_shards_capella_tp.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shards_capella_tp.sbatch -------------------------------------------------------------------------------- /eval/distributed/process_shards_jureca.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shards_jureca.sbatch -------------------------------------------------------------------------------- /eval/distributed/process_shards_leonardo.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shards_leonardo.sbatch -------------------------------------------------------------------------------- /eval/distributed/process_shards_leonardo_tp.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shards_leonardo_tp.sbatch -------------------------------------------------------------------------------- /eval/distributed/process_shards_local.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shards_local.sh -------------------------------------------------------------------------------- /eval/distributed/process_shards_local_tp.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shards_local_tp.sh -------------------------------------------------------------------------------- /eval/distributed/process_shards_tacc.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shards_tacc.sbatch -------------------------------------------------------------------------------- /eval/distributed/process_shards_tacc_tp.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/process_shards_tacc_tp.sbatch -------------------------------------------------------------------------------- /eval/distributed/run_evaluations_capella.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/run_evaluations_capella.sbatch -------------------------------------------------------------------------------- /eval/distributed/run_evaluations_tacc.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/run_evaluations_tacc.sbatch -------------------------------------------------------------------------------- /eval/distributed/simple_alpha.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/simple_alpha.sbatch -------------------------------------------------------------------------------- /eval/distributed/simple_claix.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/simple_claix.sbatch -------------------------------------------------------------------------------- /eval/distributed/simple_jureca.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/simple_jureca.sbatch -------------------------------------------------------------------------------- /eval/distributed/simple_leonardo.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/simple_leonardo.sbatch -------------------------------------------------------------------------------- /eval/distributed/simple_tacc.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/simple_tacc.sbatch -------------------------------------------------------------------------------- /eval/distributed/simple_zih.sbatch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/distributed/simple_zih.sbatch -------------------------------------------------------------------------------- /eval/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/eval.py -------------------------------------------------------------------------------- /eval/eval_tracker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/eval_tracker.py -------------------------------------------------------------------------------- /eval/examples/alpaca_diff_annotator.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/examples/alpaca_diff_annotator.sh -------------------------------------------------------------------------------- /eval/examples/database_logging.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/examples/database_logging.sh -------------------------------------------------------------------------------- /eval/examples/mistral_on_alpaca.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/examples/mistral_on_alpaca.sh -------------------------------------------------------------------------------- /eval/examples/mistral_on_alpaca_greedy_bs1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/examples/mistral_on_alpaca_greedy_bs1.sh -------------------------------------------------------------------------------- /eval/examples/mistral_on_alpaca_greedy_bs32.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/examples/mistral_on_alpaca_greedy_bs32.sh -------------------------------------------------------------------------------- /eval/examples/mistral_on_many_evals.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/examples/mistral_on_many_evals.sh -------------------------------------------------------------------------------- /eval/examples/multi_gpu.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/examples/multi_gpu.sh -------------------------------------------------------------------------------- /eval/listener/tacc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/listener/tacc.py -------------------------------------------------------------------------------- /eval/task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/eval/task.py -------------------------------------------------------------------------------- /image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/image.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/pyproject.toml -------------------------------------------------------------------------------- /reasoning_csv.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/reasoning_csv.sh -------------------------------------------------------------------------------- /reproduced_benchmarks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/reproduced_benchmarks.md -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/requirements.txt -------------------------------------------------------------------------------- /score.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlfoundations/evalchemy/HEAD/score.sh --------------------------------------------------------------------------------