├── .env_sample ├── .github ├── ISSUE_TEMPLATE │ ├── 1-bug-report.yml │ ├── 2-feature-request.yml │ ├── 3-new-eval-request.yml │ ├── 4-documentation-improve.yml │ └── config.yml └── pull_request_template.md ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── assets ├── Evalverse_Color.png ├── Evalverse_White.png ├── architecture.png ├── intro-evalverse.png ├── overview.png └── sample_report.png ├── contribution └── CONTRIBUTING.md ├── evalverse ├── README.md ├── __init__.py ├── connector.py ├── evaluator.py ├── reporter.py ├── slack_bot.py ├── tests │ ├── test_evaluator.py │ ├── test_reporter.py │ ├── test_reproducibility.py │ └── test_results │ │ └── SOLAR-10.7B-Instruct-v1.0 │ │ └── h6_en │ │ ├── arc_challenge_25.json │ │ ├── gsm8k_5.json │ │ ├── hellaswag_10.json │ │ ├── mmlu_5.json │ │ ├── truthfulqa_mc2_0.json │ │ └── winogrande_5.json └── utils.py ├── examples ├── 01_basic_usage.ipynb ├── 02_advanced_usage.ipynb ├── README.md ├── db │ ├── figures │ │ └── figure_20240402_105011.jpeg │ ├── score_df.csv │ └── scores │ │ └── table_20240402_105011.csv └── results │ ├── Llama-2-7b-chat-hf │ ├── eq_bench │ │ ├── benchmark_results.csv │ │ └── raw_results.json │ ├── h6_en │ │ ├── arc_challenge_25.json │ │ ├── gsm8k_5.json │ │ ├── hellaswag_10.json │ │ ├── mmlu_5.json │ │ ├── truthfulqa_mc2_0.json │ │ └── winogrande_5.json │ ├── ifeval │ │ ├── eval_results_loose.jsonl │ │ ├── eval_results_strict.jsonl │ │ ├── output.jsonl │ │ └── scores.txt │ └── mt_bench │ │ ├── model_answer │ │ └── Llama-2-7b-chat-hf.jsonl │ │ ├── model_judgment │ │ └── gpt-4_single.jsonl │ │ └── scores.txt │ └── SOLAR-10.7B-Instruct-v1.0 │ ├── eq_bench │ ├── benchmark_results.csv │ └── raw_results.json │ ├── h6_en │ ├── arc_challenge_25.json │ ├── gsm8k_5.json │ ├── hellaswag_10.json │ ├── mmlu_5.json │ ├── truthfulqa_mc2_0.json │ └── winogrande_5.json │ ├── ifeval │ ├── eval_results_loose.jsonl │ ├── eval_results_strict.jsonl │ ├── output.jsonl │ └── scores.txt │ └── mt_bench │ ├── model_answer │ └── SOLAR-10.7B-Instruct-v1.0.jsonl │ ├── model_judgment │ └── gpt-4_single.jsonl │ └── scores.txt ├── poetry.lock └── pyproject.toml /.env_sample: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/.env_sample -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/1-bug-report.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/.github/ISSUE_TEMPLATE/1-bug-report.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-feature-request.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/.github/ISSUE_TEMPLATE/2-feature-request.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-new-eval-request.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/.github/ISSUE_TEMPLATE/3-new-eval-request.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/4-documentation-improve.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/.github/ISSUE_TEMPLATE/4-documentation-improve.yml -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/.github/pull_request_template.md -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/.gitignore -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/.gitmodules -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/README.md -------------------------------------------------------------------------------- /assets/Evalverse_Color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/assets/Evalverse_Color.png -------------------------------------------------------------------------------- /assets/Evalverse_White.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/assets/Evalverse_White.png -------------------------------------------------------------------------------- /assets/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/assets/architecture.png -------------------------------------------------------------------------------- /assets/intro-evalverse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/assets/intro-evalverse.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/assets/overview.png -------------------------------------------------------------------------------- /assets/sample_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/assets/sample_report.png -------------------------------------------------------------------------------- /contribution/CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/contribution/CONTRIBUTING.md -------------------------------------------------------------------------------- /evalverse/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/README.md -------------------------------------------------------------------------------- /evalverse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/__init__.py -------------------------------------------------------------------------------- /evalverse/connector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/connector.py -------------------------------------------------------------------------------- /evalverse/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/evaluator.py -------------------------------------------------------------------------------- /evalverse/reporter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/reporter.py -------------------------------------------------------------------------------- /evalverse/slack_bot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/slack_bot.py -------------------------------------------------------------------------------- /evalverse/tests/test_evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/tests/test_evaluator.py -------------------------------------------------------------------------------- /evalverse/tests/test_reporter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/tests/test_reporter.py -------------------------------------------------------------------------------- /evalverse/tests/test_reproducibility.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/tests/test_reproducibility.py -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/arc_challenge_25.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/arc_challenge_25.json -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/gsm8k_5.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/gsm8k_5.json -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/hellaswag_10.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/hellaswag_10.json -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/mmlu_5.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/mmlu_5.json -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/truthfulqa_mc2_0.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/truthfulqa_mc2_0.json -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/winogrande_5.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/winogrande_5.json -------------------------------------------------------------------------------- /evalverse/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/evalverse/utils.py -------------------------------------------------------------------------------- /examples/01_basic_usage.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/01_basic_usage.ipynb -------------------------------------------------------------------------------- /examples/02_advanced_usage.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/02_advanced_usage.ipynb -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/README.md -------------------------------------------------------------------------------- /examples/db/figures/figure_20240402_105011.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/db/figures/figure_20240402_105011.jpeg -------------------------------------------------------------------------------- /examples/db/score_df.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/db/score_df.csv -------------------------------------------------------------------------------- /examples/db/scores/table_20240402_105011.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/db/scores/table_20240402_105011.csv -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/eq_bench/benchmark_results.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/eq_bench/benchmark_results.csv -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/eq_bench/raw_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/eq_bench/raw_results.json -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/arc_challenge_25.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/h6_en/arc_challenge_25.json -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/gsm8k_5.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/h6_en/gsm8k_5.json -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/hellaswag_10.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/h6_en/hellaswag_10.json -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/mmlu_5.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/h6_en/mmlu_5.json -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/truthfulqa_mc2_0.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/h6_en/truthfulqa_mc2_0.json -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/winogrande_5.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/h6_en/winogrande_5.json -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/ifeval/eval_results_loose.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/ifeval/eval_results_loose.jsonl -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/ifeval/eval_results_strict.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/ifeval/eval_results_strict.jsonl -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/ifeval/output.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/ifeval/output.jsonl -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/ifeval/scores.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/ifeval/scores.txt -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/mt_bench/model_answer/Llama-2-7b-chat-hf.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/mt_bench/model_answer/Llama-2-7b-chat-hf.jsonl -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/mt_bench/model_judgment/gpt-4_single.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/mt_bench/model_judgment/gpt-4_single.jsonl -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/mt_bench/scores.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/Llama-2-7b-chat-hf/mt_bench/scores.txt -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/eq_bench/benchmark_results.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/eq_bench/benchmark_results.csv -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/eq_bench/raw_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/eq_bench/raw_results.json -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/arc_challenge_25.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/arc_challenge_25.json -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/gsm8k_5.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/gsm8k_5.json -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/hellaswag_10.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/hellaswag_10.json -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/mmlu_5.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/mmlu_5.json -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/truthfulqa_mc2_0.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/truthfulqa_mc2_0.json -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/winogrande_5.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/winogrande_5.json -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/eval_results_loose.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/eval_results_loose.jsonl -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/eval_results_strict.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/eval_results_strict.jsonl -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/output.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/output.jsonl -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/scores.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/scores.txt -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/model_answer/SOLAR-10.7B-Instruct-v1.0.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/model_answer/SOLAR-10.7B-Instruct-v1.0.jsonl -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/model_judgment/gpt-4_single.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/model_judgment/gpt-4_single.jsonl -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/scores.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/examples/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/scores.txt -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/poetry.lock -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/HEAD/pyproject.toml --------------------------------------------------------------------------------