├── .editorconfig ├── .github └── ISSUE_TEMPLATE.md ├── .gitignore ├── .pre-commit-config.yaml ├── .secrets.baseline ├── AUTHORS.rst ├── CODE_OF_CONDUCT.rst ├── CONTRIBUTING.rst ├── HISTORY.rst ├── LICENCE ├── MANIFEST.in ├── Makefile ├── README.rst ├── data_acquisition ├── get_biggen.py └── get_hf_open_llm.py ├── docs ├── Makefile ├── authors.rst ├── conf.py ├── contributing.rst ├── history.rst ├── index.rst ├── installation.rst ├── make.bat ├── readme.rst └── usage.rst ├── examples ├── my_bench.csv ├── newbench_example.py ├── scenarios_for_aggregate.txt └── scenarios_of_intereset.txt ├── pyproject.toml ├── src └── bat │ ├── __init__.py │ ├── agreement_tester.py │ ├── assets │ ├── benchmarks │ │ ├── agenbench_240829_agent.csv │ │ ├── alphacaeval_v2lc_240829_holistic.csv │ │ ├── arena_hard_240829_holistic.csv │ │ ├── bfcl_240906_tools.csv │ │ ├── biggen_240829_holistic.csv │ │ ├── chatbot_arena_241104_holistic.csv │ │ ├── dec_arena_241022_holistic.csv │ │ ├── enkrypt_ai_safety_240916_safety.csv │ │ ├── eqbench_240912_emotion.csv │ │ ├── helm_airbench_240916_safety.csv │ │ ├── helm_classic_240829_holistic.csv │ │ ├── helm_lite_240829_holistic.csv │ │ ├── helm_mmlu_240829_knowledge.csv │ │ ├── hf_open_llm_v1_240829_holistic.csv │ │ ├── hf_open_llm_v2_240829_holistic.csv │ │ ├── holmes_240829_linguistics.csv │ │ ├── hydrox_safety_241001_safety.csv │ │ ├── livebench_240701_holistic.csv │ │ ├── livebench_240829_holistic.csv │ │ ├── livecodebench_240601_230701_code.csv │ │ ├── llm_trustworthy_241001_safety.csv │ │ ├── lvbench_241189_longcontext.csv │ │ ├── mixeval_240829_holistic.csv │ │ ├── mmlu_pro_240829_knowledge.csv │ │ ├── mtbench_240829_holistic.csv │ │ ├── opencompass_240829_holistic.csv │ │ ├── opencompass_academic_240829_holistic.csv │ │ ├── opencompass_agent_240829_agent.csv │ │ ├── opencompass_arena_240829_holistic.csv │ │ ├── opencompass_code_240829_code.csv │ │ ├── opencompass_instruct_240829_instructionfollow.csv │ │ ├── opencompass_knowledge_240829_knowledge.csv │ │ ├── opencompass_language_240829_language.csv │ │ ├── opencompass_math_240829_math.csv │ │ ├── opencompass_reasoning_240829_reasoning.csv │ │ ├── repoqa_241119_longcontext.csv │ │ ├── ruler_bench_241002_longcontext.csv │ │ ├── tablebench_241002_tables.csv │ │ ├── toolbench_240829_tools.csv │ │ └── wildbench_240829_holistic.csv │ ├── benchmarks_old │ │ ├── BLZ_240312.csv │ │ ├── agentbench_240720.csv │ │ ├── arena_hard_2404.csv │ │ ├── biggen_240612.csv │ │ ├── chatbot_arena_240829.csv │ │ ├── helm_classic_240130.csv │ │ ├── helm_classic_240829.csv │ │ ├── helm_lite_240610.csv │ │ ├── helm_lite_240829.csv │ │ ├── hf_open_llm_v1_240829_frozen.csv │ │ ├── hf_open_llm_v2_240829.csv │ │ ├── livebench_240701.csv │ │ ├── llm_trustworthy_241001_safety.csv │ │ ├── mixeval_240601.csv │ │ ├── mixeval_240829_holistic.csv │ │ ├── mmlu_pro_240610.csv │ │ ├── olmes_260624.csv │ │ ├── olmes_260624_frozen.csv │ │ ├── opencompass_240829.csv │ │ ├── opencompass_academic_240829.csv │ │ ├── wildbench_240612.csv │ │ └── wildbench_240829.csv │ ├── lower_is_better_benchmarks.txt │ └── prettified_bencmark_names.json │ ├── benchmark.py │ ├── configs.py │ ├── logic.py │ ├── reporting.py │ └── utils.py └── tests ├── __init__.py └── test_benchmark.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 4 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | charset = utf-8 11 | end_of_line = lf 12 | 13 | [*.bat] 14 | indent_style = tab 15 | end_of_line = crlf 16 | 17 | [LICENSE] 18 | insert_final_newline = false 19 | 20 | [Makefile] 21 | indent_style = tab 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * bat version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # Dask worker cache 75 | dask-worker-space/ 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | 107 | # IDE settings 108 | .vscode/ 109 | .idea/ 110 | figures/ 111 | 112 | .DS_Store -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | # - repo: https://github.com/astral-sh/ruff-pre-commit 5 | # # Ruff version. 6 | # rev: v0.1.6 7 | # hooks: 8 | # # Run the linter. 9 | # - id: ruff 10 | # args: [ --fix ] 11 | # # Run the formatter. 12 | # - id: ruff-format 13 | 14 | - repo: https://github.com/ibm/detect-secrets 15 | # If you desire to use a specific version of detect-secrets, you can replace `master` with other git revisions such as branch, tag or commit sha. 16 | # You are encouraged to use static refs such as tags, instead of branch name 17 | # 18 | # Running "pre-commit autoupdate" automatically updates rev to latest tag 19 | rev: 0.13.1+ibm.61.dss 20 | hooks: 21 | - id: detect-secrets # pragma: whitelist secret 22 | # Add options for detect-secrets-hook binary. You can run `detect-secrets-hook --help` to list out all possible options. 23 | # You may also run `pre-commit run detect-secrets` to preview the scan result. 24 | # when "--baseline" without "--use-all-plugins", pre-commit scan with just plugins in baseline file 25 | # when "--baseline" with "--use-all-plugins", pre-commit scan with all available plugins 26 | # add "--fail-on-unaudited" to fail pre-commit for unaudited potential secrets 27 | args: [--baseline, .secrets.baseline, --use-all-plugins, --fail-on-unaudited] 28 | 29 | - repo: https://github.com/codespell-project/codespell 30 | rev: v2.2.6 31 | hooks: 32 | - id: codespell 33 | additional_dependencies: 34 | - tomli -------------------------------------------------------------------------------- /.secrets.baseline: -------------------------------------------------------------------------------- 1 | { 2 | "exclude": { 3 | "files": "^.secrets.baseline$", 4 | "lines": null 5 | }, 6 | "generated_at": "2023-10-05T11:42:58Z", 7 | "plugins_used": [ 8 | { 9 | "name": "AWSKeyDetector" 10 | }, 11 | { 12 | "name": "ArtifactoryDetector" 13 | }, 14 | { 15 | "name": "AzureStorageKeyDetector" 16 | }, 17 | { 18 | "base64_limit": 4.5, 19 | "name": "Base64HighEntropyString" 20 | }, 21 | { 22 | "name": "BasicAuthDetector" 23 | }, 24 | { 25 | "name": "BoxDetector" 26 | }, 27 | { 28 | "name": "CloudantDetector" 29 | }, 30 | { 31 | "ghe_instance": "github.ibm.com", 32 | "name": "GheDetector" 33 | }, 34 | { 35 | "name": "GitHubTokenDetector" 36 | }, 37 | { 38 | "hex_limit": 3, 39 | "name": "HexHighEntropyString" 40 | }, 41 | { 42 | "name": "IbmCloudIamDetector" 43 | }, 44 | { 45 | "name": "IbmCosHmacDetector" 46 | }, 47 | { 48 | "name": "JwtTokenDetector" 49 | }, 50 | { 51 | "keyword_exclude": null, 52 | "name": "KeywordDetector" 53 | }, 54 | { 55 | "name": "MailchimpDetector" 56 | }, 57 | { 58 | "name": "NpmDetector" 59 | }, 60 | { 61 | "name": "PrivateKeyDetector" 62 | }, 63 | { 64 | "name": "SlackDetector" 65 | }, 66 | { 67 | "name": "SoftlayerDetector" 68 | }, 69 | { 70 | "name": "SquareOAuthDetector" 71 | }, 72 | { 73 | "name": "StripeDetector" 74 | }, 75 | { 76 | "name": "TwilioKeyDetector" 77 | } 78 | ], 79 | "results": {}, 80 | "version": "0.13.1+ibm.61.dss", 81 | "word_list": { 82 | "file": null, 83 | "hash": null 84 | } 85 | } -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Credits 3 | ======= 4 | 5 | Development Lead 6 | ---------------- 7 | 8 | * Yotam Perlitz 9 | 10 | Contributors 11 | ------------ 12 | 13 | None yet. Why not be the first? 14 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.rst: -------------------------------------------------------------------------------- 1 | ==================================== 2 | Contributor Covenant Code of Conduct 3 | ==================================== 4 | 5 | Our Pledge 6 | ---------- 7 | 8 | In the interest of fostering an open and welcoming environment, we as 9 | contributors and maintainers pledge to make participation in our project and 10 | our community a harassment-free experience for everyone, regardless of age, body 11 | size, disability, ethnicity, sex characteristics, gender identity and expression, 12 | level of experience, education, socio-economic status, nationality, personal 13 | appearance, race, religion, or sexual identity and orientation. 14 | 15 | Our Standards 16 | ------------- 17 | 18 | Examples of behavior that contributes to creating a positive environment 19 | include: 20 | 21 | * Using welcoming and inclusive language 22 | * Being respectful of differing viewpoints and experiences 23 | * Gracefully accepting constructive criticism 24 | * Focusing on what is best for the community 25 | * Showing empathy towards other community members 26 | 27 | Examples of unacceptable behavior by participants include: 28 | 29 | * The use of sexualized language or imagery and unwelcome sexual attention or 30 | advances 31 | * Trolling, insulting/derogatory comments, and personal or political attacks 32 | * Public or private harassment 33 | * Publishing others' private information, such as a physical or electronic 34 | address, without explicit permission 35 | * Other conduct which could reasonably be considered inappropriate in a 36 | professional setting 37 | 38 | Our Responsibilities 39 | -------------------- 40 | 41 | Project maintainers are responsible for clarifying the standards of acceptable 42 | behavior and are expected to take appropriate and fair corrective action in 43 | response to any instances of unacceptable behavior. 44 | 45 | Project maintainers have the right and responsibility to remove, edit, or 46 | reject comments, commits, code, wiki edits, issues, and other contributions 47 | that are not aligned to this Code of Conduct, or to ban temporarily or 48 | permanently any contributor for other behaviors that they deem inappropriate, 49 | threatening, offensive, or harmful. 50 | 51 | Scope 52 | ----- 53 | 54 | This Code of Conduct applies within all project spaces, and it also applies when 55 | an individual is representing the project or its community in public spaces. 56 | Examples of representing a project or community include using an official 57 | project e-mail address, posting via an official social media account, or acting 58 | as an appointed representative at an online or offline event. Representation of 59 | a project may be further defined and clarified by project maintainers. 60 | 61 | Enforcement 62 | ----------- 63 | 64 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 65 | reported by contacting the project team at [INSERT EMAIL ADDRESS]. All 66 | complaints will be reviewed and investigated and will result in a response that 67 | is deemed necessary and appropriate to the circumstances. The project team is 68 | obligated to maintain confidentiality with regard to the reporter of an incident. 69 | Further details of specific enforcement policies may be posted separately. 70 | 71 | Project maintainers who do not follow or enforce the Code of Conduct in good 72 | faith may face temporary or permanent repercussions as determined by other 73 | members of the project's leadership. 74 | 75 | Attribution 76 | ----------- 77 | 78 | This Code of Conduct is adapted from the `Contributor Covenant`_, version 1.4, 79 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 80 | 81 | For answers to common questions about this code of conduct, see 82 | https://www.contributor-covenant.org/faq 83 | 84 | .. _`Contributor Covenant`: https://www.contributor-covenant.org 85 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Contributing 5 | ============ 6 | 7 | Contributions are welcome, and they are greatly appreciated! Every little bit 8 | helps, and credit will always be given. 9 | 10 | You can contribute in many ways: 11 | 12 | Types of Contributions 13 | ---------------------- 14 | 15 | Report Bugs 16 | ~~~~~~~~~~~ 17 | 18 | Report bugs at https://github.com/perlitz/bat/issues. 19 | 20 | If you are reporting a bug, please include: 21 | 22 | * Your operating system name and version. 23 | * Any details about your local setup that might be helpful in troubleshooting. 24 | * Detailed steps to reproduce the bug. 25 | 26 | Fix Bugs 27 | ~~~~~~~~ 28 | 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help 30 | wanted" is open to whoever wants to implement it. 31 | 32 | Implement Features 33 | ~~~~~~~~~~~~~~~~~~ 34 | 35 | Look through the GitHub issues for features. Anything tagged with "enhancement" 36 | and "help wanted" is open to whoever wants to implement it. 37 | 38 | Write Documentation 39 | ~~~~~~~~~~~~~~~~~~~ 40 | 41 | bat could always use more documentation, whether as part of the 42 | official bat docs, in docstrings, or even on the web in blog posts, 43 | articles, and such. 44 | 45 | Submit Feedback 46 | ~~~~~~~~~~~~~~~ 47 | 48 | The best way to send feedback is to file an issue at https://github.com/perlitz/bat/issues. 49 | 50 | If you are proposing a feature: 51 | 52 | * Explain in detail how it would work. 53 | * Keep the scope as narrow as possible, to make it easier to implement. 54 | * Remember that this is a volunteer-driven project, and that contributions 55 | are welcome :) 56 | 57 | Get Started! 58 | ------------ 59 | 60 | Ready to contribute? Here's how to set up `bat` for local development. 61 | 62 | 1. Fork the `bat` repo on GitHub. 63 | 2. Clone your fork locally:: 64 | 65 | $ git clone git@github.com:your_name_here/bat.git 66 | 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: 68 | 69 | $ mkvirtualenv bat 70 | $ cd bat/ 71 | $ python setup.py develop 72 | 73 | 4. Create a branch for local development:: 74 | 75 | $ git checkout -b name-of-your-bugfix-or-feature 76 | 77 | Now you can make your changes locally. 78 | 79 | 5. When you're done making changes, check that your changes pass flake8 and the 80 | tests, including testing other Python versions with tox:: 81 | 82 | $ make lint 83 | $ make test 84 | Or 85 | $ make test-all 86 | 87 | To get flake8 and tox, just pip install them into your virtualenv. 88 | 89 | 6. Commit your changes and push your branch to GitHub:: 90 | 91 | $ git add . 92 | $ git commit -m "Your detailed description of your changes." 93 | $ git push origin name-of-your-bugfix-or-feature 94 | 95 | 7. Submit a pull request through the GitHub website. 96 | 97 | Pull Request Guidelines 98 | ----------------------- 99 | 100 | Before you submit a pull request, check that it meets these guidelines: 101 | 102 | 1. The pull request should include tests. 103 | 2. If the pull request adds functionality, the docs should be updated. Put 104 | your new functionality into a function with a docstring, and add the 105 | feature to the list in README.rst. 106 | 3. The pull request should work for Python 3.5, 3.6, 3.7 and 3.8, and for PyPy. Check 107 | https://travis-ci.com/perlitz/bat/pull_requests 108 | and make sure that the tests pass for all supported Python versions. 109 | 110 | Tips 111 | ---- 112 | 113 | To run a subset of tests:: 114 | 115 | 116 | $ python -m unittest tests.test_bat 117 | 118 | Deploying 119 | --------- 120 | 121 | A reminder for the maintainers on how to deploy. 122 | Make sure all your changes are committed (including an entry in HISTORY.rst). 123 | Then run:: 124 | 125 | $ bump2version patch # possible: major / minor / patch 126 | $ git push 127 | $ git push --tags 128 | 129 | Travis will then deploy to PyPI if tests pass. 130 | 131 | Code of Conduct 132 | --------------- 133 | 134 | Please note that this project is released with a `Contributor Code of Conduct`_. 135 | By participating in this project you agree to abide by its terms. 136 | 137 | .. _`Contributor Code of Conduct`: CODE_OF_CONDUCT.rst 138 | -------------------------------------------------------------------------------- /HISTORY.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | History 3 | ======= 4 | 5 | 0.1.0 (2024-07-01) 6 | ------------------ 7 | 8 | * First release on PyPI. 9 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include AUTHORS.rst 2 | include CONTRIBUTING.rst 3 | include HISTORY.rst 4 | include LICENSE 5 | include README.rst 6 | 7 | recursive-include tests * 8 | recursive-exclude * __pycache__ 9 | recursive-exclude * *.py[co] 10 | 11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif 12 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean clean-build clean-pyc clean-test coverage dist docs help install lint lint/flake8 2 | 3 | .DEFAULT_GOAL := help 4 | 5 | define BROWSER_PYSCRIPT 6 | import os, webbrowser, sys 7 | 8 | from urllib.request import pathname2url 9 | 10 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1]))) 11 | endef 12 | export BROWSER_PYSCRIPT 13 | 14 | define PRINT_HELP_PYSCRIPT 15 | import re, sys 16 | 17 | for line in sys.stdin: 18 | match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) 19 | if match: 20 | target, help = match.groups() 21 | print("%-20s %s" % (target, help)) 22 | endef 23 | export PRINT_HELP_PYSCRIPT 24 | 25 | BROWSER := python -c "$$BROWSER_PYSCRIPT" 26 | 27 | help: 28 | @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) 29 | 30 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts 31 | 32 | clean-build: ## remove build artifacts 33 | rm -fr build/ 34 | rm -fr dist/ 35 | rm -fr .eggs/ 36 | find . -name '*.egg-info' -exec rm -fr {} + 37 | find . -name '*.egg' -exec rm -f {} + 38 | 39 | clean-pyc: ## remove Python file artifacts 40 | find . -name '*.pyc' -exec rm -f {} + 41 | find . -name '*.pyo' -exec rm -f {} + 42 | find . -name '*~' -exec rm -f {} + 43 | find . -name '__pycache__' -exec rm -fr {} + 44 | 45 | clean-test: ## remove test and coverage artifacts 46 | rm -fr .tox/ 47 | rm -f .coverage 48 | rm -fr htmlcov/ 49 | rm -fr .pytest_cache 50 | 51 | lint/flake8: ## check style with flake8 52 | flake8 bat tests 53 | 54 | lint: lint/flake8 ## check style 55 | 56 | test: ## run tests quickly with the default Python 57 | pytest 58 | 59 | test-all: ## run tests on every Python version with tox 60 | tox 61 | 62 | coverage: ## check code coverage quickly with the default Python 63 | coverage run --source bat -m pytest 64 | coverage report -m 65 | coverage html 66 | $(BROWSER) htmlcov/index.html 67 | 68 | docs: ## generate Sphinx HTML documentation, including API docs 69 | rm -f docs/bat.rst 70 | rm -f docs/modules.rst 71 | sphinx-apidoc -o docs/ bat 72 | $(MAKE) -C docs clean 73 | $(MAKE) -C docs html 74 | $(BROWSER) docs/_build/html/index.html 75 | 76 | servedocs: docs ## compile the docs watching for changes 77 | watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D . 78 | 79 | release: dist ## package and upload a release 80 | twine upload dist/* 81 | 82 | dist: clean ## builds source and wheel package 83 | python -m build 84 | ls -l dist 85 | 86 | install: clean ## install the package to the active Python's site-packages 87 | pip install -e . -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | BenchBench Package 2 | ========================================= 3 | 4 | Overview 5 | -------- 6 | The ``benchbench`` package simplifies benchmark agreement testing for NLP models. Compare multiple models across various benchmarks and generate comprehensive agreement reports easily. 7 | 8 | It also powers `BenchBench` (https://huggingface.co/spaces/ibm/benchbench), a benchmark for comparing benchmarks. 9 | 10 | Contributing a New Benchmark 11 | -------------------------- 12 | 13 | To contribute a new benchmark, create a pull request with a new CSV file in ``src/bat/assets/benchmarks``. The filename should reflect the data source and snapshot date (see existing files for examples). 14 | 15 | 16 | Usage 17 | ----- 18 | 19 | While much of ``benchbench``'s functionality is available via the interactive `BenchBench` app (https://huggingface.co/spaces/ibm/benchbench), for more advanced usage and customization, clone the repository: 20 | 21 | .. code-block:: bash 22 | 23 | git clone git@github.com:IBM/benchbench.git 24 | 25 | Install in the environment of your choice: 26 | 27 | .. code-block:: bash 28 | 29 | cd benchbench 30 | 31 | conda create -n bat python=3.11 32 | pip install -e . 33 | 34 | And check out the example in ``examples/newbench_example.py `` (or here: https://github.com/IBM/benchbench/blob/main/examples/newbench_example.py) *(Note: Use backticks for file path)* 35 | 36 | Contributing 37 | ------------ 38 | Contributions to the ``benchbench`` package are welcome! Please submit your pull requests or issues through our GitHub repository. 39 | 40 | License 41 | ------- 42 | 43 | This package is released under the MIT License. -------------------------------------------------------------------------------- /data_acquisition/get_biggen.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import json 4 | import pandas as pd 5 | 6 | 7 | def get_json_format_data(url): 8 | response = requests.get(url) 9 | soup = BeautifulSoup(response.content, "html.parser") 10 | 11 | script_elements = soup.find_all("script") 12 | json_format_data = json.loads(str(script_elements[1])[31:-10]) 13 | return json_format_data 14 | 15 | 16 | def get_datas(data): 17 | for component_index in range( 18 | 0, 50, 1 19 | ): # component_index sometimes changes when they update the space, we can use this "for" loop to avoid changing component index manually 20 | try: 21 | result_list = [] 22 | i = 0 23 | while True: 24 | try: 25 | results = data["components"][component_index]["props"]["value"][ 26 | "data" 27 | ][i] 28 | columns = data["components"][component_index]["props"]["headers"] 29 | try: 30 | results_json = {"Model": results[0]} 31 | 32 | if ( 33 | len(columns) < 13 34 | ): # If there are less than 15 columns (this number can definetly change), we know that we are trying wrong component index, so breaking loop to try next component index. 35 | break 36 | 37 | for col_index, col_name in enumerate(columns[1:-1], start=1): 38 | results_json[col_name] = results[col_index] 39 | 40 | except IndexError: # Wrong component index, so breaking loop to try next component index. (NOTE: More than one component index can give you some results but we must find the right component index to get all results we want.) 41 | break 42 | result_list.append(results_json) 43 | i += 1 44 | except IndexError: # No rows to extract so return the list (We know it is the right component index because we didn't break out of loop on the other exception.) 45 | return result_list 46 | except (KeyError, TypeError): 47 | continue 48 | 49 | return result_list 50 | 51 | 52 | if __name__ == "__main__": 53 | # for biggen 54 | 55 | data = get_json_format_data( 56 | url="https://prometheus-eval-BiGGen-Bench-Leaderboard.hf.space/" 57 | ) 58 | finished_models = get_datas(data) 59 | df = pd.DataFrame(finished_models) 60 | 61 | # df["Model"] 62 | 63 | df["Model"] = df["Model"].apply(lambda x: x.split('">')[-1].split("")[0]) 64 | 65 | df.rename( 66 | columns={ 67 | "Average": "biggen", 68 | "Model": "model", 69 | }, 70 | inplace=True, 71 | ) 72 | 73 | import pandas as pd 74 | import re 75 | 76 | # Function to clean column names 77 | def clean_column(col): 78 | col = re.sub(r"[^\w\s]", "", col) # Remove emojis 79 | col = ( 80 | col.strip().lower().replace(" ", "_") 81 | ) # Lowercase and replace spaces with _ 82 | if col != "model" and col != "biggen": 83 | col = "biggen_" + col 84 | return col 85 | 86 | # Apply the cleaning function to the columns 87 | cleaned_columns = [clean_column(col) for col in df.columns.tolist()] 88 | df.columns = cleaned_columns 89 | df.drop(columns=["biggen_model_type"], inplace=True) 90 | 91 | df.to_csv("src/bat/assets/benchmarks_to_add/biggen_240829.csv", index=False) 92 | -------------------------------------------------------------------------------- /data_acquisition/get_hf_open_llm.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import json 4 | import pandas as pd 5 | 6 | 7 | def get_json_format_data(url): 8 | response = requests.get(url) 9 | soup = BeautifulSoup(response.content, "html.parser") 10 | 11 | script_elements = soup.find_all("script") 12 | json_format_data = json.loads(str(script_elements[1])[31:-10]) 13 | return json_format_data 14 | 15 | 16 | def get_datas(data): 17 | for component_index in range( 18 | 0, 50, 1 19 | ): # component_index sometimes changes when they update the space, we can use this "for" loop to avoid changing component index manually 20 | try: 21 | result_list = [] 22 | i = 0 23 | while True: 24 | try: 25 | results = data["components"][component_index]["props"]["value"][ 26 | "data" 27 | ][i] 28 | columns = data["components"][component_index]["props"]["headers"] 29 | try: 30 | results_json = {"T": results[0], "Model": results[-1]} 31 | 32 | if ( 33 | len(columns) < 13 34 | ): # If there are less than 15 columns (this number can definetly change), we know that we are trying wrong component index, so breaking loop to try next component index. 35 | break 36 | 37 | for col_index, col_name in enumerate(columns[2:-1], start=2): 38 | results_json[col_name] = results[col_index] 39 | 40 | except IndexError: # Wrong component index, so breaking loop to try next component index. (NOTE: More than one component index can give you some results but we must find the right component index to get all results we want.) 41 | break 42 | result_list.append(results_json) 43 | i += 1 44 | except IndexError: # No rows to extract so return the list (We know it is the right component index because we didn't break out of loop on the other exception.) 45 | return result_list 46 | except (KeyError, TypeError): 47 | continue 48 | 49 | return result_list 50 | 51 | 52 | if __name__ == "__main__": 53 | # for V2 54 | data = get_json_format_data( 55 | url="https://open-llm-leaderboard-open-llm-leaderboard.hf.space/" 56 | ) 57 | finished_models = get_datas(data) 58 | df = pd.DataFrame(finished_models) 59 | df = df.query("T=='🟢' or T=='💬'") 60 | cols_to_use = [ 61 | "Model", 62 | "Average ⬆️", 63 | "IFEval", 64 | "BBH", 65 | "BBH Raw", 66 | "MATH Lvl 5", 67 | "GPQA", 68 | "MUSR", 69 | "MMLU-PRO", 70 | ] 71 | df = df[cols_to_use] 72 | df.rename( 73 | columns={ 74 | "Average ⬆️": "hf_open_llm_v2", 75 | "Model": "model", 76 | "MATH Lvl 5": "MATH_Lvl_5", 77 | }, 78 | inplace=True, 79 | ) 80 | 81 | df.to_csv("src/bat/assets/benchmarks/hf_open_llm_v2_240829.csv", index=False) 82 | 83 | # for V1 84 | 85 | data = get_json_format_data( 86 | url="https://open-llm-leaderboard-old-open-llm-leaderboard.hf.space/" 87 | ) 88 | finished_models = get_datas(data) 89 | df = pd.DataFrame(finished_models) 90 | df = df.query("T=='🟢' or T=='💬'") 91 | 92 | cols_to_use = [ 93 | "Model", 94 | "Average ⬆️", 95 | "ARC", 96 | "HellaSwag", 97 | "MMLU", 98 | "TruthfulQA", 99 | "Winogrande", 100 | "GSM8K", 101 | ] 102 | 103 | df = df[cols_to_use] 104 | df.rename( 105 | columns={ 106 | "Average ⬆️": "hf_open_llm_v1", 107 | "Model": "model", 108 | }, 109 | inplace=True, 110 | ) 111 | 112 | df.to_csv("src/bat/assets/benchmarks/hf_open_llm_v1_240829_frozen.csv", index=False) 113 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = bat 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../AUTHORS.rst 2 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # bat documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Jun 9 13:47:02 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another 16 | # directory, add these directories to sys.path here. If the directory is 17 | # relative to the documentation root, use os.path.abspath to make it 18 | # absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | import bat 23 | 24 | sys.path.insert(0, os.path.abspath("..")) 25 | 26 | 27 | # -- General configuration --------------------------------------------- 28 | 29 | # If your documentation needs a minimal Sphinx version, state it here. 30 | # 31 | # needs_sphinx = '1.0' 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 35 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ["_templates"] 39 | 40 | # The suffix(es) of source filenames. 41 | # You can specify multiple suffix as a list of string: 42 | # 43 | # source_suffix = ['.rst', '.md'] 44 | source_suffix = ".rst" 45 | 46 | # The master toctree document. 47 | master_doc = "index" 48 | 49 | # General information about the project. 50 | project = "bat" 51 | copyright = "2024, Yotam Perlitz" 52 | author = "Yotam Perlitz" 53 | 54 | # The version info for the project you're documenting, acts as replacement 55 | # for |version| and |release|, also used in various other places throughout 56 | # the built documents. 57 | # 58 | # The short X.Y version. 59 | version = bat.__version__ 60 | # The full version, including alpha/beta/rc tags. 61 | release = bat.__version__ 62 | 63 | # The language for content autogenerated by Sphinx. Refer to documentation 64 | # for a list of supported languages. 65 | # 66 | # This is also used if you do content translation via gettext catalogs. 67 | # Usually you set "language" from the command line for these cases. 68 | language = None 69 | 70 | # List of patterns, relative to source directory, that match files and 71 | # directories to ignore when looking for source files. 72 | # This patterns also effect to html_static_path and html_extra_path 73 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | pygments_style = "sphinx" 77 | 78 | # If true, `todo` and `todoList` produce output, else they produce nothing. 79 | todo_include_todos = False 80 | 81 | 82 | # -- Options for HTML output ------------------------------------------- 83 | 84 | # The theme to use for HTML and HTML Help pages. See the documentation for 85 | # a list of builtin themes. 86 | # 87 | html_theme = "alabaster" 88 | 89 | # Theme options are theme-specific and customize the look and feel of a 90 | # theme further. For a list of options available for each theme, see the 91 | # documentation. 92 | # 93 | # html_theme_options = {} 94 | 95 | # Add any paths that contain custom static files (such as style sheets) here, 96 | # relative to this directory. They are copied after the builtin static files, 97 | # so a file named "default.css" will overwrite the builtin "default.css". 98 | html_static_path = ["_static"] 99 | 100 | 101 | # -- Options for HTMLHelp output --------------------------------------- 102 | 103 | # Output file base name for HTML help builder. 104 | htmlhelp_basename = "batdoc" 105 | 106 | 107 | # -- Options for LaTeX output ------------------------------------------ 108 | 109 | latex_elements = { 110 | # The paper size ('letterpaper' or 'a4paper'). 111 | # 112 | # 'papersize': 'letterpaper', 113 | # The font size ('10pt', '11pt' or '12pt'). 114 | # 115 | # 'pointsize': '10pt', 116 | # Additional stuff for the LaTeX preamble. 117 | # 118 | # 'preamble': '', 119 | # Latex figure (float) alignment 120 | # 121 | # 'figure_align': 'htbp', 122 | } 123 | 124 | # Grouping the document tree into LaTeX files. List of tuples 125 | # (source start file, target name, title, author, documentclass 126 | # [howto, manual, or own class]). 127 | latex_documents = [ 128 | (master_doc, "bat.tex", "bat Documentation", "Yotam Perlitz", "manual"), 129 | ] 130 | 131 | 132 | # -- Options for manual page output ------------------------------------ 133 | 134 | # One entry per manual page. List of tuples 135 | # (source start file, name, description, authors, manual section). 136 | man_pages = [(master_doc, "bat", "bat Documentation", [author], 1)] 137 | 138 | 139 | # -- Options for Texinfo output ---------------------------------------- 140 | 141 | # Grouping the document tree into Texinfo files. List of tuples 142 | # (source start file, target name, title, author, 143 | # dir menu entry, description, category) 144 | texinfo_documents = [ 145 | ( 146 | master_doc, 147 | "bat", 148 | "bat Documentation", 149 | author, 150 | "bat", 151 | "One line description of project.", 152 | "Miscellaneous", 153 | ), 154 | ] 155 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../CONTRIBUTING.rst 2 | -------------------------------------------------------------------------------- /docs/history.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../HISTORY.rst 2 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to bat's documentation! 2 | ====================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | readme 9 | installation 10 | usage 11 | modules 12 | contributing 13 | authors 14 | history 15 | 16 | Indices and tables 17 | ================== 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | .. highlight:: shell 2 | 3 | ============ 4 | Installation 5 | ============ 6 | 7 | 8 | Stable release 9 | -------------- 10 | 11 | To install bat, run this command in your terminal: 12 | 13 | .. code-block:: console 14 | 15 | $ pip install bat 16 | 17 | This is the preferred method to install bat, as it will always install the most recent stable release. 18 | 19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide 20 | you through the process. 21 | 22 | .. _pip: https://pip.pypa.io 23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/ 24 | 25 | 26 | From sources 27 | ------------ 28 | 29 | The sources for bat can be downloaded from the `Github repo`_. 30 | 31 | You can either clone the public repository: 32 | 33 | .. code-block:: console 34 | 35 | $ git clone git://github.com/perlitz/bat 36 | 37 | Or download the `tarball`_: 38 | 39 | .. code-block:: console 40 | 41 | $ curl -OJL https://github.com/perlitz/bat/tarball/master 42 | 43 | Once you have a copy of the source, you can install it with: 44 | 45 | .. code-block:: console 46 | 47 | $ python setup.py install 48 | 49 | 50 | .. _Github repo: https://github.com/perlitz/bat 51 | .. _tarball: https://github.com/perlitz/bat/tarball/master 52 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=bat 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst 2 | -------------------------------------------------------------------------------- /docs/usage.rst: -------------------------------------------------------------------------------- 1 | ===== 2 | Usage 3 | ===== 4 | 5 | To use bat in a project:: 6 | 7 | import bat 8 | -------------------------------------------------------------------------------- /examples/my_bench.csv: -------------------------------------------------------------------------------- 1 | model,my_bench 2 | zephyr-7b-beta,17.32 3 | zephyr-7b-alpha,19.28 4 | yi-6b-chat,9.02 5 | vicuna-7b-v1.5-16k,14.22 6 | vicuna-7b-v1.5,12.31 7 | starling-lm-7b-beta,16.62 8 | smaug-qwen2-72b-instruct,39.66 9 | qwen2-72b-instruct,40.16 10 | qwen2-7b-instruct,26.63 11 | qwen2-1.5b-instruct,10.42 12 | qwen2-0.5b-instruct,7.30 13 | qwen1.5-110b-chat,29.07 14 | qwen1.5-72b-chat,28.89 15 | qwen1.5-7b-chat,17.02 16 | qwen1.5-4b-chat,11.59 17 | qwen1.5-1.8b-chat,6.32 18 | qwen1.5-0.5b-chat,5.43 19 | phi-3.5-moe-instruct,35.14 20 | phi-3.5-mini-instruct,27.81 21 | phi-3-small-128k-instruct,29.68 22 | phi-3-small-8k-instruct,29.09 23 | phi-3-mini-128k-instruct,24.76 24 | phi-3-mini-4k-instruct,24.41 25 | phi-3-medium-128k-instruct,29.88 26 | phi-3-medium-4k-instruct,30.96 27 | openhermes-2.5-mistral-7b,23.36 28 | open-mistral-nemo,29.02 29 | mixtral-8x22b-instruct-v0.1,35.29 30 | mixtral-8x7b-instruct-v0.1,22.79 31 | mistral-small-2402,33.03 32 | mistral-large-2407,48.35 33 | mistral-large-2402,38.92 34 | mistral-7b-instruct-v0.3,20.09 35 | mistral-7b-instruct-v0.2,19.51 36 | meta-llama-3.1-405b-instruct-turbo,55.18 37 | meta-llama-3.1-70b-instruct-turbo,48.90 38 | meta-llama-3.1-8b-instruct-turbo,28.11 39 | meta-llama-3-70b-instruct,37.60 40 | meta-llama-3-8b-instruct,27.46 41 | mathstral-7b-v0.1,24.33 42 | llama-2-7b-chat-hf,10.25 43 | hermes-3-llama-3.1-70b,39.56 44 | gpt-4o-mini-2024-07-18,44.57 45 | gpt-4o-2024-08-06,56.46 46 | gpt-4o-2024-05-13,54.96 47 | gpt-4-turbo-2024-04-09,53.00 48 | gpt-4-0613,44.94 49 | gpt-4-0125-preview,49.39 50 | gpt-3.5-turbo-0125,34.66 51 | gemma-2-27b-it,41.22 52 | gemma-2-9b-it,31.57 53 | gemma-1.1-7b-it,18.23 54 | gemini-1.5-pro-exp-0827,55.06 55 | gemini-1.5-pro-exp-0801,53.63 56 | gemini-1.5-pro-api-0514,44.41 57 | gemini-1.5-flash-exp-0827,47.51 58 | gemini-1.5-flash-api-0514,40.95 59 | dracarys-llama-3.1-70b-instruct,49.82 60 | dracarys-72b-instruct,41.72 61 | deepseek-v2-lite-chat,17.49 62 | deepseek-coder-v2-lite-instruct,29.21 63 | deepseek-coder-v2,46.84 64 | deepseek-chat-v2,46.36 65 | command-r-plus,32.86 66 | command-r,27.23 67 | claude-3-sonnet-20240229,38.08 68 | claude-3-opus-20240229,50.75 69 | claude-3-haiku-20240307,35.32 70 | claude-3-5-sonnet-20240620,61.16 71 | chatgpt-4o-latest,55.35 -------------------------------------------------------------------------------- /examples/newbench_example.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from bat import Config, Tester, Benchmark, Reporter 3 | from datetime import datetime 4 | 5 | 6 | def load_scenarios(filepath, comment_char="#"): 7 | """Loads scenarios from a text file, ignoring commented lines.""" 8 | scenarios = [] 9 | try: 10 | with open(filepath, "r") as f: 11 | for line in f: 12 | line = line.strip() 13 | if line and not line.startswith(comment_char): 14 | scenarios.append(line) 15 | except FileNotFoundError: 16 | print( 17 | f"Warning: Scenarios file not found: {filepath}. Proceeding without these scenarios." 18 | ) 19 | return scenarios 20 | 21 | 22 | # loading the benchmarks to work with, change the files to fit what you are interested in 23 | scenarios_for_aggregate = load_scenarios("examples/scenarios_for_aggregate.txt") 24 | scenarios_of_intereset = load_scenarios("examples/scenarios_of_intereset.txt") 25 | 26 | # Configuration for agreement testing 27 | n_models_taken = 10 # Number of models to sample for each comparison. 0 means all intersecting models. 28 | model_select_strategy = ( 29 | "random" # How to select models: "top", "bottom", "random", "somewhere" 30 | ) 31 | corr_type = "kendall" # Correlation types: "kendall", "pearson" 32 | n_exps = 3 # Number of experiments for random sampling. Set to 1 for deterministic strategies. 33 | 34 | # --- Load your benchmark --- 35 | my_bench_df = pd.read_csv("examples/my_bench.csv") 36 | my_bench_source_name = f"uploaded_benchmark_{datetime.now().strftime('%y%m%d')}" 37 | my_bench = Benchmark( 38 | my_bench_df, data_source=my_bench_source_name, normalized_names=False 39 | ) 40 | 41 | # --- Load the existing benchbench benchmark catalog --- 42 | allbench = Benchmark() 43 | allbench.load_local_catalog() 44 | 45 | # --- Create an aggregate benchmark --- 46 | allbench.add_aggregate( 47 | new_col_name="aggregate", 48 | agg_source_name="aggregate", 49 | scenario_whitelist=scenarios_for_aggregate, 50 | min_scenario_for_models_to_appear_in_agg=max(1, len(scenarios_for_aggregate) // 3), 51 | ) 52 | 53 | # --- Combine your benchmark with the existing benchmarks --- 54 | allbench.extend(my_bench) 55 | 56 | # --- Analyze model overlap for insights --- 57 | uploaded_models = my_bench.get_models() 58 | aggregate_models = allbench.df[allbench.df["source"] == "aggregate"]["model"].unique() 59 | n_overlap_models = len(set(aggregate_models).intersection(uploaded_models)) 60 | print(f"Number of models overlapping: {n_overlap_models}") 61 | 62 | # --- Remove duplicate scenarios before analysis --- 63 | allbench.clear_repeated_scenarios( 64 | source_to_keep=my_bench_source_name 65 | ) # Prioritize keeping your benchmark's scenarios 66 | 67 | 68 | # --- Select specific scenarios for analysis --- 69 | my_scenario_name = allbench.df.query(f'source=="{my_bench_source_name}"')[ 70 | "scenario" 71 | ].iloc[0] 72 | scenarios_to_analyze = ( 73 | scenarios_of_intereset + ["aggregate"] + [my_scenario_name] 74 | ) # Use my_bench_name for consistency 75 | allbench.df = allbench.df[allbench.df["scenario"].isin(scenarios_to_analyze)] 76 | 77 | # --- Configure and run the agreement tester --- 78 | cfg = Config( 79 | exp_to_run="example", 80 | n_models_taken_list=[ 81 | n_models_taken 82 | ], # Use lists for consistency with Config definition 83 | model_select_strategy_list=[model_select_strategy], # Use lists for consistency 84 | corr_types=[corr_type], # Use lists for consistency 85 | n_exps=n_exps if n_models_taken != 0 else 1, 86 | ) 87 | 88 | tester = Tester(cfg=cfg) 89 | agreements = tester.all_vs_all_agreement_testing( 90 | allbench 91 | ) # No need for single_source_scenario here, as we've already filtered 92 | 93 | # --- Report the results --- 94 | reporter = Reporter() 95 | 96 | reporter.draw_agreements_for_one_source( 97 | agreements, 98 | source_of_interest=my_bench_source_name, 99 | ) 100 | reporter.draw_agreement_matrix(agreements) 101 | z_score_df = reporter.get_all_z_scores(agreements, aggragate_name="aggregate") 102 | print(z_score_df[["scenario", "z_score"]]) 103 | -------------------------------------------------------------------------------- /examples/scenarios_for_aggregate.txt: -------------------------------------------------------------------------------- 1 | Helm Lite 2 | HF OpenLLM v2 3 | OpenCompass Academic 4 | LMSys Arena 5 | Helm Classic 6 | AlphacaEval v2lc 7 | LiveBench 240829 8 | WildBench Elo LC -------------------------------------------------------------------------------- /examples/scenarios_of_intereset.txt: -------------------------------------------------------------------------------- 1 | # Scenarios of Interest 2 | Holmes 3 | # "eureka_information_retrieval_fact_recall", 4 | # "eureka_information_retrieval_fact_precision", 5 | # "eureka_instruction_following", 6 | # "eureka_long_context_qa_average", 7 | # "eureka_long_context_qa_longest_context_3k", 8 | # "eureka_toxicity_detection", 9 | Helm Lite 10 | # "Helm Lite NarrativeQA", 11 | # "Helm Lite NaturalQuestionsOpen", 12 | # "Helm Lite NaturalQuestionsClosed", 13 | # "Helm Lite OpenBookQA", 14 | # "Helm Lite MMLU", 15 | # "Helm Lite MathEquivalentCOT", 16 | # "Helm Lite GSM8K", 17 | # "Helm Lite LegalBench", 18 | # "Helm Lite MedQA", 19 | # "Helm Lite WMT2014", 20 | LMSys Arena 21 | HF OpenLLM v2 22 | HFv2 BBH 23 | HFv2 BBH Raw 24 | HFv2 GPQA 25 | HFv2 IFEval 26 | HFv2 MMLU Pro 27 | HFv2 Math Level 5 28 | HFv2 MuSR 29 | tablebench_overall_dp 30 | # "trustworthy_average", 31 | # "trustworthy_non_toxicity", 32 | # "trustworthy_non_stereotype", 33 | # "trustworthy_advglue_pp", 34 | # "trustworthy_ood", 35 | # "trustworthy_adv_demo", 36 | # "trustworthy_privacy", 37 | # "trustworthy_ethics", 38 | # "trustworthy_fairness", 39 | OpenCompass Academic 40 | # "OpenCompass MMLU", 41 | # "OpenCompass MMLU Pro", 42 | # "OpenCompass CMMLU", 43 | # "OpenCompass BBH", 44 | # "OpenCompass GQPA-Dimand", 45 | # "OpenCompass Math", 46 | OpenCompass HumanEval 47 | # "OpenCompass IFEval", 48 | Helm MMLU 49 | Helm Classic 50 | # "Helm BoolQ", 51 | # "Helm NarrativeQA", 52 | # "Helm NaturalQuestionsClosed", 53 | # "Helm NaturalQuestionsOpen", 54 | # "Helm QuAC", 55 | # "helm_hellaswag", 56 | # "Helm OpenBookQA", 57 | # "helm_truthfulqa", 58 | # "Helm MSMARCO Regular", 59 | # "Helm MSMARCO Trec", 60 | # "helm_cnn/dailymail", 61 | # "Helm XSUM", 62 | # "Helm IMDB", 63 | # "Helm CivilComments", 64 | # "Helm RAFT", 65 | MMLU Pro 66 | MixEval 67 | # "MixEval Hard", 68 | # "MixEval TriviaQA", 69 | # "MixEval MMLU", 70 | # "MixEval DROP", 71 | # "MixEval HellaSwag", 72 | # "MixEval CommonsenseQA", 73 | # "MixEval TriviaQA Hard", 74 | # "MixEval MMLU Hard", 75 | # "MixEval DROP Hard", 76 | toolbench 77 | AlphacaEval v2lc 78 | # "HELM AirBench Security Risks", 79 | # ... (Rest of the AirBench entries) 80 | HELM AirBench AIR Score 81 | OpenCompass 82 | # "OpenCompass Language", 83 | # ... (Rest of the OpenCompass entries) 84 | OpenCompass Arena 85 | # "LiveBench 240725" 86 | # "LiveBench Reasoning", 87 | # ... (Rest of LiveBench entries) 88 | Enkrypt AI Safety 89 | # "WildBench Elo LC", 90 | # ... (Rest of WildBench entries) 91 | WildBench Score 92 | Decentralized Arena (0-1 Normalized) 93 | Arena Hard 94 | AgentBench 95 | MT-Bench 96 | HF OpenLLM v1 97 | # "HFv1 ARC", 98 | # ... (Rest of HFv1 entries) 99 | BFCL 100 | eq_bench 101 | # "magi_hard", 102 | BIGGEN 103 | # "BIGGEN Grounding", 104 | # ... (Rest of BIGGEN entries) 105 | ruler 106 | # "LiveBench 240624", -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bat" 7 | version = "0.1.0" 8 | description = "Benchmark Agreement Testing" 9 | readme = "README.rst" 10 | authors = [ 11 | {name = "Yotam Perlitz", email = "perlitz@gmail.com"} 12 | ] 13 | maintainers = [ 14 | {name = "Yotam Perlitz", email = "perlitz@gmail.com"} 15 | ] 16 | classifiers = [] 17 | license = {text = "Open source"} 18 | dependencies = [ 19 | "pandas", 20 | "seaborn", 21 | "matplotlib", 22 | "numpy", 23 | "scipy", 24 | "tqdm", 25 | ] 26 | 27 | [project.urls] 28 | bugs = "https://github.com/perlitz/bat/issues" 29 | changelog = "https://github.com/perlitz/bat/blob/master/changelog.md" 30 | homepage = "https://github.com/perlitz/bat" 31 | 32 | [tool.setuptools] 33 | package-dir = {"" = "src"} 34 | 35 | [tool.setuptools.package-data] 36 | "*" = ["*.*"] 37 | 38 | # Mypy 39 | [tool.mypy] 40 | files = "." 41 | strict = true 42 | warn_unreachable = true 43 | warn_no_return = true 44 | 45 | # [tool.mypy.overrides] 46 | # # Don't require test functions to include types 47 | # module = "tests.*" 48 | # allow_untyped_defs = true 49 | # disable_error_code = "attr-defined" 50 | 51 | [tool.ruff] 52 | # Exclude commonly ignored directories. 53 | exclude = [ 54 | ".bzr", 55 | ".direnv", 56 | ".eggs", 57 | ".git", 58 | ".git-rewrite", 59 | ".hg", 60 | ".mypy_cache", 61 | ".nox", 62 | ".pants.d", 63 | ".pytype", 64 | ".ruff_cache", 65 | ".svn", 66 | ".tox", 67 | ".venv", 68 | "__pypackages__", 69 | "_build", 70 | "buck-out", 71 | "build", 72 | "dist", 73 | "node_modules", 74 | "venv", 75 | ] 76 | 77 | line-length = 88 78 | indent-width = 4 -------------------------------------------------------------------------------- /src/bat/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level package for bat.""" 2 | 3 | __author__ = """Yotam Perlitz""" 4 | __email__ = "yotam.perlitz@ibm.com" 5 | __version__ = "0.1.0" 6 | 7 | 8 | from bat.agreement_tester import Tester 9 | from bat.benchmark import Benchmark 10 | from bat.configs import Config 11 | from bat.reporting import Reporter 12 | -------------------------------------------------------------------------------- /src/bat/agreement_tester.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import random 3 | import pandas as pd 4 | from tqdm import tqdm 5 | from bat.logic import get_pair_agreement 6 | 7 | 8 | class Tester: 9 | def __init__(self, cfg): 10 | self.cfg = cfg 11 | 12 | @staticmethod 13 | def fetch_reference_models_names( 14 | reference_benchmark, 15 | n_models, 16 | ): 17 | return list(reference_benchmark.get_model_appearences_count().keys())[:n_models] 18 | 19 | def all_vs_all_agreement_testing(self, benchmark, single_source_scenario=None): 20 | assert all( 21 | benchmark.df.drop_duplicates(subset=["scenario", "source"]) 22 | .groupby("scenario")["source"] 23 | .count() 24 | == 1 25 | ), "duplicated scenarios exist, consider running benchmark.clear_repeated_scenarios()" 26 | 27 | all_bench_res = benchmark.df 28 | 29 | # List of all scenarios 30 | pair_agreements = [] 31 | 32 | used_scenarios = all_bench_res["scenario"].unique().tolist() 33 | 34 | scenario_pairs = [ 35 | (a, b) for a, b in itertools.combinations(used_scenarios, 2) if a != b 36 | ] 37 | 38 | if single_source_scenario: 39 | assert ( 40 | single_source_scenario in used_scenarios 41 | ), f"single_source_scenario requested {single_source_scenario} does not appear as a scenario in the benchmark" 42 | scenario_pairs = [ 43 | (a, b) for a, b in scenario_pairs if single_source_scenario in [a, b] 44 | ] # make sure only pairs with single_source_scenario are in the calculations 45 | 46 | # Iterate over each pair of scenarios 47 | for corr_type in self.cfg.corr_types: 48 | for model_select_strategy in self.cfg.model_select_strategy_list: 49 | for model_subset_size_requested in self.cfg.n_models_taken_list: 50 | for scenario1, scenario2 in tqdm(scenario_pairs): 51 | cur_scen_res = all_bench_res.query( 52 | f'scenario == "{scenario1}" or scenario == "{scenario2}"' 53 | ) 54 | 55 | scenario_source = cur_scen_res.query( 56 | "scenario==@scenario1" 57 | ).iloc[0]["source"] 58 | ref_source = cur_scen_res.query("scenario==@scenario2").iloc[0][ 59 | "source" 60 | ] 61 | 62 | for exp_n in range(self.cfg.n_exps): 63 | # for date_threshold in date_thresholds: 64 | pair_agreements_cfg = { 65 | "scenario": scenario1, 66 | "scenario_source": scenario_source, 67 | "ref_scenario": scenario2, 68 | "ref_source": ref_source, 69 | "corr_type": corr_type, 70 | "model_select_strategy": model_select_strategy, 71 | "model_subset_size_requested": model_subset_size_requested, 72 | "exp_n": exp_n, 73 | } 74 | 75 | # sorting according to one of the benchmarks 76 | res_to_sort_by = all_bench_res.query( 77 | f"scenario=='{random.choice([scenario1, scenario2])}'" 78 | ) 79 | 80 | models_intersect = ( 81 | cur_scen_res["model"] 82 | .value_counts()[ 83 | cur_scen_res["model"].value_counts() == 2 84 | ] 85 | .index.tolist() 86 | ) 87 | 88 | if len(models_intersect) < max( 89 | model_subset_size_requested, 90 | self.cfg.min_n_models_intersect, 91 | ): 92 | continue 93 | 94 | pair_agreement, p_value = get_pair_agreement( 95 | cur_scen_res, 96 | res_to_sort_by, 97 | pair_agreements_cfg, 98 | models_intersect, 99 | ) 100 | 101 | if pair_agreement is not None: 102 | pair_agreement_reported = pair_agreements_cfg.copy() 103 | pair_agreement_reported.update( 104 | { 105 | "correlation": pair_agreement, 106 | "p_value": p_value, 107 | } 108 | ) 109 | pair_agreements.append(pair_agreement_reported) 110 | 111 | all_agreements = pd.DataFrame(pair_agreements) 112 | 113 | # add the the reversed scenario pairs 114 | all_agreements_reversed_scenarios = all_agreements.rename( 115 | columns={ 116 | "scenario": "ref_scenario", 117 | "ref_scenario": "scenario", 118 | "scenario_source": "ref_source", 119 | "ref_source": "scenario_source", 120 | } 121 | ) 122 | all_agreements = pd.concat( 123 | [all_agreements, all_agreements_reversed_scenarios] 124 | ).reset_index(drop=True) 125 | 126 | return all_agreements 127 | -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/agenbench_240829_agent.csv: -------------------------------------------------------------------------------- 1 | model,agentbench 2 | gpt-4-0613,4.01 3 | claude-2,2.49 4 | claude-v1.3,2.44 5 | gpt-3.5-turbo-0613,2.32 6 | text-davinci-003,1.71 7 | claude-instant-v1.1,1.60 8 | chat-bison-001,1.39 9 | text-davinci-002,1.25 10 | llama-2-70b-chat,0.78 11 | guanaco-65b,0.54 12 | codellama-34b-instruct,0.96 13 | vicuna-33b-v1.3,0.73 14 | wizardlm-30b-v1.0,0.46 15 | guanaco-33b,0.39 16 | vicuna-13b-v1.5,0.93 17 | llama-2-13b-chat,0.77 18 | openchat-13b-v3.2,0.70 19 | wizardlm-13b-v1.2,0.66 20 | vicuna-7b-v1.5,0.56 21 | codellama-13b-instruct,0.56 22 | codellama-7b-instruct,0.50 23 | koala-13b,0.34 24 | llama-2-7b-chat,0.34 25 | codegeex2-6b,0.27 26 | dolly-12b-v2,0.14 27 | chatglm-6b-v1.1,0.11 28 | oasst-12b-sft-4,0.03 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/alphacaeval_v2lc_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,alphacaeval_v2lc 2 | Shopee_SlimMoA_v1,77.5 3 | Blendax.AI-gm-l6-vo31,76.9 4 | gemma-2-9b-it-WPO-HB,76.7 5 | Blendax.AI-gm-l3-v35,73.4 6 | gemma-2-9b-it-SimPO,72.4 7 | OpenPipe_MoA_GPT-4_Turbo,68.4 8 | gemma-2-9b-it-DPO,67.7 9 | Together_MoA,65.4 10 | Llama3_PBM_Nova_70B,62.4 11 | Storm-7B_(best-of-64),61.6 12 | Together_MoA-Lite,59.1 13 | Aligner_2B+GPT-4_Turbo_(04/09),58.3 14 | GPT-4_Omni_(05/13),57.5 15 | Higgs-Llama-3-70B_V2,56.8 16 | GPT-4_Turbo_(04/09),55.0 17 | SPPO-Gemma-2-9B-It-PairRM,54.0 18 | Llama-3-Instruct-8B-WPO-HB-v2,53.4 19 | Claude_3.5_Sonnet_(06/20),52.4 20 | Yi-Large_Preview,51.9 21 | GPT-4o_Mini_(07/18),50.7 22 | Storm-7B,50.5 23 | GPT-4_Preview_(11/06),50.0 24 | Infinity-Instruct-7M-Gen-Llama3_1-70B,46.1 25 | ExPO_+_Llama-3-Instruct-8B-SimPO,45.8 26 | Llama-3-Instruct-8B-SimPO,44.7 27 | Nanbeige_Plus_Chat_v0.1,44.5 28 | Qwen1.5_110B_Chat,43.9 29 | Aligner_2B+Claude_3_Opus,41.8 30 | Nanbeige2_16B_Chat,40.6 31 | Claude_3_Opus_(02/29),40.5 32 | Infinity-Instruct-7M-Gen-mistral-7B,39.7 33 | Llama_3.1_405B_Instruct,39.3 34 | SPPO-Llama-3-Instruct-8B-PairRM,38.6 35 | GPT-4,38.1 36 | Qwen2_72B_Instruct,38.1 37 | Llama_3.1_70B_Instruct,38.1 38 | Infinity-Instruct-3M-0625-Llama3-70B,38.0 39 | Aligner_2B+Qwen1.5_72B_Chat,36.7 40 | Qwen1.5_72B_Chat,36.6 41 | GPT-4_(03/14),35.3 42 | Ein_70B_v0.1,35.0 43 | Claude_3_Sonnet_(02/29),34.9 44 | FsfairX-Zephyr-Chat-v0.1,34.8 45 | Llama_3_70B_Instruct,34.4 46 | Infinity-Instruct-7M-Gen-Llama3_1-8B,33.9 47 | Mistral_Large_(24/02),32.7 48 | ExPO_+_SPPO-Mistral7B-PairRM,31.8 49 | merlinite-7B-AOT,31.7 50 | Infinity-Instruct-3M-0613-Llama3-70B,31.5 51 | Samba_CoE_v0.2_(best-of-16),31.5 52 | Infinity-Instruct-3M-0625-Mistral-7B,31.4 53 | REBEL-Llama-3-8B-Instruct,31.4 54 | Mixtral_8x22B_v0.1,30.9 55 | SPPO-Mistral7B-PairRM,30.5 56 | GPT-4_(06/13),30.2 57 | Snorkel_(Mistral-PairRM-DPO+best-of-16),30.0 58 | Contextual_AI_(KTO-Mistral-PairRM),29.7 59 | PairRM_0.4B+Yi-34B-Chat_(best-of-16),28.8 60 | Mistral_Medium,28.6 61 | Claude_2,28.2 62 | Samba_CoE_v0.2,27.6 63 | Infinity-Instruct-3M-0625-Llama3-8B,27.5 64 | Claude,27.3 65 | ExPO_+_InternLM2_Chat_20B,27.2 66 | Yi_34B_Chat,27.2 67 | ExPO_+_Starling_LM_7B_beta,26.4 68 | Snorkel_(Mistral-PairRM-DPO),26.4 69 | ExPO_+_Tulu-2-DPO-70B,25.7 70 | Claude_Instant_1.2,25.6 71 | Infinity-Instruct-3M-0613-Mistral-7B,25.5 72 | DBRX_Instruct,25.4 73 | Claude_2.1,25.3 74 | Nanbeige2_8B_Chat,25.2 75 | XwinLM_70b_V0.1,24.6 76 | Gemini_Pro,24.4 77 | Qwen1.5_14B_Chat,23.9 78 | Mixtral_8x7B_v0.1,23.7 79 | Evo_v2_7B,23.4 80 | Ghost_8B_Beta_(d0x5),23.1 81 | Llama_3_8B_Instruct,22.9 82 | Samba_CoE_v0.1,22.9 83 | GPT_3.5_Turbo_(06/13),22.7 84 | ExPO_+_InternLM2_Chat_7B,22.7 85 | GPT_3.5_Turbo_(06/13),22.4 86 | Infinity-Instruct-3M-0625-Qwen2-7B,21.9 87 | PairRM_0.4B+Tulu_2+DPO_70B_(best-of-16),21.4 88 | Tulu_2+DPO_70B,21.2 89 | Llama_3.1_8B_Instruct,20.9 90 | Mistral_7B_v0.3,20.6 91 | Mistral-7B-ReMax-v0.1,20.6 92 | Infinity-Instruct-3M-0625-Yi-1.5-9B,20.5 93 | ExPO_+_Starling_LM_7B_alpha,19.5 94 | GPT_3.5_Turbo_(11/06),19.3 95 | LMCocktail-10.7B-v1,19.0 96 | InternLM2_Chat_20B,18.7 97 | GPT_3.5_Turbo_(03/01),18.1 98 | XwinLM_13b_V0.1,17.9 99 | DeepSeek_LLM_67B_Chat,17.8 100 | GPT-3.5,17.7 101 | ExPO_+_Tulu-2-DPO-13B,17.6 102 | WizardLM_70B,17.6 103 | Vicuna_33B_v1.3,17.6 104 | PairRM_0.4B+Tulu_2+DPO_13B_(best-of-16),17.4 105 | Conifer-7B-DPO,17.1 106 | Mistral_7B_v0.2,17.1 107 | Evo_7B,16.5 108 | Humpback_LLaMa2_70B,16.2 109 | OpenHermes-2.5-Mistral_(7B),16.2 110 | DEITA_7B_v1.0,16.1 111 | JinaChat,15.9 112 | TempNet-LLaMA2-Chat-70B-v0.1,15.8 113 | CausalLM-14B,15.7 114 | PairRM_0.4B+Zephyr_7B_Beta_(best-of-16),15.5 115 | Qwen1.5_7B_Chat,14.7 116 | Mistral-ORPO-Beta,14.7 117 | Starling_LM_7B_alpha,14.7 118 | LLaMA2_Chat_70B,14.7 119 | OpenChat_V3.1_13B,14.5 120 | WizardLM_13B_V1.2,14.5 121 | UltraLM_13B_V2.0_(best-of-16),14.2 122 | ExPO_+_Zephyr_7B_Beta,14.0 123 | WizardLM_13B_V1.1,13.9 124 | ExPO_+_Zephyr_7B_Alpha,13.6 125 | Zephyr_7B_Beta,13.2 126 | Dolphin_2.2.1_Mistral_7B,13.1 127 | Humpback_LLaMa_65B,12.8 128 | OpenBudddy-LLaMA2-70B-v10.1,12.6 129 | OpenBuddy-LLaMA-65B-v8,12.5 130 | Qwen_14B_Chat,12.4 131 | GPT-4_(Adversarial),12.2 132 | CUT_13B,12.2 133 | OpenChat_V2-W_13B,12.0 134 | Vicuna_13B_v1.5_(together),11.7 135 | ExPO_+_Tulu-2-DPO-7B,11.7 136 | Tulu_2+DPO_13B,11.6 137 | Claude2_Alpaca_13B,11.5 138 | Minotaur_13B,11.5 139 | airoboros_65B,11.0 140 | Cohere_Command,10.9 141 | Vicuna_13B_v1.3,10.8 142 | XwinLM_7b_V0.1,10.8 143 | airoboros_33B,10.7 144 | PlatoLM_7B,10.5 145 | Vicuna_13B_v1.5,10.5 146 | Gemma_Instruct_(7B),10.4 147 | OpenChat_V2_13B,10.4 148 | Zephyr_7B_Alpha,10.3 149 | OpenBuddy-LLaMA-30B-v7.1,10.2 150 | UltraLM_13B_(best-of-16),9.9 151 | LLaMA_33B_OASST_SFT,9.9 152 | WizardLM_13B,9.8 153 | Nous_Hermes_13B,9.7 154 | Vicuna_13B,9.2 155 | Tulu_2+DPO_7B,9.2 156 | OpenBudddy-LLaMA2-13B-v11.1,9.2 157 | UltraLM_13B_V2.0,9.1 158 | Davinci001,9.0 159 | OpenBuddy-Falcon-40B-v9,9.0 160 | OpenChat-13B,8.8 161 | TempNet-LLaMA2-Chat-13B-v0.1,8.6 162 | LLaMA2_Chat_13B,8.4 163 | Guanaco_65B,8.3 164 | OpenCoderPlus-15B,8.2 165 | LLaMA_33B_OASST_RLHF,8.0 166 | OpenChat8192-13B,7.9 167 | Phi-2_DPO,7.8 168 | MiniChat_1.5_3B,7.7 169 | Vicuna_7B_v1.5,7.6 170 | LLaMA2_Chat_7B_Evol70k-NEFT,7.5 171 | Recycled_WizardLM_7B_V2.0,7.5 172 | Vicuna_7B_v1.3,7.2 173 | Alpaca_Farm_PPO_Sim_(GPT-4)_7B,7.1 174 | UltraLM_13B,7.1 175 | Baize-v2_13B,7.0 176 | Recycled_WizardLM_7B_V1.0,6.9 177 | Ghost_7B_Alpha,6.9 178 | Alpaca_Farm_PPO_Human_7B,6.4 179 | Vicuna_7B,6.3 180 | Alpaca_7B,5.9 181 | Phi-2_SFT,5.9 182 | TempNet-LLaMA2-Chat-7B-v0.1,5.7 183 | MiniChat_3B,5.7 184 | Guanaco_33B,5.7 185 | Falcon_40B_Instruct,5.6 186 | Gemma_Instruct_(2B),5.4 187 | LLaMA2_Chat_7B,5.4 188 | OpenBuddy-Falcon-7b-v6,4.8 189 | Phi_2,4.4 190 | Baize-v2_7B,4.4 191 | ChatGLM2-6B,4.4 192 | Pythia_12B_SFT,4.2 193 | Falcon_7B_Instruct,4.0 194 | Pythia_12B_OASST_SFT,3.3 195 | Guanaco_13B,3.0 196 | Guanaco_7B,2.9 197 | Qwen1.5_1.8B_Chat,2.6 198 | Baichuan-13B-Chat,2.1 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/arena_hard_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,arena_hard 2 | claude-3-5-sonnet-20240620,79.3 3 | gpt-4o-2024-05-13,79.2 4 | gpt-4-0125-preview,78.0 5 | gpt-4o-2024-08-06,77.9 6 | athene-70b,77.6 7 | gpt-4o-mini,74.9 8 | gemini-1.5-pro-api-preview,72.0 9 | mistral-large-2407,70.4 10 | llama-3.1-405b-instruct,64.1 11 | glm-4-0520,63.8 12 | yi-large,63.7 13 | deepseek-coder-v2,62.3 14 | claude-3-opus-20240229,60.4 15 | gemma-2-27b-it,57.5 16 | llama-3.1-70b-instruct,55.7 17 | glm-4-0116,55.7 18 | glm-4-air,50.9 19 | gpt-4-0314,50.0 20 | gemini-1.5-flash-api-preview,49.6 21 | qwen2-72b-instruct,46.9 22 | claude-3-sonnet-20240229,46.8 23 | llama-3-70b-instruct,46.6 24 | claude-3-haiku-20240307,41.5 25 | gpt-4-0613,37.9 26 | mistral-large-2402,37.7 27 | mixtral-8x22b-instruct-v0.1,36.4 28 | Qwen1.5-72B-Chat,36.1 29 | phi-3-medium-4k-instruct,33.4 30 | command-r-plus,33.1 31 | mistral-medium,31.9 32 | internlm2.5-20b-chat,31.2 33 | phi-3-small-8k-instruct,29.8 34 | mistral-next,27.4 35 | gpt-3.5-turbo-0613,24.8 36 | dbrx-instruct-preview,24.6 37 | internlm2-20b-chat,24.4 38 | claude-2.0,24.0 39 | Mixtral-8x7B-Instruct-v0.1,23.4 40 | gpt-3.5-turbo-0125,23.3 41 | Yi-34B-Chat,23.1 42 | Starling-LM-7B-beta,23.0 43 | claude-2.1,22.8 44 | llama-3.1-8b-instruct,21.3 45 | Snorkel-Mistral-PairRM-DPO,20.7 46 | llama-3-8b-instruct,20.6 47 | gpt-3.5-turbo-1106,18.9 48 | gpt-3.5-turbo-0301,18.1 49 | gemini-1.0-pro,17.8 50 | snowflake-arctic-instruct,17.6 51 | command-r,17.0 52 | phi-3-mini-128k-instruct,15.4 53 | tulu-2-dpo-70b,15.0 54 | Starling-LM-7B-alpha,12.8 55 | mistral-7b-instruct,12.6 56 | gemma-1.1-7b-it,12.1 57 | Llama-2-70b-chat-hf,11.6 58 | vicuna-33b-v1.3,8.6 59 | gemma-7b-it,7.5 60 | Llama-2-7b-chat-hf,4.6 61 | gemma-1.1-2b-it,3.4 62 | gemma-2b-it,3.0 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/bfcl_240906_tools.csv: -------------------------------------------------------------------------------- 1 | model,BFCL 2 | GPT-4-0125-Preview,85.79 3 | GPT-4-1106-Preview,85 4 | GPT-4-0613,84.74 5 | GPT-4-turbo-2024-04-09,83.89 6 | GPT-4o-mini-2024-07-18,83.35 7 | GPT-4o-2024-05-13,83.13 8 | Functionary-Medium-v3.1,82.55 9 | GPT-4-1106-Preview,81.78 10 | Meta-Llama-3-70B-Instruct,81.59 11 | Claude-3-Opus-20240229,80.88 12 | Nemotron-4-340b-instruct,80.23 13 | Functionary-Small-v3.1,80.21 14 | mistral-large-2407,79.66 15 | GPT-4o-2024-05-13,79.55 16 | xLAM-7b-fc-r,79.41 17 | GPT-4o-mini-2024-07-18,79.25 18 | Open-Mixtral-8x22b,79.14 19 | Gorilla-OpenFunctions-v2,79.1 20 | GPT-4-turbo-2024-04-09,79.09 21 | Functionary-Small-v3.2,78.96 22 | GPT-4o-2024-08-06,78.87 23 | mistral-large-2407,78.78 24 | Claude-3-Sonnet-20240229,77.92 25 | FireFunction-v2,77.45 26 | Granite-20b-FunctionCalling,76.63 27 | Open-Mistral-Nemo-2407,76.31 28 | Claude-3.5-Sonnet-20240620,76.29 29 | GPT-3.5-Turbo-0125,75.41 30 | Open-Mistral-Nemo-2407,74.97 31 | xLAM-1b-fc-r,74.9 32 | Hermes-2-Pro-Llama-3-70B,74.78 33 | Gemini-1.5-Pro-Preview-0514,74.75 34 | Claude-2.1,74.57 35 | Gemini-1.5-Pro-Preview-0409,74.56 36 | GPT-4o-2024-08-06,74.12 37 | Command-R-Plus (Original),74.11 38 | Open-Mistral-Nemo-2407,73.12 39 | Mistral-Medium-2312,72.19 40 | Gemini-1.5-Flash-Preview-0514,70.75 41 | DBRX-Instruct,69.55 42 | Claude-3.5-Sonnet-20240620,68.88 43 | GPT-3.5-Turbo-0125,66.19 44 | Hermes-2-Pro-Llama-3-8B,66.18 45 | Hermes-2-Pro-Mistral-7B,65.44 46 | Hermes-2-Theta-Llama-3-8B,64.83 47 | Meta-Llama-3-8B-Instruct,62.7 48 | Claude-3-Opus-20240229,61.89 49 | Open-Mixtral-8x7b,60.82 50 | Claude-3-Haiku-20240307,60.34 51 | Open-Mixtral-8x22b,58.89 52 | Open-Mixtral-8x22b,58.37 53 | Gemini-1.0-Pro-001,57.81 54 | Mistral-small-2402,55.36 55 | FireFunction-v1,48.11 56 | Claude-3-Sonnet-20240229,47.97 57 | Claude-instant-1.2,47.95 58 | Claude-3-Haiku-20240307,47.03 59 | GPT-4-0613,45.61 60 | Snowflake/snowflake-arctic-instruct,42.46 61 | mistral-large-2407,27.87 62 | Mistral-tiny-2312,21.17 63 | Deepseek-v1.5,11.18 64 | Gemma-7b-it,10.3 65 | Hermes-2-Theta-Llama-3-70B,10 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/biggen_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,biggen 2 | gpt-4-1106-preview,4.22 3 | gpt-4-0125-preview,4.19 4 | gpt-4o-2024-05-13,4.141 5 | gpt-4-turbo-2024-04-09,4.132 6 | claude-3-opus-20240229,4.103 7 | meta-llama/Meta-Llama-3-70B-Instruct,4.012 8 | claude-3-sonnet-20240229,4.011 9 | qwen/qwen-110b-chat,3.979 10 | claude-3-haiku-20240307,3.954 11 | gemini-pro-1.5,3.953 12 | MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ,3.936 13 | mistral-medium,3.935 14 | mistral-large,3.927 15 | google/gemini-flash-1.5,3.899 16 | alpindale/c4ai-command-r-plus-GPTQ,3.839 17 | Qwen/Qwen1.5-72B-Chat,3.832 18 | microsoft/Phi-3-mini-4k-instruct,3.821 19 | Qwen/Qwen1.5-32B-Chat,3.813 20 | Starling-LM-7B-beta,3.756 21 | meta-llama/Meta-Llama-3-8B-Instruct,3.753 22 | NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO,3.737 23 | 01-ai/Yi-34B-Chat,3.701 24 | mistralai/Mixtral-8x7B-Instruct-v0.1,3.695 25 | gpt-3.5-turbo-0125,3.689 26 | allenai/tulu-2-dpo-70b,3.683 27 | microsoft/Phi-3-mini-128k-instruct,3.679 28 | gpt-3.5-turbo-1106,3.678 29 | CohereForAI/c4ai-command-r-v01,3.677 30 | upstage/SOLAR-10.7B-Instruct-v1.0,3.672 31 | meta-llama/Llama-2-70b-chat-hf,3.668 32 | gemini-1.0-pro,3.64 33 | mistralai/Mistral-7B-Instruct-v0.2,3.619 34 | mistral-community/Mixtral-8x22B-v0.1-AWQ,3.606 35 | NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT,3.596 36 | openchat/openchat-3.5-0106,3.581 37 | MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ,3.573 38 | Qwen/Qwen1.5-14B-Chat,3.573 39 | Qwen/Qwen1.5-7B-Chat,3.556 40 | Starling-LM-7B-alpha,3.537 41 | HuggingFaceH4/zephyr-7b-beta,3.522 42 | NousResearch/Nous-Hermes-2-Mistral-7B-DPO,3.493 43 | NousResearch/Nous-Hermes-2-Yi-34B,3.476 44 | kaist-ai/mistral-orpo-beta,3.473 45 | meta-llama/Llama-2-13b-chat-hf,3.467 46 | teknium/OpenHermes-2.5-Mistral-7B,3.462 47 | mistralai/Mixtral-8x7B-v0.1,3.445 48 | kaist-ai/mistral-orpo-alpha,3.441 49 | allenai/tulu-2-dpo-13b,3.423 50 | Qwen/Qwen1.5-72B,3.422 51 | allenai/codetulu-2-34b,3.421 52 | google/gemma-1.1-7b-it,3.407 53 | teknium/OpenHermes-2-Mistral-7B,3.394 54 | codellama/CodeLlama-34b-Instruct-hf,3.363 55 | 01-ai/Yi-34B,3.322 56 | meta-llama/Llama-2-70b-hf,3.317 57 | Qwen/Qwen1.5-32B,3.312 58 | meta-llama/Llama-2-7b-chat-hf,3.307 59 | allenai/tulu-2-dpo-7b,3.28 60 | allenai/codetulu-2-13b,3.254 61 | upstage/SOLAR-10.7B-v1.0,3.248 62 | allenai/tulu-2-13b,3.211 63 | codellama/CodeLlama-13b-Instruct-hf,3.206 64 | 01-ai/Yi-6B-Chat,3.204 65 | codellama/CodeLlama-7b-Instruct-hf,3.14 66 | google/gemma-7b-it,3.132 67 | meta-llama/Meta-Llama-3-70B,3.122 68 | Qwen/Qwen1.5-14B,3.106 69 | google/gemma-1.1-2b-it,3.072 70 | allenai/codetulu-2-7b,3.07 71 | allenai/tulu-2-7b,3.041 72 | mistral-community/Mistral-7B-v0.2,3.024 73 | mistralai/Mistral-7B-v0.1,3.006 74 | Qwen/Qwen1.5-4B-Chat,2.976 75 | allenai/OLMo-7B-Instruct,2.974 76 | google/gemma-2b-it,2.932 77 | Qwen/Qwen1.5-7B,2.872 78 | microsoft/phi-2,2.859 79 | allenai/OLMo-7B-SFT,2.827 80 | codellama/CodeLlama-70b-Instruct-hf,2.805 81 | EleutherAI/llemma_34b,2.771 82 | meta-llama/Meta-Llama-3-8B,2.743 83 | Qwen/Qwen1.5-1.8B-Chat,2.741 84 | Qwen/Qwen1.5-4B,2.708 85 | meta-llama/Llama-2-13b-hf,2.703 86 | 01-ai/Yi-6B,2.635 87 | codellama/CodeLlama-70b-hf,2.593 88 | codellama/CodeLlama-34b-hf,2.509 89 | microsoft/phi-1_5,2.497 90 | microsoft/Orca-2-13b,2.489 91 | meta-llama/Llama-2-7b-hf,2.457 92 | Qwen/Qwen1.5-1.8B,2.364 93 | EleutherAI/llemma_7b,2.27 94 | google/gemma-2b,2.262 95 | codellama/CodeLlama-13b-hf,2.134 96 | Qwen/Qwen1.5-0.5B-Chat,2.108 97 | microsoft/Orca-2-7b,2.083 98 | allenai/OLMo-7B,2.081 99 | codellama/CodeLlama-7b-hf,1.954 100 | Qwen/Qwen1.5-0.5B,1.834 101 | allenai/OLMo-1B,1.648 102 | CohereForAI/aya-101,1.447 103 | google/gemma-7b,1.411 104 | microsoft/phi-1,1.135 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/chatbot_arena_241104_holistic.csv: -------------------------------------------------------------------------------- 1 | model,arena_elo 2 | chatglm-6b,855.910565721209 3 | koala-13b,901.4444159097708 4 | oasst-pythia-12b,812.3918514404036 5 | alpaca-13b,851.3113435573603 6 | vicuna-13b,874.2126379649785 7 | dolly-v2-12b,781.4370567093974 8 | stablelm-tuned-alpha-7b,829.7609284591157 9 | llama-13b,800.0 10 | fastchat-t5-3b,794.3748535699036 11 | gpt-3.5-turbo-0314,1051.024508411953 12 | gpt-4-0314,980.6906633214737 13 | RWKV-4-Raven-14B,874.536173297737 14 | claude-1,1039.7803750141782 15 | mpt-7b-chat,869.0762171208861 16 | palm-2,922.5218005276811 17 | claude-instant-1,991.8056867962612 18 | vicuna-7b,910.6856107758757 19 | wizardlm-13b,971.8432912657483 20 | gpt4all-13b-snoozy,885.7452637089059 21 | guanaco-33b,974.3076720194276 22 | vicuna-33b,906.4317166108785 23 | mpt-30b-chat,971.1057122702123 24 | gpt-3.5-turbo-0613,999.7201069046866 25 | gpt-4-0613,960.3770824361335 26 | llama-2-7b-chat,895.4706517283653 27 | claude-2.0,1016.5801503367938 28 | llama-2-13b-chat,963.7146661400922 29 | chatglm2-6b,835.3074735731766 30 | llama-2-70b-chat,1007.6844327159829 31 | codellama-34b-instruct,934.0457254208728 32 | wizardlm-70b,979.5605650746356 33 | falcon-180b-chat,923.054729229491 34 | mistral-7b-instruct,895.9405753947756 35 | qwen-14b-chat,921.4887868532272 36 | zephyr-7b-alpha,946.9339607858802 37 | zephyr-7b-beta,913.246312461937 38 | openchat-3.5,948.9893819327425 39 | gpt-4-1106-preview,1001.256303019811 40 | gpt-3.5-turbo-1106,937.6322384103785 41 | chatglm3-6b,814.5480014217649 42 | claude-2.1,979.8637705131841 43 | tulu-2-dpo-70b,961.7298633389956 44 | yi-34b-chat,932.0283635154187 45 | starling-lm-7b-alpha,945.1430459412007 46 | openhermes-2.5-mistral-7b,935.5573447997912 47 | pplx-70b-online,931.0576338876376 48 | pplx-7b-online,948.7421850358356 49 | dolphin-2.2.1-mistral-7b,977.0069489193058 50 | mixtral-8x7b-instruct-v0.1,867.9036424292025 51 | gemini-pro,1006.251403062337 52 | solar-10.7b-instruct-v1.0,958.6549095565917 53 | mistral-medium,965.0537859905727 54 | llama2-70b-steerlm-chat,965.6376159085758 55 | gemini-pro-dev-api,1019.3566145491036 56 | stripedhyena-nous-7b,919.5708420570646 57 | bard-jan-24-gemini-pro,1041.261256012453 58 | deepseek-llm-67b-chat,958.7276958964317 59 | gpt-4-0125-preview,997.1712467949897 60 | gpt-3.5-turbo-0125,898.9675086846296 61 | nous-hermes-2-mixtral-8x7b-dpo,972.2639217501226 62 | mistral-7b-instruct-v0.2,892.8914241485261 63 | qwen1.5-72b-chat,947.9919390672214 64 | openchat-3.5-0106,956.5639851579056 65 | qwen1.5-4b-chat,857.8615305194531 66 | qwen1.5-7b-chat,937.5784150291832 67 | codellama-70b-instruct,873.7635218944325 68 | mistral-next,969.0249137331155 69 | gemma-2b-it,865.630898513726 70 | gemma-7b-it,913.3020846629596 71 | mistral-large-2402,939.5529442890696 72 | olmo-7b-instruct,875.880001693062 73 | claude-3-sonnet-20240229,970.6832692453123 74 | claude-3-opus-20240229,1021.9572137608475 75 | claude-3-haiku-20240307,946.756591266114 76 | starling-lm-7b-beta,967.1740802373936 77 | command-r,915.3923710382185 78 | dbrx-instruct-preview,930.1149113654316 79 | qwen1.5-14b-chat,932.8461519507623 80 | qwen1.5-32b-chat,917.6067239158654 81 | command-r-plus,981.9316261444285 82 | gemma-1.1-7b-it,888.863535227059 83 | gpt-4-turbo-2024-04-09,1001.9508367594701 84 | zephyr-orpo-141b-A35b-v0.1,992.2709969445073 85 | gemma-1.1-2b-it,839.3449619004468 86 | gemini-1.5-pro-api-0409-preview,1106.8697777575628 87 | reka-flash-21b-20240226-online,967.873277488609 88 | reka-flash-21b-20240226,939.8601363871353 89 | mixtral-8x22b-instruct-v0.1,911.463562145636 90 | llama-3-8b-instruct,925.300077951389 91 | llama-3-70b-instruct,987.92132812523 92 | phi-3-mini-128k-instruct,875.3830177408651 93 | snowflake-arctic-instruct,908.9578096804898 94 | reka-core-20240501,960.871641047353 95 | qwen1.5-110b-chat,970.825546150876 96 | qwen-max-0428,991.8829133949346 97 | gpt-4o-2024-05-13,1033.7736651812086 98 | yi-large-preview,1007.9055342457846 99 | glm-4-0116,996.2388680185245 100 | phi-3-mini-4k-instruct,875.486575120554 101 | gemini-advanced-0514,1034.5901919978594 102 | gemini-1.5-pro-api-0514,1006.938590226684 103 | gemini-1.5-flash-api-0514,988.4260721445921 104 | yi-1.5-34b-chat,935.8573439301474 105 | phi-3-small-8k-instruct,877.7438151636035 106 | phi-3-medium-4k-instruct,866.7539620360035 107 | qwen2-72b-instruct,930.7722721046769 108 | yi-large,991.7868427711801 109 | nemotron-4-340b-instruct,1011.0291063554423 110 | reka-flash-preview-20240611,937.4782906143831 111 | glm-4-0520,1012.3461462160476 112 | deepseek-coder-v2,968.7272337322494 113 | claude-3-5-sonnet-20240620,1026.059060767346 114 | gemma-2-9b-it,950.0755523266928 115 | gemma-2-27b-it,977.8470656596851 116 | phi-3-mini-4k-instruct-june-2024,860.4379813139254 117 | deepseek-v2-api-0628,989.5345921181047 118 | athene-70b-0725,1020.8101504540734 119 | gemini-1.5-pro-exp-0801,1074.9371768117894 120 | gpt-4o-mini-2024-07-18,1026.236414405759 121 | deepseek-coder-v2-0724,990.94288841608 122 | gemma-2-2b-it,906.320768087545 123 | llama-3.1-8b-instruct,949.3125757952853 124 | llama-3.1-405b-instruct,1005.4497444176718 125 | llama-3.1-70b-instruct,1034.402372751568 126 | mistral-large-2407,1005.1771608005986 127 | reka-core-20240722,1006.9821508042021 128 | reka-flash-20240722,950.5542647646221 129 | chatgpt-4o-latest,1073.7429047571106 130 | gpt-4o-2024-08-06,1032.650635133711 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/dec_arena_241022_holistic.csv: -------------------------------------------------------------------------------- 1 | model,decentralized_arena 2 | o1-mini,1.000000 3 | o1-preview,0.988296 4 | ChatGPT-4o-latest (2024-09-03),0.971391 5 | yi-lightning,0.955415 6 | glm-4-plus,0.910273 7 | claude-3.5-sonnet,0.897083 8 | gpt-4o-2024-05-13,0.894297 9 | gpt-4o-2024-08-06,0.889095 10 | nemotron-70b,0.881107 11 | gpt-4o-mini-2024-07-18,0.873119 12 | gpt-4-turbo-2024-04-09,0.865131 13 | gemini-1.5-pro-001,0.854542 14 | qwen2-72b-instruct,0.814787 15 | claude-3-opus,0.804198 16 | gpt4-1106,0.761657 17 | gemini-1.5-flash-001,0.761657 18 | meta-llama-3.1-70b-instruct,0.759056 19 | gemma-2-9b-it-simpo,0.736950 20 | gemma-2-27b-it,0.716515 21 | google-gemma-2-9b-it,0.687349 22 | yi-1.5-34b-chat,0.671373 23 | llama-3-70b-instruct,0.658183 24 | claude-3-haiku,0.591863 25 | qwen1.5-72b-chat,0.583875 26 | meta-llama-3.1-8b-instruct,0.533346 27 | qwen1.5-32b-chat,0.533346 28 | claude-2.1,0.509567 29 | claude-2.0,0.501579 30 | starling-lm-7b-beta,0.464425 31 | qwen1.5-14b-chat,0.437860 32 | mistral-8x7b-instruct-v0.1,0.437860 33 | llama3-8b-instruct,0.421884 34 | gemma-2-2b-it,0.414081 35 | gpt3.5-turbo-0125,0.411295 36 | command-r-(08-2024),0.392718 37 | openchat-3.5-0106,0.387516 38 | openchat-3.5,0.374141 39 | command-r-(04-2024),0.339773 40 | gemma-1.1-7b-it,0.336987 41 | starling-lm-7b-alpha,0.331785 42 | gemini-1.0-pro-001,0.326398 43 | mistral-7b-instruct-2,0.260078 44 | llama-3.2-3b-it,0.252090 45 | vicuna-33b,0.238900 46 | gemma-7b-it,0.228311 47 | qwen1.5-4b-chat,0.146015 48 | mistral-7b-instruct-1,0.143229 49 | vicuna-13b,0.140628 50 | gemma-1.1-2b-it,0.135426 51 | llama2-7b-chat,0.127438 52 | llama2-13b-chat,0.116849 53 | gemma-2b-it,0.087498 54 | vicuna-7b,0.071707 55 | zephyr-7b-beta,0.058332 56 | koala-13b,0.026565 57 | openassistant-pythia-12b,0.000000 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/enkrypt_ai_safety_240916_safety.csv: -------------------------------------------------------------------------------- 1 | model,Enkrypt_AI_Safety 2 | gemini-1.5-pro-exp-0801,84 3 | gemini-1.5-pro-latest,81 4 | gemma-2-27b-it,79 5 | Reflection-Llama-3.1-70B,81 6 | Llama-2-7B-Chat-GGUF-8bit,80 7 | Llama-2-7B-Chat-GGUF-4bit,80 8 | SmolLM-360M-Instruct,80 9 | Llama-2-7b-chat-hf,78 10 | flan-ul2,76 11 | o1-preview,76 12 | Llama-3-8B-Instruct-RR,81 13 | claude-3-opus-20240229,75 14 | gpt-4-0125-preview,79 15 | sarvam-2b-v0.5,75 16 | Llama-3-8B-Instruct-MopeyMule,73 17 | claude-3-5-sonnet-20240620,71 18 | sea-lion-7b-instruct,73 19 | claude-instant-1.2,76 20 | gpt-4-turbo-2024-04-09,75 21 | Meta-Llama-3.1-8B-Instruct-Turbo,70 22 | RakutenAI-7B-chat,68 23 | gemma-2-2b-it,67 24 | Meta-Llama-3-8B-Instruct,72 25 | o1-mini,71 26 | Mistral-7B-v0.1,70 27 | Llama-2-13b-chat-hf,72 28 | h2o-danube3-500m-chat,68 29 | Llama-2-70b-chat-hf,68 30 | gemma-2-9b-it,67 31 | internlm2-chat-20b,59 32 | gemma-2-9b,64 33 | NexusRaven-V2-13B,63 34 | komodo-7b-base,61 35 | gpt-4o,64 36 | phi-2,58 37 | phi3-medium-128K,61 38 | gemma-7b-it,61 39 | claude-3-haiku-20240307,67 40 | Meta-Llama-3.1-405B-Instruct-Turbo,61 41 | SmolLM-1.7B-Instruct,60 42 | gpt-4o-2024-08-06,60 43 | PowerLM-3b,53 44 | Meta-Llama-3-70B-Instruct,62 45 | Starling-LM-7B-beta-GGUF-4bit,54 46 | Smaug-72B-v0.1,61 47 | gpt-3.5-turbo,62 48 | CodeLlama-7b-Instruct-hf,56 49 | Smaug-Llama-3-70B-Instruct,56 50 | Mixtral-8x7B-Instruct-v0.1,54 51 | jamba-instruct-preview,51 52 | Mixtral-8x22B-Instruct-v0.1,53 53 | SeaLLM-7B-v2,58 54 | Qwen2-72B-Instruct,55 55 | OLMo-7B-Instruct,47 56 | Phi-3-mini-128k-instruct,55 57 | dbrx-instruct,51 58 | falcon-mamba-7b-instruct,49 59 | gpt-4o-mini,55 60 | Phi-3.5-MoE-instruct,54 61 | Qwen1.5-14B-Chat,51 62 | c4ai-command-r-plus,48 63 | Smaug-34B-v0.1,56 64 | Qwen2-7B-Instruct,50 65 | Mistral-7B-Instruct-v0.2-GGUF-4bit,48 66 | Meta-Llama-3.1-70B-Instruct-Turbo,48 67 | K2-Chat,50 68 | Phi-3-mini-4k-instruct,50 69 | Starling-LM-7B-beta,51 70 | OLMoE-1B-7B-0924-Instruct,49 71 | Mistral-7B-Instruct-v0.2-GGUF-8bit,48 72 | h2o-danube3-4b-chat,47 73 | RakutenAI-7B-instruct,44 74 | Mistral-7B-Instruct-v0.2,46 75 | jamba-1.5-mini,48 76 | aya-23-35B,47 77 | jamba-1.5-large,47 78 | Phi-3-small-8k-instruct,48 79 | Phi-3-small-128k-instruct,46 80 | zephyr-7b-beta,43 81 | PowerMoE-3b,47 82 | LongWriter-glm4-9b,46 83 | Mistral-7B-Instruct-v0.1-GGUF-4bit,39 84 | snowflake-arctic-instruct,45 85 | Qwen2-57B-A14B-Instruct,45 86 | palm-2-chat-bison,40 87 | Mistral-7B-Instruct-v0.1-GGUF-8bit,40 88 | glm-4-9b-chat,43 89 | Phi-3-medium-4k-instruct,43 90 | aya-23-8B,40 91 | Mistral-7B-Instruct-v0.3,39 92 | Phi-3.5-mini-instruct,37 93 | dolphin-2.5-mixtral-8x7b,32 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/eqbench_240912_emotion.csv: -------------------------------------------------------------------------------- 1 | model,EQ-Bench 2 | Meta-Llama-3.1-405B-Instruct,83.0 3 | claude-3-5-sonnet-20240620,86.36 4 | gpt-4o,83.51 5 | gpt-4-turbo-2024-04-09,86.35 6 | RYS-XLarge-Base,85.05 7 | gpt-4-0613,84.79 8 | gpt-4-0314,85.73 9 | RYS-XLarge,84.55 10 | gpt-4-1106-preview,86.05 11 | gpt-4-0125-preview,83.87 12 | claude-3-opus-20240229,82.19 13 | mistral-large-2407,85.05 14 | Qwen2-72B-Instruct,81.35 15 | mistral-large-2402,85.17 16 | Meta-Llama-3-70B-Instruct,82.13 17 | Qwen1.5-110B-Chat,83.68 18 | solar-pro-preview-instruct,78.52 19 | Senku-70B-Full,84.89 20 | Smaug-Llama-3-70B-Instruct,80.69 21 | ECE-TW3-JRGL-V1,83.07 22 | miiqu-f16,83.17 23 | Qwen1.5-72B-Chat,82.81 24 | miqu-1-70b,82.91 25 | mistral-medium,82.57 26 | gemma-2-27b-it,80.55 27 | gpt-4o-mini,76.93 28 | 🆕Phi-3.5-MoE-instruct,76.97 29 | DeepSeek-V2-Chat-0628,83.18 30 | miquella-120b,82.15 31 | Phi-3-medium-4k-instruct,76.34 32 | claude-3-sonnet-20240229,80.45 33 | Tess-72B-v1.5b,81.78 34 | Mixtral-8x22B-Instruct-v0.1,78.79 35 | Qwen-72B-Chat,80.7 36 | Smaug-72B-v0.1,79.75 37 | gemma-2-9b-it,80.46 38 | Yi-1.5-34B-Chat,72.93 39 | Mixtral_34Bx2_MoE_60B,72.69 40 | Phi-3-small-8k-instruct,73.49 41 | WizardLM-2-8x22B,77.91 42 | miquliz-120b-v2.0,82.21 43 | Quyen-Pro-Max-v0.1,77.16 44 | Qwen1.5-32B-Chat,75.59 45 | 🆕gemma-2-Ifable-9B,79.93 46 | dolphin-2_2-yi-34b,75.52 47 | Nous-Hermes-2-Yi-34B,72.68 48 | MegaDolphin-120b,80.21 49 | dbrx-instruct,76.82 50 | Meta-Llama-3-8B-Instruct,68.88 51 | DiscoLM-120b,78.48 52 | mistral-small-2402,80.36 53 | dolphin-2.2-70b,79.6 54 | Yi-34B-Chat,71.62 55 | tulu-2-dpo-70b,76.63 56 | Tess-XL-v1.0,78.46 57 | Yi-1.5-9B-Chat,70.37 58 | goliath-120b,76.09 59 | c4ai-command-r-plus,76.11 60 | Samantha-120b,76.44 61 | Nous-Hermes-2-Mixtral-8x7B-SFT,72.91 62 | Qwen1.5-14B-Chat,74.99 63 | SynthIA-70B-v1.5,73.71 64 | gemini-pro,75.08 65 | Mistral-Nemo-Instruct-2407,77.13 66 | Mixtral-8x7B-Instruct-v0.1,72.37 67 | Quyen-Pro-v0.1,70.75 68 | gpt-3.5-turbo-0301,70.67 69 | Midnight-Miqu-70B-v1.0,75.9 70 | meow,73.94 71 | LMCocktail-10.7B-v1,73.67 72 | Experiment26-7B,77.21 73 | Beyonder-4x7B-v3,77.01 74 | SauerkrautLM-UNA-SOLAR-Instruct,73.56 75 | NeuralBeagle14-7B,74.79 76 | NeuralMonarch-7B,76.26 77 | SOLAR-10.7b-Instruct-dpo,73.21 78 | Beagle14-7B,74.45 79 | Monarch-7B,75.8 80 | WestLake-7B-v2,78.7 81 | AlphaMonarch-7B,76.08 82 | GML-Mistral-merged-v1,74.01 83 | gpt-3.5-turbo-1106,71.74 84 | Starling-LM-7B-beta,73.82 85 | SOLAR-10.7B-Instruct-v1.0,73.53 86 | Phi-3-mini-4k-instruct,58.15 87 | claude-3-haiku-20240307,63.65 88 | openchat-3.5-1210,72.52 89 | NeuralMarcoro14-7B,74.15 90 | WizardLM-70B-V1.0,71.28 91 | Starling-LM-7B-alpha,73.9 92 | gpt-3.5-turbo-0613,69.35 93 | openchat_3.5,72.18 94 | 🆕EXAONE-3.0-7.8B-Instruct,66.72 95 | laserxtral,71.96 96 | Llama-2-70b-chat-hf,73.59 97 | marcoroni-7b-v3-safetensor,71.68 98 | 🆕Trillama-8B,66.63 99 | 🆕Phi-3.5-mini-instruct,54.74 100 | gpt-3.5-turbo-0125,64.97 101 | Beyonder-4x7B-v2,69.23 102 | firefly-mixtral-8x7b,64.36 103 | Yi-1.5-6B-Chat,59.45 104 | Marcoroni-neural-chat-7B-v2,68.54 105 | WizardLM-2-7B,69.31 106 | OpenHermes-2.5-Mistral-7B,66.89 107 | NeuralHermes-2.5-Mistral-7B,65.86 108 | Snorkel-Mistral-PairRM-DPO,65.83 109 | Qwen-14B-Chat,63.47 110 | dolphin-2.2.1-mistral-7b,69.92 111 | Mistral-7B-Instruct-v0.2,68.18 112 | Mistral-7B-OpenOrca,66.55 113 | neural-chat-7b-v3-1,64.77 114 | internlm2-chat-7b,62.61 115 | Yi-6B-Chat,61.79 116 | Orion-14B-Chat,59.71 117 | una-cybertron-7b-v2-bf16,62.83 118 | c4ai-command-r-v01,56.05 119 | Mistral-7B-Instruct-v0.3,63.15 120 | vicuna-33b-v1.3,67.07 121 | Nanbeige2-8B-Chat,65.17 122 | gemma-1.1-7b-it,59.17 123 | Qwen1.5-MoE-A2.7B-Chat,58.07 124 | vicuna-13b-v1.5,67.39 125 | gemma-2-2b-it,60.86 126 | Qwen1.5-7B-Chat,54.41 127 | sparsetral-16x7B-v2,59.9 128 | zephyr-7b-beta,58.33 129 | WizardLM-13B-V1.2,63.71 130 | zephyr-7b-alpha,56.82 131 | phi-2-orange,56.94 132 | phi-2-psy,56.44 133 | gemma-7b-it,61.72 134 | phi-2-dpo,54.42 135 | phixtral-2x2_8,54.58 136 | Qwen-7B-Chat,50.11 137 | mistral-7b-instruct-v0.1,52.15 138 | Llama-2-13b-chat-hf,49.12 139 | guanaco-33b-merged,36.11 140 | Nous-Capybara-7B-V1,34.37 141 | Llama-2-7b-chat-hf,36.32 142 | Qwen1.5-4B-Chat,28.75 143 | Qwen-1_8B-Chat,30 144 | phi-2,27.6 145 | Qwen1.5-1.8B-Chat,24.12 146 | vicuna-7b-v1.1,26.12 147 | gemma-2b-it,23.26 148 | koala-7B-HF,21.54 149 | stablelm-2-zephyr-1_6b,15.04 150 | random-baseline,0.0 151 | falcon-180B-chat,56.82 152 | claude-instant-1.2,69.04 153 | claude-2.1,73.96 154 | claude-1,76.83 155 | claude-2.0,72.89 156 | pplx-70b-online,62.79 157 | pplx-7b-online,48.91 158 | TheProfessor-155b,78.82 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/helm_airbench_240916_safety.csv: -------------------------------------------------------------------------------- 1 | model,HELM_AIRBENCH_Security_Risks,HELM_AIRBENCH_Operational_Misuses,HELM_AIRBENCH_Violence_&_Extremism,HELM_AIRBENCH_Hate/Toxicity,HELM_AIRBENCH_Sexual_Content,HELM_AIRBENCH_Child_Harm,HELM_AIRBENCH_Self_harm,HELM_AIRBENCH_Political_Usage,HELM_AIRBENCH_Economic_Harm,HELM_AIRBENCH_Deception,HELM_AIRBENCH_Manipulation,HELM_AIRBENCH_Defamation,HELM_AIRBENCH_Fundamental_Rights,HELM_AIRBENCH_Discrimination/Bias,HELM_AIRBENCH_Privacy,HELM_AIRBENCH_Criminal_Activities,HELM_AIRBENCH_AIR_Score 2 | anthropic/claude-3-haiku-20240307,0.005,0.572,0.159,0.057,0.219,0.152,0.022,0.041,0.04,0.089,0.053,0.037,0,0.382,0.086,0.008,0.198 3 | anthropic/claude-3-sonnet-20240229,0.009,0.473,0.156,0.071,0.184,0.133,0,0.031,0.02,0.096,0.127,0.074,0,0.332,0.089,0,0.177 4 | meta/llama-3-8b-chat,0.065,0.694,0.214,0.188,0.208,0.276,0.089,0.177,0.14,0.259,0.353,0.38,0.027,0.521,0.225,0,0.386 5 | anthropic/claude-3-opus-20240229,0.065,0.477,0.187,0.091,0.34,0.19,0.022,0.063,0.06,0.126,0.1,0.074,0,0.27,0.096,0.017,0.177 6 | google/gemini-1.5-pro-001-safety-default,0.097,0.338,0.253,0.135,0.288,0.233,0.078,0.161,0.09,0.215,0.22,0.194,0.06,0.24,0.123,0.042,0.189 7 | google/gemini-1.5-flash-001-safety-default,0.124,0.371,0.289,0.164,0.302,0.286,0.022,0.195,0.153,0.3,0.293,0.278,0.053,0.325,0.14,0.033,0.233 8 | openai/gpt-3.5-turbo-0613,0.137,0.551,0.455,0.274,0.549,0.429,0.089,0.463,0.433,0.522,0.433,0.463,0.213,0.516,0.316,0.108,0.407 9 | openai/gpt-4-turbo-2024-04-09,0.142,0.636,0.329,0.144,0.378,0.381,0.156,0.323,0.293,0.304,0.34,0.167,0.08,0.461,0.207,0.058,0.322 10 | meta/llama-3-70b-chat,0.158,0.726,0.351,0.329,0.49,0.267,0.078,0.339,0.34,0.385,0.427,0.574,0.147,0.502,0.274,0.025,0.386 11 | openai/gpt-3.5-turbo-1106,0.275,0.636,0.589,0.433,0.559,0.629,0.322,0.609,0.623,0.659,0.573,0.481,0.333,0.589,0.39,0.267,0.511 12 | openai/gpt-4o-2024-05-13,0.297,0.813,0.527,0.327,0.524,0.552,0.189,0.601,0.587,0.504,0.54,0.426,0.267,0.575,0.45,0.233,0.506 13 | openai/gpt-3.5-turbo-0125,0.405,0.768,0.664,0.51,0.667,0.752,0.422,0.725,0.71,0.748,0.7,0.593,0.52,0.624,0.471,0.45,0.593 14 | qwen/qwen1.5-72b-chat,0.453,0.772,0.579,0.371,0.635,0.686,0.356,0.616,0.623,0.733,0.633,0.63,0.467,0.571,0.546,0.35,0.558 15 | deepseek-ai/deepseek-llm-67b-chat,0.457,0.709,0.541,0.365,0.622,0.643,0.344,0.532,0.567,0.648,0.573,0.407,0.373,0.584,0.515,0.3,0.533 16 | 01-ai/yi-34b-chat,0.509,0.691,0.558,0.377,0.576,0.624,0.289,0.52,0.503,0.681,0.533,0.491,0.227,0.559,0.436,0.275,0.507 17 | mistralai/mixtral-8x22b-instruct-v0.1,0.671,0.744,0.726,0.417,0.569,0.767,0.322,0.747,0.647,0.726,0.66,0.463,0.573,0.593,0.593,0.646,0.611 18 | mistralai/mixtral-8x7b-instruct-v0.1,0.777,0.818,0.733,0.504,0.632,0.848,0.533,0.808,0.74,0.822,0.687,0.602,0.627,0.592,0.579,0.742,0.645 19 | cohere/command-r,0.782,0.878,0.775,0.586,0.712,0.824,0.578,0.861,0.82,0.822,0.813,0.648,0.773,0.678,0.699,0.717,0.722 20 | cohere/command-r-plus,0.829,0.881,0.816,0.653,0.729,0.819,0.578,0.895,0.897,0.867,0.853,0.815,0.8,0.68,0.709,0.817,0.747 21 | mistralai/mistral-7b-instruct-v0.3,0.932,0.841,0.806,0.501,0.597,0.924,0.522,0.909,0.91,0.889,0.853,0.648,0.893,0.624,0.717,0.942,0.718 22 | databricks/dbrx-instruct,0.955,0.874,0.841,0.624,0.684,0.924,0.722,0.963,0.953,0.926,0.953,0.75,0.947,0.675,0.817,0.967,0.786 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/helm_classic_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,helm_classic 2 | Llama 2 (70B),0.944 3 | LLaMA (65B),0.908 4 | text-davinci-002,0.905 5 | Mistral v0.1 (7B),0.884 6 | Cohere Command beta (52.4B),0.874 7 | text-davinci-003,0.872 8 | Jurassic-2 Jumbo (178B),0.824 9 | Llama 2 (13B),0.823 10 | TNLG v2 (530B),0.787 11 | gpt-3.5-turbo-0613,0.783 12 | LLaMA (30B),0.781 13 | Anthropic-LM v4-s3 (52B),0.78 14 | gpt-3.5-turbo-0301,0.76 15 | Jurassic-2 Grande (17B),0.743 16 | Palmyra X (43B),0.732 17 | Falcon (40B),0.729 18 | Falcon-Instruct (40B),0.727 19 | MPT-Instruct (30B),0.716 20 | MPT (30B),0.714 21 | J1-Grande v2 beta (17B),0.706 22 | Vicuna v1.3 (13B),0.706 23 | Cohere Command beta (6.1B),0.675 24 | Cohere xlarge v20221108 (52.4B),0.664 25 | Luminous Supreme (70B),0.662 26 | Vicuna v1.3 (7B),0.625 27 | OPT (175B),0.609 28 | Llama 2 (7B),0.607 29 | LLaMA (13B),0.595 30 | InstructPalmyra (30B),0.568 31 | Cohere xlarge v20220609 (52.4B),0.56 32 | Jurassic-2 Large (7.5B),0.553 33 | davinci (175B),0.538 34 | LLaMA (7B),0.533 35 | RedPajama-INCITE-Instruct (7B),0.524 36 | J1-Jumbo v1 (178B),0.517 37 | GLM (130B),0.512 38 | Luminous Extended (30B),0.485 39 | OPT (66B),0.448 40 | BLOOM (176B),0.446 41 | J1-Grande v1 (17B),0.433 42 | Alpaca (7B),0.381 43 | Falcon (7B),0.378 44 | RedPajama-INCITE-Base (7B),0.378 45 | Cohere large v20220720 (13.1B),0.372 46 | RedPajama-INCITE-Instruct-v1 (3B),0.366 47 | text-curie-001,0.36 48 | GPT-NeoX (20B),0.351 49 | Luminous Base (13B),0.315 50 | Cohere medium v20221108 (6.1B),0.312 51 | RedPajama-INCITE-Base-v1 (3B),0.311 52 | TNLG v2 (6.7B),0.309 53 | J1-Large v1 (7.5B),0.285 54 | GPT-J (6B),0.273 55 | Pythia (12B),0.257 56 | curie (6.7B),0.247 57 | Falcon-Instruct (7B),0.244 58 | Cohere medium v20220720 (6.1B),0.23 59 | text-babbage-001,0.229 60 | T0pp (11B),0.197 61 | Pythia (6.9B),0.196 62 | UL2 (20B),0.167 63 | T5 (11B),0.131 64 | babbage (1.3B),0.114 65 | Cohere small v20220720 (410M),0.109 66 | ada (350M),0.108 67 | text-ada-001,0.107 68 | YaLM (100B),0.075 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/helm_lite_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,helm_lite 2 | GPT-4o (2024-05-13),0.963 3 | Claude 3.5 Sonnet (20240620),0.915 4 | GPT-4 (0613),0.915 5 | GPT-4 Turbo (2024-04-09),0.908 6 | Llama 3.1 Instruct Turbo (405B),0.896 7 | Llama 3.1 Instruct Turbo (70B),0.858 8 | Llama 3 (70B),0.838 9 | Qwen2 Instruct (72B),0.827 10 | Mistral Large 2 (2407),0.803 11 | Gemini 1.5 Pro (001),0.793 12 | GPT-4o mini (2024-07-18),0.776 13 | Mixtral (8x22B),0.767 14 | GPT-4 Turbo (1106 preview),0.758 15 | Palmyra X V3 (72B),0.749 16 | Gemma 2 Instruct (27B),0.742 17 | Gemini 1.5 Flash (001),0.733 18 | Claude 3 Opus (20240229),0.722 19 | PaLM-2 (Unicorn),0.703 20 | Qwen1.5 (72B),0.68 21 | Palmyra X V2 (33B),0.659 22 | Gemma 2 Instruct (9B),0.639 23 | Yi (34B),0.634 24 | Qwen1.5 Chat (110B),0.619 25 | Qwen1.5 (32B),0.615 26 | Claude v1.3,0.594 27 | PaLM-2 (Bison),0.584 28 | Mixtral (8x7B 32K seqlen),0.582 29 | Phi-3 (14B),0.579 30 | Claude 2.0,0.56 31 | DeepSeek LLM Chat (67B),0.556 32 | Phi-3 (7B),0.545 33 | Llama 2 (70B),0.537 34 | Yi Large (Preview),0.53 35 | Command R Plus,0.509 36 | GPT-3.5 (text-davinci-003),0.503 37 | Claude 2.1,0.503 38 | Qwen1.5 (14B),0.491 39 | Gemini 1.0 Pro (002),0.484 40 | Claude Instant 1.2,0.464 41 | Llama 3 (8B),0.441 42 | GPT-3.5 Turbo (0613),0.42 43 | Claude 3 Sonnet (20240229),0.42 44 | Mistral NeMo (2402),0.401 45 | Arctic Instruct,0.399 46 | Gemma (7B),0.392 47 | GPT-3.5 (text-davinci-002),0.392 48 | LLaMA (65B),0.39 49 | Mistral Large (2402),0.382 50 | Command,0.365 51 | Command R,0.35 52 | Llama 3.1 Instruct Turbo (8B),0.347 53 | Mistral Small (2402),0.342 54 | DBRX Instruct,0.341 55 | Jamba Instruct,0.339 56 | Mistral v0.1 (7B),0.338 57 | Mistral Medium (2312),0.318 58 | Qwen1.5 (7B),0.317 59 | Claude 3 Haiku (20240307),0.309 60 | Yi (6B),0.289 61 | Llama 2 (13B),0.273 62 | Jurassic-2 Jumbo (178B),0.254 63 | Falcon (40B),0.249 64 | Mistral Instruct v0.3 (7B),0.233 65 | Jurassic-2 Grande (17B),0.203 66 | Phi-2,0.202 67 | Llama 2 (7B),0.18 68 | Luminous Supreme (70B),0.172 69 | Command Light,0.125 70 | Luminous Extended (30B),0.093 71 | Falcon (7B),0.078 72 | OLMo (7B),0.063 73 | Luminous Base (13B),0.052 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/helm_mmlu_240829_knowledge.csv: -------------------------------------------------------------------------------- 1 | model,helm_mmlu 2 | Claude 3.5 Sonnet (20240620),0.865 3 | Claude 3 Opus (20240229),0.846 4 | Llama 3.1 Instruct Turbo (405B),0.845 5 | GPT-4o (2024-05-13),0.842 6 | Gemini 1.5 Pro (001),0.827 7 | GPT-4 (0613),0.824 8 | Qwen2 Instruct (72B),0.824 9 | GPT-4 Turbo (2024-04-09),0.813 10 | Gemini 1.5 Pro (0409 preview),0.81 11 | Llama 3.1 Instruct Turbo (70B),0.801 12 | Mistral Large 2 (2407),0.8 13 | GPT-4 Turbo (1106 preview),0.796 14 | Llama 3 (70B),0.793 15 | Yi Large (Preview),0.793 16 | Palmyra X V3 (72B),0.786 17 | PaLM-2 (Unicorn),0.786 18 | Gemini 1.5 Flash (001),0.779 19 | Mixtral (8x22B),0.778 20 | Gemini 1.5 Flash (0514 preview),0.778 21 | Phi-3 (14B),0.775 22 | Qwen1.5 (72B),0.774 23 | Qwen1.5 Chat (110B),0.768 24 | GPT-4o mini (2024-07-18),0.767 25 | Yi (34B),0.762 26 | Claude 3 Sonnet (20240229),0.759 27 | Gemma 2 (27B),0.757 28 | Phi-3 (7B),0.757 29 | Qwen1.5 (32B),0.744 30 | DBRX Instruct,0.741 31 | Claude 3 Haiku (20240307),0.738 32 | Claude 2.1,0.735 33 | DeepSeek LLM Chat (67B),0.725 34 | Gemma 2 (9B),0.721 35 | Mixtral (8x7B 32K seqlen),0.717 36 | Gemini 1.0 Pro (001),0.7 37 | Llama 2 (70B),0.695 38 | Command R Plus,0.694 39 | PaLM-2 (Bison),0.692 40 | GPT-3.5 Turbo (0613),0.689 41 | Claude Instant 1.2,0.688 42 | Mistral Large (2402),0.688 43 | Mistral Small (2402),0.687 44 | Qwen1.5 (14B),0.686 45 | Arctic Instruct,0.677 46 | Llama 3 (8B),0.668 47 | Gemma (7B),0.661 48 | Jamba Instruct,0.659 49 | Mistral NeMo (2402),0.653 50 | Command R,0.652 51 | Yi (6B),0.64 52 | Qwen1.5 (7B),0.626 53 | Mistral Instruct v0.3 (7B),0.599 54 | Phi-2,0.584 55 | Mistral v0.1 (7B),0.566 56 | Llama 3.1 Instruct Turbo (8B),0.561 57 | Llama 2 (13B),0.554 58 | OLMo 1.7 (7B),0.538 59 | Llama 2 (7B),0.458 60 | OLMo (7B),0.295 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/holmes_240829_linguistics.csv: -------------------------------------------------------------------------------- 1 | model,holmes 2 | google/flan-ul2,72.2 3 | google/flan-t5-xxl,70.5 4 | google/t5-xxl-lm-adapt,70.2 5 | lmsys/vicuna-13b-v1.5,68.6 6 | meta-llama/Llama-2-70b-chat-hf,66.3 7 | ibm/labradorite-13b,66.1 8 | meta-llama/Llama-2-13b-hf,65.0 9 | meta-llama/Llama-2-13b-chat-hf,64.1 10 | EleutherAI/pythia-12b-deduped,63.1 11 | facebook/bart-base,63.0 12 | microsoft/Orca-2-13b,62.7 13 | EleutherAI/pythia-6.9b-deduped,62.3 14 | google/ul2,60.5 15 | google/flan-t5-xl,60.0 16 | google/t5-xl-lm-adapt,59.5 17 | google/electra-base-discriminator,58.3 18 | databricks/dolly-v2-12b,58.2 19 | EleutherAI/pythia-12b,58.0 20 | allenai/tulu-2-13b,57.6 21 | EleutherAI/pythia-6.9b,56.6 22 | microsoft/deberta-v3-base,56.0 23 | EleutherAI/pythia-2.8b-deduped,56.0 24 | meta-llama/Llama-2-70b-hf,55.9 25 | allenai/tulu-2-dpo-13b,55.5 26 | WizardLM/WizardLM-13B-V1.2,55.4 27 | microsoft/deberta-base,55.3 28 | EleutherAI/pythia-1.4b,54.2 29 | EleutherAI/pythia-2.8b,54.0 30 | allenai/tulu-2-70b,53.5 31 | mistralai/Mistral-7B-Instruct-v0.1,52.9 32 | albert-base-v2,52.3 33 | allenai/tk-instruct-11b-def,51.7 34 | allenai/tulu-2-dpo-70b,51.2 35 | google/flan-t5-large,50.9 36 | google/t5-base-lm-adapt,48.7 37 | google/flan-t5-base,48.7 38 | EleutherAI/pythia-1b-deduped,47.5 39 | meta-llama/Llama-2-7b-hf,47.2 40 | EleutherAI/pythia-1.4b-deduped,47.2 41 | mistralai/Mixtral-8x7B-Instruct-v0.1,46.5 42 | bert-base-uncased,45.3 43 | mistralai/Mistral-7B-v0.1,45.2 44 | meta-llama/Llama-2-7b-chat-hf,45.0 45 | ibm/merlinite-7b,44.1 46 | roberta-base,43.2 47 | google/t5-large-lm-adapt,42.4 48 | mistralai/Mixtral-8x7B-v0.1,42.2 49 | gpt2,40.6 50 | EleutherAI/pythia-410m,40.0 51 | google/flan-t5-small,38.9 52 | google/t5-small-lm-adapt,36.0 53 | EleutherAI/pythia-410m-deduped,31.3 54 | Glove.840B,26.6 55 | EleutherAI/pythia-160m-deduped,17.2 56 | EleutherAI/pythia-160m,16.3 57 | EleutherAI/pythia-70m,15.6 58 | EleutherAI/pythia-70m-deduped,14.4 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/hydrox_safety_241001_safety.csv: -------------------------------------------------------------------------------- 1 | model,hydrox_safety,hydrox_privacy,hydrox_security,hydrox_integrity,hydrox_overall_score 2 | hydroxai/hydro-safe-Mistral-7B-v0.1-dpo-full,89.41,99.62,96.66,98.16,94.44 3 | anthropic/claude-3-5-sonnet,94.75,93.83,92.61,95.56,94.18 4 | anthropic/claude-3-sonnet,92.33,94.36,94.62,94.14,93.62 5 | anthropic/claude-3-opus,92.50,91.26,90.47,94.08,92.02 6 | google/gemma-2-2b-it,92.15,92.43,89.22,93.14,91.66 7 | hydroxai/hydro-safe-Mistral-7B-Instruct-v0.1-dpo-full-1-epoch,86.56,96.21,91.35,97.74,91.60 8 | anthropic/claude-3-haiku,91.52,93.69,91.39,89.53,91.59 9 | OpenAI/gpt-4-0613,79.94,91.79,92.00,96.04,85.43 10 | hydroxai/hydro-safe-llama2-7b-chat-dpo-full-3-epoch,79.83,90.63,84.68,84.27,83.93 11 | meta-llama/Meta-Llama-3-8B-Instruct,83.32,88.61,82.51,80.86,83.72 12 | OpenAI/gpt-4o-mini-2024-07-18,80.87,82.32,77.55,81.38,80.43 13 | google/gemini-1.0-pro-latest,69.20,87.82,77.91,88.61,78.29 14 | hydroxai/hydro-safe-zephyr-td-full,78.18,0.00,0.00,0.00,78.18 15 | meta-llama/Llama-3.2-3B-Instruct,79.46,77.90,72.51,79.24,77.42 16 | google/gemini-1.0-pro,65.18,90.39,79.93,87.11,77.20 17 | meta-llama/Llama-3.2-1B-Instruct,76.25,75.71,74.20,76.98,75.78 18 | meta-llama/Meta-Llama-3-70B-Instruct,74.65,80.65,70.21,73.55,74.44 19 | google/gemini-1.5-flash,77.61,83.33,72.05,60.00,74.43 20 | google/gemini-pro,63.56,90.60,67.49,84.42,73.04 21 | OpenAI/gpt-3.5-turbo-0613,56.94,90.00,93.43,80.84,72.04 22 | Qwen/Qwen2-72B-Instruct,77.10,73.40,65.19,70.13,71.86 23 | OpenAI/gpt-4o-2024-05-13,67.11,68.46,60.89,63.54,65.26 24 | hydroxai/hydro-safe-zephyr-td-full,69.64,49.70,66.63,71.25,65.23 25 | h2oai/h2ogpt-4096-llama2-70b-chat,60.65,73.46,59.38,65.75,63.67 26 | h2oai/h2ogpt-4096-llama2-70b-chat,63.64,65.15,63.34,59.50,63.19 27 | OpenAI/gpt-4-0314,56.36,76.67,72.79,54.00,62.51 28 | meta-llama/Llama-2-70b-chat-hf,61.00,68.87,59.58,63.00,62.50 29 | meta-llama/Llama-2-13b-chat-hf,58.60,63.37,57.85,62.67,60.00 30 | meta-llama/Llama-2-7b-chat-hf,52.30,55.30,46.71,51.63,51.26 31 | deepseek-ai/DeepSeek-V2-Chat-0628,50.00,0.00,0.00,0.00,50.00 32 | deepseek-ai/DeepSeek-V2-Lite-Chat,44.26,48.84,41.91,45.93,44.91 33 | google/gemini-1.5-pro,46.99,40.63,41.65,40.84,43.27 34 | 01-ai/Yi-6B-Chat,37.35,45.36,31.49,36.02,37.00 35 | mistralai/Mistral-7B-Instruct-v0.2,41.71,37.18,32.24,32.52,36.82 36 | lmsys/vicuna-13b-v1.5,38.46,29.78,30.71,36.08,34.07 37 | meta-llama/Llama-2-7b-chat-hf,30.37,38.61,26.57,45.52,33.47 38 | hydroxai/hydro-safe-Sheared-LLaMA-1.3B-dpo-full,26.44,45.30,27.07,35.98,31.87 39 | hydroxai/hydro-safe-dolly-v2-7b-dpo-full-3-epoch,22.95,32.34,25.64,35.51,27.81 40 | tiiuae/falcon-40b-instruct,28.10,30.83,22.97,30.32,27.55 41 | google/gemma-2-2b,25.61,27.04,24.50,24.88,25.50 42 | upstage/SOLAR-0-70b-16bit,22.40,33.80,17.55,30.25,24.50 43 | HuggingFaceH4/zephyr-7b-beta,21.20,30.60,22.40,24.95,23.80 44 | mistralai/Mixtral-8x7B-Instruct-v0.1,27.70,25.04,18.24,21.23,23.75 45 | hydroxai/zephyr-reproduction-dpo-full,19.35,21.65,21.22,26.05,21.38 46 | argilla/notus-7b-v1,26.55,22.05,15.53,19.50,21.30 47 | microsoft/Orca-2-7b,18.30,18.31,20.52,22.09,19.53 48 | lmsys/vicuna-13b-v1.5-16k,21.14,17.01,16.99,22.25,19.31 49 | minimax/abab5-5s-chat,22.54,20.63,14.17,19.46,19.12 50 | Intel/neural-chat-7b-v3-1,15.86,22.28,14.72,22.84,17.86 51 | Intel/neural-chat-7b-v3-2,19.68,14.36,18.62,15.33,17.82 52 | lmsys/vicuna-33b-v1.3,18.42,21.34,13.89,18.64,17.64 53 | microsoft/Orca-2-13b,33.06,27.78,0.00,0.00,17.48 54 | mistralai/Mistral-7B-Instruct-v0.1,26.91,12.08,10.86,12.39,16.74 55 | lmsys/vicuna-7b-v1.5,22.47,10.91,12.61,11.74,15.37 56 | tiiuae/falcon-7b-instruct,14.64,11.30,14.01,15.76,14.01 57 | hydroxai/zephyr-reproduction-sft-full,14.92,14.94,9.50,13.61,13.10 58 | mistralai/Mistral-7B-Instruct-v0.1,13.92,8.63,6.29,8.04,9.68 59 | google/gemma-2-27b-it,8.10,11.11,10.00,10.94,9.67 60 | mistralai/Mixtral-8x7B-v0.1,10.61,8.81,6.73,8.16,8.81 61 | google/gemma-2b,8.55,8.27,8.09,6.39,7.99 62 | databricks/dolly-v2-7b,9.92,8.33,4.96,8.33,7.79 63 | hydroxai/hydro-safe-dolly-v2-7b-dpo-full,11.03,6.16,5.10,5.96,7.64 64 | LumiOpen/Viking-13B,7.75,8.32,5.76,7.68,7.33 65 | mistralai/Mistral-7B-v0.1,11.38,4.18,2.86,8.53,7.32 66 | LumiOpen/Viking-33B,6.87,6.48,6.92,6.38,6.73 67 | minimax/abab5-5-chat,8.32,5.13,4.85,8.09,6.60 68 | WizardLM/WizardLM-30B-V1.0,8.00,3.88,6.49,5.58,6.41 69 | databricks/dolly-v2-12b,11.46,3.48,3.39,3.72,6.21 70 | LumiOpen/Viking-7B,5.37,3.91,7.60,9.05,6.15 71 | databricks/dolly-v2-7b,8.81,2.86,2.89,5.94,5.41 72 | TinyLlama/TinyLlama-1.1B-Chat-v1.0,6.87,3.30,4.57,5.65,5.38 73 | mistralai/Mistral-7B-v0.1,7.63,2.78,4.44,3.53,5.00 74 | Nexusflow/NexusRaven-V2-13B,3.95,3.13,4.77,4.50,4.16 75 | hydroxai/hydro-safe-Mistral-7B-Instruct-v0.1-dpo-full-1-epoch,8.26,0.00,8.16,0.00,4.04 76 | databricks/dolly-v2-3b,4.08,1.08,0.55,0.18,1.81 77 | tiiuae/falcon-40b,2.08,0.25,0.40,0.64,0.90 78 | tiiuae/falcon-7b,1.05,0.11,0.43,0.23,0.51 79 | minimax/abab5-5-chat,1.27,0.21,0.04,0.20,0.44 80 | princeton-nlp/Sheared-LLaMA-1.3B,1.14,0.05,0.03,0.04,0.29 81 | davidkim205/komt-mistral-7b-v1,0.65,0.02,0.00,0.00,0.13 82 | EleutherAI/pythia-70m-deduped,0.00,0.00,0.00,0.00,0.00 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/livebench_240701_holistic.csv: -------------------------------------------------------------------------------- 1 | model,livebench_240624,LB_Reasoning_Average,LB_Coding_Average,LB_Mathematics_Average,LB_Data_Analysis_Average,LB_Language_Average,LB_IF_Average 2 | zephyr-7b-beta,17.32,16.00,8.32,11.23,15.75,4.28,48.32 3 | zephyr-7b-alpha,19.28,17.00,11.32,9.96,17.40,7.20,52.79 4 | yi-6b-chat,9.02,8.00,1.32,8.53,4.38,4.69,27.22 5 | vicuna-7b-v1.5-16k,14.22,15.00,1.32,9.04,9.93,7.92,42.12 6 | vicuna-7b-v1.5,12.31,12.00,1.00,7.10,3.33,8.66,41.75 7 | starling-lm-7b-beta,16.62,19.00,18.26,14.86,2.00,7.26,38.32 8 | smaug-qwen2-72b-instruct,39.66,37.00,39.05,40.67,26.19,30.03,65.00 9 | qwen2-72b-instruct,40.16,42.00,31.79,43.44,26.24,29.21,68.27 10 | qwen2-7b-instruct,26.63,20.00,29.21,26.87,28.75,10.21,44.74 11 | qwen2-1.5b-instruct,10.42,8.00,5.63,9.94,10.01,3.05,25.90 12 | qwen2-0.5b-instruct,7.30,3.00,2.00,7.35,2.00,2.80,26.63 13 | qwen1.5-110b-chat,29.07,26.00,22.21,26.28,31.45,13.22,55.26 14 | qwen1.5-72b-chat,28.89,21.00,22.89,26.82,32.98,11.37,58.25 15 | qwen1.5-7b-chat,17.02,13.00,6.63,15.29,16.90,6.18,44.12 16 | qwen1.5-4b-chat,11.59,13.00,4.00,9.86,9.13,5.80,27.75 17 | qwen1.5-1.8b-chat,6.32,5.00,0.00,3.53,3.33,3.16,22.90 18 | qwen1.5-0.5b-chat,5.43,4.00,0.00,4.43,0.00,2.88,21.30 19 | phi-3.5-moe-instruct,35.14,41.00,19.26,33.30,40.46,17.07,59.73 20 | phi-3.5-mini-instruct,27.81,31.00,15.26,22.20,30.43,9.67,58.30 21 | phi-3-small-128k-instruct,29.68,28.00,24.87,28.97,27.26,15.53,53.47 22 | phi-3-small-8k-instruct,29.09,29.00,21.24,23.73,29.62,15.13,55.81 23 | phi-3-mini-128k-instruct,24.76,24.00,14.29,17.06,34.02,7.76,51.40 24 | phi-3-mini-4k-instruct,24.41,22.00,14.79,20.84,29.55,8.06,51.25 25 | phi-3-medium-128k-instruct,29.88,31.00,21.58,25.64,32.12,12.76,56.15 26 | phi-3-medium-4k-instruct,30.96,35.00,20.58,31.36,31.63,13.91,53.30 27 | openhermes-2.5-mistral-7b,23.36,17.00,11.63,20.45,26.92,11.37,52.78 28 | open-mistral-nemo,29.02,25.00,28.16,21.66,33.35,14.15,51.80 29 | mixtral-8x22b-instruct-v0.1,35.29,29.00,33.11,28.33,31.67,26.48,63.17 30 | mixtral-8x7b-instruct-v0.1,22.79,18.00,11.32,20.71,28.13,13.76,44.81 31 | mistral-small-2402,33.03,28.00,24.21,28.15,31.88,22.06,63.91 32 | mistral-large-2407,48.35,45.00,46.37,40.48,46.61,39.79,71.85 33 | mistral-large-2402,38.92,35.00,26.84,32.20,42.55,28.74,68.19 34 | mistral-7b-instruct-v0.3,20.09,11.00,9.00,14.56,21.77,11.85,52.37 35 | mistral-7b-instruct-v0.2,19.51,13.00,11.63,17.08,14.62,9.05,51.65 36 | meta-llama-3.1-405b-instruct-turbo,55.18,57.00,45.68,46.55,53.51,49.85,78.47 37 | meta-llama-3.1-70b-instruct-turbo,48.90,43.00,33.11,45.58,50.29,42.36,79.08 38 | meta-llama-3.1-8b-instruct-turbo,28.11,14.00,21.58,24.37,32.15,20.05,56.53 39 | meta-llama-3-70b-instruct,37.60,31.00,20.95,32.31,43.75,34.11,63.50 40 | meta-llama-3-8b-instruct,27.46,25.00,18.26,19.66,26.00,18.72,57.14 41 | mathstral-7b-v0.1,24.33,16.00,15.63,17.84,27.89,15.37,53.25 42 | llama-2-7b-chat-hf,10.25,5.00,0.00,4.78,0.00,6.86,44.88 43 | hermes-3-llama-3.1-70b,39.56,32.00,29.79,28.32,48.11,43.77,55.37 44 | gpt-4o-mini-2024-07-18,44.57,37.00,43.37,41.58,44.52,35.28,65.68 45 | gpt-4o-2024-08-06,56.46,54.00,50.63,52.29,52.89,54.37,74.58 46 | gpt-4o-2024-05-13,54.96,55.00,46.37,49.88,52.41,53.94,72.17 47 | gpt-4-turbo-2024-04-09,53.00,54.00,47.05,48.99,51.32,45.26,71.39 48 | gpt-4-0613,44.94,31.00,37.05,36.22,44.03,49.57,71.79 49 | gpt-4-0125-preview,49.39,48.00,44.05,42.75,54.06,43.55,63.92 50 | gpt-3.5-turbo-0125,34.66,26.00,29.16,26.93,41.21,24.22,60.47 51 | gemma-2-27b-it,41.22,31.00,36.74,36.23,43.58,32.40,67.37 52 | gemma-2-9b-it,31.57,19.00,22.21,23.98,35.06,27.64,61.55 53 | gemma-1.1-7b-it,18.23,10.00,11.00,15.21,18.17,10.65,44.34 54 | gemini-1.5-pro-exp-0827,55.06,56.00,42.00,56.28,50.83,49.31,75.95 55 | gemini-1.5-pro-exp-0801,53.63,55.00,43.37,47.46,50.15,46.96,78.84 56 | gemini-1.5-pro-api-0514,44.41,33.00,32.79,42.42,52.81,38.25,67.20 57 | gemini-1.5-flash-exp-0827,47.51,52.00,39.74,36.29,47.87,31.04,78.11 58 | gemini-1.5-flash-api-0514,40.95,30.00,39.05,38.89,44.03,30.69,63.01 59 | dracarys-llama-3.1-70b-instruct,49.82,50.00,36.11,45.68,47.99,41.77,77.37 60 | dracarys-72b-instruct,41.72,41.00,41.05,42.77,26.24,31.17,68.08 61 | deepseek-v2-lite-chat,17.49,13.00,8.63,14.08,18.19,9.20,41.83 62 | deepseek-coder-v2-lite-instruct,29.21,22.00,26.84,34.44,33.00,10.64,48.34 63 | deepseek-coder-v2,46.84,49.00,41.05,52.54,38.25,33.04,67.18 64 | deepseek-chat-v2,46.36,41.00,42.05,52.11,45.59,32.77,64.61 65 | command-r-plus,32.86,32.00,20.26,24.85,24.60,23.92,71.51 66 | command-r,27.23,28.00,14.95,16.92,31.69,14.64,57.16 67 | claude-3-sonnet-20240229,38.08,26.00,25.21,29.65,44.56,38.08,65.00 68 | claude-3-opus-20240229,50.75,41.00,40.05,46.54,54.32,51.72,70.87 69 | claude-3-haiku-20240307,35.32,26.00,24.53,25.72,41.54,30.07,64.03 70 | claude-3-5-sonnet-20240620,61.16,64.00,63.21,53.75,56.74,56.94,72.30 71 | chatgpt-4o-latest,55.35,57.00,46.00,52.19,54.43,49.95,72.52 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/livebench_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,livebench_240725,LB_Reasoning,LB_Coding,LB_Mathematics,LB_Data_Analysis,LB_Language,LB_IF 2 | claude-3-5-sonnet-20240620,59.87,58.67,60.85,53.75,56.74,56.94,72.30 3 | gpt-4o-2024-08-06,56.71,54.67,51.44,52.29,52.89,54.37,74.58 4 | chatgpt-4o-latest,54.71,52.00,47.15,52.19,54.43,49.95,72.52 5 | gpt-4o-2024-05-13,54.63,50.00,49.36,49.88,52.41,53.94,72.17 6 | meta-llama-3.1-405b-instruct-turbo,54.25,53.33,43.80,46.55,53.51,49.85,78.47 7 | gemini-1.5-pro-exp-0827,53.78,49.33,40.95,56.28,50.83,49.31,75.95 8 | gpt-4-turbo-2024-04-09,52.88,51.33,49.00,48.99,51.32,45.26,71.39 9 | gemini-1.5-pro-exp-0801,52.22,48.67,41.23,47.46,50.15,46.96,78.84 10 | claude-3-opus-20240229,50.56,41.33,38.59,46.54,54.32,51.72,70.87 11 | gpt-4-0125-preview,48.90,47.33,41.80,42.75,54.06,43.55,63.92 12 | dracarys-llama-3.1-70b-instruct,48.67,44.00,35.23,45.68,47.99,41.77,77.37 13 | meta-llama-3.1-70b-instruct-turbo,48.44,40.67,32.67,45.58,50.29,42.36,79.08 14 | mistral-large-2407,47.97,42.00,47.08,40.48,46.61,39.79,71.85 15 | gemini-1.5-flash-exp-0827,46.87,47.33,40.59,36.29,47.87,31.04,78.11 16 | deepseek-coder-v2,46.31,45.33,41.51,52.54,38.25,33.04,67.18 17 | deepseek-chat-v2,46.04,40.00,41.15,52.11,45.59,32.77,64.61 18 | gpt-4-0613,45.60,34.67,37.31,36.22,44.03,49.57,71.79 19 | gemini-1.5-pro-api-0514,44.72,35.33,32.31,42.42,52.81,38.25,67.20 20 | gpt-4o-mini-2024-07-18,44.26,35.33,43.15,41.58,44.52,35.28,65.68 21 | gemma-2-27b-it,41.26,32.00,35.95,36.23,43.58,32.40,67.37 22 | dracarys-72b-instruct,41.20,40.00,38.95,42.77,26.24,31.17,68.08 23 | qwen2-72b-instruct,40.15,41.33,32.38,43.44,26.24,29.21,68.27 24 | hermes-3-llama-3.1-70b,40.05,33.33,31.38,28.32,48.11,43.77,55.37 25 | gemini-1.5-flash-api-0514,40.04,29.33,34.31,38.89,44.03,30.69,63.01 26 | smaug-qwen2-72b-instruct,39.32,36.00,38.03,40.67,26.19,30.03,65.00 27 | mistral-large-2402,39.18,36.00,27.38,32.20,42.55,28.74,68.19 28 | claude-3-sonnet-20240229,38.72,28.67,26.38,29.65,44.56,38.08,65.00 29 | meta-llama-3-70b-instruct,37.73,30.67,22.03,32.31,43.75,34.11,63.50 30 | claude-3-haiku-20240307,35.86,29.33,24.46,25.72,41.54,30.07,64.03 31 | mixtral-8x22b-instruct-v0.1,35.17,29.33,32.03,28.33,31.67,26.48,63.17 32 | phi-3.5-moe-instruct,35.16,38.67,21.74,33.30,40.46,17.07,59.73 33 | gpt-3.5-turbo-0125,34.54,26.67,27.74,26.93,41.21,24.22,60.47 34 | mistral-small-2402,32.19,26.00,21.18,28.15,31.88,22.06,63.91 35 | command-r-plus,32.17,28.67,19.46,24.85,24.60,23.92,71.51 36 | gemma-2-9b-it,31.34,17.33,22.46,23.98,35.06,27.64,61.55 37 | phi-3-medium-4k-instruct,31.22,36.67,20.46,31.36,31.63,13.91,53.30 38 | phi-3-medium-128k-instruct,30.30,34.00,21.10,25.64,32.12,12.76,56.15 39 | phi-3-small-128k-instruct,29.97,30.00,24.57,28.97,27.26,15.53,53.47 40 | qwen1.5-110b-chat,29.78,30.67,21.82,26.28,31.45,13.22,55.26 41 | deepseek-coder-v2-lite-instruct,29.53,26.00,24.74,34.44,33.00,10.64,48.34 42 | qwen1.5-72b-chat,29.26,23.33,22.82,26.82,32.98,11.37,58.25 43 | open-mistral-nemo,29.17,25.33,28.74,21.66,33.35,14.15,51.80 44 | phi-3.5-mini-instruct,28.30,33.33,15.90,22.20,30.43,9.67,58.30 45 | meta-llama-3.1-8b-instruct-turbo,28.03,15.33,19.74,24.37,32.15,20.05,56.53 46 | phi-3-small-8k-instruct,27.98,23.33,20.26,23.73,29.62,15.13,55.81 47 | meta-llama-3-8b-instruct,27.56,24.00,19.82,19.66,26.00,18.72,57.14 48 | command-r,26.83,25.33,15.26,16.92,31.69,14.64,57.16 49 | qwen2-7b-instruct,26.58,20.00,28.95,26.87,28.75,10.21,44.74 50 | phi-3-mini-128k-instruct,25.55,28.00,15.04,17.06,34.02,7.76,51.40 51 | phi-3-mini-4k-instruct,25.46,28.00,15.04,20.84,29.55,8.06,51.25 52 | mathstral-7b-v0.1,24.48,18.00,14.54,17.84,27.89,15.37,53.25 53 | openhermes-2.5-mistral-7b,24.13,20.00,13.26,20.45,26.92,11.37,52.78 54 | mixtral-8x7b-instruct-v0.1,22.73,17.33,11.62,20.71,28.13,13.76,44.81 55 | mistral-7b-instruct-v0.3,21.25,16.00,10.97,14.56,21.77,11.85,52.37 56 | mistral-7b-instruct-v0.2,20.05,14.00,13.90,17.08,14.62,9.05,51.65 57 | gemma-1.1-7b-it,18.78,14.67,9.62,15.21,18.17,10.65,44.34 58 | zephyr-7b-alpha,18.60,12.00,12.26,9.96,17.40,7.20,52.79 59 | qwen1.5-7b-chat,17.98,16.00,9.41,15.29,16.90,6.18,44.12 60 | deepseek-v2-lite-chat,17.74,16.00,7.13,14.08,18.19,9.20,41.83 61 | zephyr-7b-beta,16.72,12.67,8.05,11.23,15.75,4.28,48.32 62 | starling-lm-7b-beta,16.60,18.67,18.46,14.86,2.00,7.26,38.32 63 | vicuna-7b-v1.5-16k,14.50,15.33,2.64,9.04,9.93,7.92,42.12 64 | vicuna-7b-v1.5,12.57,12.67,1.92,7.10,3.33,8.66,41.75 65 | llama-2-7b-chat-hf,11.63,12.00,1.28,4.78,0.00,6.86,44.88 66 | qwen1.5-4b-chat,11.28,10.67,4.49,9.86,9.13,5.80,27.75 67 | qwen2-1.5b-instruct,10.35,8.00,5.21,9.94,10.01,3.05,25.90 68 | yi-6b-chat,9.58,10.67,2.00,8.53,4.38,4.69,27.22 69 | qwen2-0.5b-instruct,7.68,6.00,1.28,7.35,2.00,2.80,26.63 70 | qwen1.5-1.8b-chat,6.04,3.33,0.00,3.53,3.33,3.16,22.90 71 | qwen1.5-0.5b-chat,5.21,2.67,0.00,4.43,0.00,2.88,21.30 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/livecodebench_240601_230701_code.csv: -------------------------------------------------------------------------------- 1 | model,LiveCodeBench_Pass@1 2 | GPT-4-0-2024-05-13,45.6 3 | GPT-4-Turbo-2024-04-09,44.7 4 | GPT-4-Turbo-0106,39.7 5 | GPT-4-0613,36.9 6 | GeminiPro-1.5-May,35.7 7 | Claude-3-Opus,35.4 8 | Codestral-Latest,32.2 9 | Gemini-Flash-1.5-May,30 10 | LLama3-70b-Ins,28.3 11 | Claude-3-Sonnet,26.9 12 | GeminiPro-1.5-April (n=1),26.9 13 | Mixtral-8x22B-Ins,26.4 14 | Mistral-Large,26 15 | GPT-3.5-Turbo-0125,24.6 16 | Claude-3-Haiku,24.5 17 | Claude-2,24.1 18 | Claude-Instant-1,23.7 19 | DSCoder-33b-Ins,23.6 20 | Command-R+,22.9 21 | GPT-3.5-Turbo-0301,22.6 22 | OC-DS-33B,22.3 23 | PHPhind-34B-V2,21 24 | MagiCoders-DS-6.7B,20.5 25 | LLama3-70b-Base,20.1 26 | CodeGen15-7B-Chat,19.3 27 | DSCoder-6.7b-Ins,19.1 28 | OC-DS-6.7B,18.3 29 | CodeGen15-7B,16.3 30 | Command-R,15.4 31 | LLama3-8b-Ins,15.3 32 | DSCoder-33b-Base,15 33 | StarCoder2-15b,14.2 34 | CodeLlama-13b-Ins,14 35 | CodeGenma-7b-Base,13.8 36 | CodeLlama-34b-Ins,13.3 37 | DSCoder-6.7b-Base,12.9 38 | MagiCoders-CL-7B,12.7 39 | CodeLlama-34b-Base,12.3 40 | LLama3-8b-Base,12.3 41 | Mixtral-8x7B-Ins,12.3 42 | CodeLlama-7b-Ins,11.2 43 | StarCoder2-7b,11.2 44 | StarCoder2-3b,10.4 45 | Gemma-7b-Base,10 46 | CodeLlama-13b-Base,9.2 47 | DSCoder-1.3b-Ins,8.8 48 | CodeLlama-7b-Base,7 49 | CodeGenma-2b-Base,6.7 50 | DSCoder-1.3b-Base,6.4 51 | CodeLlama-70b-Base,6.2 52 | OC-DS-1.3B,4.1 53 | CodeLlama-70b-Ins,3 54 | Gemma-2b-Base,2.2 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/llm_trustworthy_241001_safety.csv: -------------------------------------------------------------------------------- 1 | model,trustworthy 2 | google/gemma-2b-it,67.18 3 | google/gemma-7b-it,66.87 4 | lmsys/vicuna-7b-v1.3,60.62 5 | meta-llama/Llama-2-7b-chat-hf,74.72 6 | meta-llama/Meta-Llama-3-8B-Instruct,80.61 7 | mosaicml/mpt-7b-chat,62.29 8 | openai/gpt-3.5-turbo-0301,72.45 9 | openai/gpt-4-0314,69.24 10 | openai/gpt-4o-2024-05-13,82.96 11 | openai/gpt-4o-mini-2024-07-18,76.31 12 | tiiuae/falcon-7b-instruct,59.49 13 | togethercomputer/RedPajama-INCITE-7B-Instruct,56.58 14 | vertexai/gemini-pro-1.0,80.61 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/lvbench_241189_longcontext.csv: -------------------------------------------------------------------------------- 1 | llm_model,average,factrecall-zh,loogle-cr-mixup,factrecall-en,loogle-sd-mixup,loogle-mr-mixup,lic-mixup,cmrc-mixup,multifieldqa-en-mixup,dureader-mixup,multifieldqa-zh-mixup,hotpotwikiqa-mixup 2 | ChatGLM3-6B,18.73509091,6.1,10.168,52.6,22.29,9.102,15.024,28.16,12.926,19.574,18.992,11.15 3 | BlueLM-7B,12.27709091,18.8,5.036,24.034,13.02,2.874,9.114,17.53,7.322,14.608,11.486,11.224 4 | Yi-6B,9.805636364,13.95,5.818,22.282,29.17,4.412,6.122,1.272,7.75,2.83,1.836,12.42 5 | LongChat-7B,8.119818182,4.28,8.59,9.144,14.56,6.028,6.924,9.65,6.954,10.342,5.864,6.982 6 | Llama2-7B,7.412181818,0.92,2.512,38.09,7.63,1.918,5.268,6.124,4.628,9.574,2.564,2.306 7 | Qwen-7B,4.470727273,5.45,3.136,0.8,4.78,2.702,4.772,5.806,4.516,10.416,4.574,2.226 8 | Vicuna-7B,3.258545455,0,3.256,0.09,4.68,2.314,4.004,6.044,3.438,7.178,2.888,1.952 9 | Llama2-7B-Chat,2.145090909,0,2.622,0.446,3.04,1.798,1.02,1.966,3.994,5.49,1.482,1.738 10 | GPT-3.5,6.530363636,5.28,6.092,2.874,13.988,5.868,3.532,5.162,9.782,4.866,8.506,5.884 11 | GPT-4,8.467636364,11.386,7.26,9.254,11.128,5.906,5.276,5.96,10.156,12.068,7.292,7.458 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/mixeval_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,MixEval 2 | LLaMA-3-70B,82.2 3 | Qwen1.5-72B,79.5 4 | Yi-34B,78.3 5 | Qwen1.5-32B,77.6 6 | Mixtral-8x7B,74.0 7 | LLaMA-2-70B,73.2 8 | Qwen1.5-MoE-A2.7B,70.2 9 | Qwen1.5-7B,68.2 10 | LLaMA-3-8B,65.1 11 | Mistral-7B,64.8 12 | Gemma-7B,64.7 13 | Yi-6B,63.1 14 | Qwen1.5-4B,58.2 15 | JetMoE-8B,57.1 16 | DeepSeek-7B,52.2 17 | Phi-2,51.9 18 | DeepSeekMoE-16B,51.4 19 | LLaMA-2-7B,43.1 20 | Gemma-2B,38.9 21 | OLMo-7B,31.8 22 | MPT-7B,30.8 23 | Claude 3.5 Sonnet-0620,89.9 24 | LLaMA-3.1-405B-Instruct,- 25 | GPT-4o-2024-05-13,87.9 26 | Claude 3 Opus,88.1 27 | GPT-4-Turbo-2024-04-09,88.8 28 | Gemini 1.5 Pro-API-0409,84.2 29 | Gemini 1.5 Pro-API-0514,84.8 30 | Mistral Large 2,86.1 31 | Yi-Large-preview,84.4 32 | LLaMA-3-70B-Instruct,84.0 33 | Qwen-Max-0428,86.1 34 | Claude 3 Sonnet,81.7 35 | Reka Core-20240415,83.3 36 | MAmmoTH2-8x7B-Plus,81.5 37 | DeepSeek-V2,83.7 38 | GPT-4o mini,84.2 39 | Command R+,81.5 40 | Yi-1.5-34B-Chat,81.7 41 | Mistral-Large,84.2 42 | Qwen1.5-72B-Chat,84.1 43 | Mistral-Medium,81.9 44 | Gemini 1.0 Pro,78.9 45 | Reka Flash-20240226,79.8 46 | Mistral-Small,81.2 47 | LLaMA-3-8B-Instruct,75.0 48 | Command R,77.0 49 | Qwen1.5-32B-Chat,81.0 50 | GPT-3.5-Turbo-0125,79.7 51 | Claude 3 Haiku,79.7 52 | Yi-34B-Chat,80.1 53 | Mixtral-8x7B-Instruct-v0.1,76.4 54 | Starling-LM-7B-beta,74.8 55 | Yi-1.5-9B-Chat,74.2 56 | Gemma-1.1-7B-IT,69.6 57 | Vicuna-33B-v1.3,66.3 58 | LLaMA-2-70B-Chat,74.6 59 | MAP-Neo-Instruct-v0.1,70.0 60 | Mistral-7B-Instruct-v0.2,70.0 61 | Qwen1.5-7B-Chat,71.4 62 | Reka Edge-20240208,68.5 63 | Zephyr-7B-β,69.1 64 | LLaMA-2-7B-Chat,61.7 65 | Yi-6B-Chat,65.6 66 | Qwen1.5-MoE-A2.7B-Chat,69.1 67 | Gemma-1.1-2B-IT,51.9 68 | Vicuna-7B-v1.5,60.3 69 | OLMo-7B-Instruct,55.0 70 | Qwen1.5-4B-Chat,57.2 71 | JetMoE-8B-Chat,51.6 72 | MPT-7B-Chat,43.8 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/mmlu_pro_240829_knowledge.csv: -------------------------------------------------------------------------------- 1 | model,mmlu_pro 2 | Arx-0.3,0.7824 3 | Claude-3.5-Sonnet,0.7612 4 | Grok-2,0.7546 5 | GPT-4o (2024-05-13),0.7255 6 | Grok-2-mini,0.7185 7 | Gemini-1.5-Pro,0.6903 8 | Claude-3-Opus,0.6845 9 | Qwen2-72B-Chat,0.6438 10 | magnum-72b-v1,0.6393 11 | GPT-4-Turbo,0.6371 12 | DeepSeek-Coder-V2-Instruct,0.6363 13 | Higgs-Llama-3-70B,0.6316 14 | GPT-4o-mini,0.6309 15 | Llama-3.1-70B-Instruct,0.6284 16 | Gemini-1.5-Flash,0.5912 17 | Yi-large,0.5809 18 | Claude-3-Sonnet,0.568 19 | Llama-3-70B-Instruct,0.562 20 | Phi3-medium-4k,0.557 21 | Qwen2-72B-32k,0.5559 22 | Deepseek-V2-Chat,0.5481 23 | Llama-3-70B,0.5278 24 | Qwen1.5-72B-Chat,0.5264 25 | Llama-3.1-70B,0.5247 26 | Yi-1.5-34B-Chat,0.5229 27 | Gemma-2-9B-it,0.5208 28 | Phi3-medium-128k,0.5191 29 | MAmmoTH2-8x7B-Plus,0.504 30 | Qwen1.5-110B,0.4993 31 | GLM-4-9B-Chat,0.4801 32 | GLM-4-9B,0.4792 33 | Phi-3.5-mini-instruct,0.4787 34 | Qwen2-7B-Instruct,0.4724 35 | Yi-1.5-9B-Chat,0.4595 36 | Phi3-mini-4k,0.4566 37 | Gemma-2-9B,0.451 38 | Mistral-Nemo-Instruct-2407,0.4481 39 | Llama-3.1-8B-Instruct,0.4425 40 | Phi3-mini-128k,0.4386 41 | MAmmoTH2-8B-Plus,0.4335 42 | Mixtral-8x7B-Instruct-v0.1,0.4327 43 | Yi-34B,0.4303 44 | Mathstral-7B-v0.1,0.42 45 | DeepSeek-Coder-V2-Lite-Instruct,0.4157 46 | Mixtral-8x7B-v0.1,0.4103 47 | Llama-3-8B-Instruct,0.4098 48 | MAmmoTH2-7B-Plus,0.4085 49 | Qwen2-7B,0.4073 50 | Mistral-Nemo-Base-2407,0.3977 51 | WizardLM-2-8x22B,0.3924 52 | Yi-1.5-6B-Chat,0.3823 53 | Qwen1.5-14B-Chat,0.3802 54 | c4ai-command-r-v01,0.379 55 | Staring-7B,0.379 56 | Llama-2-70B,0.3753 57 | OpenChat-3.5-8B,0.3724 58 | InternMath-20B-Plus,0.371 59 | Llama3-Smaug-8B,0.3693 60 | Llama-3.1-8B,0.366 61 | Llama-3-8B,0.3536 62 | DeepseekMath-7B-Instruct,0.353 63 | DeepSeek-Coder-V2-Lite-Base,0.3437 64 | Gemma-7B,0.3373 65 | InternMath-7B-Plus,0.335 66 | Zephyr-7B-Beta,0.3297 67 | Mistral-7B-v0.1,0.3088 68 | Mistral-7B-Instruct-v0.2,0.3084 69 | Mistral-7B-v0.2,0.3043 70 | Qwen1.5-7B-Chat,0.2906 71 | Yi-6B-Chat,0.2884 72 | Neo-7B-Instruct,0.2874 73 | Yi-6B,0.2651 74 | Neo-7B,0.2585 75 | Mistral-7B-Instruct-v0.1,0.2575 76 | Llama-2-13B,0.2534 77 | Llemma-7B,0.2345 78 | Qwen2-1.5B-Instruct,0.2262 79 | Qwen2-1.5B,0.2256 80 | Llama-2-7B,0.2032 81 | Qwen2-0.5B-Instruct,0.1593 82 | Gemma-2B,0.1585 83 | Qwen2-0.5B,0.1497 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/mtbench_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,MT-bench 2 | GPT-4,8.99 3 | GPT-3.5-turbo,7.94 4 | Claude-v1,7.9 5 | Claude-instant-v1,7.85 6 | Vicuna-33B,7.12 7 | WizardLM-30B,7.01 8 | Guanaco-33B,6.53 9 | Tulu-30B,6.43 10 | Guanaco-65B,6.41 11 | OpenAssistant-LLaMA-30B,6.41 12 | PaLM-Chat-Bison-001,6.4 13 | Vicuna-13B,6.39 14 | MPT-30B-chat,6.39 15 | WizardLM-13B,6.35 16 | Vicuna-7B,6.0 17 | Baize-v2-13B,5.75 18 | Nous-Hermes-13B,5.51 19 | MPT-7B-Chat,5.42 20 | GPT4All-13B-Snoozy,5.41 21 | Koala-13B,5.35 22 | MPT-30B-Instruct,5.22 23 | Falcon-40B-Instruct,5.17 24 | H2O-Oasst-OpenLLaMA-13B,4.63 25 | Alpaca-13B,4.53 26 | ChatGLM-6B,4.5 27 | OpenAssistant-Pythia-12B,4.32 28 | RWKV-4-Raven-14B,3.98 29 | Dolly-V2-12B,3.28 30 | FastChat-T5-3B,3.04 31 | StableLM-Tuned-Alpha-7B,2.75 32 | LLaMA-13B,2.61 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,opencompass 2 | Claude-3.5-Sonnet,67.9 3 | GPT-4o-20240513,67.7 4 | Mistral-Large,63.2 5 | Mistral-Large-Instruct-2407,62.5 6 | DeepSeek-V2-Chat(0618),61.7 7 | GPT-4o-mini-20240718,60.4 8 | Qwen-Max-0428,57.8 9 | Yi-Large,56.3 10 | Qwen2-72B-Instruct,55.4 11 | GLM-4,55.2 12 | Llama3.1-70B-Instruct,53.9 13 | Gemma-2-27B-it,53.5 14 | Qwen1.5-110B-Chat,51.9 15 | Doubao-pro-32k/240615,51 16 | Baichuan4,50.4 17 | Step-1-8K,49.9 18 | abab6.5,49.9 19 | Ernie-4.0-8K-Preview-0518,48.8 20 | Moonshot-v1-8K,48.6 21 | GLM-4-9B-Chat,47.9 22 | Yi-1.5-34B-Chat,46.9 23 | Hunyuan-Standard-256k,46.9 24 | Mixtral-8x22B-Instruct-v0.1,46.3 25 | Gemma-2-9B-it,45.5 26 | Qwen2-7B-Instruct,45.1 27 | InternLM2.5-7B-Chat,44.5 28 | Yi-1.5-9B-Chat,42.6 29 | Nanbeige2-16B-Chat,42.3 30 | Llama3.1-8B-Instruct,42.1 31 | DBRX-Instruct,37.6 32 | Yi-1.5-6B-Chat,36.5 33 | InternLM2-Chat-20B,36 34 | Mixtral-8x7B-Instruct-v0.1,34.5 35 | Mistral-7B-Instruct-v0.3,30.7 36 | DeepSeek-V2-Lite-Chat,30 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_academic_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,opencompass_academic 2 | GPT-4o-20240513,77 3 | Qwen2-72B-Instruct,73.1 4 | GPT-4o-mini-20240718,72.5 5 | Llama3-70B-Instruct,66.6 6 | Qwen1.5-110B-Chat,61.7 7 | Yi-1.5-34B-Chat,60.4 8 | InternLM2.5-Chat-7B,60.3 9 | GLM-4-9B-Chat,59.5 10 | Qwen1.5-32B-Chat,57.1 11 | Qwen1.5-72B-Chat,56.9 12 | Yi-1.5-9B-Chat,56.1 13 | Qwen2-7B-Instruct,52 14 | Llama3-8B-Instruct,50.6 15 | Qwen1.5-14B-Chat,49.7 16 | InternLM2-Chat-20B,45.2 17 | Yi-1.5-6B-Chat,43.5 18 | Mixtral-8x7B-Instruct-v0.1,42.6 19 | InternLM2-Chat-7B,42.1 20 | Qwen1.5-7B-Chat,35.4 21 | Mistral-7B-Instruct-v0.3 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_agent_240829_agent.csv: -------------------------------------------------------------------------------- 1 | model,OC_Agent 2 | Claude-3.5-Sonnet,81.7 3 | GPT-4o-20240513,84.4 4 | Mistral-Large,83.5 5 | Mistral-Large-Instruct-2407,84.5 6 | DeepSeek-V2-Chat(0618),83.7 7 | GPT-4o-mini-20240718,85.7 8 | Qwen-Max-0428,83.8 9 | Yi-Large,86.1 10 | Qwen2-72B-Instruct,85.9 11 | GLM-4,80.4 12 | Llama3.1-70B-Instruct,86.5 13 | Gemma-2-27B-it,85.5 14 | Qwen1.5-110B-Chat,79.6 15 | Doubao-pro-32k/240615,79.3 16 | Baichuan4,84.5 17 | Step-1-8K,84.2 18 | abab6.5,62.5 19 | Ernie-4.0-8K-Preview-0518,72.7 20 | Moonshot-v1-8K,63.5 21 | GLM-4-9B-Chat,81.9 22 | Yi-1.5-34B-Chat,63.5 23 | Hunyuan-Standard-256k,65.6 24 | Mixtral-8x22B-Instruct-v0.1,86 25 | Gemma-2-9B-it,69.9 26 | Qwen2-7B-Instruct,79.7 27 | InternLM2.5-7B-Chat,79 28 | Yi-1.5-9B-Chat,54.3 29 | Nanbeige2-16B-Chat,85.8 30 | Llama3.1-8B-Instruct,80.1 31 | DBRX-Instruct,75.3 32 | Yi-1.5-6B-Chat,55.4 33 | InternLM2-Chat-20B,80.3 34 | Mixtral-8x7B-Instruct-v0.1,71 35 | Mistral-7B-Instruct-v0.3,75.4 36 | DeepSeek-V2-Lite-Chat,72.4 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_arena_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,OC_arena 2 | GPT-40-20240513,1090 3 | Qwen2-72B-Instruct,1085 4 | Qwen-Max-0428,1071 5 | Hunyuan-pro,1069 6 | Claude 3.5 Sonnet 20240620,1055 7 | ERNIE-4.0-8K-Preview-0518,1051 8 | DeepSeek-V2-Chat,1048 9 | Yi-Large,1051 10 | GPT-4-turbo-20240409,1044 11 | GLM-4-0520,1033 12 | DeepSeek-V2,1027 13 | abab6.5-chat,1027 14 | Yi-1.5-34B-Chat,1016 15 | Doubao-pro-32k/240615,1011 16 | Baichuan4,1007 17 | Qwen1.5-32B-Chat,1007 18 | Baichuan4,1007 19 | Qwen1.5-32B-Chat,1007 20 | Qwen1.5-72B-Chat,1007 21 | MoonShot-v1-32K,994 22 | InternLM2-Chat-20B,992 23 | Yi-34B-Chat,983 24 | Command-R+,977 25 | Qwen1.5-7B-Chat,970 26 | InternLM2-Chat-7B,968 27 | Qwen1.5-14B-Chat,968 28 | InternLM2.5-7B-Chat,958 29 | DeepSeek LLM 67B Chat,937 30 | Mixtral-8x22B-Instruct-v0.1,933 31 | Llama3-70B-Instruct,926 32 | Llama3-8B-Instruct,920 33 | DeepSeek MoE 16B Chat,895 34 | DBRX-Instruct,879 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_code_240829_code.csv: -------------------------------------------------------------------------------- 1 | model,OC_Code 2 | Claude-3.5-Sonnet,69.6 3 | GPT-4o-20240513,69.1 4 | Mistral-Large,65.1 5 | Mistral-Large-Instruct-2407,55.6 6 | DeepSeek-V2-Chat(0618),66.2 7 | GPT-4o-mini-20240718,63.3 8 | Qwen-Max-0428,52.4 9 | Yi-Large,54.3 10 | Qwen2-72B-Instruct,49.5 11 | GLM-4,56.3 12 | Llama3.1-70B-Instruct,53.7 13 | Gemma-2-27B-it,54.6 14 | Qwen1.5-110B-Chat,49.5 15 | Doubao-pro-32k/240615,50.2 16 | Baichuan4,44.1 17 | Step-1-8K,44.2 18 | abab6.5,50.5 19 | Ernie-4.0-8K-Preview-0518,50.6 20 | Moonshot-v1-8K,47 21 | GLM-4-9B-Chat,45.1 22 | Yi-1.5-34B-Chat,44.8 23 | Hunyuan-Standard-256k,46.1 24 | Mixtral-8x22B-Instruct-v0.1,44.7 25 | Gemma-2-9B-it,42.2 26 | Qwen2-7B-Instruct,44 27 | InternLM2.5-7B-Chat,34.8 28 | Yi-1.5-9B-Chat,41.8 29 | Nanbeige2-16B-Chat,33.3 30 | Llama3.1-8B-Instruct,39.3 31 | DBRX-Instruct,32.2 32 | Yi-1.5-6B-Chat,34.4 33 | InternLM2-Chat-20B,36.2 34 | Mixtral-8x7B-Instruct-v0.1,26.7 35 | Mistral-7B-Instruct-v0.3,23.6 36 | DeepSeek-V2-Lite-Chat,16.3 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_instruct_240829_instructionfollow.csv: -------------------------------------------------------------------------------- 1 | model,OC_Instruct 2 | Claude-3.5-Sonnet,66.2 3 | GPT-4o-20240513,60.3 4 | Mistral-Large,51.1 5 | Mistral-Large-Instruct-2407,50.3 6 | DeepSeek-V2-Chat(0618),44.1 7 | GPT-4o-mini-20240718,56 8 | Qwen-Max-0428,47.4 9 | Yi-Large,40 10 | Qwen2-72B-Instruct,34 11 | GLM-4,36.9 12 | Llama3.1-70B-Instruct,46.2 13 | Gemma-2-27B-it,45.2 14 | Qwen1.5-110B-Chat,36.8 15 | Doubao-pro-32k/240615,30.6 16 | Baichuan4,39.4 17 | Step-1-8K,38.9 18 | abab6.5,32 19 | Ernie-4.0-8K-Preview-0518,28.5 20 | Moonshot-v1-8K,35.9 21 | GLM-4-9B-Chat,36 22 | Yi-1.5-34B-Chat,38.8 23 | Hunyuan-Standard-256k,29.2 24 | Mixtral-8x22B-Instruct-v0.1,31.2 25 | Gemma-2-9B-it,40.9 26 | Qwen2-7B-Instruct,27.5 27 | InternLM2.5-7B-Chat,26.5 28 | Yi-1.5-9B-Chat,29.8 29 | Nanbeige2-16B-Chat,33.2 30 | Llama3.1-8B-Instruct,39.1 31 | DBRX-Instruct,32.5 32 | Yi-1.5-6B-Chat,26.3 33 | InternLM2-Chat-20B,18.5 34 | Mixtral-8x7B-Instruct-v0.1,28.2 35 | Mistral-7B-Instruct-v0.3,28.5 36 | DeepSeek-V2-Lite-Chat,20.6 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_knowledge_240829_knowledge.csv: -------------------------------------------------------------------------------- 1 | model,OC_Knowledge 2 | Claude-3.5-Sonnet,85 3 | GPT-4o-20240513,85.2 4 | Mistral-Large,83.4 5 | Mistral-Large-Instruct-2407,83.3 6 | DeepSeek-V2-Chat(0618),78.8 7 | GPT-4o-mini-20240718,78.7 8 | Qwen-Max-0428,79 9 | Yi-Large,75.3 10 | Qwen2-72B-Instruct,84 11 | GLM-4,77.7 12 | Llama3.1-70B-Instruct,81.4 13 | Gemma-2-27B-it,58.5 14 | Qwen1.5-110B-Chat,79.3 15 | Doubao-pro-32k/240615,78.3 16 | Baichuan4,74.2 17 | Step-1-8K,72 18 | abab6.5,69.8 19 | Ernie-4.0-8K-Preview-0518,76.4 20 | Moonshot-v1-8K,61 21 | GLM-4-9B-Chat,68.9 22 | Yi-1.5-34B-Chat,65 23 | Hunyuan-Standard-256k,69.7 24 | Mixtral-8x22B-Instruct-v0.1,72.2 25 | Gemma-2-9B-it,53.7 26 | Qwen2-7B-Instruct,64.1 27 | InternLM2.5-7B-Chat,64.8 28 | Yi-1.5-9B-Chat,56 29 | Nanbeige2-16B-Chat,53.8 30 | Llama3.1-8B-Instruct,63.2 31 | DBRX-Instruct,66.3 32 | Yi-1.5-6B-Chat,41.3 33 | InternLM2-Chat-20B,60 34 | Mixtral-8x7B-Instruct-v0.1,50.4 35 | Mistral-7B-Instruct-v0.3,47.8 36 | DeepSeek-V2-Lite-Chat,41.3 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_language_240829_language.csv: -------------------------------------------------------------------------------- 1 | model,OC_Language 2 | Claude-3.5-Sonnet,50.9 3 | GPT-4o-20240513,55.5 4 | Mistral-Large,50.9 5 | Mistral-Large-Instruct-2407,50.3 6 | DeepSeek-V2-Chat(0618),46.3 7 | GPT-4o-mini-20240718,50.1 8 | Qwen-Max-0428,56.5 9 | Yi-Large,48.7 10 | Qwen2-72B-Instruct,45.8 11 | GLM-4,45.8 12 | Llama3.1-70B-Instruct,38.4 13 | Gemma-2-27B-it,45.2 14 | Qwen1.5-110B-Chat,53.4 15 | Doubao-pro-32k/240615,31.1 16 | Baichuan4,37.2 17 | Step-1-8K,40.6 18 | abab6.5,44.9 19 | Ernie-4.0-8K-Preview-0518,36.7 20 | Moonshot-v1-8K,46.3 21 | GLM-4-9B-Chat,44.3 22 | Yi-1.5-34B-Chat,50.5 23 | Hunyuan-Standard-256k,30.6 24 | Mixtral-8x22B-Instruct-v0.1,33 25 | Gemma-2-9B-it,40.8 26 | Qwen2-7B-Instruct,43.5 27 | InternLM2.5-7B-Chat,44.6 28 | Yi-1.5-9B-Chat,46.1 29 | Nanbeige2-16B-Chat,50.5 30 | Llama3.1-8B-Instruct,33.7 31 | DBRX-Instruct,25.6 32 | Yi-1.5-6B-Chat,43.6 33 | InternLM2-Chat-20B,36.7 34 | Mixtral-8x7B-Instruct-v0.1,36.6 35 | Mistral-7B-Instruct-v0.3,30.3 36 | DeepSeek-V2-Lite-Chat,31.4 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_math_240829_math.csv: -------------------------------------------------------------------------------- 1 | model,OC_Math 2 | Claude-3.5-Sonnet,71.1 3 | GPT-4o-20240513,71.1 4 | Mistral-Large,66.4 5 | Mistral-Large-Instruct-2407,72.8 6 | DeepSeek-V2-Chat(0618),68.2 7 | GPT-4o-mini-20240718,58.2 8 | Qwen-Max-0428,55.1 9 | Yi-Large,54.8 10 | Qwen2-72B-Instruct,57.7 11 | GLM-4,53.2 12 | Llama3.1-70B-Instruct,58 13 | Gemma-2-27B-it,50.1 14 | Qwen1.5-110B-Chat,39.6 15 | Doubao-pro-32k/240615,67.5 16 | Baichuan4,51.8 17 | Step-1-8K,51.4 18 | abab6.5,47.2 19 | Ernie-4.0-8K-Preview-0518,44.7 20 | Moonshot-v1-8K,46.6 21 | GLM-4-9B-Chat,38.7 22 | Yi-1.5-34B-Chat,38.1 23 | Hunyuan-Standard-256k,53.9 24 | Mixtral-8x22B-Instruct-v0.1,47.2 25 | Gemma-2-9B-it,40.7 26 | Qwen2-7B-Instruct,37.7 27 | InternLM2.5-7B-Chat,40.8 28 | Yi-1.5-9B-Chat,38.2 29 | Nanbeige2-16B-Chat,25.8 30 | Llama3.1-8B-Instruct,38 31 | DBRX-Instruct,35.3 32 | Yi-1.5-6B-Chat,28.4 33 | InternLM2-Chat-20B,27.4 34 | Mixtral-8x7B-Instruct-v0.1,24.8 35 | Mistral-7B-Instruct-v0.3,18.1 36 | DeepSeek-V2-Lite-Chat,22.8 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/opencompass_reasoning_240829_reasoning.csv: -------------------------------------------------------------------------------- 1 | model,OC_Reasoning 2 | Claude-3.5-Sonnet,57 3 | GPT-4o-20240513,55.8 4 | Mistral-Large,50.1 5 | Mistral-Large-Instruct-2407,50 6 | DeepSeek-V2-Chat(0618),47.4 7 | GPT-4o-mini-20240718,45.4 8 | Qwen-Max-0428,47.9 9 | Yi-Large,47.6 10 | Qwen2-72B-Instruct,44.7 11 | GLM-4,46.1 12 | Llama3.1-70B-Instruct,31.6 13 | Gemma-2-27B-it,45.4 14 | Qwen1.5-110B-Chat,45.8 15 | Doubao-pro-32k/240615,27.8 16 | Baichuan4,38.5 17 | Step-1-8K,35.8 18 | abab6.5,47 19 | Ernie-4.0-8K-Preview-0518,41.3 20 | Moonshot-v1-8K,46 21 | GLM-4-9B-Chat,40 22 | Yi-1.5-34B-Chat,42.7 23 | Hunyuan-Standard-256k,36.8 24 | Mixtral-8x22B-Instruct-v0.1,28.6 25 | Gemma-2-9B-it,41.9 26 | Qwen2-7B-Instruct,36.2 27 | InternLM2.5-7B-Chat,39.3 28 | Yi-1.5-9B-Chat,39.8 29 | Nanbeige2-16B-Chat,40.5 30 | Llama3.1-8B-Instruct,24.9 31 | DBRX-Instruct,20.8 32 | Yi-1.5-6B-Chat,36.5 33 | InternLM2-Chat-20B,18.9 34 | Mixtral-8x7B-Instruct-v0.1,28.1 35 | Mistral-7B-Instruct-v0.3,20.7 36 | DeepSeek-V2-Lite-Chat,28.1 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/repoqa_241119_longcontext.csv: -------------------------------------------------------------------------------- 1 | model,RepoQA 2 | gpt-4o-2024-05-13,90.6 3 | gemini-1.5-pro-latest,90.6 4 | claude-3-opus-20240229,90.6 5 | gemini-1.5-flash-latest,90.0 6 | claude-3-sonnet-20240229,87.4 7 | DeepSeek-V2-Chat,83.4 8 | Meta-Llama-3-70B-Instruct,82.2 9 | claude-3-haiku-20240307,81.8 10 | c4ai-command-r-plus,78.4 11 | gpt-4-turbo-2024-04-09,76.4 12 | Mixtral-8x7B-Instruct-v0.1,68.0 13 | Mixtral-8x22B-Instruct-v0.1,67.8 14 | Qwen1.5-72B-Chat,67.0 15 | Phi-3-medium-128k-instruct,63.2 16 | CodeQwen1.5-7B-Chat,62.8 17 | Mistral-7B-Instruct-v0.3,62.0 18 | gpt-3.5-turbo-0125,60.4 19 | Meta-Llama-3-8B-Instruct,53.6 20 | deepseek-coder-33b-instruct,48.4 21 | Mistral-7B-Instruct-v0.2,47.4 22 | CodeLlama-13b-Instruct-hf,42.6 23 | DeepSeek-V2-Lite-Chat,41.6 24 | CodeLlama-34b-Instruct-hf,41.6 25 | Phi-3-small-128k-instruct,39.6 26 | Qwen1.5-32B-Chat,33.8 27 | CodeLlama-7b-Instruct-hf,28.2 28 | Qwen1.5-14B-Chat,26.0 29 | Magicoder-S-DS-6.7B,23.2 30 | Phi-3-mini-128k-instruct,22.4 31 | Mistral-7B-Instruct-v0.1,11.0 32 | deepseek-coder-6.7b-instruct,10.6 33 | Qwen1.5-7B-Chat,2.8 34 | codegemma-7b-it,2.2 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/ruler_bench_241002_longcontext.csv: -------------------------------------------------------------------------------- 1 | model,Ruler 2 | Jamba-1.5-large*,96.3 3 | Gemini-1.5-pro,96.1 4 | Jamba-1.5-mini,94.8 5 | GPT-4-1106-preview,94.1 6 | Llama-3.1-70b,93.7 7 | Command-R-plus-0824,92.4 8 | Qwen2-72b,92.3 9 | Command-R-plus,92.1 10 | Command-R-0824,91.9 11 | GLM4-9b,91.7 12 | Llama3.1-8b,91.3 13 | Command-R,91.1 14 | MegaBeam-Mistral,91.0 15 | Mistral-Large,90.4 16 | GradientAI/Llama3-70b,90.3 17 | Mixtral-8x22B,90.3 18 | Yi-34b,90.1 19 | Phi3-mini,88.7 20 | Phi3-medium,88.3 21 | Mixtral-8x7B,87.9 22 | GradientAI/Llama3-8b,86.3 23 | FILM-7B,84.7 24 | InternLM2.5-7b,83.9 25 | Mistral-7b,81.2 26 | Mistral-Nemo,77.8 27 | GLM3-6b,77.2 28 | LWM,75.7 29 | DBRX,74.7 30 | Qwen1.5-72b,74.0 31 | Together-7b,66.7 32 | LongChat-7b,65.2 33 | LongAlpaca-13b,47.9 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/tablebench_241002_tables.csv: -------------------------------------------------------------------------------- 1 | model,tablebench_overall_dp 2 | Llama2-7B-Chat,16.98 3 | CodeLlama-7B-Instruct,17.01 4 | Gemma-7B-Instruct,14.82 5 | Mistral-7B-Instruct,19.15 6 | Deepseek-Coder-7B-Instruct,13.82 7 | CodeQwen1.5-7B-Chat,16.76 8 | Qwen1.5-7B-Chat,15.84 9 | Qwen2-7B-Instruct,21.23 10 | StructLM-7B,12.06 11 | MAP-Neo-7B-Instruct,12.66 12 | Llama3-8B-Chat,27.28 13 | Llama3.1-8B-Instruct,23.47 14 | Llama2-13B-Chat,18.58 15 | StructLM-13B,11.52 16 | WizardLM-13B,20.8 17 | Qwen1.5-14B-Chat,17.76 18 | Qwen1.5-32B-Chat,20.21 19 | Deepseek-Coder-33B-Instruct,9.74 20 | CodeLlama-34B-Instruct,21.6 21 | StructLM-34B,0.6 22 | Mixtral-8x7B-Instruct,24.98 23 | Qwen1.5-72B-Chat,28.45 24 | Qwen2-72B-Instruct,32.52 25 | Qwen1.5-110B-Chat,29.72 26 | Llama3-70B-Chat,30.91 27 | Llama3.1-70B-Instruct,33.63 28 | GPT-3.5-Turbo,27.75 29 | Qwen-Max,29.63 30 | Yi-Large,32.43 31 | GLM-4,31.23 32 | Deepseek-Chat-V2,40.65 33 | Deepseek-Coder-V2,35.21 34 | GPT-4-Turbo,40.38 35 | GPT-4o,42.73 36 | TableLLM-CodeQwen-7B,26.08 37 | TableLLM-Deepseek-Coder-7B,27.98 38 | TableLLM-Llama3.1-8B,27.19 39 | TableLLM-Llama3-8B,26.93 40 | TableLLM-Qwen2-7B,27.14 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/toolbench_240829_tools.csv: -------------------------------------------------------------------------------- 1 | model,Toolbench 2 | gpt4,68.8 3 | text-davinci-003,67.2 4 | gpt-3.5-turbo,56.6 5 | text-curie-001,10.6 6 | Llama-2-70b,61 7 | Llama-2-13b,48.8 8 | Llama-2-7b,39.5 9 | llama-65b,55.6 10 | llama-30b,49.6 11 | llama-13b,36.8 12 | llama-13b-alpaca,26.9 13 | CodeLlama-7b-hf,48.3 14 | CodeLlama-7b-Instruct-hf,50.5 15 | CodeLlama-7b-Python-hf,52.2 16 | CodeLlama-13b-hf,56.9 17 | CodeLlama-13b-Instruct-hf,60.5 18 | CodeLlama-13b-Python-hf,56.3 19 | CodeLlama-34b-hf,62.9 20 | CodeLlama-34b-Instruct-hf,64.8 21 | CodeLlama-34b-Python-hf,59.2 22 | starcoder,49.7 23 | starcoderbase,52.2 24 | codegen-16B-nl,28.2 25 | codegen-16B-multi,28.8 26 | codegen-16B-mono,35.6 27 | bloomz,27.8 28 | opt-iml-30b,14.1 29 | opt-30b,13.4 30 | opt-iml-1.3b,7 31 | opt-1.3b,7.5 32 | neox-20b,26.4 33 | GPT-NeoXT-Chat-Base-20B,22.6 34 | pythia-12b,19.5 35 | dolly-v2-12b,5 36 | pythia-6.9b,19.4 37 | pythia-2.8b,18.6 38 | pythia-1.4b,15.9 39 | stablelm-base-alpha-7b,10.8 40 | stablelm-tuned-alpha-7b,9.2 41 | stablelm-base-alpha-3b,5.2 42 | stablelm-tuned-alpha-3b,6.6 43 | llama-30b-toolbench,50.2 44 | starcoder-toolbench,51.7 45 | codegen-16B-mono-toolbench,51.6 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks/wildbench_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,WB-Score 2 | gpt-4o-2024-05-13,59.3 3 | Claude_3.5_Sonnet,54.7 4 | Gemini_1.5_Pro,53 5 | gpt-4-turbo-2024-04-09,55.2 6 | Yi-Large-Preview,55.3 7 | DeepSeek-V2-Chat_0628_API,54 8 | gpt-4-0125-preview,52.3 9 | Claude_3_Opus,51.7 10 | Gemini_1.5_Flash,48.9 11 | Llama-3-70B-Instruct,47.8 12 | DeepSeek-V2-Coder_0614_API,45.7 13 | Yi-Large,48.9 14 | Athene-70B,59.5 15 | Nemotron-4-340B-Inst,47.7 16 | Gemma-2-27B-it,48.5 17 | Mistral-Large-2,55.6 18 | Claude_3_Sonnet,45.5 19 | gpt-4o-mini-2024-07-18,57.1 20 | Qwen2-72B-Instruct,44.5 21 | Reka_Core,45.9 22 | gemma-2-9b-it-SimPO,53.3 23 | gemma-2-9b-it-DPO,53.2 24 | Yi-1.5-34B-Chat,45.6 25 | Claude_3_Haiku,38.9 26 | Mistral-Nemo-Inst_12B,44.4 27 | Mistral-Large,38.9 28 | Gemma-2-9B-it,42.7 29 | Command-R-Plus,36.8 30 | GLM-4-9B-Chat,39.1 31 | Magpie-8B-Align-v0.1,39.3 32 | Yi-1.5-9B-Chat,38.7 33 | Llama3-Inst-8B-SimPO,37 34 | Llama3-Inst-8B-SimPO-v0.2,37.2 35 | Qwen1.5-72B-Chat,39.9 36 | Llama3-Inst-8B-SimPO-ExPO,35 37 | SELM_Llama3-8B-Inst-iter3,35.3 38 | Phi-3-medium-128k,27.3 39 | Llama-3-8B-Instruct,29.2 40 | Hermes-2-Theta-Llama-3-8B,29.6 41 | Starling-LM-7B-beta-ExPO,31.6 42 | SELM_Zephyr-7B-iter3,25.1 43 | Reka_Flash,30.4 44 | Gemma-2-2B-it,27.8 45 | gpt-3.5-turbo-0125,30 46 | DBRX_Instruct,32.6 47 | Neo-7B-Instruct-ExPO,23.1 48 | Neo-7B-Instruct,25 49 | StarlingLM-7B-beta,30.2 50 | Command-R,29.5 51 | Mixtral-8x7B-Instruct,31.5 52 | Yi-1.5-6B-Chat,23.3 53 | Tulu-2-dpo-70b,28 54 | Reka_Edge,21.3 55 | Mistral-7B-Instruct-v0.2,25.6 56 | Llama-2-70B-chat,20.7 57 | Qwen1.5-7B-Chat,23.4 58 | Hermes-2-Mixtral-8x7B-DPO,30.7 59 | Phi-3-mini-128k,24.7 60 | Gemma-7B-it,6.6 61 | Llama-2-7B-chat,8.3 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/arena_hard_2404.csv: -------------------------------------------------------------------------------- 1 | model,score,scenario,source,aggragated_from,tag 2 | gpt_4_turbo_2024_04_09,82.6,arena_hard,arena_hard_2404,[],holistic 3 | gpt_4_0125_preview,78.0,arena_hard,arena_hard_2404,[],holistic 4 | gemini_1.5_pro_api_preview,72.0,arena_hard,arena_hard_2404,[],holistic 5 | yi_large,63.7,arena_hard,arena_hard_2404,[],holistic 6 | claude_3_opus_20240229,60.4,arena_hard,arena_hard_2404,[],holistic 7 | glm_4,55.7,arena_hard,arena_hard_2404,[],holistic 8 | gpt_4_0314,50.0,arena_hard,arena_hard_2404,[],holistic 9 | gemini_1.5_flash_api_preview,49.6,arena_hard,arena_hard_2404,[],holistic 10 | claude_3_sonnet_20240229,46.8,arena_hard,arena_hard_2404,[],holistic 11 | claude_3_haiku_20240307,41.5,arena_hard,arena_hard_2404,[],holistic 12 | llama_3_70b_chat,41.1,arena_hard,arena_hard_2404,[],holistic 13 | gpt_4_0613,37.9,arena_hard,arena_hard_2404,[],holistic 14 | mistral_large_2402,37.7,arena_hard,arena_hard_2404,[],holistic 15 | mixtral_8x22b_instruct_v0.1,36.4,arena_hard,arena_hard_2404,[],holistic 16 | qwen1.5_72b_chat,36.1,arena_hard,arena_hard_2404,[],holistic 17 | command_r_plus,33.1,arena_hard,arena_hard_2404,[],holistic 18 | mistral_medium,31.9,arena_hard,arena_hard_2404,[],holistic 19 | mistral_next,27.4,arena_hard,arena_hard_2404,[],holistic 20 | gpt_3.5_turbo_0613,24.8,arena_hard,arena_hard_2404,[],holistic 21 | claude_2.0,24.0,arena_hard,arena_hard_2404,[],holistic 22 | dbrx_instructruct,23.9,arena_hard,arena_hard_2404,[],holistic 23 | mixtral_8x7b_instruct_v0.1,23.4,arena_hard,arena_hard_2404,[],holistic 24 | gpt_3.5_turbo_0125,23.3,arena_hard,arena_hard_2404,[],holistic 25 | yi_34b_chat,23.1,arena_hard,arena_hard_2404,[],holistic 26 | starling_lm_7b_beta,23.0,arena_hard,arena_hard_2404,[],holistic 27 | claude_2.1,22.8,arena_hard,arena_hard_2404,[],holistic 28 | snorkel_mistral_pairrm_dpo,20.7,arena_hard,arena_hard_2404,[],holistic 29 | llama_3_8b_chat,20.6,arena_hard,arena_hard_2404,[],holistic 30 | gpt_3.5_turbo_1106,18.9,arena_hard,arena_hard_2404,[],holistic 31 | gpt_3.5_turbo_0301,18.1,arena_hard,arena_hard_2404,[],holistic 32 | gemini_1.0_pro,17.8,arena_hard,arena_hard_2404,[],holistic 33 | snowflake_arctic_instruct,17.6,arena_hard,arena_hard_2404,[],holistic 34 | command_r,17.0,arena_hard,arena_hard_2404,[],holistic 35 | phi_3_mini_128k_instruct,15.4,arena_hard,arena_hard_2404,[],holistic 36 | tulu_2_dpo_70b,15.0,arena_hard,arena_hard_2404,[],holistic 37 | starling_lm_7b_alpha,12.8,arena_hard,arena_hard_2404,[],holistic 38 | mistral_7b_instruct,12.6,arena_hard,arena_hard_2404,[],holistic 39 | gemma_1.1_7b_it,12.1,arena_hard,arena_hard_2404,[],holistic 40 | llama_2_70b_chat,11.6,arena_hard,arena_hard_2404,[],holistic 41 | vicuna_33b_v1.3,8.6,arena_hard,arena_hard_2404,[],holistic 42 | gemma_7b_it,7.5,arena_hard,arena_hard_2404,[],holistic 43 | llama_2_7b_chat,4.6,arena_hard,arena_hard_2404,[],holistic 44 | gemma_1.1_2b_it,3.4,arena_hard,arena_hard_2404,[],holistic 45 | gemma_2b_it,3.0,arena_hard,arena_hard_2404,[],holistic 46 | -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/chatbot_arena_240829.csv: -------------------------------------------------------------------------------- 1 | model,arena_elo 2 | claude-3-5-sonnet-20240620,79.3 3 | gpt-4o-2024-05-13,79.2 4 | gpt-4-0125-preview,78.0 5 | gpt-4o-2024-08-06,77.9 6 | athene-70b,77.6 7 | gpt-4o-mini,74.9 8 | gemini-1.5-pro-api-preview,72.0 9 | mistral-large-2407,70.4 10 | llama-3.1-405b-instruct,64.1 11 | glm-4-0520,63.8 12 | yi-large,63.7 13 | deepseek-coder-v2,62.3 14 | claude-3-opus-20240229,60.4 15 | gemma-2-27b-it,57.5 16 | llama-3.1-70b-instruct,55.7 17 | glm-4-0116,55.7 18 | glm-4-air,50.9 19 | gpt-4-0314,50.0 20 | gemini-1.5-flash-api-preview,49.6 21 | qwen2-72b-instruct,46.9 22 | claude-3-sonnet-20240229,46.8 23 | llama-3-70b-instruct,46.6 24 | claude-3-haiku-20240307,41.5 25 | gpt-4-0613,37.9 26 | mistral-large-2402,37.7 27 | mixtral-8x22b-instruct-v0.1,36.4 28 | Qwen1.5-72B-Chat,36.1 29 | phi-3-medium-4k-instruct,33.4 30 | command-r-plus,33.1 31 | mistral-medium,31.9 32 | internlm2.5-20b-chat,31.2 33 | phi-3-small-8k-instruct,29.8 34 | mistral-next,27.4 35 | gpt-3.5-turbo-0613,24.8 36 | dbrx-instruct-preview,24.6 37 | internlm2-20b-chat,24.4 38 | claude-2.0,24.0 39 | Mixtral-8x7B-Instruct-v0.1,23.4 40 | gpt-3.5-turbo-0125,23.3 41 | Yi-34B-Chat,23.1 42 | Starling-LM-7B-beta,23.0 43 | claude-2.1,22.8 44 | llama-3.1-8b-instruct,21.3 45 | Snorkel-Mistral-PairRM-DPO,20.7 46 | llama-3-8b-instruct,20.6 47 | gpt-3.5-turbo-1106,18.9 48 | gpt-3.5-turbo-0301,18.1 49 | gemini-1.0-pro,17.8 50 | snowflake-arctic-instruct,17.6 51 | command-r,17.0 52 | phi-3-mini-128k-instruct,15.4 53 | tulu-2-dpo-70b,15.0 54 | Starling-LM-7B-alpha,12.8 55 | mistral-7b-instruct,12.6 56 | gemma-1.1-7b-it,12.1 57 | Llama-2-70b-chat-hf,11.6 58 | vicuna-33b-v1.3,8.6 59 | gemma-7b-it,7.5 60 | Llama-2-7b-chat-hf,4.6 61 | gemma-1.1-2b-it,3.4 62 | gemma-2b-it,3.0 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/helm_classic_240829.csv: -------------------------------------------------------------------------------- 1 | model,helm_classic,HELM_MMLU,HELM_BoolQ,HELM_NarrativeQA,HELM_NaturalQuestions(closed),HELM_NaturalQuestions(open),HELM_QuAC,HELM_HellaSwag,HELM_OpenbookQA,HELM_TruthfulQA,HELM_MS MARCO(regular),HELM_MS MARCO(TREC),HELM_CNN/DailyMail,XSUM,HELM_IMDB,HELM_CivilComments,HELM_RAFT 2 | Llama 2 (70B),0.944,0.582,0.886,0.77,0.458,0.674,0.484,-,0.554,-,-,-,-,-,0.961,0.652,0.727 3 | LLaMA (65B),0.908,0.584,0.871,0.755,0.431,0.672,0.401,-,0.508,-,-,-,-,-,0.962,0.655,0.702 4 | text-davinci-002,0.905,0.568,0.877,0.727,0.383,0.713,0.445,0.815,0.594,0.61,0.421,0.664,0.153,0.144,0.948,0.668,0.733 5 | Mistral v0.1 (7B),0.884,0.572,0.874,0.716,0.365,0.687,0.423,-,0.422,-,-,-,-,-,0.962,0.624,0.707 6 | Cohere Command beta (52.4B),0.874,0.452,0.856,0.752,0.372,0.76,0.432,0.811,0.582,0.269,0.472,0.762,0.161,0.152,0.96,0.601,0.667 7 | text-davinci-003,0.872,0.569,0.881,0.727,0.406,0.77,0.525,0.822,0.646,0.593,0.368,0.644,0.156,0.124,0.848,0.684,0.759 8 | Jurassic-2 Jumbo (178B),0.824,0.48,0.829,0.733,0.385,0.669,0.435,0.788,0.558,0.437,0.398,0.661,0.149,0.182,0.938,0.57,0.746 9 | Llama 2 (13B),0.823,0.507,0.811,0.744,0.376,0.637,0.424,-,0.33,-,-,-,-,-,0.962,0.588,0.707 10 | TNLG v2 (530B),0.787,0.469,0.809,0.722,0.384,0.642,0.39,0.799,0.562,0.251,0.377,0.643,0.161,0.169,0.941,0.601,0.679 11 | gpt-3.5-turbo-0613,0.783,0.391,0.87,0.625,0.348,0.675,0.485,-,0.339,-,-,-,-,-,0.943,0.696,0.748 12 | LLaMA (30B),0.781,0.531,0.861,0.752,0.408,0.666,0.39,-,0.344,-,-,-,-,-,0.927,0.549,0.752 13 | Anthropic-LM v4-s3 (52B),0.78,0.481,0.815,0.728,0.288,0.686,0.431,0.807,0.558,0.368,-,-0.154,0.134,0.934,0.61,0.699, 14 | gpt-3.5-turbo-0301,0.76,0.59,0.74,0.663,0.39,0.624,0.512,-,0.609,-,-,-,-,-,0.899,0.674,0.768 15 | Jurassic-2 Grande (17B),0.743,0.475,0.826,0.737,0.356,0.639,0.418,0.781,0.542,0.348,0.293,0.514,0.144,0.167,0.938,0.547,0.712 16 | Palmyra X (43B),0.732,0.609,0.896,0.742,0.413,-,0.473,-,0.616,-,-,0.049,0.149,0.935,0.008,0.701, 17 | Falcon (40B),0.729,0.509,0.819,0.673,0.392,0.675,0.307,-,0.353,-,-,-,-,-,0.959,0.552,0.661 18 | Falcon-Instruct (40B),0.727,0.497,0.829,0.625,0.377,0.666,0.371,-,0.384,-,-,-,-,-,0.959,0.603,0.586 19 | MPT-Instruct (30B),0.716,0.444,0.85,0.733,0.304,0.697,0.327,-,0.234,-,-,-,-,-,0.956,0.573,0.68 20 | MPT (30B),0.714,0.437,0.704,0.732,0.347,0.673,0.393,-,0.231,-,-,-,-,-,0.959,0.599,0.723 21 | J1-Grande v2 beta (17B),0.706,0.445,0.812,0.725,0.337,0.625,0.392,0.764,0.56,0.306,0.285,0.46,0.146,0.152,0.957,0.546,0.679 22 | Vicuna v1.3 (13B),0.706,0.462,0.808,0.691,0.346,0.686,0.403,-,0.385,-,-,-,-,-,0.762,0.645,0.657 23 | Cohere Command beta (6.1B),0.675,0.406,0.798,0.709,0.229,0.717,0.375,0.752,0.55,0.203,0.434,0.709,0.153,0.122,0.961,0.54,0.634 24 | Cohere xlarge v20221108 (52.4B),0.664,0.382,0.762,0.672,0.361,0.628,0.374,0.81,0.588,0.169,0.315,0.55,0.153,0.153,0.956,0.524,0.624 25 | Luminous Supreme (70B),0.662,0.38,0.775,0.711,0.293,0.649,0.37,-,0.222,-,-,0.15,0.136,0.959,0.562,0.653, 26 | Vicuna v1.3 (7B),0.625,0.434,0.76,0.643,0.287,0.634,0.392,-,0.292,-,-,-,-,-,0.916,0.62,0.693 27 | OPT (175B),0.609,0.318,0.793,0.671,0.297,0.615,0.36,0.791,0.586,0.25,0.288,0.448,0.146,0.155,0.947,0.505,0.606 28 | Llama 2 (7B),0.607,0.431,0.762,0.691,0.337,0.611,0.406,-,0.272,-,-,-,-,-,0.907,0.562,0.643 29 | LLaMA (13B),0.595,0.422,0.714,0.711,0.346,0.614,0.347,-,0.324,-,-,-,-,-,0.928,0.6,0.643 30 | InstructPalmyra (30B),0.568,0.403,0.751,0.496,0.33,0.682,0.433,-,0.185,-,-,0.152,0.104,0.94,0.555,0.652, 31 | Cohere xlarge v20220609 (52.4B),0.56,0.353,0.718,0.65,0.312,0.595,0.361,0.811,0.55,0.198,0.273,0.459,0.144,0.129,0.956,0.532,0.633 32 | Jurassic-2 Large (7.5B),0.553,0.339,0.742,-,0.274,0.589,-,0.729,0.53,0.245,0.247,0.464,0.136,0.142,0.956,0.57,0.622 33 | davinci (175B),0.538,0.422,0.722,0.687,0.329,0.625,0.36,0.775,0.586,0.194,0.211,0.378,0.127,0.126,0.933,0.532,0.642 34 | LLaMA (7B),0.533,0.321,0.756,0.669,0.297,0.589,0.338,-,0.28,-,-,-,-,-,0.947,0.563,0.573 35 | RedPajama-INCITE-Instruct (7B),0.524,0.363,0.705,0.638,0.232,0.659,0.26,-,0.243,-,-,-,-,-,0.927,0.664,0.695 36 | J1-Jumbo v1 (178B),0.517,0.259,0.776,0.695,0.293,0.595,0.358,0.765,0.534,0.175,0.21,0.363,0.144,0.129,0.943,0.553,0.681 37 | GLM (130B),0.512,0.344,0.784,0.706,0.148,0.642,0.272,-,0.218,-,-,0.154,0.132,0.955,0.5,0.598, 38 | Luminous Extended (30B),0.485,0.321,0.767,0.665,0.254,0.609,0.349,-,0.221,-,-,0.139,0.124,0.947,0.524,0.523, 39 | OPT (66B),0.448,0.276,0.76,0.638,0.258,0.596,0.357,0.745,0.534,0.201,0.237,0.482,0.136,0.126,0.917,0.506,0.557 40 | BLOOM (176B),0.446,0.299,0.704,0.662,0.216,0.621,0.361,0.744,0.534,0.205,0.236,0.386,0.08,0.03,0.945,0.62,0.592 41 | J1-Grande v1 (17B),0.433,0.27,0.722,0.672,0.233,0.578,0.362,0.739,0.52,0.193,0.161,0.341,0.143,0.122,0.953,0.529,0.658 42 | Alpaca (7B),0.381,0.385,0.778,0.396,0.266,0.592,0.27,-,0.243,-,-,-,-,-,0.738,0.566,0.486 43 | Falcon (7B),0.378,0.286,0.753,0.621,0.285,0.579,0.332,-,0.234,-,-,-,-,-,0.836,0.514,0.602 44 | RedPajama-INCITE-Base (7B),0.378,0.302,0.713,0.617,0.25,0.586,0.336,-,0.205,-,-,-,-,-,0.752,0.547,0.648 45 | Cohere large v20220720 (13.1B),0.372,0.324,0.725,0.625,0.232,0.573,0.338,0.736,0.542,0.181,0.19,0.33,0.126,0.108,0.933,0.507,0.596 46 | RedPajama-INCITE-Instruct-v1 (3B),0.366,0.257,0.677,0.638,0.203,0.637,0.259,-,0.208,-,-,-,-,-,0.894,0.549,0.661 47 | text-curie-001,0.36,0.237,0.62,0.582,0.175,0.571,0.358,0.676,0.514,0.257,0.271,0.507,0.152,0.076,0.923,0.537,0.489 48 | GPT-NeoX (20B),0.351,0.276,0.683,0.599,0.193,0.596,0.326,0.718,0.524,0.216,0.184,0.398,0.123,0.102,0.948,0.516,0.505 49 | Luminous Base (13B),0.315,0.27,0.719,0.605,0.202,0.568,0.334,-,0.182,-,-,0.11,0.105,0.939,0.544,0.473, 50 | Cohere medium v20221108 (6.1B),0.312,0.254,0.7,0.61,0.199,0.517,0.314,0.726,0.538,0.215,0.175,0.373,0.121,0.099,0.935,0.5,0.591 51 | RedPajama-INCITE-Base-v1 (3B),0.311,0.263,0.685,0.555,0.207,0.52,0.309,-,0.277,-,-,-,-,-,0.907,0.549,0.502 52 | TNLG v2 (6.7B),0.309,0.242,0.698,0.631,0.21,0.561,0.345,0.704,0.478,0.167,0.158,0.332,0.146,0.11,0.927,0.532,0.525 53 | J1-Large v1 (7.5B),0.285,0.241,0.683,0.623,0.19,0.532,0.328,0.7,0.514,0.197,0.147,0.292,0.134,0.102,0.956,0.532,0.545 54 | GPT-J (6B),0.273,0.249,0.649,0.545,0.156,0.559,0.33,0.663,0.514,0.199,0.152,0.345,0.131,0.096,0.939,0.52,0.619 55 | Pythia (12B),0.257,0.274,0.662,0.596,0.175,0.581,0.313,-,0.177,-,-,-,-,-,0.931,0.531,0.514 56 | curie (6.7B),0.247,0.243,0.656,0.604,0.199,0.552,0.321,0.682,0.502,0.232,0.162,0.3,0.113,0.091,0.889,0.539,0.49 57 | Falcon-Instruct (7B),0.244,0.275,0.72,0.476,0.194,0.449,0.311,-,0.213,-,-,-,-,-,0.852,0.511,0.523 58 | Cohere medium v20220720 (6.1B),0.23,0.279,0.659,0.559,0.177,0.504,0.279,0.706,0.496,0.19,0.152,0.374,0.077,0.087,0.935,0.504,0.52 59 | text-babbage-001,0.229,0.229,0.451,0.429,0.07,0.33,0.284,0.561,0.452,0.233,0.208,0.449,0.151,0.046,0.913,0.499,0.509 60 | T0pp (11B),0.197,0.407,0.0,0.151,0.039,0.19,0.121,-,0.377,-,-,0.122,0.09,0.207,0.234,0.118, 61 | Pythia (6.9B),0.196,0.236,0.631,0.528,0.142,0.539,0.296,-,0.213,-,-,-,-,-,0.928,0.511,0.502 62 | UL2 (20B),0.167,0.291,0.746,0.083,0.204,0.349,0.144,-,0.193,-,-,0.03,0.058,0.337,0.521,0.404, 63 | T5 (11B),0.131,0.29,0.761,0.086,0.194,0.477,0.116,-,0.133,-,-,0.043,0.015,0.379,0.509,0.37, 64 | babbage (1.3B),0.114,0.235,0.574,0.491,0.119,0.451,0.273,0.555,0.438,0.188,0.122,0.317,0.079,0.045,0.597,0.519,0.455 65 | Cohere small v20220720 (410M),0.109,0.264,0.457,0.294,0.078,0.309,0.219,0.483,0.348,0.217,-,0.304,0.063,0.033,0.578,0.501,0.492 66 | ada (350M),0.108,0.243,0.581,0.326,0.082,0.365,0.242,0.435,0.38,0.215,0.102,0.29,0.09,0.022,0.849,0.517,0.423 67 | text-ada-001,0.107,0.238,0.464,0.238,0.025,0.149,0.176,0.429,0.346,0.232,0.134,0.302,0.136,0.034,0.822,0.503,0.406 68 | YaLM (100B),0.075,0.243,0.634,0.252,0.068,0.227,0.162,-,0.202,-,-,0.017,0.021,0.836,0.49,0.395, -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/helm_lite_240829.csv: -------------------------------------------------------------------------------- 1 | model,helm_lite,HELM_Lite_NarrativeQA,HELM_Lite_NaturalQuestions(open),HELM_Lite_NaturalQuestions(closed),HELM_Lite_OpenbookQA,HELM_Lite_MMLU,HELM_Lite_MATH-Equivalent(CoT),HELM_Lite_GSM8K,HELM_Lite_LegalBench,HELM_Lite_MedQA,HELM_Lite_WMT2014 2 | GPT-4o (2024-05-13),0.963,0.804,0.803,0.501,0.966,0.748,0.829,0.905,0.733,0.857,0.231 3 | Claude 3.5 Sonnet (20240620),0.915,0.746,0.749,0.502,0.972,0.799,0.813,0.949,0.707,0.825,0.229 4 | GPT-4 (0613),0.915,0.768,0.79,0.457,0.96,0.735,0.802,0.932,0.713,0.815,0.211 5 | GPT-4 Turbo (2024-04-09),0.908,0.761,0.795,0.482,0.97,0.711,0.833,0.824,0.727,0.783,0.218 6 | Llama 3.1 Instruct Turbo (405B),0.896,0.749,0.756,0.456,0.94,0.759,0.827,0.949,0.707,0.805,0.238 7 | Llama 3.1 Instruct Turbo (70B),0.858,0.772,0.738,0.452,0.938,0.709,0.783,0.938,0.687,0.769,0.223 8 | Llama 3 (70B),0.838,0.798,0.743,0.475,0.934,0.695,0.663,0.805,0.733,0.777,0.225 9 | Qwen2 Instruct (72B),0.827,0.727,0.776,0.39,0.954,0.769,0.79,0.92,0.712,0.746,0.207 10 | Mistral Large 2 (2407),0.803,0.779,0.734,0.453,0.932,0.725,0.677,0.912,0.646,0.775,0.192 11 | Gemini 1.5 Pro (001),0.793,0.783,0.748,0.378,0.902,0.772,0.825,0.836,0.757,0.692,0.189 12 | GPT-4o mini (2024-07-18),0.776,0.768,0.746,0.386,0.92,0.668,0.802,0.843,0.653,0.748,0.206 13 | Mixtral (8x22B),0.767,0.779,0.726,0.478,0.882,0.701,0.656,0.8,0.708,0.704,0.209 14 | GPT-4 Turbo (1106 preview),0.758,0.727,0.763,0.435,0.95,0.699,0.857,0.668,0.626,0.817,0.205 15 | Palmyra X V3 (72B),0.749,0.706,0.685,0.407,0.938,0.702,0.723,0.831,0.709,0.684,0.262 16 | Gemma 2 Instruct (27B),0.742,0.79,0.731,0.353,0.918,0.664,0.746,0.812,0.7,0.684,0.214 17 | Gemini 1.5 Flash (001),0.733,0.783,0.723,0.332,0.928,0.703,0.753,0.785,0.661,0.68,0.225 18 | Claude 3 Opus (20240229),0.722,0.351,0.264,0.441,0.956,0.768,0.76,0.924,0.662,0.775,0.24 19 | PaLM-2 (Unicorn),0.703,0.583,0.674,0.435,0.938,0.702,0.674,0.831,0.677,0.684,0.26 20 | Qwen1.5 (72B),0.68,0.601,0.758,0.417,0.93,0.647,0.683,0.799,0.694,0.67,0.201 21 | Palmyra X V2 (33B),0.659,0.752,0.752,0.428,0.878,0.621,0.58,0.735,0.644,0.598,0.239 22 | Gemma 2 Instruct (9B),0.639,0.768,0.738,0.328,0.91,0.645,0.724,0.762,0.639,0.63,0.201 23 | Yi (34B),0.634,0.782,0.775,0.443,0.92,0.65,0.375,0.648,0.618,0.656,0.172 24 | Qwen1.5 Chat (110B),0.619,0.721,0.739,0.35,0.922,0.704,0.568,0.815,0.624,0.64,0.192 25 | Qwen1.5 (32B),0.615,0.589,0.777,0.353,0.932,0.628,0.733,0.773,0.636,0.656,0.193 26 | Claude v1.3,0.594,0.723,0.699,0.409,0.908,0.631,0.54,0.784,0.629,0.618,0.219 27 | PaLM-2 (Bison),0.584,0.718,0.813,0.39,0.878,0.608,0.421,0.61,0.645,0.547,0.241 28 | Mixtral (8x7B 32K seqlen),0.582,0.767,0.699,0.427,0.868,0.649,0.494,0.622,0.63,0.652,0.19 29 | Phi-3 (14B),0.579,0.724,0.729,0.278,0.916,0.675,0.611,0.878,0.593,0.696,0.17 30 | Claude 2.0,0.56,0.718,0.67,0.428,0.862,0.639,0.603,0.583,0.643,0.652,0.219 31 | DeepSeek LLM Chat (67B),0.556,0.581,0.733,0.412,0.88,0.641,0.615,0.795,0.637,0.628,0.186 32 | Phi-3 (7B),0.545,0.754,0.675,0.324,0.912,0.659,0.703,-,0.584,0.672,0.154 33 | Llama 2 (70B),0.537,0.763,0.674,0.46,0.838,0.58,0.323,0.567,0.673,0.618,0.196 34 | Yi Large (Preview),0.53,0.373,0.586,0.428,0.946,0.712,0.712,0.69,0.519,0.66,0.176 35 | Command R Plus,0.509,0.735,0.711,0.343,0.828,0.59,0.403,0.738,0.672,0.567,0.203 36 | GPT-3.5 (text-davinci-003),0.503,0.731,0.77,0.413,0.828,0.555,0.449,0.615,0.622,0.531,0.191 37 | Claude 2.1,0.503,0.677,0.611,0.375,0.872,0.643,0.632,0.604,0.643,0.644,0.204 38 | Qwen1.5 (14B),0.491,0.711,0.772,0.3,0.862,0.626,0.686,0.693,0.593,0.515,0.178 39 | Gemini 1.0 Pro (002),0.484,0.751,0.714,0.391,0.788,0.534,0.665,0.816,0.475,0.483,0.194 40 | Claude Instant 1.2,0.464,0.616,0.731,0.343,0.844,0.631,0.499,0.721,0.586,0.559,0.194 41 | Llama 3 (8B),0.441,0.754,0.681,0.378,0.766,0.602,0.391,0.499,0.637,0.581,0.183 42 | GPT-3.5 Turbo (0613),0.42,0.655,0.678,0.335,0.838,0.614,0.667,0.501,0.528,0.622,0.187 43 | Claude 3 Sonnet (20240229),0.42,0.111,0.072,0.028,0.918,0.652,0.084,0.907,0.49,0.684,0.218 44 | Mistral NeMo (2402),0.401,0.731,0.65,0.265,0.822,0.604,0.668,0.782,0.415,0.59,0.177 45 | Arctic Instruct,0.399,0.654,0.586,0.39,0.828,0.575,0.519,0.768,0.588,0.581,0.172 46 | Gemma (7B),0.392,0.752,0.665,0.336,0.808,0.571,0.5,0.559,0.581,0.513,0.187 47 | GPT-3.5 (text-davinci-002),0.392,0.719,0.71,0.394,0.796,0.568,0.428,0.479,0.58,0.525,0.174 48 | LLaMA (65B),0.39,0.755,0.672,0.433,0.754,0.584,0.257,0.489,0.48,0.507,0.189 49 | Mistral Large (2402),0.382,0.454,0.485,0.311,0.894,0.638,0.75,0.694,0.479,0.499,0.182 50 | Command,0.365,0.749,0.777,0.391,0.774,0.525,0.236,0.452,0.578,0.445,0.088 51 | Command R,0.35,0.742,0.72,0.352,0.782,0.567,0.266,0.551,0.507,0.555,0.149 52 | Llama 3.1 Instruct Turbo (8B),0.347,0.756,0.677,0.209,0.74,0.5,0.703,0.798,0.342,0.245,0.181 53 | Mistral Small (2402),0.342,0.519,0.587,0.304,0.862,0.593,0.621,0.734,0.389,0.616,0.169 54 | DBRX Instruct,0.341,0.488,0.55,0.284,0.91,0.643,0.358,0.671,0.426,0.694,0.131 55 | Jamba Instruct,0.339,0.658,0.636,0.384,0.796,0.582,0.38,0.67,0.54,0.519,0.164 56 | Mistral v0.1 (7B),0.338,0.716,0.687,0.367,0.776,0.584,0.297,0.377,0.58,0.525,0.16 57 | Mistral Medium (2312),0.318,0.449,0.468,0.29,0.83,0.618,0.565,0.706,0.452,0.61,0.169 58 | Qwen1.5 (7B),0.317,0.448,0.749,0.27,0.806,0.569,0.561,0.6,0.523,0.479,0.153 59 | Claude 3 Haiku (20240307),0.309,0.244,0.252,0.144,0.838,0.662,0.131,0.699,0.46,0.702,0.148 60 | Yi (6B),0.289,0.702,0.748,0.31,0.8,0.53,0.126,0.375,0.519,0.497,0.117 61 | Llama 2 (13B),0.273,0.741,0.64,0.371,0.634,0.505,0.102,0.266,0.591,0.392,0.167 62 | Jurassic-2 Jumbo (178B),0.254,0.728,0.65,0.385,0.688,0.483,0.103,0.239,0.533,0.431,0.114 63 | Falcon (40B),0.249,0.671,0.676,0.392,0.662,0.507,0.128,0.267,0.442,0.419,0.162 64 | Mistral Instruct v0.3 (7B),0.233,0.716,0.68,0.253,0.79,0.51,0.289,0.538,0.331,0.517,0.142 65 | Jurassic-2 Grande (17B),0.203,0.744,0.627,0.35,0.614,0.471,0.064,0.159,0.468,0.39,0.102 66 | Phi-2,0.202,0.703,0.68,0.155,0.798,0.518,0.255,0.581,0.334,0.41,0.038 67 | Llama 2 (7B),0.18,0.686,0.612,0.333,0.544,0.425,0.097,0.154,0.502,0.392,0.144 68 | Luminous Supreme (70B),0.172,0.743,0.656,0.299,0.284,0.316,0.078,0.137,0.452,0.276,0.102 69 | Command Light,0.125,0.629,0.686,0.195,0.398,0.386,0.098,0.149,0.397,0.312,0.023 70 | Luminous Extended (30B),0.093,0.684,0.611,0.253,0.272,0.248,0.04,0.075,0.421,0.276,0.083 71 | Falcon (7B),0.078,0.621,0.58,0.285,0.26,0.288,0.044,0.055,0.346,0.254,0.094 72 | OLMo (7B),0.063,0.597,0.603,0.259,0.222,0.305,0.029,0.044,0.341,0.229,0.097 73 | Luminous Base (13B),0.052,0.633,0.577,0.197,0.286,0.243,0.026,0.028,0.332,0.26,0.066 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/llm_trustworthy_241001_safety.csv: -------------------------------------------------------------------------------- 1 | model,trustworthy_average,trustworthy_Non-toxicity,trustworthy_Non-Stereotype,trustworthy_AdvGLUE_PP,trustworthy_OoD,trustworthy_Adv_Demo,trustworthy_Privacy,trustworthy_Ethics,trustworthy_Fairness 2 | google/gemma-2b-it,67.18,77.07,73.33,43.21,51.43,35.55,88.77,75.03,93.02 3 | google/gemma-7b-it,66.87,75.52,100,43.43,61.78,33.33,83.69,43.33,93.88 4 | lmsys/vicuna-7b-v1.3,60.62,28,81,52.16,59.1,57.99,72.96,48.22,85.53 5 | meta-llama/Llama-2-7b-chat-hf,74.72,80,97.6,51.01,75.65,55.54,97.39,40.58,100 6 | meta-llama/Meta-Llama-3-8B-Instruct,80.61,77.53,98.33,67.28,70.85,75.54,81.59,93.74,80.05 7 | mosaicml/mpt-7b-chat,62.29,40,84.6,46.2,64.26,58.25,78.93,26.11,100 8 | openai/gpt-3.5-turbo-0301,72.45,47,87,56.69,73.58,81.28,70.13,86.38,77.57 9 | openai/gpt-4-0314,69.24,41,77,64.04,87.55,77.94,66.11,76.6,63.67 10 | openai/gpt-4o-2024-05-13,82.96,86.46,99.67,51.36,86.59,88.1,97.04,92.02,62.47 11 | openai/gpt-4o-mini-2024-07-18,76.31,59.02,87.34,50.25,79.07,88.49,89.38,87.2,69.74 12 | tiiuae/falcon-7b-instruct,59.49,39,87,43.98,51.45,33.95,70.26,50.28,100 13 | togethercomputer/RedPajama-INCITE-7B-Instruct,56.58,18,73,44.81,54.21,58.51,76.64,27.49,100 14 | vertexai/gemini-pro-1.0,80.61,77.53,98.33,67.28,70.85,75.54,81.59,93.74,80.05 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/mixeval_240829_holistic.csv: -------------------------------------------------------------------------------- 1 | model,MixEval,MixEval-Hard,MixEval_TriviaQA,MixEval_MMLU,MixEval_DROP,MixEval_HellaSwag,MixEval_CommonsenseQA,MixEval_TriviaQA-Hard,MixEval_MMLU-Hard,MixEval_DROP-Hard 2 | LLaMA-3-70B,82.2,54.0,83.1,79.8,81.5,90.9,85.4,59.1,39.8,59.5 3 | Qwen1.5-72B,79.5,41.9,78.4,78.8,64.5,91.9,87.3,41.4,42.4,26.2 4 | Yi-34B,78.3,47.2,72.1,79.3,78.2,98.0,81.1,39.4,42.4,56.5 5 | Qwen1.5-32B,77.6,41.0,71.9,77.2,68.7,93.3,89.2,28.0,37.2,36.9 6 | Mixtral-8x7B,74.0,40.7,77.3,71.6,69.8,73.7,77.4,44.1,34.6,42.0 7 | LLaMA-2-70B,73.2,41.6,78.7,70.8,73.2,63.0,77.4,53.8,29.0,46.1 8 | Qwen1.5-MoE-A2.7B,70.2,33.5,71.3,69.4,59.9,80.1,80.2,36.0,30.7,31.0 9 | Qwen1.5-7B,68.2,33.7,61.4,67.0,63.6,83.8,84.4,31.6,28.6,29.8 10 | LLaMA-3-8B,65.1,31.7,65.2,69.5,63.8,51.5,69.8,22.6,38.5,37.1 11 | Mistral-7B,64.8,27.1,67.2,68.5,61.3,54.5,67.9,24.2,27.7,34.5 12 | Gemma-7B,64.7,32.7,66.0,67.4,63.8,36.0,68.4,31.1,28.1,31.4 13 | Yi-6B,63.1,30.4,54.7,71.2,51.4,77.4,76.4,17.0,37.2,19.4 14 | Qwen1.5-4B,58.2,23.5,47.8,59.6,51.0,65.7,79.2,14.0,22.9,24.7 15 | JetMoE-8B,57.1,27.0,53.4,55.3,44.1,89.2,60.4,22.8,27.3,19.2 16 | DeepSeek-7B,52.2,21.7,58.7,53.3,43.5,35.0,51.4,21.4,26.4,21.4 17 | Phi-2,51.9,21.9,37.0,62.5,50.4,20.2,68.9,7.3,29.0,27.1 18 | DeepSeekMoE-16B,51.4,24.2,64.2,49.9,41.1,28.6,48.6,24.9,30.7,12.2 19 | LLaMA-2-7B,43.1,22.1,55.5,40.8,37.6,24.9,30.7,19.5,24.7,14.9 20 | Gemma-2B,38.9,22.6,41.5,37.4,32.6,33.3,31.6,12.1,27.3,13.2 21 | OLMo-7B,31.8,21.2,38.4,29.7,24.0,26.9,25.5,16.0,25.1,11.1 22 | MPT-7B,30.8,17.4,33.5,30.9,26.8,19.2,28.8,6.6,24.2,9.2 23 | Claude 3.5 Sonnet-0620,89.9,68.1,92.6,84.2,93.7,94.6,85.4,73.3,58.4,80.4 24 | LLaMA-3.1-405B-Instruct,-,66.2,-,-,-,-,-,72.0,57.1,69.2 25 | GPT-4o-2024-05-13,87.9,64.7,88.0,85.4,87.9,94.3,86.8,70.3,57.1,67.5 26 | Claude 3 Opus,88.1,63.5,90.4,83.2,91.5,93.3,87.7,71.4,55.0,75.2 27 | GPT-4-Turbo-2024-04-09,88.8,62.6,91.2,82.8,91.0,92.6,85.4,73.1,45.5,71.0 28 | Gemini 1.5 Pro-API-0409,84.2,58.7,85.3,79.2,84.2,89.2,84.4,67.8,44.6,64.8 29 | Gemini 1.5 Pro-API-0514,84.8,58.3,83.7,84.0,82.5,91.2,82.5,59.4,54.5,55.2 30 | Mistral Large 2,86.1,57.4,88.2,81.9,89.3,80.1,81.6,64.8,42.9,72.0 31 | Yi-Large-preview,84.4,56.8,81.7,80.9,87.0,92.6,90.1,55.4,48.5,63.1 32 | LLaMA-3-70B-Instruct,84.0,55.9,83.1,80.5,90.1,81.8,83.0,60.5,46.3,74.5 33 | Qwen-Max-0428,86.1,55.8,86.7,80.6,85.4,93.6,88.2,61.5,41.6,53.5 34 | Claude 3 Sonnet,81.7,54.0,84.2,74.7,87.7,85.9,82.5,59.1,40.7,66.9 35 | Reka Core-20240415,83.3,52.9,82.8,79.3,88.1,88.6,81.6,51.6,46.3,66.6 36 | MAmmoTH2-8x7B-Plus,81.5,51.8,83.0,74.5,85.7,82.2,82.5,52.9,41.1,65.1 37 | DeepSeek-V2,83.7,51.7,84.4,77.3,85.3,88.2,84.0,51.7,42.0,62.8 38 | GPT-4o mini,84.2,51.6,83.1,82.3,87.7,83.8,84.9,45.3,45.0,68.1 39 | Command R+,81.5,51.4,83.3,78.9,80.4,83.5,82.1,57.5,42.0,65.0 40 | Yi-1.5-34B-Chat,81.7,51.2,78.4,76.4,87.0,90.2,86.8,44.4,38.1,67.4 41 | Mistral-Large,84.2,50.3,88.3,80.2,88.6,65.0,83.5,55.5,42.4,61.6 42 | Qwen1.5-72B-Chat,84.1,48.3,83.9,80.1,85.1,87.9,86.3,49.9,37.7,56.5 43 | Mistral-Medium,81.9,47.8,86.8,76.3,83.2,72.4,82.5,59.8,38.5,47.1 44 | Gemini 1.0 Pro,78.9,46.4,81.0,74.9,82.6,74.7,80.2,58.2,35.5,54.1 45 | Reka Flash-20240226,79.8,46.2,76.4,75.4,86.7,90.6,80.7,42.9,34.6,65.0 46 | Mistral-Small,81.2,46.2,85.1,75.2,86.1,73.4,77.8,56.0,33.8,52.6 47 | LLaMA-3-8B-Instruct,75.0,45.6,71.7,71.9,86.4,65.7,78.3,40.2,40.7,67.6 48 | Command R,77.0,45.2,80.9,75.0,72.0,75.8,77.4,57.0,39.0,42.0 49 | Qwen1.5-32B-Chat,81.0,43.3,75.7,78.0,82.9,85.9,88.2,39.1,29.9,54.4 50 | GPT-3.5-Turbo-0125,79.7,43.0,85.2,74.5,84.8,63.0,81.6,46.4,35.1,55.4 51 | Claude 3 Haiku,79.7,42.8,79.9,76.1,85.0,75.8,78.8,42.4,30.7,51.5 52 | Yi-34B-Chat,80.1,42.6,82.7,73.6,86.1,86.9,78.8,41.5,29.9,57.1 53 | Mixtral-8x7B-Instruct-v0.1,76.4,42.5,82.5,72.0,79.5,54.2,77.4,48.5,37.2,47.7 54 | Starling-LM-7B-beta,74.8,41.8,75.1,69.0,86.4,48.5,84.9,33.4,34.2,62.9 55 | Yi-1.5-9B-Chat,74.2,40.9,61.3,72.6,83.9,86.5,82.5,23.3,36.8,61.3 56 | Gemma-1.1-7B-IT,69.6,39.1,64.3,66.9,80.6,66.3,73.6,30.3,39.0,55.1 57 | Vicuna-33B-v1.3,66.3,38.7,79.2,59.2,71.4,30.3,61.8,42.5,39.4,36.6 58 | LLaMA-2-70B-Chat,74.6,38.0,80.0,69.8,79.8,67.3,74.1,42.2,27.7,42.2 59 | MAP-Neo-Instruct-v0.1,70.0,37.8,62.1,66.7,75.5,74.4,82.1,26.5,32.5,42.4 60 | Mistral-7B-Instruct-v0.2,70.0,36.2,73.7,67.3,72.8,54.2,66.0,33.5,29.4,44.3 61 | Qwen1.5-7B-Chat,71.4,35.5,64.1,68.7,76.4,76.1,82.1,29.0,29.0,50.0 62 | Reka Edge-20240208,68.5,32.2,60.0,63.6,80.0,74.7,80.7,18.6,26.4,56.9 63 | Zephyr-7B-β,69.1,31.6,74.7,64.9,77.3,39.1,69.3,30.2,24.2,45.3 64 | LLaMA-2-7B-Chat,61.7,30.8,68.8,59.4,69.3,35.7,61.3,24.8,30.3,44.3 65 | Yi-6B-Chat,65.6,30.1,66.1,65.4,70.5,52.5,69.8,18.9,26.8,43.7 66 | Qwen1.5-MoE-A2.7B-Chat,69.1,29.1,65.9,69.5,64.6,72.7,81.1,21.9,26.8,39.5 67 | Gemma-1.1-2B-IT,51.9,28.4,53.7,51.5,59.8,26.6,57.1,31.9,30.3,27.8 68 | Vicuna-7B-v1.5,60.3,27.8,66.4,58.7,68.3,24.9,62.7,25.9,23.4,33.2 69 | OLMo-7B-Instruct,55.0,26.7,51.7,57.1,53.1,55.9,64.6,24.7,27.3,22.9 70 | Qwen1.5-4B-Chat,57.2,24.6,46.0,61.4,57.2,54.9,74.1,16.5,17.3,28.6 71 | JetMoE-8B-Chat,51.6,24.3,46.8,58.5,27.0,86.2,68.4,19.2,25.5,11.5 72 | MPT-7B-Chat,43.8,23.8,50.2,37.8,50.0,25.6,36.3,17.5,24.7,31.0 73 | -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/mmlu_pro_240610.csv: -------------------------------------------------------------------------------- 1 | model,score,scenario,source,aggragated_from,tag 2 | llama_2_70b,0.3753,mmlu_pro,mmlu_pro_240610,[],knowledge 3 | llama_3_8b,0.3536,mmlu_pro,mmlu_pro_240610,[],knowledge 4 | deepseekmath_instruct,0.353,mmlu_pro,mmlu_pro_240610,[],knowledge 5 | gemma_7b,0.3373,mmlu_pro,mmlu_pro_240610,[],knowledge 6 | mistral_7b_v0.1,0.3088,mmlu_pro,mmlu_pro_240610,[],knowledge 7 | mistral_7b_instruct_v0.2,0.3084,mmlu_pro,mmlu_pro_240610,[],knowledge 8 | mistral_7b_v0.2,0.3043,mmlu_pro,mmlu_pro_240610,[],knowledge 9 | qwen1.5_7b_chat,0.2906,mmlu_pro,mmlu_pro_240610,[],knowledge 10 | yi_6b_chat,0.2884,mmlu_pro,mmlu_pro_240610,[],knowledge 11 | yi_6b,0.2651,mmlu_pro,mmlu_pro_240610,[],knowledge 12 | mistral_7b_instruct_v0.1,0.2575,mmlu_pro,mmlu_pro_240610,[],knowledge 13 | llama_2_13b,0.2534,mmlu_pro,mmlu_pro_240610,[],knowledge 14 | llemma_7b,0.2345,mmlu_pro,mmlu_pro_240610,[],knowledge 15 | llama_2_7b,0.2032,mmlu_pro,mmlu_pro_240610,[],knowledge 16 | gpt_4o,0.7255,mmlu_pro,mmlu_pro_240610,[],knowledge 17 | claude_3_opus,0.6845,mmlu_pro,mmlu_pro_240610,[],knowledge 18 | gpt_4_turbo,0.6371,mmlu_pro,mmlu_pro_240610,[],knowledge 19 | gemini_1.5_flash,0.5912,mmlu_pro,mmlu_pro_240610,[],knowledge 20 | yi_large,0.5753,mmlu_pro,mmlu_pro_240610,[],knowledge 21 | claude_3_sonnet,0.568,mmlu_pro,mmlu_pro_240610,[],knowledge 22 | llama_3_70b_instruct,0.562,mmlu_pro,mmlu_pro_240610,[],knowledge 23 | deepseek_v2,0.5481,mmlu_pro,mmlu_pro_240610,[],knowledge 24 | phi_3_medium_4k_instruct,0.5348,mmlu_pro,mmlu_pro_240610,[],knowledge 25 | llama_3_70b,0.5278,mmlu_pro,mmlu_pro_240610,[],knowledge 26 | qwen1.5_72b_chat,0.5162,mmlu_pro,mmlu_pro_240610,[],knowledge 27 | mammoth2_8x7b_plus,0.504,mmlu_pro,mmlu_pro_240610,[],knowledge 28 | qwen1.5_110b,0.4993,mmlu_pro,mmlu_pro_240610,[],knowledge 29 | mammoth2_8b_plus,0.4335,mmlu_pro,mmlu_pro_240610,[],knowledge 30 | mixtral_8x7b_instruct_v0.1,0.4327,mmlu_pro,mmlu_pro_240610,[],knowledge 31 | phi_3_mini_4k_instruct,0.4317,mmlu_pro,mmlu_pro_240610,[],knowledge 32 | yi_34b,0.4303,mmlu_pro,mmlu_pro_240610,[],knowledge 33 | mixtral_8x7b_v0.1,0.4103,mmlu_pro,mmlu_pro_240610,[],knowledge 34 | llama_3_8b_instruct,0.4098,mmlu_pro,mmlu_pro_240610,[],knowledge 35 | mammoth2_7b_plus,0.4085,mmlu_pro,mmlu_pro_240610,[],knowledge 36 | qwen1.5_14b_chat,0.3802,mmlu_pro,mmlu_pro_240610,[],knowledge 37 | c4ai_command_r_v01,0.379,mmlu_pro,mmlu_pro_240610,[],knowledge 38 | -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/olmes_260624.csv: -------------------------------------------------------------------------------- 1 | model,score,scenario,source,aggragated_from,tag 2 | pythia_1b,31.4,arc_c,olmes_260624,[],reasoning 3 | olmo_1b,38.6,arc_c,olmes_260624,[],reasoning 4 | tinyllama_1.1b,38.1,arc_c,olmes_260624,[],reasoning 5 | pythia_6.7b,44.6,arc_c,olmes_260624,[],reasoning 6 | rpj_incite_7b,45.3,arc_c,olmes_260624,[],reasoning 7 | stablelm2_1.6b,50.6,arc_c,olmes_260624,[],reasoning 8 | olmo_7b,46.4,arc_c,olmes_260624,[],reasoning 9 | mpt_7b,45.7,arc_c,olmes_260624,[],reasoning 10 | falcon_7b,49.7,arc_c,olmes_260624,[],reasoning 11 | llama2_7b,54.2,arc_c,olmes_260624,[],reasoning 12 | llama2_13b,67.3,arc_c,olmes_260624,[],reasoning 13 | olmo_1.7_7b,66.9,arc_c,olmes_260624,[],reasoning 14 | llama3_8b,79.3,arc_c,olmes_260624,[],reasoning 15 | mistral_7b_v0.1,78.6,arc_c,olmes_260624,[],reasoning 16 | llama3_70b,93.7,arc_c,olmes_260624,[],reasoning 17 | pythia_1b,63.4,arc_e,olmes_260624,[],reasoning 18 | olmo_1b,68.3,arc_e,olmes_260624,[],reasoning 19 | tinyllama_1.1b,69.5,arc_e,olmes_260624,[],reasoning 20 | pythia_6.7b,72.6,arc_e,olmes_260624,[],reasoning 21 | rpj_incite_7b,78.8,arc_e,olmes_260624,[],reasoning 22 | stablelm2_1.6b,75.3,arc_e,olmes_260624,[],reasoning 23 | olmo_7b,78.9,arc_e,olmes_260624,[],reasoning 24 | mpt_7b,78.0,arc_e,olmes_260624,[],reasoning 25 | falcon_7b,80.6,arc_e,olmes_260624,[],reasoning 26 | llama2_7b,84.0,arc_e,olmes_260624,[],reasoning 27 | llama2_13b,85.9,arc_e,olmes_260624,[],reasoning 28 | olmo_1.7_7b,83.6,arc_e,olmes_260624,[],reasoning 29 | llama3_8b,92.4,arc_e,olmes_260624,[],reasoning 30 | mistral_7b_v0.1,90.8,arc_e,olmes_260624,[],reasoning 31 | llama3_70b,97.7,arc_e,olmes_260624,[],reasoning 32 | pythia_1b,56.8,boolq,olmes_260624,[],knowledge 33 | olmo_1b,51.3,boolq,olmes_260624,[],knowledge 34 | tinyllama_1.1b,63.6,boolq,olmes_260624,[],knowledge 35 | pythia_6.7b,68.7,boolq,olmes_260624,[],knowledge 36 | rpj_incite_7b,72.0,boolq,olmes_260624,[],knowledge 37 | stablelm2_1.6b,82.3,boolq,olmes_260624,[],knowledge 38 | olmo_7b,78.7,boolq,olmes_260624,[],knowledge 39 | mpt_7b,82.4,boolq,olmes_260624,[],knowledge 40 | falcon_7b,78.2,boolq,olmes_260624,[],knowledge 41 | llama2_7b,86.1,boolq,olmes_260624,[],knowledge 42 | llama2_13b,86.7,boolq,olmes_260624,[],knowledge 43 | olmo_1.7_7b,85.9,boolq,olmes_260624,[],knowledge 44 | llama3_8b,87.5,boolq,olmes_260624,[],knowledge 45 | mistral_7b_v0.1,89.3,boolq,olmes_260624,[],knowledge 46 | llama3_70b,91.7,boolq,olmes_260624,[],knowledge 47 | pythia_1b,50.9,csqa,olmes_260624,[],knowledge 48 | olmo_1b,62.2,csqa,olmes_260624,[],knowledge 49 | tinyllama_1.1b,61.1,csqa,olmes_260624,[],knowledge 50 | pythia_6.7b,62.1,csqa,olmes_260624,[],knowledge 51 | rpj_incite_7b,69.2,csqa,olmes_260624,[],knowledge 52 | stablelm2_1.6b,70.4,csqa,olmes_260624,[],knowledge 53 | olmo_7b,70.8,csqa,olmes_260624,[],knowledge 54 | mpt_7b,70.9,csqa,olmes_260624,[],knowledge 55 | falcon_7b,73.4,csqa,olmes_260624,[],knowledge 56 | llama2_7b,74.2,csqa,olmes_260624,[],knowledge 57 | llama2_13b,74.0,csqa,olmes_260624,[],knowledge 58 | olmo_1.7_7b,85.8,csqa,olmes_260624,[],knowledge 59 | llama3_8b,73.9,csqa,olmes_260624,[],knowledge 60 | mistral_7b_v0.1,72.4,csqa,olmes_260624,[],knowledge 61 | llama3_70b,83.2,csqa,olmes_260624,[],knowledge 62 | pythia_1b,48.0,hellaswag,olmes_260624,[],reasoning 63 | olmo_1b,65.2,hellaswag,olmes_260624,[],reasoning 64 | tinyllama_1.1b,60.8,hellaswag,olmes_260624,[],reasoning 65 | pythia_6.7b,66.1,hellaswag,olmes_260624,[],reasoning 66 | rpj_incite_7b,72.8,hellaswag,olmes_260624,[],reasoning 67 | stablelm2_1.6b,70.3,hellaswag,olmes_260624,[],reasoning 68 | olmo_7b,78.1,hellaswag,olmes_260624,[],reasoning 69 | mpt_7b,79.6,hellaswag,olmes_260624,[],reasoning 70 | falcon_7b,79.0,hellaswag,olmes_260624,[],reasoning 71 | llama2_7b,78.9,hellaswag,olmes_260624,[],reasoning 72 | llama2_13b,83.9,hellaswag,olmes_260624,[],reasoning 73 | olmo_1.7_7b,80.1,hellaswag,olmes_260624,[],reasoning 74 | llama3_8b,81.8,hellaswag,olmes_260624,[],reasoning 75 | mistral_7b_v0.1,83.0,hellaswag,olmes_260624,[],reasoning 76 | llama3_70b,89.5,hellaswag,olmes_260624,[],reasoning 77 | pythia_1b,31.1,mmlu,olmes_260624,[],knowledge 78 | olmo_1b,33.4,mmlu,olmes_260624,[],knowledge 79 | tinyllama_1.1b,33.6,mmlu,olmes_260624,[],knowledge 80 | pythia_6.7b,37.7,mmlu,olmes_260624,[],knowledge 81 | rpj_incite_7b,40.1,mmlu,olmes_260624,[],knowledge 82 | stablelm2_1.6b,40.4,mmlu,olmes_260624,[],knowledge 83 | olmo_7b,40.5,mmlu,olmes_260624,[],knowledge 84 | mpt_7b,40.6,mmlu,olmes_260624,[],knowledge 85 | falcon_7b,42.1,mmlu,olmes_260624,[],knowledge 86 | llama2_7b,46.2,mmlu,olmes_260624,[],knowledge 87 | llama2_13b,55.8,mmlu,olmes_260624,[],knowledge 88 | olmo_1.7_7b,54.4,mmlu,olmes_260624,[],knowledge 89 | llama3_8b,66.6,mmlu,olmes_260624,[],knowledge 90 | mistral_7b_v0.1,64.0,mmlu,olmes_260624,[],knowledge 91 | llama3_70b,79.8,mmlu,olmes_260624,[],knowledge 92 | pythia_1b,40.4,openbookqa,olmes_260624,[],knowledge 93 | olmo_1b,47.6,openbookqa,olmes_260624,[],knowledge 94 | tinyllama_1.1b,45.0,openbookqa,olmes_260624,[],knowledge 95 | pythia_6.7b,50.4,openbookqa,olmes_260624,[],knowledge 96 | rpj_incite_7b,49.0,openbookqa,olmes_260624,[],knowledge 97 | stablelm2_1.6b,56.6,openbookqa,olmes_260624,[],knowledge 98 | olmo_7b,55.8,openbookqa,olmes_260624,[],knowledge 99 | mpt_7b,52.4,openbookqa,olmes_260624,[],knowledge 100 | falcon_7b,55.2,openbookqa,olmes_260624,[],knowledge 101 | llama2_7b,57.8,openbookqa,olmes_260624,[],knowledge 102 | llama2_13b,65.4,openbookqa,olmes_260624,[],knowledge 103 | olmo_1.7_7b,68.6,openbookqa,olmes_260624,[],knowledge 104 | llama3_8b,77.2,openbookqa,olmes_260624,[],knowledge 105 | mistral_7b_v0.1,80.6,openbookqa,olmes_260624,[],knowledge 106 | llama3_70b,93.4,openbookqa,olmes_260624,[],knowledge 107 | pythia_1b,68.9,piqa,olmes_260624,[],reasoning 108 | olmo_1b,74.1,piqa,olmes_260624,[],reasoning 109 | tinyllama_1.1b,71.7,piqa,olmes_260624,[],reasoning 110 | pythia_6.7b,74.9,piqa,olmes_260624,[],reasoning 111 | rpj_incite_7b,75.9,piqa,olmes_260624,[],reasoning 112 | stablelm2_1.6b,75.6,piqa,olmes_260624,[],reasoning 113 | olmo_7b,78.5,piqa,olmes_260624,[],reasoning 114 | mpt_7b,79.2,piqa,olmes_260624,[],reasoning 115 | falcon_7b,79.0,piqa,olmes_260624,[],reasoning 116 | llama2_7b,77.5,piqa,olmes_260624,[],reasoning 117 | llama2_13b,80.2,piqa,olmes_260624,[],reasoning 118 | olmo_1.7_7b,80.3,piqa,olmes_260624,[],reasoning 119 | llama3_8b,81.6,piqa,olmes_260624,[],reasoning 120 | mistral_7b_v0.1,82.8,piqa,olmes_260624,[],reasoning 121 | llama3_70b,91.6,piqa,olmes_260624,[],reasoning 122 | pythia_1b,46.4,siqa,olmes_260624,[],other 123 | olmo_1b,51.5,siqa,olmes_260624,[],other 124 | tinyllama_1.1b,50.4,siqa,olmes_260624,[],other 125 | pythia_6.7b,51.7,siqa,olmes_260624,[],other 126 | rpj_incite_7b,56.6,siqa,olmes_260624,[],other 127 | stablelm2_1.6b,64.3,siqa,olmes_260624,[],other 128 | olmo_7b,56.5,siqa,olmes_260624,[],other 129 | mpt_7b,57.4,siqa,olmes_260624,[],other 130 | falcon_7b,60.1,siqa,olmes_260624,[],other 131 | llama2_7b,59.6,siqa,olmes_260624,[],other 132 | llama2_13b,65.9,siqa,olmes_260624,[],other 133 | olmo_1.7_7b,76.1,siqa,olmes_260624,[],other 134 | llama3_8b,70.2,siqa,olmes_260624,[],other 135 | mistral_7b_v0.1,71.3,siqa,olmes_260624,[],other 136 | llama3_70b,78.9,siqa,olmes_260624,[],other 137 | pythia_1b,52.7,winogrande,olmes_260624,[],reasoning 138 | olmo_1b,59.3,winogrande,olmes_260624,[],reasoning 139 | tinyllama_1.1b,60.1,winogrande,olmes_260624,[],reasoning 140 | pythia_6.7b,62.3,winogrande,olmes_260624,[],reasoning 141 | rpj_incite_7b,68.0,winogrande,olmes_260624,[],reasoning 142 | stablelm2_1.6b,65.7,winogrande,olmes_260624,[],reasoning 143 | olmo_7b,68.5,winogrande,olmes_260624,[],reasoning 144 | mpt_7b,70.2,winogrande,olmes_260624,[],reasoning 145 | falcon_7b,71.3,winogrande,olmes_260624,[],reasoning 146 | llama2_7b,71.7,winogrande,olmes_260624,[],reasoning 147 | llama2_13b,74.9,winogrande,olmes_260624,[],reasoning 148 | olmo_1.7_7b,73.6,winogrande,olmes_260624,[],reasoning 149 | llama3_8b,76.2,winogrande,olmes_260624,[],reasoning 150 | mistral_7b_v0.1,77.9,winogrande,olmes_260624,[],reasoning 151 | llama3_70b,84.1,winogrande,olmes_260624,[],reasoning 152 | pythia_1b,49.0,olmes_average,olmes_260624,[],holistic 153 | olmo_1b,55.1,olmes_average,olmes_260624,[],holistic 154 | tinyllama_1.1b,55.4,olmes_average,olmes_260624,[],holistic 155 | pythia_6.7b,59.1,olmes_average,olmes_260624,[],holistic 156 | rpj_incite_7b,62.8,olmes_average,olmes_260624,[],holistic 157 | stablelm2_1.6b,65.1,olmes_average,olmes_260624,[],holistic 158 | olmo_7b,65.3,olmes_average,olmes_260624,[],holistic 159 | mpt_7b,65.6,olmes_average,olmes_260624,[],holistic 160 | falcon_7b,66.9,olmes_average,olmes_260624,[],holistic 161 | llama2_7b,69.0,olmes_average,olmes_260624,[],holistic 162 | llama2_13b,74.0,olmes_average,olmes_260624,[],holistic 163 | olmo_1.7_7b,75.5,olmes_average,olmes_260624,[],holistic 164 | llama3_8b,78.7,olmes_average,olmes_260624,[],holistic 165 | mistral_7b_v0.1,79.1,olmes_average,olmes_260624,[],holistic 166 | llama3_70b,88.4,olmes_average,olmes_260624,[],holistic 167 | -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/olmes_260624_frozen.csv: -------------------------------------------------------------------------------- 1 | model,score,scenario,source,aggragated_from,tag 2 | pythia_1b,31.4,arc_c,olmes_260624,[],reasoning 3 | olmo_1b,38.6,arc_c,olmes_260624,[],reasoning 4 | tinyllama_1.1b,38.1,arc_c,olmes_260624,[],reasoning 5 | pythia_6.7b,44.6,arc_c,olmes_260624,[],reasoning 6 | rpj_incite_7b,45.3,arc_c,olmes_260624,[],reasoning 7 | stablelm2_1.6b,50.6,arc_c,olmes_260624,[],reasoning 8 | olmo_7b,46.4,arc_c,olmes_260624,[],reasoning 9 | mpt_7b,45.7,arc_c,olmes_260624,[],reasoning 10 | falcon_7b,49.7,arc_c,olmes_260624,[],reasoning 11 | llama2_7b,54.2,arc_c,olmes_260624,[],reasoning 12 | llama2_13b,67.3,arc_c,olmes_260624,[],reasoning 13 | olmo_1.7_7b,66.9,arc_c,olmes_260624,[],reasoning 14 | llama3_8b,79.3,arc_c,olmes_260624,[],reasoning 15 | mistral_7b_v0.1,78.6,arc_c,olmes_260624,[],reasoning 16 | llama3_70b,93.7,arc_c,olmes_260624,[],reasoning 17 | pythia_1b,63.4,arc_e,olmes_260624,[],reasoning 18 | olmo_1b,68.3,arc_e,olmes_260624,[],reasoning 19 | tinyllama_1.1b,69.5,arc_e,olmes_260624,[],reasoning 20 | pythia_6.7b,72.6,arc_e,olmes_260624,[],reasoning 21 | rpj_incite_7b,78.8,arc_e,olmes_260624,[],reasoning 22 | stablelm2_1.6b,75.3,arc_e,olmes_260624,[],reasoning 23 | olmo_7b,78.9,arc_e,olmes_260624,[],reasoning 24 | mpt_7b,78.0,arc_e,olmes_260624,[],reasoning 25 | falcon_7b,80.6,arc_e,olmes_260624,[],reasoning 26 | llama2_7b,84.0,arc_e,olmes_260624,[],reasoning 27 | llama2_13b,85.9,arc_e,olmes_260624,[],reasoning 28 | olmo_1.7_7b,83.6,arc_e,olmes_260624,[],reasoning 29 | llama3_8b,92.4,arc_e,olmes_260624,[],reasoning 30 | mistral_7b_v0.1,90.8,arc_e,olmes_260624,[],reasoning 31 | llama3_70b,97.7,arc_e,olmes_260624,[],reasoning 32 | pythia_1b,56.8,boolq,olmes_260624,[],knowledge 33 | olmo_1b,51.3,boolq,olmes_260624,[],knowledge 34 | tinyllama_1.1b,63.6,boolq,olmes_260624,[],knowledge 35 | pythia_6.7b,68.7,boolq,olmes_260624,[],knowledge 36 | rpj_incite_7b,72.0,boolq,olmes_260624,[],knowledge 37 | stablelm2_1.6b,82.3,boolq,olmes_260624,[],knowledge 38 | olmo_7b,78.7,boolq,olmes_260624,[],knowledge 39 | mpt_7b,82.4,boolq,olmes_260624,[],knowledge 40 | falcon_7b,78.2,boolq,olmes_260624,[],knowledge 41 | llama2_7b,86.1,boolq,olmes_260624,[],knowledge 42 | llama2_13b,86.7,boolq,olmes_260624,[],knowledge 43 | olmo_1.7_7b,85.9,boolq,olmes_260624,[],knowledge 44 | llama3_8b,87.5,boolq,olmes_260624,[],knowledge 45 | mistral_7b_v0.1,89.3,boolq,olmes_260624,[],knowledge 46 | llama3_70b,91.7,boolq,olmes_260624,[],knowledge 47 | pythia_1b,50.9,csqa,olmes_260624,[],knowledge 48 | olmo_1b,62.2,csqa,olmes_260624,[],knowledge 49 | tinyllama_1.1b,61.1,csqa,olmes_260624,[],knowledge 50 | pythia_6.7b,62.1,csqa,olmes_260624,[],knowledge 51 | rpj_incite_7b,69.2,csqa,olmes_260624,[],knowledge 52 | stablelm2_1.6b,70.4,csqa,olmes_260624,[],knowledge 53 | olmo_7b,70.8,csqa,olmes_260624,[],knowledge 54 | mpt_7b,70.9,csqa,olmes_260624,[],knowledge 55 | falcon_7b,73.4,csqa,olmes_260624,[],knowledge 56 | llama2_7b,74.2,csqa,olmes_260624,[],knowledge 57 | llama2_13b,74.0,csqa,olmes_260624,[],knowledge 58 | olmo_1.7_7b,85.8,csqa,olmes_260624,[],knowledge 59 | llama3_8b,73.9,csqa,olmes_260624,[],knowledge 60 | mistral_7b_v0.1,72.4,csqa,olmes_260624,[],knowledge 61 | llama3_70b,83.2,csqa,olmes_260624,[],knowledge 62 | pythia_1b,48.0,hellaswag,olmes_260624,[],reasoning 63 | olmo_1b,65.2,hellaswag,olmes_260624,[],reasoning 64 | tinyllama_1.1b,60.8,hellaswag,olmes_260624,[],reasoning 65 | pythia_6.7b,66.1,hellaswag,olmes_260624,[],reasoning 66 | rpj_incite_7b,72.8,hellaswag,olmes_260624,[],reasoning 67 | stablelm2_1.6b,70.3,hellaswag,olmes_260624,[],reasoning 68 | olmo_7b,78.1,hellaswag,olmes_260624,[],reasoning 69 | mpt_7b,79.6,hellaswag,olmes_260624,[],reasoning 70 | falcon_7b,79.0,hellaswag,olmes_260624,[],reasoning 71 | llama2_7b,78.9,hellaswag,olmes_260624,[],reasoning 72 | llama2_13b,83.9,hellaswag,olmes_260624,[],reasoning 73 | olmo_1.7_7b,80.1,hellaswag,olmes_260624,[],reasoning 74 | llama3_8b,81.8,hellaswag,olmes_260624,[],reasoning 75 | mistral_7b_v0.1,83.0,hellaswag,olmes_260624,[],reasoning 76 | llama3_70b,89.5,hellaswag,olmes_260624,[],reasoning 77 | pythia_1b,31.1,mmlu,olmes_260624,[],knowledge 78 | olmo_1b,33.4,mmlu,olmes_260624,[],knowledge 79 | tinyllama_1.1b,33.6,mmlu,olmes_260624,[],knowledge 80 | pythia_6.7b,37.7,mmlu,olmes_260624,[],knowledge 81 | rpj_incite_7b,40.1,mmlu,olmes_260624,[],knowledge 82 | stablelm2_1.6b,40.4,mmlu,olmes_260624,[],knowledge 83 | olmo_7b,40.5,mmlu,olmes_260624,[],knowledge 84 | mpt_7b,40.6,mmlu,olmes_260624,[],knowledge 85 | falcon_7b,42.1,mmlu,olmes_260624,[],knowledge 86 | llama2_7b,46.2,mmlu,olmes_260624,[],knowledge 87 | llama2_13b,55.8,mmlu,olmes_260624,[],knowledge 88 | olmo_1.7_7b,54.4,mmlu,olmes_260624,[],knowledge 89 | llama3_8b,66.6,mmlu,olmes_260624,[],knowledge 90 | mistral_7b_v0.1,64.0,mmlu,olmes_260624,[],knowledge 91 | llama3_70b,79.8,mmlu,olmes_260624,[],knowledge 92 | pythia_1b,40.4,openbookqa,olmes_260624,[],knowledge 93 | olmo_1b,47.6,openbookqa,olmes_260624,[],knowledge 94 | tinyllama_1.1b,45.0,openbookqa,olmes_260624,[],knowledge 95 | pythia_6.7b,50.4,openbookqa,olmes_260624,[],knowledge 96 | rpj_incite_7b,49.0,openbookqa,olmes_260624,[],knowledge 97 | stablelm2_1.6b,56.6,openbookqa,olmes_260624,[],knowledge 98 | olmo_7b,55.8,openbookqa,olmes_260624,[],knowledge 99 | mpt_7b,52.4,openbookqa,olmes_260624,[],knowledge 100 | falcon_7b,55.2,openbookqa,olmes_260624,[],knowledge 101 | llama2_7b,57.8,openbookqa,olmes_260624,[],knowledge 102 | llama2_13b,65.4,openbookqa,olmes_260624,[],knowledge 103 | olmo_1.7_7b,68.6,openbookqa,olmes_260624,[],knowledge 104 | llama3_8b,77.2,openbookqa,olmes_260624,[],knowledge 105 | mistral_7b_v0.1,80.6,openbookqa,olmes_260624,[],knowledge 106 | llama3_70b,93.4,openbookqa,olmes_260624,[],knowledge 107 | pythia_1b,68.9,piqa,olmes_260624,[],reasoning 108 | olmo_1b,74.1,piqa,olmes_260624,[],reasoning 109 | tinyllama_1.1b,71.7,piqa,olmes_260624,[],reasoning 110 | pythia_6.7b,74.9,piqa,olmes_260624,[],reasoning 111 | rpj_incite_7b,75.9,piqa,olmes_260624,[],reasoning 112 | stablelm2_1.6b,75.6,piqa,olmes_260624,[],reasoning 113 | olmo_7b,78.5,piqa,olmes_260624,[],reasoning 114 | mpt_7b,79.2,piqa,olmes_260624,[],reasoning 115 | falcon_7b,79.0,piqa,olmes_260624,[],reasoning 116 | llama2_7b,77.5,piqa,olmes_260624,[],reasoning 117 | llama2_13b,80.2,piqa,olmes_260624,[],reasoning 118 | olmo_1.7_7b,80.3,piqa,olmes_260624,[],reasoning 119 | llama3_8b,81.6,piqa,olmes_260624,[],reasoning 120 | mistral_7b_v0.1,82.8,piqa,olmes_260624,[],reasoning 121 | llama3_70b,91.6,piqa,olmes_260624,[],reasoning 122 | pythia_1b,46.4,siqa,olmes_260624,[],other 123 | olmo_1b,51.5,siqa,olmes_260624,[],other 124 | tinyllama_1.1b,50.4,siqa,olmes_260624,[],other 125 | pythia_6.7b,51.7,siqa,olmes_260624,[],other 126 | rpj_incite_7b,56.6,siqa,olmes_260624,[],other 127 | stablelm2_1.6b,64.3,siqa,olmes_260624,[],other 128 | olmo_7b,56.5,siqa,olmes_260624,[],other 129 | mpt_7b,57.4,siqa,olmes_260624,[],other 130 | falcon_7b,60.1,siqa,olmes_260624,[],other 131 | llama2_7b,59.6,siqa,olmes_260624,[],other 132 | llama2_13b,65.9,siqa,olmes_260624,[],other 133 | olmo_1.7_7b,76.1,siqa,olmes_260624,[],other 134 | llama3_8b,70.2,siqa,olmes_260624,[],other 135 | mistral_7b_v0.1,71.3,siqa,olmes_260624,[],other 136 | llama3_70b,78.9,siqa,olmes_260624,[],other 137 | pythia_1b,52.7,winogrande,olmes_260624,[],reasoning 138 | olmo_1b,59.3,winogrande,olmes_260624,[],reasoning 139 | tinyllama_1.1b,60.1,winogrande,olmes_260624,[],reasoning 140 | pythia_6.7b,62.3,winogrande,olmes_260624,[],reasoning 141 | rpj_incite_7b,68.0,winogrande,olmes_260624,[],reasoning 142 | stablelm2_1.6b,65.7,winogrande,olmes_260624,[],reasoning 143 | olmo_7b,68.5,winogrande,olmes_260624,[],reasoning 144 | mpt_7b,70.2,winogrande,olmes_260624,[],reasoning 145 | falcon_7b,71.3,winogrande,olmes_260624,[],reasoning 146 | llama2_7b,71.7,winogrande,olmes_260624,[],reasoning 147 | llama2_13b,74.9,winogrande,olmes_260624,[],reasoning 148 | olmo_1.7_7b,73.6,winogrande,olmes_260624,[],reasoning 149 | llama3_8b,76.2,winogrande,olmes_260624,[],reasoning 150 | mistral_7b_v0.1,77.9,winogrande,olmes_260624,[],reasoning 151 | llama3_70b,84.1,winogrande,olmes_260624,[],reasoning 152 | pythia_1b,49.0,olmes_average,olmes_260624,[],holistic 153 | olmo_1b,55.1,olmes_average,olmes_260624,[],holistic 154 | tinyllama_1.1b,55.4,olmes_average,olmes_260624,[],holistic 155 | pythia_6.7b,59.1,olmes_average,olmes_260624,[],holistic 156 | rpj_incite_7b,62.8,olmes_average,olmes_260624,[],holistic 157 | stablelm2_1.6b,65.1,olmes_average,olmes_260624,[],holistic 158 | olmo_7b,65.3,olmes_average,olmes_260624,[],holistic 159 | mpt_7b,65.6,olmes_average,olmes_260624,[],holistic 160 | falcon_7b,66.9,olmes_average,olmes_260624,[],holistic 161 | llama2_7b,69.0,olmes_average,olmes_260624,[],holistic 162 | llama2_13b,74.0,olmes_average,olmes_260624,[],holistic 163 | olmo_1.7_7b,75.5,olmes_average,olmes_260624,[],holistic 164 | llama3_8b,78.7,olmes_average,olmes_260624,[],holistic 165 | mistral_7b_v0.1,79.1,olmes_average,olmes_260624,[],holistic 166 | llama3_70b,88.4,olmes_average,olmes_260624,[],holistic 167 | -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/opencompass_240829.csv: -------------------------------------------------------------------------------- 1 | model,opencompass,OC_Language,OC_Knowledge,OC_Reasoning,OC_Math,OC_Code,OC_Instruct,OC_Agent 2 | Claude-3.5-Sonnet,67.9,50.9,85,57,71.1,69.6,66.2,81.7 3 | GPT-4o-20240513,67.7,55.5,85.2,55.8,71.1,69.1,60.3,84.4 4 | Mistral-Large,63.2,50.9,83.4,50.1,66.4,65.1,51.1,83.5 5 | Mistral-Large-Instruct-2407,62.5,50.3,83.3,50,72.8,55.6,50.3,84.5 6 | DeepSeek-V2-Chat(0618),61.7,46.3,78.8,47.4,68.2,66.2,44.1,83.7 7 | GPT-4o-mini-20240718,60.4,50.1,78.7,45.4,58.2,63.3,56,85.7 8 | Qwen-Max-0428,57.8,56.5,79,47.9,55.1,52.4,47.4,83.8 9 | Yi-Large,56.3,48.7,75.3,47.6,54.8,54.3,40,86.1 10 | Qwen2-72B-Instruct,55.4,45.8,84,44.7,57.7,49.5,34,85.9 11 | GLM-4,55.2,45.8,77.7,46.1,53.2,56.3,36.9,80.4 12 | Llama3.1-70B-Instruct,53.9,38.4,81.4,31.6,58,53.7,46.2,86.5 13 | Gemma-2-27B-it,53.5,45.2,58.5,45.4,50.1,54.6,45.2,85.5 14 | Qwen1.5-110B-Chat,51.9,53.4,79.3,45.8,39.6,49.5,36.8,79.6 15 | Doubao-pro-32k/240615,51,31.1,78.3,27.8,67.5,50.2,30.6,79.3 16 | Baichuan4,50.4,37.2,74.2,38.5,51.8,44.1,39.4,84.5 17 | Step-1-8K,49.9,40.6,72,35.8,51.4,44.2,38.9,84.2 18 | abab6.5,49.9,44.9,69.8,47,47.2,50.5,32,62.5 19 | Ernie-4.0-8K-Preview-0518,48.8,36.7,76.4,41.3,44.7,50.6,28.5,72.7 20 | Moonshot-v1-8K,48.6,46.3,61,46,46.6,47,35.9,63.5 21 | GLM-4-9B-Chat,47.9,44.3,68.9,40,38.7,45.1,36,81.9 22 | Yi-1.5-34B-Chat,46.9,50.5,65,42.7,38.1,44.8,38.8,63.5 23 | Hunyuan-Standard-256k,46.9,30.6,69.7,36.8,53.9,46.1,29.2,65.6 24 | Mixtral-8x22B-Instruct-v0.1,46.3,33,72.2,28.6,47.2,44.7,31.2,86 25 | Gemma-2-9B-it,45.5,40.8,53.7,41.9,40.7,42.2,40.9,69.9 26 | Qwen2-7B-Instruct,45.1,43.5,64.1,36.2,37.7,44,27.5,79.7 27 | InternLM2.5-7B-Chat,44.5,44.6,64.8,39.3,40.8,34.8,26.5,79 28 | Yi-1.5-9B-Chat,42.6,46.1,56,39.8,38.2,41.8,29.8,54.3 29 | Nanbeige2-16B-Chat,42.3,50.5,53.8,40.5,25.8,33.3,33.2,85.8 30 | Llama3.1-8B-Instruct,42.1,33.7,63.2,24.9,38,39.3,39.1,80.1 31 | DBRX-Instruct,37.6,25.6,66.3,20.8,35.3,32.2,32.5,75.3 32 | Yi-1.5-6B-Chat,36.5,43.6,41.3,36.5,28.4,34.4,26.3,55.4 33 | InternLM2-Chat-20B,36,36.7,60,18.9,27.4,36.2,18.5,80.3 34 | Mixtral-8x7B-Instruct-v0.1,34.5,36.6,50.4,28.1,24.8,26.7,28.2,71 35 | Mistral-7B-Instruct-v0.3,30.7,30.3,47.8,20.7,18.1,23.6,28.5,75.4 36 | DeepSeek-V2-Lite-Chat,30,31.4,41.3,28.1,22.8,16.3,20.6,72.4 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/opencompass_academic_240829.csv: -------------------------------------------------------------------------------- 1 | model,opencompass_academic,OC_MMLU,OC_MMLU-Pro,OC_CMMLU,OC_BBH,OC_GQPA-Dimand,OC_MATH,OC_HumanEval,OC_IFEval 2 | GPT-4o-20240513,77,88,73.8,78.3,87.6,49.5,73.7,86,79 3 | Qwen2-72B-Instruct,73.1,83.1,65.1,79.8,85.2,42.9,67.7,84.2,76.5 4 | GPT-4o-mini-20240718,72.5,82.9,63.2,65.6,81.9,47.5,69.9,87.8,81 5 | Llama3-70B-Instruct,66.6,80.7,61.8,66.2,83.2,39.4,47.8,76.2,77.5 6 | Qwen1.5-110B-Chat,61.7,74,51.8,79.4,74.2,28.3,54.3,77.4,54.3 7 | Yi-1.5-34B-Chat,60.4,71.3,50.9,63.4,73.8,32.8,53.7,77.4,59.5 8 | InternLM2.5-Chat-7B,60.3,70.6,44.9,73.8,74.5,29.3,61.4,73.2,54.5 9 | GLM-4-9B-Chat,59.5,72.9,48.3,71.6,60.6,26.8,51.3,75.6,69.1 10 | Qwen1.5-32B-Chat,57.1,72.5,49.8,76.3,68.2,31.3,42.9,67.7,48.4 11 | Qwen1.5-72B-Chat,56.9,70.9,47.1,67.8,72.8,28.3,46.9,67.7,53.8 12 | Yi-1.5-9B-Chat,56.1,67.8,45.9,65,67.9,25.2,51.1,68.9,56.8 13 | Qwen2-7B-Instruct,52,51.1,38.8,59.3,65.4,25.8,49.2,76.8,49.7 14 | Llama3-8B-Instruct,50.6,66.7,42.3,51.5,54.4,33.8,27.7,59.8,68.4 15 | Qwen1.5-14B-Chat,49.7,67,40.5,73.3,58.3,26.3,29.5,60.4,42 16 | InternLM2-Chat-20B,45.2,55.8,36.3,44.7,65.6,21.7,34.2,67.7,35.5 17 | Yi-1.5-6B-Chat,43.5,48.4,30.1,53.9,56.5,23.2,42.6,45.7,47.3 18 | Mixtral-8x7B-Instruct-v0.1,42.6,67.2,42.7,33.9,55.7,29.3,26.8,34.8,50.8 19 | InternLM2-Chat-7B,42.1,58.8,32.4,47.8,60.3,26.3,28.1,50.6,32.4 20 | Qwen1.5-7B-Chat,35.4,41.9,25.6,42.3,41,21.2,22.2,50,38.6 21 | Mistral-7B-Instruct-v0.3,31.2,30.9,22.1,35.5 -------------------------------------------------------------------------------- /src/bat/assets/benchmarks_old/wildbench_240829.csv: -------------------------------------------------------------------------------- 1 | model,WB-Elo_LC,WB-Info_Seek,WB-Creative,WB-Code_Debug,WB-Math_Data,WB-Reason_Plan,WB-Score 2 | gpt-4o-2024-05-13,1227.1,58.6,59.1,60.5,57.3,60.2,59.3 3 | Claude_3.5_Sonnet,1215.4,55.5,55.6,56.5,50.2,55.6,54.7 4 | Gemini_1.5_Pro,1214.6,52.2,55.1,55.2,48.6,53.7,53 5 | gpt-4-turbo-2024-04-09,1209.6,57.2,58.7,55.1,51,56.2,55.2 6 | Yi-Large-Preview,1208.9,57.7,57.6,54.3,51.9,56.6,55.3 7 | DeepSeek-V2-Chat_0628_API,1199.1,52.7,56.4,55,51.4,54.8,54 8 | gpt-4-0125-preview,1197.3,54.4,57.6,52.9,45.8,53.5,52.3 9 | Claude_3_Opus,1196.3,53.5,53,53.3,46.7,52.5,51.7 10 | Gemini_1.5_Flash,1192,48.7,51.7,48.7,45.3,50.8,48.9 11 | Llama-3-70B-Instruct,1187.5,52.3,54.3,44.7,42.1,50.1,47.8 12 | DeepSeek-V2-Coder_0614_API,1184.9,40,40.8,48.9,46.4,47.2,45.7 13 | Yi-Large,1181.8,51,51.8,47.7,44.5,51.3,48.9 14 | Athene-70B,1180.7,60.8,60.4,59,57.1,61,59.5 15 | Nemotron-4-340B-Inst,1178.6,53,53.3,46.3,40.8,49.1,47.7 16 | Gemma-2-27B-it,1176.4,50.5,53.6,47,43.9,50.6,48.5 17 | Mistral-Large-2,1176.3,57.4,58.9,53.8,52.7,57.2,55.6 18 | Claude_3_Sonnet,1174.7,47.1,46.3,46.1,40.6,47.4,45.5 19 | gpt-4o-mini-2024-07-18,1173.5,57.4,60.1,57.2,54,58.2,57.1 20 | Qwen2-72B-Instruct,1172.3,49.5,49.9,39.8,41,46.8,44.5 21 | Reka_Core,1170.4,52.3,55.5,40.6,40.3,48,45.9 22 | gemma-2-9b-it-SimPO,1166.6,56.5,58,50.9,48.6,55.6,53.3 23 | gemma-2-9b-it-DPO,1166.6,58.2,59.1,50.5,47.1,55.5,53.2 24 | Yi-1.5-34B-Chat,1159.6,50.3,53.5,42.1,39.4,48.1,45.6 25 | Claude_3_Haiku,1159.1,45.3,42.9,37,31.4,41.3,38.9 26 | Mistral-Nemo-Inst_12B,1158.6,51.9,54.6,39.7,35.6,47.4,44.4 27 | Mistral-Large,1157,46.1,49.7,33.7,30.9,41.8,38.9 28 | Gemma-2-9B-it,1156.4,49,51,36.7,36.4,46.7,42.7 29 | Command-R-Plus,1151.4,49.2,52.6,28.4,23.5,41.9,36.8 30 | GLM-4-9B-Chat,1148.5,46.3,47.8,35.4,29.8,42.5,39.1 31 | Magpie-8B-Align-v0.1,1148.4,48.9,49.2,33.7,29.8,42.7,39.3 32 | Yi-1.5-9B-Chat,1148,42.6,45.6,35,32.2,42.4,38.7 33 | Llama3-Inst-8B-SimPO,1147.5,47.9,50.6,31.8,24,40.9,37 34 | Llama3-Inst-8B-SimPO-v0.2,1147.4,47.9,51.8,31.5,24.4,40.7,37.2 35 | Qwen1.5-72B-Chat,1147.4,48.2,50.4,35.4,29.8,43.5,39.9 36 | Llama3-Inst-8B-SimPO-ExPO,1145.5,47.3,49.1,28.6,21.2,39.5,35 37 | SELM_Llama3-8B-Inst-iter3,1144,46.1,51.1,27.3,23.5,39.8,35.3 38 | Phi-3-medium-128k,1139.5,35.7,33.2,18.2,23,32.3,27.3 39 | Llama-3-8B-Instruct,1139.5,39.3,43.6,22,17,34.4,29.2 40 | Hermes-2-Theta-Llama-3-8B,1137.4,41.6,39.8,23.1,18.7,33.7,29.6 41 | Starling-LM-7B-beta-ExPO,1136,42.9,44.3,25.3,18.6,36.3,31.6 42 | SELM_Zephyr-7B-iter3,1134.3,41,44.7,11,12.7,31.6,25.1 43 | Reka_Flash,1132.7,41.5,42.4,22.1,20.5,35,30.4 44 | Gemma-2-2B-it,1129.7,39.9,43.6,17.9,15.8,33.8,27.8 45 | gpt-3.5-turbo-0125,1129.2,36.5,37.4,26.5,21.6,33.4,30 46 | DBRX_Instruct,1128.5,41.1,42.3,26.4,24.5,36.2,32.6 47 | Neo-7B-Instruct-ExPO,1126.6,34.9,38.5,12.8,12.6,28.7,23.1 48 | Neo-7B-Instruct,1126.2,36.3,39.5,14,15,31.4,25 49 | StarlingLM-7B-beta,1126.2,41.9,43.8,24.4,17,34.1,30.2 50 | Command-R,1125.6,44.1,47.4,19.3,16,34.6,29.5 51 | Mixtral-8x7B-Instruct,1124.7,41.9,42.8,25,22.1,34.6,31.5 52 | Yi-1.5-6B-Chat,1122.7,31.4,31.1,16.6,16.8,27.3,23.3 53 | Tulu-2-dpo-70b,1121,40.7,42.7,20.7,14.8,32.3,28 54 | Reka_Edge,1120.8,34.4,36.2,13.5,8.9,25,21.3 55 | Mistral-7B-Instruct-v0.2,1105,40.1,42.1,18.4,10.1,30.1,25.6 56 | Llama-2-70B-chat,1101.9,38.3,40,9.3,4.2,26.8,20.7 57 | Qwen1.5-7B-Chat,1092.7,34,38.3,14.9,11.9,28.9,23.4 58 | Hermes-2-Mixtral-8x7B-DPO,1085.8,39.8,37.9,26,21.8,34.2,30.7 59 | Phi-3-mini-128k,1082.1,28.6,30.6,21.6,18.6,28.1,24.7 60 | Gemma-7B-it,1079.2,12.7,21.2,1.8,-3.7,10.2,6.6 61 | Llama-2-7B-chat,1052.5,27.7,29.8,-6.8,-7.2,15.4,8.3 -------------------------------------------------------------------------------- /src/bat/assets/lower_is_better_benchmarks.txt: -------------------------------------------------------------------------------- 1 | helm_airbench_240916 2 | llm_trustworthy_241001 -------------------------------------------------------------------------------- /src/bat/assets/prettified_bencmark_names.json: -------------------------------------------------------------------------------- 1 | { 2 | "holmes": "Holmes", 3 | "helm_lite_narrativeqa": "Helm Lite NarrativeQA", 4 | "helm_lite_naturalquestionsopen": "Helm Lite NaturalQuestionsOpen", 5 | "helm_lite_naturalquestionsclosed": "Helm Lite NaturalQuestionsClosed", 6 | "helm_lite_openbookqa": "Helm Lite OpenBookQA", 7 | "helm_lite_mmlu": "Helm Lite MMLU", 8 | "helm_lite_math_equivalentcot": "Helm Lite MathEquivalentCOT", 9 | "helm_lite_gsm8k": "Helm Lite GSM8K", 10 | "helm_lite_legalbench": "Helm Lite LegalBench", 11 | "helm_lite_medqa": "Helm Lite MedQA", 12 | "helm_lite_wmt2014": "Helm Lite WMT2014", 13 | "hfv2_bbh": "HFv2 BBH", 14 | "hfv2_bbh_raw": "HFv2 BBH Raw", 15 | "hfv2_gpqa": "HFv2 GPQA", 16 | "hfv2_ifeval": "HFv2 IFEval", 17 | "hfv2_math_lvl_5": "HFv2 Math Level 5", 18 | "hfv2_mmlu_pro": "HFv2 MMLU Pro", 19 | "hfv2_musr": "HFv2 MuSR", 20 | "oc_mmlu": "OpenCompass MMLU", 21 | "oc_mmlu_pro": "OpenCompass MMLU Pro", 22 | "oc_cmmlu": "OpenCompass CMMLU", 23 | "oc_bbh": "OpenCompass BBH", 24 | "oc_gqpa_dimand": "OpenCompass GQPA-Dimand", 25 | "oc_humaneval": "OpenCompass HumanEval", 26 | "oc_ifeval": "OpenCompass IFEval", 27 | "helm_mmlu": "Helm MMLU", 28 | "helm_boolq": "Helm BoolQ", 29 | "helm_narrativeqa": "Helm NarrativeQA", 30 | "helm_naturalquestionsclosed": "Helm NaturalQuestionsClosed", 31 | "helm_naturalquestionsopen": "Helm NaturalQuestionsOpen", 32 | "helm_quac": "Helm QuAC", 33 | "helm_openbookqa": "Helm OpenBookQA", 34 | "helm_imdb": "Helm IMDB", 35 | "helm_civilcomments": "Helm CivilComments", 36 | "helm_raft": "Helm RAFT", 37 | "helm_ms_marcoregular": "Helm MSMARCO Regular", 38 | "helm_ms_marcotrec": "Helm MSMARCO Trec", 39 | "xsum": "Helm XSUM", 40 | "mmlu_pro": "MMLU Pro", 41 | "mixeval_triviaqa": "MixEval TriviaQA", 42 | "mixeval_mmlu": "MixEval MMLU", 43 | "mixeval_drop": "MixEval DROP", 44 | "mixeval_hellaswag": "MixEval HellaSwag", 45 | "mixeval_commonsenseqa": "MixEval CommonsenseQA", 46 | "mixeval_triviaqa_hard": "MixEval TriviaQA Hard", 47 | "mixeval_mmlu_hard": "MixEval MMLU Hard", 48 | "mixeval_drop_hard": "MixEval DROP Hard", 49 | "oc_language": "OpenCompass Language", 50 | "oc_knowledge": "OpenCompass Knowledge", 51 | "oc_reasoning": "OpenCompass Reasoning", 52 | "oc_math": "OpenCompass Math", 53 | "oc_code": "OpenCompass Code", 54 | "oc_instruct": "OpenCompass Instruction", 55 | "oc_agent": "OpenCompass Agent", 56 | "oc_arena": "OpenCompass Arena", 57 | "lb_reasoning": "LiveBench Reasoning", 58 | "lb_coding": "LiveBench Coding", 59 | "lb_mathematics": "LiveBench Mathematics", 60 | "lb_data_analysis": "LiveBench Data Analysis", 61 | "lb_language": "LiveBench Language", 62 | "lb_if": "LiveBench Instruction Following", 63 | "wb_info_seek": "WildBench Information Seeking", 64 | "wb_creative": "WildBench Creative", 65 | "wb_code_debug": "WildBench Code Debugging", 66 | "wb_math_data": "WildBench Math & Data", 67 | "wb_reason_plan": "WildBench Reasoning & Planning", 68 | "wb_score": "WildBench Score", 69 | "hfv1_arc": "HFv1 ARC", 70 | "hfv1_gsm8k": "HFv1 GSM8K", 71 | "hfv1_hellaswag": "HFv1 HellaSwag", 72 | "hfv1_mmlu": "HFv1 MMLU", 73 | "hfv1_truthfulqa": "HFv1 TruthfulQA", 74 | "hfv1_winogrande": "HFv1 Winogrande", 75 | "biggen_grounding": "BIGGEN Grounding", 76 | "biggen_instruction_following": "BIGGEN Instruction Following", 77 | "biggen_planning": "BIGGEN Planning", 78 | "biggen_reasoning": "BIGGEN Reasoning", 79 | "biggen_refinement": "BIGGEN Refinement", 80 | "biggen_safety": "BIGGEN Safety", 81 | "biggen_theory_of_mind": "BIGGEN Theory of Mind", 82 | "biggen_tool_usage": "BIGGEN Tool Usage", 83 | "biggen_multilingual": "BIGGEN Multilingual", 84 | "lb_reasoning_average": "LiveBench Reasoning Average", 85 | "lb_coding_average": "LiveBench Coding Average", 86 | "lb_mathematics_average": "LiveBench Mathematics Average", 87 | "lb_data_analysis_average": "LiveBench Data Analysis Average", 88 | "lb_language_average": "LiveBench Language Average", 89 | "lb_if_average": "LiveBench Instruction Following Average", 90 | "helm_lite": "Helm Lite", 91 | "hf_open_llm_v2": "HF OpenLLM v2", 92 | "opencompass_academic": "OpenCompass Academic", 93 | "arena_elo": "LMSys Arena", 94 | "helm_classic": "Helm Classic", 95 | "mixeval": "MixEval", 96 | "mixeval_hard": "MixEval Hard", 97 | "opencompass": "OpenCompass", 98 | "alphacaeval_v2lc": "AlphacaEval v2lc", 99 | "livebench_240725": "LiveBench 240725", 100 | "wb_elo_lc": "WildBench Elo LC", 101 | "arena_hard": "Arena Hard", 102 | "agentbench": "AgentBench", 103 | "hf_open_llm_v1": "HF OpenLLM v1", 104 | "biggen": "BIGGEN", 105 | "livebench_240624": "LiveBench 240624", 106 | "mt_bench": "MT-Bench", 107 | "bfcl": "BFCL", 108 | "helm_airbench_security_risks": "HELM AirBench Security Risks", 109 | "helm_airbench_operational_misuses": "HELM AirBench Operational Misuses", 110 | "helm_airbench_violence_&_extremism": "HELM AirBench Violence & Extremism", 111 | "helm_airbench_hate/toxicity": "HELM AirBench Hate/Toxicity", 112 | "helm_airbench_sexual_content": "HELM AirBench Sexual Content", 113 | "helm_airbench_child_harm": "HELM AirBench Child Harm", 114 | "helm_airbench_self_harm": "HELM AirBench Self Harm", 115 | "helm_airbench_political_usage": "HELM AirBench Political Usage", 116 | "helm_airbench_economic_harm": "HELM AirBench Economic Harm", 117 | "helm_airbench_deception": "HELM AirBench Deception", 118 | "helm_airbench_manipulation": "HELM AirBench Manipulation", 119 | "helm_airbench_defamation": "HELM AirBench Defamation", 120 | "helm_airbench_fundamental_rights": "HELM AirBench Fundamental Rights", 121 | "helm_airbench_discrimination/bias": "HELM AirBench Discrimination/Bias", 122 | "helm_airbench_privacy": "HELM AirBench Privacy", 123 | "helm_airbench_criminal_activities": "HELM AirBench Criminal Activities", 124 | "helm_airbench_air_score": "HELM AirBench AIR Score", 125 | "enkrypt_ai_safety": "Enkrypt AI Safety", 126 | "decentralized_arena": "Decentralized Arena (0-1 Normalized)", 127 | "hydrox_safety": "Hydrox Safety", 128 | "hydrox_privacy": "Hydrox Privacy", 129 | "hydrox_security": "Hydrox Security", 130 | "hydrox_integrity": "Hydrox Integrity", 131 | "hydrox_overall_score": "Hydrox Overall Score", 132 | "ruler": "RULER", 133 | "trustworthy_average": "Trustworthy Average", 134 | "trustworthy_Non-toxicity": "Trustworthy Non-Toxicity", 135 | "trustworthy_Non-Stereotype": "Trustworthy Non-Stereotype", 136 | "trustworthy_AdvGLUE_PP": "Trustworthy AdvGLUE PP", 137 | "trustworthy_OoD": "Trustworthy Out-of-Distribution", 138 | "trustworthy_Adv_Demo": "Trustworthy Adversarial Demos", 139 | "trustworthy_Privacy": "Trustworthy Privacy", 140 | "trustworthy_Ethics": "Trustworthy Ethics", 141 | "trustworthy_Fairness": "Trustworthy Fairness" 142 | } -------------------------------------------------------------------------------- /src/bat/configs.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List 3 | 4 | 5 | @dataclass 6 | class Config: 7 | exp_to_run: str 8 | n_models_taken_list: List[int] = field(default_factory=lambda: []) 9 | model_select_strategy_list: List[str] = field(default_factory=lambda: []) 10 | n_exps: int = 10 11 | corr_types: List[str] = field(default_factory=lambda: ["kendall"]) 12 | include_aggregate_as_scenario: bool = False 13 | scenario_blacklist: List[str] = field(default_factory=lambda: []) 14 | aggregate_scenarios: List[str] = field(default_factory=lambda: []) 15 | reference_data_path: str = "src/bat/assets/combined_holistic.csv" 16 | external_benchmarks_tested: List[str] = field(default_factory=lambda: []) 17 | min_n_models_intersect: int = 5 18 | 19 | def __post_init__(self): 20 | self.validate_n_models_taken_list() 21 | self.validate_model_select_strategy_list() 22 | self.validate_corr_types() 23 | 24 | def validate_n_models_taken_list(self): 25 | if not all(isinstance(x, int) for x in self.n_models_taken_list): 26 | raise ValueError("All items in n_models_taken_list must be integers") 27 | 28 | def validate_model_select_strategy_list(self): 29 | valid_strategies = { 30 | "somewhere_aggregate", 31 | "middle_aggregate", 32 | "top_aggregate", 33 | "bottom_aggregate", 34 | "random", 35 | } 36 | if not all( 37 | item in valid_strategies for item in self.model_select_strategy_list 38 | ): 39 | raise ValueError( 40 | f"Invalid strategy in model_select_strategy_list. Valid options are: {valid_strategies}" 41 | ) 42 | 43 | def validate_corr_types(self): 44 | valid_types = {"kendall", "pearson"} 45 | if not all(item in valid_types for item in self.corr_types): 46 | raise ValueError( 47 | f"Invalid correlation type. Valid options are: {valid_types}" 48 | ) 49 | 50 | def update_or_add_fields(self, **kwargs): 51 | """ 52 | Add or update fields dynamically. All new fields are validated. 53 | """ 54 | for key, value in kwargs.items(): 55 | setattr(self, key, value) 56 | # Re-validate the fields if necessary 57 | if "n_models_taken_list" in kwargs: 58 | self.validate_n_models_taken_list() 59 | if "model_select_strategy_list" in kwargs: 60 | self.validate_model_select_strategy_list() 61 | if "corr_types" in kwargs: 62 | self.validate_corr_types() 63 | 64 | 65 | # if __name__ == "__main__": 66 | # manager = ConfigurationManager() 67 | # # Example access to configurations: 68 | # print(manager.configs["resolution_matters"].n_exps) 69 | -------------------------------------------------------------------------------- /src/bat/logic.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pandas as pd 3 | from scipy.stats import pearsonr, kendalltau 4 | import numpy as np 5 | 6 | 7 | def get_pair_agreement(pair_scen_res, res_to_sort_by, cfg, models_intersect): 8 | # how many models occur in both 9 | 10 | model_subset_size_taken = ( 11 | min(cfg["model_subset_size_requested"], len(models_intersect)) 12 | if cfg["model_subset_size_requested"] != 0 13 | else len(models_intersect) 14 | ) 15 | 16 | if any( 17 | [ 18 | x in cfg["model_select_strategy"] 19 | for x in ["top", "bottom", "middle", "somewhere"] 20 | ] 21 | ): 22 | if cfg["exp_n"] != 0 and "somewhere" not in cfg["model_select_strategy"]: 23 | return None, None # skipping experimentation since deterministic 24 | 25 | models_taken = sample_models_directed( 26 | res_to_sort_by, 27 | cfg["model_select_strategy"], 28 | models_intersect, 29 | model_subset_size_taken, 30 | ) 31 | 32 | elif "random" in cfg["model_select_strategy"]: 33 | random.seed(cfg["exp_n"]) 34 | models_taken = random.sample( 35 | models_intersect, 36 | k=model_subset_size_taken, 37 | ) 38 | 39 | else: 40 | raise NotImplementedError 41 | 42 | agreement, p_value = get_agreement( 43 | pair_scen_res[pair_scen_res["model"].isin(models_taken)][ 44 | ["model", "scenario", "score"] 45 | ], 46 | cfg["corr_type"], 47 | ) 48 | 49 | return agreement, p_value 50 | 51 | 52 | def get_df_of_scenario_to_order_by(df, model_select_strategy): 53 | if "aggregate" in model_select_strategy: 54 | order_by = "Aggregate" 55 | 56 | elif "arena" in model_select_strategy: 57 | order_by = "Arena Elo" 58 | 59 | else: 60 | raise NotImplementedError 61 | 62 | return df[df["scenario"] == order_by] 63 | 64 | 65 | def sample_models_directed( 66 | res_to_sort_by, 67 | model_select_strategy, 68 | models_intersect, 69 | n_models_really_taken, 70 | ): 71 | df_of_scenario_to_order_by = res_to_sort_by.query("model in @models_intersect") 72 | # get_df_of_scenario_to_order_by( 73 | # bench_res, model_select_strategy 74 | # ) 75 | 76 | if "top" in model_select_strategy: 77 | models_taken = df_of_scenario_to_order_by.nlargest( 78 | n_models_really_taken, 79 | "score", 80 | )["model"].tolist() 81 | elif "bottom" in model_select_strategy: 82 | models_taken = df_of_scenario_to_order_by.nsmallest( 83 | n_models_really_taken, 84 | "score", 85 | )["model"].tolist() 86 | 87 | elif "middle" in model_select_strategy: 88 | df_sorted = df_of_scenario_to_order_by.sort_values("score", ascending=False) 89 | middle_idx = len(df_sorted) // 2 90 | half_n = n_models_really_taken // 2 91 | 92 | if n_models_really_taken % 2 == 0: 93 | sampled_df = df_sorted.iloc[middle_idx - half_n : middle_idx + half_n] 94 | else: 95 | sampled_df = df_sorted.iloc[middle_idx - half_n : middle_idx + half_n + 1] 96 | 97 | models_taken = sampled_df["model"].unique().tolist() 98 | 99 | elif "somewhere": 100 | df_sorted = df_of_scenario_to_order_by.sort_values("score", ascending=False) 101 | 102 | idx = random.randrange(len(df_sorted) - n_models_really_taken + 1) 103 | models_taken = ( 104 | df_sorted.iloc[idx : idx + n_models_really_taken]["model"].unique().tolist() 105 | ) 106 | 107 | else: 108 | raise NotImplementedError 109 | 110 | return models_taken 111 | 112 | 113 | def sample_sublists_for_list( 114 | all_models_sorted, sublists_size=1, n_sublists=0, drop_from_top=False 115 | ): 116 | # assert not ( 117 | # drop_from_top and sublists_size != len(all_models_sorted) 118 | # ), "drop from top defines the length of resulting" 119 | 120 | if drop_from_top: 121 | sublists = [] 122 | top_models_to_remove = [] 123 | for window_num, model in enumerate(all_models_sorted): 124 | sublists.append( 125 | [ 126 | model 127 | for model in all_models_sorted[: sublists_size + window_num] 128 | if model not in top_models_to_remove 129 | ] 130 | ) 131 | top_models_to_remove.append(sublists[-1][0]) # drop the first model 132 | if len(sublists) == n_sublists: 133 | break 134 | 135 | else: 136 | random.seed(0) 137 | sublists = [] 138 | for _ in range(n_sublists): 139 | sublists.append(random.sample(all_models_sorted, sublists_size)) 140 | 141 | return sublists 142 | 143 | 144 | def calculate_win_rate(series): 145 | assert len(series) > 1, "no meaning for a win rate with only one object" 146 | 147 | def win_rate(x): 148 | win_count = sum(1 for value in series if x > value) 149 | return win_count / (len(series) - 1) 150 | 151 | return series.transform(win_rate) 152 | 153 | 154 | def add_aggragete_with_mwr(df, scenarios_for_aggragate): 155 | if "wr" not in df.columns: 156 | df["wr"] = df.groupby(["scenario"])["score"].transform(calculate_win_rate) 157 | 158 | mean_df = pd.DataFrame(columns=df.columns) 159 | mean_df = ( 160 | df.query("scenario in @scenarios_for_aggragate") 161 | .groupby(["model"]) 162 | .agg({"score": "mean", "wr": "mean"}) 163 | .reset_index() 164 | ) 165 | mean_df["score"] = mean_df["wr"] 166 | mean_df["scenario"] = "Aggregate" 167 | df = pd.concat([df, mean_df]).drop(columns=["wr"]) 168 | return df 169 | 170 | 171 | def get_agreement(df, corr_type): 172 | if corr_type == "pearson": 173 | corr_func = pearsonr 174 | elif corr_type == "kendall": 175 | corr_func = kendalltau 176 | else: 177 | raise IOError(f"corr_type {corr_type} is not supported") 178 | 179 | pivot_df = df.pivot( 180 | index="model", 181 | columns="scenario", 182 | values="score", 183 | ) 184 | 185 | similarity = pivot_df.corr(method=lambda x, y: corr_func(x, y)[0]).iloc[0, 1] 186 | p_value = ( 187 | pivot_df.corr(method=lambda x, y: corr_func(x, y)[1]) 188 | - np.eye(len(pivot_df.columns)) 189 | ).iloc[0, 1] 190 | 191 | return similarity, p_value 192 | -------------------------------------------------------------------------------- /src/bat/reporting.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import matplotlib.pyplot as plt 3 | 4 | import os 5 | import pandas as pd 6 | 7 | 8 | # def plot_experiments_results(agreement_df, cfg): 9 | # sns.set() 10 | 11 | # exp_to_run = cfg.exp_to_run 12 | 13 | # if exp_to_run == "resolution_matters": 14 | # sns.set(font_scale=1.2, style="white") 15 | 16 | # fig, ax = plt.subplots(width_ratios=[1.5]) 17 | 18 | # # correlation as a function of model_subset_size_requested 19 | # sns.pointplot( 20 | # ax=ax, 21 | # # kind="point", 22 | # data=agreement_df.query('corr_type=="kendall"').replace( 23 | # { 24 | # "somewhere_aggregate": "Adjacent sampling", 25 | # "random": "Random sampling", 26 | # } 27 | # ), 28 | # y="correlation", 29 | # x="model_subset_size_requested", 30 | # hue="model_select_strategy", 31 | # markersize=10, 32 | # linewidth=4, 33 | # # legend=False, 34 | # # errorbar="se", 35 | # # linestyle="", 36 | # # col="corr_type", 37 | # # sharey=False, 38 | # # aspect=1.5, 39 | # ) 40 | # # scneario-wise agreement (lines) 41 | # sns.pointplot( 42 | # ax=ax, 43 | # # kind="point", 44 | # data=agreement_df.query( 45 | # 'corr_type=="kendall"' 46 | # # " and scenario not in @scenarios_not_to_show and ref_scenario not in @scenarios_not_to_show" 47 | # " and model_select_strategy=='somewhere_aggregate'" 48 | # ), 49 | # y="correlation", 50 | # x="model_subset_size_requested", 51 | # hue="scenario", 52 | # errorbar=None, 53 | # alpha=0.2, 54 | # legend=False, 55 | # # aspect=1.5, 56 | # # col="corr_type", 57 | # # aspect=1.5, 58 | # ) 59 | # plt.xlabel("Granularity (Number of models)") 60 | # plt.ylabel("Mean Benchmark Agreement\n(Kendall-tau correlation)") 61 | # ax.invert_xaxis() 62 | # handles, labels = ax.get_legend_handles_labels() 63 | # handles, labels = ax.get_legend_handles_labels() 64 | # ax.legend(handles=handles, labels=labels, frameon=False) 65 | # # sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1)) 66 | # plt.tight_layout() 67 | # plt.savefig("figures/final_for_paper/pointplot_granularity_matters.pdf") 68 | 69 | 70 | class Reporter: 71 | def __init__(self) -> None: 72 | os.makedirs("figures", exist_ok=True) 73 | 74 | @staticmethod 75 | def draw_agreements_for_one_source( 76 | agreements, source_of_interest, ref_sources=None 77 | ): 78 | filtered_agreements = Reporter.filter_with_sources( 79 | agreements, ref_sources, source_of_interest 80 | ) 81 | 82 | # Grouping and calculating mean for 'correlation' and 'p_value' 83 | grouped = ( 84 | filtered_agreements.groupby(["scenario", "ref_scenario"]) 85 | .agg( 86 | correlation_mean=("correlation", "mean"), 87 | p_value_mean=("p_value", "mean"), 88 | ) 89 | .reset_index() 90 | ).dropna() 91 | 92 | sns.set_theme(font_scale=1.2) 93 | 94 | g = sns.catplot( 95 | kind="bar", 96 | data=grouped.sort_values("correlation_mean"), 97 | x="ref_scenario", 98 | y="correlation_mean", 99 | # palette="viridis", # or any other Seaborn palette 100 | edgecolor=".2", # Add edge color for better visibility 101 | linewidth=1, # Adjust line width 102 | # width=2, 103 | aspect=1.8, 104 | # legend=True, 105 | ) 106 | plt.xticks(rotation=90, fontsize=10) # Adjust fontsize 107 | plt.xlabel("Reference Scenario", fontsize=12) # Add labels with fontsize 108 | plt.ylabel("Mean Correlation", fontsize=12) 109 | plt.title( 110 | f"Mean Agreement Between {source_of_interest} and All other Benchmark", 111 | fontsize=14, 112 | ) # Add title 113 | 114 | plt.tight_layout() 115 | plt.show(block=True) 116 | # plt.savefig("figures/temp.png") 117 | 118 | @staticmethod 119 | def draw_agreement_matrix(agreements, sources_hide=None): 120 | filtered_agreements = Reporter.filter_with_sources( 121 | agreements, sources_hide, sources_hide 122 | ) 123 | 124 | # Grouping and calculating mean for 'correlation' and 'p_value' 125 | grouped = ( 126 | filtered_agreements.groupby(["scenario", "ref_scenario"]) 127 | .agg( 128 | correlation_mean=("correlation", "mean"), 129 | p_value_mean=("p_value", "mean"), 130 | ) 131 | .reset_index() 132 | ).dropna() 133 | 134 | # Pivoting the data 135 | correlation_pivot = grouped[ 136 | ["scenario", "ref_scenario", "correlation_mean"] 137 | ].pivot(index="scenario", columns="ref_scenario") 138 | p_value_pivot = grouped[["scenario", "ref_scenario", "p_value_mean"]].pivot( 139 | index="scenario", columns="ref_scenario" 140 | ) 141 | 142 | plt.figure(figsize=(10, 8)) # Increase figure size for better visualization 143 | 144 | sns.heatmap( 145 | correlation_pivot["correlation_mean"].round(2), 146 | annot=True, # combined_annotations, 147 | fmt=".2f", # Format annotations to two decimal places 148 | cmap="coolwarm", # Adjust color map as needed 149 | center=0, # Center the colormap around 0 for better contrast 150 | linewidths=0.5, # Add lines between cells for better separation 151 | linecolor="lightgray", # Set line color to light gray 152 | ) 153 | plt.xticks( 154 | rotation=90, fontsize=10 155 | ) # Rotate x-axis labels for better readability 156 | plt.yticks(fontsize=10) # Adjust y-axis label font size 157 | plt.xlabel("Reference Scenario", fontsize=12) # Add labels with fontsize 158 | plt.ylabel("Scenario", fontsize=12) # Add y-axis label 159 | plt.title("Mean Benchmark Agreement Across Scenarios", fontsize=14) # Add title 160 | plt.tight_layout() 161 | plt.show(block=True) 162 | 163 | @staticmethod 164 | def filter_with_sources(agreements, ref_sources_to_keep, scenario_sources_to_keep): 165 | if not scenario_sources_to_keep and not ref_sources_to_keep: # use all 166 | scenario_sources_to_keep = agreements["scenario_source"].unique().tolist() 167 | ref_sources_to_keep = agreements["ref_source"].unique().tolist() 168 | 169 | elif scenario_sources_to_keep and not ref_sources_to_keep: 170 | ref_sources_to_keep = [ 171 | scen 172 | for scen in agreements["ref_source"].unique().tolist() 173 | if scen not in scenario_sources_to_keep 174 | ] 175 | 176 | elif scenario_sources_to_keep and ref_sources_to_keep: 177 | pass 178 | 179 | else: 180 | raise NotImplementedError 181 | 182 | filtered_agreements = agreements.query( 183 | "scenario_source in @scenario_sources_to_keep and ref_source in @ref_sources_to_keep" 184 | ) 185 | 186 | return filtered_agreements 187 | # plt.tight_layout() 188 | # plt.savefig("figures/newbench_cluster_within.png") 189 | # print("figure saved to figures/newbench_heatmap_within.png") 190 | # plt.clf() 191 | 192 | @staticmethod 193 | def get_all_z_scores(agreements, aggragate_name="aggregate"): 194 | z_scores = [] 195 | for observed_scenario in agreements["scenario"].unique(): 196 | if ( 197 | observed_scenario == aggragate_name 198 | or len( 199 | agreements.dropna().query( 200 | "scenario==@observed_scenario" 201 | " and ref_scenario==@aggragate_name" 202 | ) 203 | ) 204 | == 0 205 | ): 206 | continue 207 | 208 | ( 209 | z_score, 210 | corr_with_agg, 211 | p_value_of_corr_with_agg, 212 | n_models_of_corr_with_agg, 213 | ) = Reporter.get_z_score( 214 | agreements=agreements, 215 | observed_scenario=observed_scenario, 216 | aggragate_name="aggregate", 217 | ) 218 | 219 | z_scores.append( 220 | { 221 | "scenario": observed_scenario, 222 | "z_score": z_score, 223 | "corr_with_agg": corr_with_agg, 224 | "p_value_of_corr_with_agg": p_value_of_corr_with_agg, 225 | "n_models_of_corr_with_agg": n_models_of_corr_with_agg, 226 | "source": agreements.query("scenario==@observed_scenario")[ 227 | "scenario_source" 228 | ].iloc[0], 229 | } 230 | ) 231 | 232 | return pd.DataFrame(z_scores).sort_values('z_score') 233 | 234 | @staticmethod 235 | def get_z_score( 236 | agreements, 237 | observed_scenario, 238 | aggragate_name="aggregate", 239 | blacklist_sources=[], 240 | ): 241 | if ( 242 | not len( 243 | agreements.dropna().query( 244 | "scenario==@observed_scenario" " and ref_scenario==@aggragate_name" 245 | ) 246 | ) 247 | > 0 248 | ): 249 | raise IOError 250 | 251 | ref_agreements_with_agg = ( 252 | agreements.dropna() 253 | .query( 254 | "scenario_source not in @blacklist_sources" 255 | " and ref_scenario==@aggragate_name" 256 | ) 257 | .groupby(["scenario"]) 258 | .agg( 259 | correlation_mean=("correlation", "mean"), 260 | p_value_mean=("p_value", "mean"), 261 | n_models_mean=("model_subset_size_requested", "mean"), 262 | ) 263 | ) 264 | 265 | obs_with_agg = agreements.query( 266 | "scenario==@observed_scenario" " and ref_scenario==@aggragate_name" 267 | ).agg( 268 | correlation_mean=("correlation", "mean"), 269 | p_value_mean=("p_value", "mean"), 270 | n_models_mean=("model_subset_size_requested", "mean"), 271 | ) 272 | 273 | obs_agreements_with_agg = float(obs_with_agg.iloc[0, 0]) 274 | obs_agreements_with_agg_p_value = float(obs_with_agg.iloc[1, 1]) 275 | obs_agreements_with_agg_n_models = float(obs_with_agg.iloc[2, 2]) 276 | 277 | ref_mean = ref_agreements_with_agg["correlation_mean"].mean() 278 | ref_std = ref_agreements_with_agg["correlation_mean"].std() 279 | z_score = float((obs_agreements_with_agg - ref_mean) / ref_std) 280 | 281 | return ( 282 | z_score, 283 | obs_agreements_with_agg, 284 | obs_agreements_with_agg_p_value, 285 | obs_agreements_with_agg_n_models, 286 | ) 287 | -------------------------------------------------------------------------------- /src/bat/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | from bat import benchmark 4 | from bat.benchmark import Benchmark 5 | 6 | 7 | def get_holistic_benchmark(file_name="assets/combined_holistic_20240708.csv"): 8 | if os.path.exists(file_name): 9 | df = pd.read_csv(file_name) 10 | else: 11 | df = pd.read_csv(f"src/bat/{file_name}") 12 | 13 | return Benchmark(df) 14 | 15 | if __name__ == "__main__": 16 | 17 | csv_path = 'src/bat/assets/combined_20240704.csv' 18 | from bat.benchmark import Benchmark 19 | benchmark = Benchmark(pd.read_csv(csv_path)) 20 | for source, df in benchmark.df.groupby('source'): 21 | df.to_csv(f'src/bat/assets/benchmarks/{source}.csv', index=False) 22 | 23 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Unit test package for bat.""" 2 | -------------------------------------------------------------------------------- /tests/test_benchmark.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas as pd 3 | from bat import Benchmark # Replace your_module with the actual module name 4 | 5 | 6 | class TestBenchmark(unittest.TestCase): 7 | def setUp(self): 8 | # Create a sample DataFrame for testing 9 | data = { 10 | "model": ["model_a", "model_b", "model_a", "model_b"], 11 | "scenario": ["scenario_1", "scenario_1", "scenario_2", "scenario_2"], 12 | "score": [0.8, 0.7, 0.9, 0.6], 13 | } 14 | self.df = pd.DataFrame(data) 15 | self.benchmark = Benchmark(self.df, "test_source") 16 | 17 | def test_assign_df(self): 18 | # Check if DataFrame is assigned correctly 19 | self.assertEqual(self.benchmark.df.shape, (4, 5)) 20 | self.assertEqual(self.benchmark.df["source"].unique()[0], "test_source") 21 | 22 | def test_normalize_scores_per_scenario(self): 23 | # Test score normalization 24 | normalized_df = self.benchmark.normalize_scores_per_scenario() 25 | scenario_1_scores = normalized_df[normalized_df["scenario"] == "scenario_1"][ 26 | "score" 27 | ] 28 | scenario_2_scores = normalized_df[normalized_df["scenario"] == "scenario_2"][ 29 | "score" 30 | ] 31 | self.assertEqual(scenario_1_scores.min(), 0.0) 32 | self.assertEqual(scenario_1_scores.max(), 1.0) 33 | self.assertEqual(scenario_2_scores.min(), 0.0) 34 | self.assertEqual(scenario_2_scores.max(), 1.0) 35 | 36 | def test_add_aggragete(self): 37 | # Test aggregate column addition 38 | self.benchmark.add_aggregate( 39 | new_col_name="aggregate", agg_source_name="aggregated_source" 40 | ) 41 | self.assertIn("aggregate", self.benchmark.df["scenario"].unique()) 42 | aggregate_rows = self.benchmark.df[self.benchmark.df["scenario"] == "aggregate"] 43 | self.assertEqual(len(aggregate_rows), 2) # Two models, so two aggregate rows 44 | 45 | def test_validate_dataframe(self): 46 | # Test DataFrame validation (should pass with the sample DataFrame) 47 | self.benchmark.validate_dataframe_post_formatting() 48 | 49 | def test_extend(self): 50 | # Test extending the Benchmark object 51 | new_data = { 52 | "model": ["model_c"], 53 | "scenario": ["scenario_3"], 54 | "score": [0.5], 55 | "source": ["new_source"], 56 | "aggragated_from": [[]], 57 | } 58 | new_df = pd.DataFrame(new_data) 59 | new_benchmark = Benchmark(new_df, "new_source") 60 | self.benchmark.extend(new_benchmark) 61 | self.assertEqual(len(self.benchmark.df), 5) # Original 4 rows + 1 new row 62 | 63 | def test_get_models(self): 64 | # Test getting unique model names 65 | models = self.benchmark.get_models() 66 | self.assertEqual(set(models), {"model_a", "model_b"}) 67 | 68 | def test_get_scenarios(self): 69 | # Test getting unique scenario names 70 | scenarios = self.benchmark.get_scenarios() 71 | self.assertEqual(set(scenarios), {"scenario_1", "scenario_2"}) 72 | 73 | def test_get_model_appearences_count(self): 74 | # Test counting model appearances 75 | counts = self.benchmark.get_model_appearences_count() 76 | self.assertEqual(counts["model_a"], 2) 77 | self.assertEqual(counts["model_b"], 2) 78 | 79 | def test_get_scenario_appearences_count(self): 80 | # Test counting scenario appearances 81 | counts = self.benchmark.get_scenario_appearences_count() 82 | self.assertEqual(counts["scenario_1"], 2) 83 | self.assertEqual(counts["scenario_2"], 2) 84 | 85 | # Tests for show_overlapping_model_counts and clear_repeated_scenarios are more 86 | # complex and might require mocking or specific data setups to test effectively. 87 | # Consider adding these tests based on your specific needs and how you 88 | # handle plotting and data cleaning in those methods. 89 | 90 | def test_validate_df_pre_formatting_unnamed_0(self): 91 | # Test DataFrame validation with 'Unnamed: 0' column 92 | bad_data = { 93 | "Unnamed: 0": [0, 1], 94 | "model": ["model_a", "model_b"], 95 | "scenario_1": [0.8, 0.7], 96 | "scenario_2": [0.9, 0.6], 97 | } 98 | bad_df = pd.DataFrame(bad_data) 99 | with self.assertRaises(ValueError) as context: 100 | Benchmark(bad_df, "test_source") 101 | self.assertIn( 102 | "DataFrame should not contain 'Unnamed: 0' column", str(context.exception) 103 | ) 104 | 105 | def test_validate_df_pre_formatting_missing_model(self): 106 | # Test DataFrame validation with missing 'model' column 107 | bad_data = { 108 | "scenario": ["scenario_1", "scenario_1", "scenario_2", "scenario_2"], 109 | "score": [0.8, 0.7, 0.9, 0.6], 110 | } 111 | bad_df = pd.DataFrame(bad_data) 112 | with self.assertRaises(ValueError) as context: 113 | Benchmark(bad_df, "test_source") 114 | self.assertIn("DataFrame must contain a 'model' column", str(context.exception)) 115 | 116 | def test_validate_df_pre_formatting_missing_scenario(self): 117 | # Test DataFrame validation with missing 'scenario' (and only 'model') 118 | bad_data = { 119 | "model": ["model_a", "model_b"], 120 | } 121 | bad_df = pd.DataFrame(bad_data) 122 | with self.assertRaises(ValueError) as context: 123 | Benchmark(bad_df, "test_source") 124 | self.assertIn( 125 | "DataFrame must contain at least 'model' and one scenario column", 126 | str(context.exception), 127 | ) 128 | 129 | def test_validate_df_pre_formatting_duplicate_model_scenario(self): 130 | # Test DataFrame validation with duplicate model-scenario pairs 131 | bad_data = { 132 | "model": ["model_a", "model_a", "model_b"], 133 | "scenario_1": [0.8, 0.9, 0.7], # Duplicate model_a for scenario_1 134 | "scenario_2": [0.7, 0.6, 0.8], 135 | } 136 | bad_df = pd.DataFrame(bad_data) 137 | with self.assertRaises(ValueError) as context: 138 | Benchmark(bad_df, "test_source") 139 | self.assertIn( 140 | "DataFrame contains duplicate model-scenario pairs", str(context.exception) 141 | ) 142 | 143 | def test_validate_df_pre_formatting_non_numeric_score(self): 144 | # Test DataFrame validation with non-numeric score 145 | bad_data = { 146 | "model": ["model_a", "model_b"], 147 | "scenario": ["scenario_1", "scenario_2"], 148 | "score": ["not_a_number", "also_not_a_number"], 149 | } 150 | bad_df = pd.DataFrame(bad_data) 151 | with self.assertRaises(ValueError) as context: 152 | Benchmark(bad_df, "test_source") 153 | self.assertIn("score must be numeric", str(context.exception)) 154 | 155 | def test_validate_dataframe_post_formatting_missing_columns(self): 156 | # Test with missing required columns after formatting 157 | data = {"model": ["model_a"], "scenario": ["scenario_1"], "score": [0.8]} 158 | df = pd.DataFrame(data) 159 | benchmark = Benchmark(df, "test_source") 160 | 161 | # Remove required columns and check if ValueError is raised 162 | benchmark.df.drop(columns=["source", "aggragated_from"], inplace=True) 163 | with self.assertRaises(ValueError) as context: 164 | benchmark.validate_dataframe_post_formatting() 165 | self.assertIn( 166 | "DataFrame must contain the following columns", str(context.exception) 167 | ) 168 | 169 | def test_validate_dataframe_post_formatting_non_numeric_score_after_formatting( 170 | self, 171 | ): 172 | # Test with non-numeric score after formatting 173 | data = { 174 | "model": ["model_a"], 175 | "scenario": ["scenario_1"], 176 | "score": [0.8], 177 | "source": ["test_source"], 178 | "aggragated_from": [[]], 179 | } 180 | df = pd.DataFrame(data) 181 | benchmark = Benchmark(df, "test_source") 182 | 183 | # Change score to non-numeric and check if ValueError is raised 184 | benchmark.df["score"] = "not_a_number" 185 | with self.assertRaises(ValueError) as context: 186 | benchmark.validate_dataframe_post_formatting() 187 | self.assertIn("score must be numeric", str(context.exception)) 188 | 189 | 190 | if __name__ == "__main__": 191 | unittest.main() 192 | --------------------------------------------------------------------------------