├── .editorconfig
├── .github
    └── ISSUE_TEMPLATE.md
├── .gitignore
├── .pre-commit-config.yaml
├── .secrets.baseline
├── AUTHORS.rst
├── CODE_OF_CONDUCT.rst
├── CONTRIBUTING.rst
├── HISTORY.rst
├── LICENCE
├── MANIFEST.in
├── Makefile
├── README.rst
├── data_acquisition
    ├── get_biggen.py
    └── get_hf_open_llm.py
├── docs
    ├── Makefile
    ├── authors.rst
    ├── conf.py
    ├── contributing.rst
    ├── history.rst
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── readme.rst
    └── usage.rst
├── examples
    ├── my_bench.csv
    ├── newbench_example.py
    ├── scenarios_for_aggregate.txt
    └── scenarios_of_intereset.txt
├── pyproject.toml
├── src
    └── bat
    │   ├── __init__.py
    │   ├── agreement_tester.py
    │   ├── assets
    │       ├── benchmarks
    │       │   ├── agenbench_240829_agent.csv
    │       │   ├── alphacaeval_v2lc_240829_holistic.csv
    │       │   ├── arena_hard_240829_holistic.csv
    │       │   ├── bfcl_240906_tools.csv
    │       │   ├── biggen_240829_holistic.csv
    │       │   ├── chatbot_arena_241104_holistic.csv
    │       │   ├── dec_arena_241022_holistic.csv
    │       │   ├── enkrypt_ai_safety_240916_safety.csv
    │       │   ├── eqbench_240912_emotion.csv
    │       │   ├── helm_airbench_240916_safety.csv
    │       │   ├── helm_classic_240829_holistic.csv
    │       │   ├── helm_lite_240829_holistic.csv
    │       │   ├── helm_mmlu_240829_knowledge.csv
    │       │   ├── hf_open_llm_v1_240829_holistic.csv
    │       │   ├── hf_open_llm_v2_240829_holistic.csv
    │       │   ├── holmes_240829_linguistics.csv
    │       │   ├── hydrox_safety_241001_safety.csv
    │       │   ├── livebench_240701_holistic.csv
    │       │   ├── livebench_240829_holistic.csv
    │       │   ├── livecodebench_240601_230701_code.csv
    │       │   ├── llm_trustworthy_241001_safety.csv
    │       │   ├── lvbench_241189_longcontext.csv
    │       │   ├── mixeval_240829_holistic.csv
    │       │   ├── mmlu_pro_240829_knowledge.csv
    │       │   ├── mtbench_240829_holistic.csv
    │       │   ├── opencompass_240829_holistic.csv
    │       │   ├── opencompass_academic_240829_holistic.csv
    │       │   ├── opencompass_agent_240829_agent.csv
    │       │   ├── opencompass_arena_240829_holistic.csv
    │       │   ├── opencompass_code_240829_code.csv
    │       │   ├── opencompass_instruct_240829_instructionfollow.csv
    │       │   ├── opencompass_knowledge_240829_knowledge.csv
    │       │   ├── opencompass_language_240829_language.csv
    │       │   ├── opencompass_math_240829_math.csv
    │       │   ├── opencompass_reasoning_240829_reasoning.csv
    │       │   ├── repoqa_241119_longcontext.csv
    │       │   ├── ruler_bench_241002_longcontext.csv
    │       │   ├── tablebench_241002_tables.csv
    │       │   ├── toolbench_240829_tools.csv
    │       │   └── wildbench_240829_holistic.csv
    │       ├── benchmarks_old
    │       │   ├── BLZ_240312.csv
    │       │   ├── agentbench_240720.csv
    │       │   ├── arena_hard_2404.csv
    │       │   ├── biggen_240612.csv
    │       │   ├── chatbot_arena_240829.csv
    │       │   ├── helm_classic_240130.csv
    │       │   ├── helm_classic_240829.csv
    │       │   ├── helm_lite_240610.csv
    │       │   ├── helm_lite_240829.csv
    │       │   ├── hf_open_llm_v1_240829_frozen.csv
    │       │   ├── hf_open_llm_v2_240829.csv
    │       │   ├── livebench_240701.csv
    │       │   ├── llm_trustworthy_241001_safety.csv
    │       │   ├── mixeval_240601.csv
    │       │   ├── mixeval_240829_holistic.csv
    │       │   ├── mmlu_pro_240610.csv
    │       │   ├── olmes_260624.csv
    │       │   ├── olmes_260624_frozen.csv
    │       │   ├── opencompass_240829.csv
    │       │   ├── opencompass_academic_240829.csv
    │       │   ├── wildbench_240612.csv
    │       │   └── wildbench_240829.csv
    │       ├── lower_is_better_benchmarks.txt
    │       └── prettified_bencmark_names.json
    │   ├── benchmark.py
    │   ├── configs.py
    │   ├── logic.py
    │   ├── reporting.py
    │   └── utils.py
└── tests
    ├── __init__.py
    └── test_benchmark.py


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.bat]
14 | indent_style = tab
15 | end_of_line = crlf
16 | 
17 | [LICENSE]
18 | insert_final_newline = false
19 | 
20 | [Makefile]
21 | indent_style = tab
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | * bat version:
 2 | * Python version:
 3 | * Operating System:
 4 | 
 5 | ### Description
 6 | 
 7 | Describe what you were trying to get done.
 8 | Tell us what happened, what went wrong, and what you expected to happen.
 9 | 
10 | ### What I Did
11 | 
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # Dask worker cache
 75 | dask-worker-space/
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # dotenv
 87 | .env
 88 | 
 89 | # virtualenv
 90 | .venv
 91 | venv/
 92 | ENV/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | # IDE settings
108 | .vscode/
109 | .idea/
110 | figures/
111 | 
112 | .DS_Store


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 |   # - repo: https://github.com/astral-sh/ruff-pre-commit
 5 |   #   # Ruff version.
 6 |   #   rev: v0.1.6
 7 |   #   hooks:
 8 |   #     # Run the linter.
 9 |   #     - id: ruff
10 |   #       args: [ --fix ]
11 |   #     # Run the formatter.
12 |   #     - id: ruff-format
13 | 
14 |   - repo: https://github.com/ibm/detect-secrets
15 |     # If you desire to use a specific version of detect-secrets, you can replace `master` with other git revisions such as branch, tag or commit sha.
16 |     # You are encouraged to use static refs such as tags, instead of branch name
17 |     #
18 |     # Running "pre-commit autoupdate" automatically updates rev to latest tag
19 |     rev: 0.13.1+ibm.61.dss
20 |     hooks:
21 |       - id: detect-secrets # pragma: whitelist secret
22 |         # Add options for detect-secrets-hook binary. You can run `detect-secrets-hook --help` to list out all possible options.
23 |         # You may also run `pre-commit run detect-secrets` to preview the scan result.
24 |         # when "--baseline" without "--use-all-plugins", pre-commit scan with just plugins in baseline file
25 |         # when "--baseline" with "--use-all-plugins", pre-commit scan with all available plugins
26 |         # add "--fail-on-unaudited" to fail pre-commit for unaudited potential secrets
27 |         args: [--baseline, .secrets.baseline, --use-all-plugins, --fail-on-unaudited]
28 | 
29 |   - repo: https://github.com/codespell-project/codespell
30 |     rev: v2.2.6
31 |     hooks:
32 |       - id: codespell
33 |         additional_dependencies:
34 |           - tomli


--------------------------------------------------------------------------------
/.secrets.baseline:
--------------------------------------------------------------------------------
 1 | {
 2 |   "exclude": {
 3 |     "files": "^.secrets.baseline$",
 4 |     "lines": null
 5 |   },
 6 |   "generated_at": "2023-10-05T11:42:58Z",
 7 |   "plugins_used": [
 8 |     {
 9 |       "name": "AWSKeyDetector"
10 |     },
11 |     {
12 |       "name": "ArtifactoryDetector"
13 |     },
14 |     {
15 |       "name": "AzureStorageKeyDetector"
16 |     },
17 |     {
18 |       "base64_limit": 4.5,
19 |       "name": "Base64HighEntropyString"
20 |     },
21 |     {
22 |       "name": "BasicAuthDetector"
23 |     },
24 |     {
25 |       "name": "BoxDetector"
26 |     },
27 |     {
28 |       "name": "CloudantDetector"
29 |     },
30 |     {
31 |       "ghe_instance": "github.ibm.com",
32 |       "name": "GheDetector"
33 |     },
34 |     {
35 |       "name": "GitHubTokenDetector"
36 |     },
37 |     {
38 |       "hex_limit": 3,
39 |       "name": "HexHighEntropyString"
40 |     },
41 |     {
42 |       "name": "IbmCloudIamDetector"
43 |     },
44 |     {
45 |       "name": "IbmCosHmacDetector"
46 |     },
47 |     {
48 |       "name": "JwtTokenDetector"
49 |     },
50 |     {
51 |       "keyword_exclude": null,
52 |       "name": "KeywordDetector"
53 |     },
54 |     {
55 |       "name": "MailchimpDetector"
56 |     },
57 |     {
58 |       "name": "NpmDetector"
59 |     },
60 |     {
61 |       "name": "PrivateKeyDetector"
62 |     },
63 |     {
64 |       "name": "SlackDetector"
65 |     },
66 |     {
67 |       "name": "SoftlayerDetector"
68 |     },
69 |     {
70 |       "name": "SquareOAuthDetector"
71 |     },
72 |     {
73 |       "name": "StripeDetector"
74 |     },
75 |     {
76 |       "name": "TwilioKeyDetector"
77 |     }
78 |   ],
79 |   "results": {},
80 |   "version": "0.13.1+ibm.61.dss",
81 |   "word_list": {
82 |     "file": null,
83 |     "hash": null
84 |   }
85 | }


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Credits
 3 | =======
 4 | 
 5 | Development Lead
 6 | ----------------
 7 | 
 8 | * Yotam Perlitz <perlitz@gmail.com>
 9 | 
10 | Contributors
11 | ------------
12 | 
13 | None yet. Why not be the first?
14 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.rst:
--------------------------------------------------------------------------------
 1 | ====================================
 2 | Contributor Covenant Code of Conduct
 3 | ====================================
 4 | 
 5 | Our Pledge
 6 | ----------
 7 | 
 8 | In the interest of fostering an open and welcoming environment, we as
 9 | contributors and maintainers pledge to make participation in our project and
10 | our community a harassment-free experience for everyone, regardless of age, body
11 | size, disability, ethnicity, sex characteristics, gender identity and expression,
12 | level of experience, education, socio-economic status, nationality, personal
13 | appearance, race, religion, or sexual identity and orientation.
14 | 
15 | Our Standards
16 | -------------
17 | 
18 | Examples of behavior that contributes to creating a positive environment
19 | include:
20 | 
21 | * Using welcoming and inclusive language
22 | * Being respectful of differing viewpoints and experiences
23 | * Gracefully accepting constructive criticism
24 | * Focusing on what is best for the community
25 | * Showing empathy towards other community members
26 | 
27 | Examples of unacceptable behavior by participants include:
28 | 
29 | * The use of sexualized language or imagery and unwelcome sexual attention or
30 |   advances
31 | * Trolling, insulting/derogatory comments, and personal or political attacks
32 | * Public or private harassment
33 | * Publishing others' private information, such as a physical or electronic
34 |   address, without explicit permission
35 | * Other conduct which could reasonably be considered inappropriate in a
36 |   professional setting
37 | 
38 | Our Responsibilities
39 | --------------------
40 | 
41 | Project maintainers are responsible for clarifying the standards of acceptable
42 | behavior and are expected to take appropriate and fair corrective action in
43 | response to any instances of unacceptable behavior.
44 | 
45 | Project maintainers have the right and responsibility to remove, edit, or
46 | reject comments, commits, code, wiki edits, issues, and other contributions
47 | that are not aligned to this Code of Conduct, or to ban temporarily or
48 | permanently any contributor for other behaviors that they deem inappropriate,
49 | threatening, offensive, or harmful.
50 | 
51 | Scope
52 | -----
53 | 
54 | This Code of Conduct applies within all project spaces, and it also applies when
55 | an individual is representing the project or its community in public spaces.
56 | Examples of representing a project or community include using an official
57 | project e-mail address, posting via an official social media account, or acting
58 | as an appointed representative at an online or offline event. Representation of
59 | a project may be further defined and clarified by project maintainers.
60 | 
61 | Enforcement
62 | -----------
63 | 
64 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
65 | reported by contacting the project team at [INSERT EMAIL ADDRESS]. All
66 | complaints will be reviewed and investigated and will result in a response that
67 | is deemed necessary and appropriate to the circumstances. The project team is
68 | obligated to maintain confidentiality with regard to the reporter of an incident.
69 | Further details of specific enforcement policies may be posted separately.
70 | 
71 | Project maintainers who do not follow or enforce the Code of Conduct in good
72 | faith may face temporary or permanent repercussions as determined by other
73 | members of the project's leadership.
74 | 
75 | Attribution
76 | -----------
77 | 
78 | This Code of Conduct is adapted from the `Contributor Covenant`_, version 1.4,
79 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
80 | 
81 | For answers to common questions about this code of conduct, see
82 | https://www.contributor-covenant.org/faq
83 | 
84 | .. _`Contributor Covenant`: https://www.contributor-covenant.org
85 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
  1 | .. highlight:: shell
  2 | 
  3 | ============
  4 | Contributing
  5 | ============
  6 | 
  7 | Contributions are welcome, and they are greatly appreciated! Every little bit
  8 | helps, and credit will always be given.
  9 | 
 10 | You can contribute in many ways:
 11 | 
 12 | Types of Contributions
 13 | ----------------------
 14 | 
 15 | Report Bugs
 16 | ~~~~~~~~~~~
 17 | 
 18 | Report bugs at https://github.com/perlitz/bat/issues.
 19 | 
 20 | If you are reporting a bug, please include:
 21 | 
 22 | * Your operating system name and version.
 23 | * Any details about your local setup that might be helpful in troubleshooting.
 24 | * Detailed steps to reproduce the bug.
 25 | 
 26 | Fix Bugs
 27 | ~~~~~~~~
 28 | 
 29 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help
 30 | wanted" is open to whoever wants to implement it.
 31 | 
 32 | Implement Features
 33 | ~~~~~~~~~~~~~~~~~~
 34 | 
 35 | Look through the GitHub issues for features. Anything tagged with "enhancement"
 36 | and "help wanted" is open to whoever wants to implement it.
 37 | 
 38 | Write Documentation
 39 | ~~~~~~~~~~~~~~~~~~~
 40 | 
 41 | bat could always use more documentation, whether as part of the
 42 | official bat docs, in docstrings, or even on the web in blog posts,
 43 | articles, and such.
 44 | 
 45 | Submit Feedback
 46 | ~~~~~~~~~~~~~~~
 47 | 
 48 | The best way to send feedback is to file an issue at https://github.com/perlitz/bat/issues.
 49 | 
 50 | If you are proposing a feature:
 51 | 
 52 | * Explain in detail how it would work.
 53 | * Keep the scope as narrow as possible, to make it easier to implement.
 54 | * Remember that this is a volunteer-driven project, and that contributions
 55 |   are welcome :)
 56 | 
 57 | Get Started!
 58 | ------------
 59 | 
 60 | Ready to contribute? Here's how to set up `bat` for local development.
 61 | 
 62 | 1. Fork the `bat` repo on GitHub.
 63 | 2. Clone your fork locally::
 64 | 
 65 |     $ git clone git@github.com:your_name_here/bat.git
 66 | 
 67 | 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development::
 68 | 
 69 |     $ mkvirtualenv bat
 70 |     $ cd bat/
 71 |     $ python setup.py develop
 72 | 
 73 | 4. Create a branch for local development::
 74 | 
 75 |     $ git checkout -b name-of-your-bugfix-or-feature
 76 | 
 77 |    Now you can make your changes locally.
 78 | 
 79 | 5. When you're done making changes, check that your changes pass flake8 and the
 80 |    tests, including testing other Python versions with tox::
 81 | 
 82 |     $ make lint
 83 |     $ make test
 84 |     Or
 85 |     $ make test-all
 86 | 
 87 |    To get flake8 and tox, just pip install them into your virtualenv.
 88 | 
 89 | 6. Commit your changes and push your branch to GitHub::
 90 | 
 91 |     $ git add .
 92 |     $ git commit -m "Your detailed description of your changes."
 93 |     $ git push origin name-of-your-bugfix-or-feature
 94 | 
 95 | 7. Submit a pull request through the GitHub website.
 96 | 
 97 | Pull Request Guidelines
 98 | -----------------------
 99 | 
100 | Before you submit a pull request, check that it meets these guidelines:
101 | 
102 | 1. The pull request should include tests.
103 | 2. If the pull request adds functionality, the docs should be updated. Put
104 |    your new functionality into a function with a docstring, and add the
105 |    feature to the list in README.rst.
106 | 3. The pull request should work for Python 3.5, 3.6, 3.7 and 3.8, and for PyPy. Check
107 |    https://travis-ci.com/perlitz/bat/pull_requests
108 |    and make sure that the tests pass for all supported Python versions.
109 | 
110 | Tips
111 | ----
112 | 
113 | To run a subset of tests::
114 | 
115 | 
116 |     $ python -m unittest tests.test_bat
117 | 
118 | Deploying
119 | ---------
120 | 
121 | A reminder for the maintainers on how to deploy.
122 | Make sure all your changes are committed (including an entry in HISTORY.rst).
123 | Then run::
124 | 
125 | $ bump2version patch # possible: major / minor / patch
126 | $ git push
127 | $ git push --tags
128 | 
129 | Travis will then deploy to PyPI if tests pass.
130 | 
131 | Code of Conduct
132 | ---------------
133 | 
134 | Please note that this project is released with a `Contributor Code of Conduct`_.
135 | By participating in this project you agree to abide by its terms.
136 | 
137 | .. _`Contributor Code of Conduct`: CODE_OF_CONDUCT.rst
138 | 


--------------------------------------------------------------------------------
/HISTORY.rst:
--------------------------------------------------------------------------------
1 | =======
2 | History
3 | =======
4 | 
5 | 0.1.0 (2024-07-01)
6 | ------------------
7 | 
8 | * First release on PyPI.
9 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include AUTHORS.rst
 2 | include CONTRIBUTING.rst
 3 | include HISTORY.rst
 4 | include LICENSE
 5 | include README.rst
 6 | 
 7 | recursive-include tests *
 8 | recursive-exclude * __pycache__
 9 | recursive-exclude * *.py[co]
10 | 
11 | recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
12 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean clean-build clean-pyc clean-test coverage dist docs help install lint lint/flake8
 2 | 
 3 | .DEFAULT_GOAL := help
 4 | 
 5 | define BROWSER_PYSCRIPT
 6 | import os, webbrowser, sys
 7 | 
 8 | from urllib.request import pathname2url
 9 | 
10 | webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
11 | endef
12 | export BROWSER_PYSCRIPT
13 | 
14 | define PRINT_HELP_PYSCRIPT
15 | import re, sys
16 | 
17 | for line in sys.stdin:
18 | 	match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
19 | 	if match:
20 | 		target, help = match.groups()
21 | 		print("%-20s %s" % (target, help))
22 | endef
23 | export PRINT_HELP_PYSCRIPT
24 | 
25 | BROWSER := python -c "$$BROWSER_PYSCRIPT"
26 | 
27 | help:
28 | 	@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
29 | 
30 | clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
31 | 
32 | clean-build: ## remove build artifacts
33 | 	rm -fr build/
34 | 	rm -fr dist/
35 | 	rm -fr .eggs/
36 | 	find . -name '*.egg-info' -exec rm -fr {} +
37 | 	find . -name '*.egg' -exec rm -f {} +
38 | 
39 | clean-pyc: ## remove Python file artifacts
40 | 	find . -name '*.pyc' -exec rm -f {} +
41 | 	find . -name '*.pyo' -exec rm -f {} +
42 | 	find . -name '*~' -exec rm -f {} +
43 | 	find . -name '__pycache__' -exec rm -fr {} +
44 | 
45 | clean-test: ## remove test and coverage artifacts
46 | 	rm -fr .tox/
47 | 	rm -f .coverage
48 | 	rm -fr htmlcov/
49 | 	rm -fr .pytest_cache
50 | 
51 | lint/flake8: ## check style with flake8
52 | 	flake8 bat tests
53 | 
54 | lint: lint/flake8 ## check style
55 | 
56 | test: ## run tests quickly with the default Python
57 | 	pytest
58 | 
59 | test-all: ## run tests on every Python version with tox
60 | 	tox
61 | 
62 | coverage: ## check code coverage quickly with the default Python
63 | 	coverage run --source bat -m pytest
64 | 	coverage report -m
65 | 	coverage html
66 | 	$(BROWSER) htmlcov/index.html
67 | 
68 | docs: ## generate Sphinx HTML documentation, including API docs
69 | 	rm -f docs/bat.rst
70 | 	rm -f docs/modules.rst
71 | 	sphinx-apidoc -o docs/ bat
72 | 	$(MAKE) -C docs clean
73 | 	$(MAKE) -C docs html
74 | 	$(BROWSER) docs/_build/html/index.html
75 | 
76 | servedocs: docs ## compile the docs watching for changes
77 | 	watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
78 | 
79 | release: dist ## package and upload a release
80 | 	twine upload dist/*
81 | 
82 | dist: clean ## builds source and wheel package
83 | 	python -m build
84 | 	ls -l dist
85 | 
86 | install: clean ## install the package to the active Python's site-packages
87 | 	pip install -e .


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | BenchBench Package
 2 | =========================================
 3 | 
 4 | Overview
 5 | --------
 6 | The ``benchbench`` package simplifies benchmark agreement testing for NLP models. Compare multiple models across various benchmarks and generate comprehensive agreement reports easily.
 7 | 
 8 | It also powers `BenchBench` (https://huggingface.co/spaces/ibm/benchbench), a benchmark for comparing benchmarks.  
 9 | 
10 | Contributing a New Benchmark
11 | --------------------------
12 | 
13 | To contribute a new benchmark, create a pull request with a new CSV file in ``src/bat/assets/benchmarks``. The filename should reflect the data source and snapshot date (see existing files for examples).
14 | 
15 | 
16 | Usage
17 | -----
18 | 
19 | While much of ``benchbench``'s functionality is available via the interactive `BenchBench` app (https://huggingface.co/spaces/ibm/benchbench), for more advanced usage and customization, clone the repository:
20 | 
21 | .. code-block:: bash
22 | 
23 |    git clone git@github.com:IBM/benchbench.git
24 | 
25 | Install in the environment of your choice:
26 | 
27 | .. code-block:: bash
28 | 
29 |    cd benchbench
30 | 
31 |    conda create -n bat python=3.11
32 |    pip install -e .
33 | 
34 | And check out the example in ``examples/newbench_example.py `` (or here: https://github.com/IBM/benchbench/blob/main/examples/newbench_example.py) *(Note: Use backticks for file path)*
35 | 
36 | Contributing
37 | ------------
38 | Contributions to the ``benchbench`` package are welcome! Please submit your pull requests or issues through our GitHub repository.
39 | 
40 | License
41 | -------
42 | 
43 | This package is released under the MIT License.


--------------------------------------------------------------------------------
/data_acquisition/get_biggen.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import json
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def get_json_format_data(url):
 8 |     response = requests.get(url)
 9 |     soup = BeautifulSoup(response.content, "html.parser")
10 | 
11 |     script_elements = soup.find_all("script")
12 |     json_format_data = json.loads(str(script_elements[1])[31:-10])
13 |     return json_format_data
14 | 
15 | 
16 | def get_datas(data):
17 |     for component_index in range(
18 |         0, 50, 1
19 |     ):  # component_index sometimes changes when they update the space, we can use this "for" loop to avoid changing component index manually
20 |         try:
21 |             result_list = []
22 |             i = 0
23 |             while True:
24 |                 try:
25 |                     results = data["components"][component_index]["props"]["value"][
26 |                         "data"
27 |                     ][i]
28 |                     columns = data["components"][component_index]["props"]["headers"]
29 |                     try:
30 |                         results_json = {"Model": results[0]}
31 | 
32 |                         if (
33 |                             len(columns) < 13
34 |                         ):  # If there are less than 15 columns (this number can definetly change), we know that we are trying wrong component index, so breaking loop to try next component index.
35 |                             break
36 | 
37 |                         for col_index, col_name in enumerate(columns[1:-1], start=1):
38 |                             results_json[col_name] = results[col_index]
39 | 
40 |                     except IndexError:  # Wrong component index, so breaking loop to try next component index. (NOTE: More than one component index can give you some results but we must find the right component index to get all results we want.)
41 |                         break
42 |                     result_list.append(results_json)
43 |                     i += 1
44 |                 except IndexError:  # No rows to extract so return the list (We know it is the right component index because we didn't break out of loop on the other exception.)
45 |                     return result_list
46 |         except (KeyError, TypeError):
47 |             continue
48 | 
49 |     return result_list
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     # for biggen
54 | 
55 |     data = get_json_format_data(
56 |         url="https://prometheus-eval-BiGGen-Bench-Leaderboard.hf.space/"
57 |     )
58 |     finished_models = get_datas(data)
59 |     df = pd.DataFrame(finished_models)
60 | 
61 |     # df["Model"]
62 | 
63 |     df["Model"] = df["Model"].apply(lambda x: x.split('">')[-1].split("</a>")[0])
64 | 
65 |     df.rename(
66 |         columns={
67 |             "Average": "biggen",
68 |             "Model": "model",
69 |         },
70 |         inplace=True,
71 |     )
72 | 
73 |     import pandas as pd
74 |     import re
75 | 
76 |     # Function to clean column names
77 |     def clean_column(col):
78 |         col = re.sub(r"[^\w\s]", "", col)  # Remove emojis
79 |         col = (
80 |             col.strip().lower().replace(" ", "_")
81 |         )  # Lowercase and replace spaces with _
82 |         if col != "model" and col != "biggen":
83 |             col = "biggen_" + col
84 |         return col
85 | 
86 |     # Apply the cleaning function to the columns
87 |     cleaned_columns = [clean_column(col) for col in df.columns.tolist()]
88 |     df.columns = cleaned_columns
89 |     df.drop(columns=["biggen_model_type"], inplace=True)
90 | 
91 |     df.to_csv("src/bat/assets/benchmarks_to_add/biggen_240829.csv", index=False)
92 | 


--------------------------------------------------------------------------------
/data_acquisition/get_hf_open_llm.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import json
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def get_json_format_data(url):
  8 |     response = requests.get(url)
  9 |     soup = BeautifulSoup(response.content, "html.parser")
 10 | 
 11 |     script_elements = soup.find_all("script")
 12 |     json_format_data = json.loads(str(script_elements[1])[31:-10])
 13 |     return json_format_data
 14 | 
 15 | 
 16 | def get_datas(data):
 17 |     for component_index in range(
 18 |         0, 50, 1
 19 |     ):  # component_index sometimes changes when they update the space, we can use this "for" loop to avoid changing component index manually
 20 |         try:
 21 |             result_list = []
 22 |             i = 0
 23 |             while True:
 24 |                 try:
 25 |                     results = data["components"][component_index]["props"]["value"][
 26 |                         "data"
 27 |                     ][i]
 28 |                     columns = data["components"][component_index]["props"]["headers"]
 29 |                     try:
 30 |                         results_json = {"T": results[0], "Model": results[-1]}
 31 | 
 32 |                         if (
 33 |                             len(columns) < 13
 34 |                         ):  # If there are less than 15 columns (this number can definetly change), we know that we are trying wrong component index, so breaking loop to try next component index.
 35 |                             break
 36 | 
 37 |                         for col_index, col_name in enumerate(columns[2:-1], start=2):
 38 |                             results_json[col_name] = results[col_index]
 39 | 
 40 |                     except IndexError:  # Wrong component index, so breaking loop to try next component index. (NOTE: More than one component index can give you some results but we must find the right component index to get all results we want.)
 41 |                         break
 42 |                     result_list.append(results_json)
 43 |                     i += 1
 44 |                 except IndexError:  # No rows to extract so return the list (We know it is the right component index because we didn't break out of loop on the other exception.)
 45 |                     return result_list
 46 |         except (KeyError, TypeError):
 47 |             continue
 48 | 
 49 |     return result_list
 50 | 
 51 | 
 52 | if __name__ == "__main__":
 53 |     # for V2
 54 |     data = get_json_format_data(
 55 |         url="https://open-llm-leaderboard-open-llm-leaderboard.hf.space/"
 56 |     )
 57 |     finished_models = get_datas(data)
 58 |     df = pd.DataFrame(finished_models)
 59 |     df = df.query("T=='🟢' or T=='💬'")
 60 |     cols_to_use = [
 61 |         "Model",
 62 |         "Average ⬆️",
 63 |         "IFEval",
 64 |         "BBH",
 65 |         "BBH Raw",
 66 |         "MATH Lvl 5",
 67 |         "GPQA",
 68 |         "MUSR",
 69 |         "MMLU-PRO",
 70 |     ]
 71 |     df = df[cols_to_use]
 72 |     df.rename(
 73 |         columns={
 74 |             "Average ⬆️": "hf_open_llm_v2",
 75 |             "Model": "model",
 76 |             "MATH Lvl 5": "MATH_Lvl_5",
 77 |         },
 78 |         inplace=True,
 79 |     )
 80 | 
 81 |     df.to_csv("src/bat/assets/benchmarks/hf_open_llm_v2_240829.csv", index=False)
 82 | 
 83 |     # for V1
 84 | 
 85 |     data = get_json_format_data(
 86 |         url="https://open-llm-leaderboard-old-open-llm-leaderboard.hf.space/"
 87 |     )
 88 |     finished_models = get_datas(data)
 89 |     df = pd.DataFrame(finished_models)
 90 |     df = df.query("T=='🟢' or T=='💬'")
 91 | 
 92 |     cols_to_use = [
 93 |         "Model",
 94 |         "Average ⬆️",
 95 |         "ARC",
 96 |         "HellaSwag",
 97 |         "MMLU",
 98 |         "TruthfulQA",
 99 |         "Winogrande",
100 |         "GSM8K",
101 |     ]
102 | 
103 |     df = df[cols_to_use]
104 |     df.rename(
105 |         columns={
106 |             "Average ⬆️": "hf_open_llm_v1",
107 |             "Model": "model",
108 |         },
109 |         inplace=True,
110 |     )
111 | 
112 |     df.to_csv("src/bat/assets/benchmarks/hf_open_llm_v1_240829_frozen.csv", index=False)
113 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = bat
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../AUTHORS.rst
2 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # bat documentation build configuration file, created by
  4 | # sphinx-quickstart on Fri Jun  9 13:47:02 2017.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | # If extensions (or modules to document with autodoc) are in another
 16 | # directory, add these directories to sys.path here. If the directory is
 17 | # relative to the documentation root, use os.path.abspath to make it
 18 | # absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | import bat
 23 | 
 24 | sys.path.insert(0, os.path.abspath(".."))
 25 | 
 26 | 
 27 | # -- General configuration ---------------------------------------------
 28 | 
 29 | # If your documentation needs a minimal Sphinx version, state it here.
 30 | #
 31 | # needs_sphinx = '1.0'
 32 | 
 33 | # Add any Sphinx extension module names here, as strings. They can be
 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 35 | extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode"]
 36 | 
 37 | # Add any paths that contain templates here, relative to this directory.
 38 | templates_path = ["_templates"]
 39 | 
 40 | # The suffix(es) of source filenames.
 41 | # You can specify multiple suffix as a list of string:
 42 | #
 43 | # source_suffix = ['.rst', '.md']
 44 | source_suffix = ".rst"
 45 | 
 46 | # The master toctree document.
 47 | master_doc = "index"
 48 | 
 49 | # General information about the project.
 50 | project = "bat"
 51 | copyright = "2024, Yotam Perlitz"
 52 | author = "Yotam Perlitz"
 53 | 
 54 | # The version info for the project you're documenting, acts as replacement
 55 | # for |version| and |release|, also used in various other places throughout
 56 | # the built documents.
 57 | #
 58 | # The short X.Y version.
 59 | version = bat.__version__
 60 | # The full version, including alpha/beta/rc tags.
 61 | release = bat.__version__
 62 | 
 63 | # The language for content autogenerated by Sphinx. Refer to documentation
 64 | # for a list of supported languages.
 65 | #
 66 | # This is also used if you do content translation via gettext catalogs.
 67 | # Usually you set "language" from the command line for these cases.
 68 | language = None
 69 | 
 70 | # List of patterns, relative to source directory, that match files and
 71 | # directories to ignore when looking for source files.
 72 | # This patterns also effect to html_static_path and html_extra_path
 73 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 74 | 
 75 | # The name of the Pygments (syntax highlighting) style to use.
 76 | pygments_style = "sphinx"
 77 | 
 78 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 79 | todo_include_todos = False
 80 | 
 81 | 
 82 | # -- Options for HTML output -------------------------------------------
 83 | 
 84 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 85 | # a list of builtin themes.
 86 | #
 87 | html_theme = "alabaster"
 88 | 
 89 | # Theme options are theme-specific and customize the look and feel of a
 90 | # theme further.  For a list of options available for each theme, see the
 91 | # documentation.
 92 | #
 93 | # html_theme_options = {}
 94 | 
 95 | # Add any paths that contain custom static files (such as style sheets) here,
 96 | # relative to this directory. They are copied after the builtin static files,
 97 | # so a file named "default.css" will overwrite the builtin "default.css".
 98 | html_static_path = ["_static"]
 99 | 
100 | 
101 | # -- Options for HTMLHelp output ---------------------------------------
102 | 
103 | # Output file base name for HTML help builder.
104 | htmlhelp_basename = "batdoc"
105 | 
106 | 
107 | # -- Options for LaTeX output ------------------------------------------
108 | 
109 | latex_elements = {
110 |     # The paper size ('letterpaper' or 'a4paper').
111 |     #
112 |     # 'papersize': 'letterpaper',
113 |     # The font size ('10pt', '11pt' or '12pt').
114 |     #
115 |     # 'pointsize': '10pt',
116 |     # Additional stuff for the LaTeX preamble.
117 |     #
118 |     # 'preamble': '',
119 |     # Latex figure (float) alignment
120 |     #
121 |     # 'figure_align': 'htbp',
122 | }
123 | 
124 | # Grouping the document tree into LaTeX files. List of tuples
125 | # (source start file, target name, title, author, documentclass
126 | # [howto, manual, or own class]).
127 | latex_documents = [
128 |     (master_doc, "bat.tex", "bat Documentation", "Yotam Perlitz", "manual"),
129 | ]
130 | 
131 | 
132 | # -- Options for manual page output ------------------------------------
133 | 
134 | # One entry per manual page. List of tuples
135 | # (source start file, name, description, authors, manual section).
136 | man_pages = [(master_doc, "bat", "bat Documentation", [author], 1)]
137 | 
138 | 
139 | # -- Options for Texinfo output ----------------------------------------
140 | 
141 | # Grouping the document tree into Texinfo files. List of tuples
142 | # (source start file, target name, title, author,
143 | #  dir menu entry, description, category)
144 | texinfo_documents = [
145 |     (
146 |         master_doc,
147 |         "bat",
148 |         "bat Documentation",
149 |         author,
150 |         "bat",
151 |         "One line description of project.",
152 |         "Miscellaneous",
153 |     ),
154 | ]
155 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../CONTRIBUTING.rst
2 | 


--------------------------------------------------------------------------------
/docs/history.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../HISTORY.rst
2 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to bat's documentation!
 2 | ======================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: Contents:
 7 | 
 8 |    readme
 9 |    installation
10 |    usage
11 |    modules
12 |    contributing
13 |    authors
14 |    history
15 | 
16 | Indices and tables
17 | ==================
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 | * :ref:`search`
21 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | .. highlight:: shell
 2 | 
 3 | ============
 4 | Installation
 5 | ============
 6 | 
 7 | 
 8 | Stable release
 9 | --------------
10 | 
11 | To install bat, run this command in your terminal:
12 | 
13 | .. code-block:: console
14 | 
15 |     $ pip install bat
16 | 
17 | This is the preferred method to install bat, as it will always install the most recent stable release.
18 | 
19 | If you don't have `pip`_ installed, this `Python installation guide`_ can guide
20 | you through the process.
21 | 
22 | .. _pip: https://pip.pypa.io
23 | .. _Python installation guide: http://docs.python-guide.org/en/latest/starting/installation/
24 | 
25 | 
26 | From sources
27 | ------------
28 | 
29 | The sources for bat can be downloaded from the `Github repo`_.
30 | 
31 | You can either clone the public repository:
32 | 
33 | .. code-block:: console
34 | 
35 |     $ git clone git://github.com/perlitz/bat
36 | 
37 | Or download the `tarball`_:
38 | 
39 | .. code-block:: console
40 | 
41 |     $ curl -OJL https://github.com/perlitz/bat/tarball/master
42 | 
43 | Once you have a copy of the source, you can install it with:
44 | 
45 | .. code-block:: console
46 | 
47 |     $ python setup.py install
48 | 
49 | 
50 | .. _Github repo: https://github.com/perlitz/bat
51 | .. _tarball: https://github.com/perlitz/bat/tarball/master
52 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=bat
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/readme.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../README.rst
2 | 


--------------------------------------------------------------------------------
/docs/usage.rst:
--------------------------------------------------------------------------------
1 | =====
2 | Usage
3 | =====
4 | 
5 | To use bat in a project::
6 | 
7 |     import bat
8 | 


--------------------------------------------------------------------------------
/examples/my_bench.csv:
--------------------------------------------------------------------------------
 1 | model,my_bench
 2 | zephyr-7b-beta,17.32
 3 | zephyr-7b-alpha,19.28
 4 | yi-6b-chat,9.02
 5 | vicuna-7b-v1.5-16k,14.22
 6 | vicuna-7b-v1.5,12.31
 7 | starling-lm-7b-beta,16.62
 8 | smaug-qwen2-72b-instruct,39.66
 9 | qwen2-72b-instruct,40.16
10 | qwen2-7b-instruct,26.63
11 | qwen2-1.5b-instruct,10.42
12 | qwen2-0.5b-instruct,7.30
13 | qwen1.5-110b-chat,29.07
14 | qwen1.5-72b-chat,28.89
15 | qwen1.5-7b-chat,17.02
16 | qwen1.5-4b-chat,11.59
17 | qwen1.5-1.8b-chat,6.32
18 | qwen1.5-0.5b-chat,5.43
19 | phi-3.5-moe-instruct,35.14
20 | phi-3.5-mini-instruct,27.81
21 | phi-3-small-128k-instruct,29.68
22 | phi-3-small-8k-instruct,29.09
23 | phi-3-mini-128k-instruct,24.76
24 | phi-3-mini-4k-instruct,24.41
25 | phi-3-medium-128k-instruct,29.88
26 | phi-3-medium-4k-instruct,30.96
27 | openhermes-2.5-mistral-7b,23.36
28 | open-mistral-nemo,29.02
29 | mixtral-8x22b-instruct-v0.1,35.29
30 | mixtral-8x7b-instruct-v0.1,22.79
31 | mistral-small-2402,33.03
32 | mistral-large-2407,48.35
33 | mistral-large-2402,38.92
34 | mistral-7b-instruct-v0.3,20.09
35 | mistral-7b-instruct-v0.2,19.51
36 | meta-llama-3.1-405b-instruct-turbo,55.18
37 | meta-llama-3.1-70b-instruct-turbo,48.90
38 | meta-llama-3.1-8b-instruct-turbo,28.11
39 | meta-llama-3-70b-instruct,37.60
40 | meta-llama-3-8b-instruct,27.46
41 | mathstral-7b-v0.1,24.33
42 | llama-2-7b-chat-hf,10.25
43 | hermes-3-llama-3.1-70b,39.56
44 | gpt-4o-mini-2024-07-18,44.57
45 | gpt-4o-2024-08-06,56.46
46 | gpt-4o-2024-05-13,54.96
47 | gpt-4-turbo-2024-04-09,53.00
48 | gpt-4-0613,44.94
49 | gpt-4-0125-preview,49.39
50 | gpt-3.5-turbo-0125,34.66
51 | gemma-2-27b-it,41.22
52 | gemma-2-9b-it,31.57
53 | gemma-1.1-7b-it,18.23
54 | gemini-1.5-pro-exp-0827,55.06
55 | gemini-1.5-pro-exp-0801,53.63
56 | gemini-1.5-pro-api-0514,44.41
57 | gemini-1.5-flash-exp-0827,47.51
58 | gemini-1.5-flash-api-0514,40.95
59 | dracarys-llama-3.1-70b-instruct,49.82
60 | dracarys-72b-instruct,41.72
61 | deepseek-v2-lite-chat,17.49
62 | deepseek-coder-v2-lite-instruct,29.21
63 | deepseek-coder-v2,46.84
64 | deepseek-chat-v2,46.36
65 | command-r-plus,32.86
66 | command-r,27.23
67 | claude-3-sonnet-20240229,38.08
68 | claude-3-opus-20240229,50.75
69 | claude-3-haiku-20240307,35.32
70 | claude-3-5-sonnet-20240620,61.16
71 | chatgpt-4o-latest,55.35


--------------------------------------------------------------------------------
/examples/newbench_example.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from bat import Config, Tester, Benchmark, Reporter
  3 | from datetime import datetime
  4 | 
  5 | 
  6 | def load_scenarios(filepath, comment_char="#"):
  7 |     """Loads scenarios from a text file, ignoring commented lines."""
  8 |     scenarios = []
  9 |     try:
 10 |         with open(filepath, "r") as f:
 11 |             for line in f:
 12 |                 line = line.strip()
 13 |                 if line and not line.startswith(comment_char):
 14 |                     scenarios.append(line)
 15 |     except FileNotFoundError:
 16 |         print(
 17 |             f"Warning: Scenarios file not found: {filepath}. Proceeding without these scenarios."
 18 |         )
 19 |     return scenarios
 20 | 
 21 | 
 22 | # loading the benchmarks to work with, change the files to fit what you are interested in
 23 | scenarios_for_aggregate = load_scenarios("examples/scenarios_for_aggregate.txt")
 24 | scenarios_of_intereset = load_scenarios("examples/scenarios_of_intereset.txt")
 25 | 
 26 | # Configuration for agreement testing
 27 | n_models_taken = 10  # Number of models to sample for each comparison. 0 means all intersecting models.
 28 | model_select_strategy = (
 29 |     "random"  # How to select models: "top", "bottom", "random", "somewhere"
 30 | )
 31 | corr_type = "kendall"  # Correlation types: "kendall", "pearson"
 32 | n_exps = 3  # Number of experiments for random sampling. Set to 1 for deterministic strategies.
 33 | 
 34 | # --- Load your benchmark ---
 35 | my_bench_df = pd.read_csv("examples/my_bench.csv")
 36 | my_bench_source_name = f"uploaded_benchmark_{datetime.now().strftime('%y%m%d')}"
 37 | my_bench = Benchmark(
 38 |     my_bench_df, data_source=my_bench_source_name, normalized_names=False
 39 | )
 40 | 
 41 | # --- Load the existing benchbench benchmark catalog ---
 42 | allbench = Benchmark()
 43 | allbench.load_local_catalog()
 44 | 
 45 | # --- Create an aggregate benchmark ---
 46 | allbench.add_aggregate(
 47 |     new_col_name="aggregate",
 48 |     agg_source_name="aggregate",
 49 |     scenario_whitelist=scenarios_for_aggregate,
 50 |     min_scenario_for_models_to_appear_in_agg=max(1, len(scenarios_for_aggregate) // 3),
 51 | )
 52 | 
 53 | # --- Combine your benchmark with the existing benchmarks ---
 54 | allbench.extend(my_bench)
 55 | 
 56 | # --- Analyze model overlap for insights ---
 57 | uploaded_models = my_bench.get_models()
 58 | aggregate_models = allbench.df[allbench.df["source"] == "aggregate"]["model"].unique()
 59 | n_overlap_models = len(set(aggregate_models).intersection(uploaded_models))
 60 | print(f"Number of models overlapping: {n_overlap_models}")
 61 | 
 62 | # --- Remove duplicate scenarios before analysis ---
 63 | allbench.clear_repeated_scenarios(
 64 |     source_to_keep=my_bench_source_name
 65 | )  # Prioritize keeping your benchmark's scenarios
 66 | 
 67 | 
 68 | # --- Select specific scenarios for analysis ---
 69 | my_scenario_name = allbench.df.query(f'source=="{my_bench_source_name}"')[
 70 |     "scenario"
 71 | ].iloc[0]
 72 | scenarios_to_analyze = (
 73 |     scenarios_of_intereset + ["aggregate"] + [my_scenario_name]
 74 | )  # Use my_bench_name for consistency
 75 | allbench.df = allbench.df[allbench.df["scenario"].isin(scenarios_to_analyze)]
 76 | 
 77 | # --- Configure and run the agreement tester ---
 78 | cfg = Config(
 79 |     exp_to_run="example",
 80 |     n_models_taken_list=[
 81 |         n_models_taken
 82 |     ],  # Use lists for consistency with Config definition
 83 |     model_select_strategy_list=[model_select_strategy],  # Use lists for consistency
 84 |     corr_types=[corr_type],  # Use lists for consistency
 85 |     n_exps=n_exps if n_models_taken != 0 else 1,
 86 | )
 87 | 
 88 | tester = Tester(cfg=cfg)
 89 | agreements = tester.all_vs_all_agreement_testing(
 90 |     allbench
 91 | )  # No need for single_source_scenario here, as we've already filtered
 92 | 
 93 | # --- Report the results ---
 94 | reporter = Reporter()
 95 | 
 96 | reporter.draw_agreements_for_one_source(
 97 |     agreements,
 98 |     source_of_interest=my_bench_source_name,
 99 | )
100 | reporter.draw_agreement_matrix(agreements)
101 | z_score_df = reporter.get_all_z_scores(agreements, aggragate_name="aggregate")
102 | print(z_score_df[["scenario", "z_score"]])
103 | 


--------------------------------------------------------------------------------
/examples/scenarios_for_aggregate.txt:
--------------------------------------------------------------------------------
1 | Helm Lite
2 | HF OpenLLM v2
3 | OpenCompass Academic
4 | LMSys Arena
5 | Helm Classic
6 | AlphacaEval v2lc
7 | LiveBench 240829
8 | WildBench Elo LC


--------------------------------------------------------------------------------
/examples/scenarios_of_intereset.txt:
--------------------------------------------------------------------------------
  1 | # Scenarios of Interest
  2 | Holmes
  3 | # "eureka_information_retrieval_fact_recall",
  4 | # "eureka_information_retrieval_fact_precision",
  5 | # "eureka_instruction_following",
  6 | # "eureka_long_context_qa_average",
  7 | # "eureka_long_context_qa_longest_context_3k",
  8 | # "eureka_toxicity_detection",
  9 | Helm Lite
 10 | # "Helm Lite NarrativeQA",
 11 | # "Helm Lite NaturalQuestionsOpen",
 12 | # "Helm Lite NaturalQuestionsClosed",
 13 | # "Helm Lite OpenBookQA",
 14 | # "Helm Lite MMLU",
 15 | # "Helm Lite MathEquivalentCOT",
 16 | # "Helm Lite GSM8K",
 17 | # "Helm Lite LegalBench",
 18 | # "Helm Lite MedQA",
 19 | # "Helm Lite WMT2014",
 20 | LMSys Arena
 21 | HF OpenLLM v2
 22 | HFv2 BBH
 23 | HFv2 BBH Raw
 24 | HFv2 GPQA
 25 | HFv2 IFEval
 26 | HFv2 MMLU Pro
 27 | HFv2 Math Level 5
 28 | HFv2 MuSR
 29 | tablebench_overall_dp
 30 | # "trustworthy_average",
 31 | # "trustworthy_non_toxicity",
 32 | # "trustworthy_non_stereotype",
 33 | # "trustworthy_advglue_pp",
 34 | # "trustworthy_ood",
 35 | # "trustworthy_adv_demo",
 36 | # "trustworthy_privacy",
 37 | # "trustworthy_ethics",
 38 | # "trustworthy_fairness",
 39 | OpenCompass Academic
 40 | # "OpenCompass MMLU",
 41 | # "OpenCompass MMLU Pro",
 42 | # "OpenCompass CMMLU",
 43 | # "OpenCompass BBH",
 44 | # "OpenCompass GQPA-Dimand",
 45 | # "OpenCompass Math",
 46 | OpenCompass HumanEval
 47 | # "OpenCompass IFEval",
 48 | Helm MMLU
 49 | Helm Classic
 50 | # "Helm BoolQ",
 51 | # "Helm NarrativeQA",
 52 | # "Helm NaturalQuestionsClosed",
 53 | # "Helm NaturalQuestionsOpen",
 54 | # "Helm QuAC",
 55 | # "helm_hellaswag",
 56 | # "Helm OpenBookQA",
 57 | # "helm_truthfulqa",
 58 | # "Helm MSMARCO Regular",
 59 | # "Helm MSMARCO Trec",
 60 | # "helm_cnn/dailymail",
 61 | # "Helm XSUM",
 62 | # "Helm IMDB",
 63 | # "Helm CivilComments",
 64 | # "Helm RAFT",
 65 | MMLU Pro
 66 | MixEval
 67 | # "MixEval Hard",
 68 | # "MixEval TriviaQA",
 69 | # "MixEval MMLU",
 70 | # "MixEval DROP",
 71 | # "MixEval HellaSwag",
 72 | # "MixEval CommonsenseQA",
 73 | # "MixEval TriviaQA Hard",
 74 | # "MixEval MMLU Hard",
 75 | # "MixEval DROP Hard",
 76 | toolbench
 77 | AlphacaEval v2lc
 78 | # "HELM AirBench Security Risks",
 79 | # ... (Rest of the AirBench entries)
 80 | HELM AirBench AIR Score
 81 | OpenCompass
 82 | # "OpenCompass Language",
 83 | # ... (Rest of the OpenCompass entries)
 84 | OpenCompass Arena
 85 | # "LiveBench 240725"
 86 | # "LiveBench Reasoning",
 87 | # ... (Rest of LiveBench entries)
 88 | Enkrypt AI Safety
 89 | # "WildBench Elo LC",
 90 | # ... (Rest of WildBench entries)
 91 | WildBench Score
 92 | Decentralized Arena (0-1 Normalized)
 93 | Arena Hard
 94 | AgentBench
 95 | MT-Bench
 96 | HF OpenLLM v1
 97 | # "HFv1 ARC",
 98 | # ... (Rest of HFv1 entries)
 99 | BFCL
100 | eq_bench
101 | # "magi_hard",
102 | BIGGEN
103 | # "BIGGEN Grounding",
104 | # ... (Rest of BIGGEN entries)
105 | ruler
106 | # "LiveBench 240624",


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "bat"
 7 | version = "0.1.0"
 8 | description = "Benchmark Agreement Testing"
 9 | readme = "README.rst"
10 | authors = [
11 |   {name = "Yotam Perlitz", email = "perlitz@gmail.com"}
12 | ]
13 | maintainers = [
14 |   {name = "Yotam Perlitz", email = "perlitz@gmail.com"}
15 | ]
16 | classifiers = []
17 | license = {text = "Open source"}
18 | dependencies = [
19 |   "pandas",
20 |   "seaborn",
21 |   "matplotlib",
22 |   "numpy",
23 |   "scipy",
24 |   "tqdm",
25 | ]
26 | 
27 | [project.urls]
28 | bugs = "https://github.com/perlitz/bat/issues"
29 | changelog = "https://github.com/perlitz/bat/blob/master/changelog.md"
30 | homepage = "https://github.com/perlitz/bat"
31 | 
32 | [tool.setuptools]
33 | package-dir = {"" = "src"}
34 | 
35 | [tool.setuptools.package-data]
36 | "*" = ["*.*"]
37 | 
38 | # Mypy
39 | [tool.mypy]
40 | files = "."
41 | strict = true
42 | warn_unreachable = true
43 | warn_no_return = true
44 | 
45 | # [tool.mypy.overrides]
46 | # # Don't require test functions to include types
47 | # module = "tests.*"
48 | # allow_untyped_defs = true
49 | # disable_error_code = "attr-defined"
50 | 
51 | [tool.ruff]
52 | # Exclude commonly ignored directories.
53 | exclude = [
54 |     ".bzr",
55 |     ".direnv",
56 |     ".eggs",
57 |     ".git",
58 |     ".git-rewrite",
59 |     ".hg",
60 |     ".mypy_cache",
61 |     ".nox",
62 |     ".pants.d",
63 |     ".pytype",
64 |     ".ruff_cache",
65 |     ".svn",
66 |     ".tox",
67 |     ".venv",
68 |     "__pypackages__",
69 |     "_build",
70 |     "buck-out",
71 |     "build",
72 |     "dist",
73 |     "node_modules",
74 |     "venv",
75 | ]
76 | 
77 | line-length = 88
78 | indent-width = 4


--------------------------------------------------------------------------------
/src/bat/__init__.py:
--------------------------------------------------------------------------------
 1 | """Top-level package for bat."""
 2 | 
 3 | __author__ = """Yotam Perlitz"""
 4 | __email__ = "yotam.perlitz@ibm.com"
 5 | __version__ = "0.1.0"
 6 | 
 7 | 
 8 | from bat.agreement_tester import Tester
 9 | from bat.benchmark import Benchmark
10 | from bat.configs import Config
11 | from bat.reporting import Reporter
12 | 


--------------------------------------------------------------------------------
/src/bat/agreement_tester.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import random
  3 | import pandas as pd
  4 | from tqdm import tqdm
  5 | from bat.logic import get_pair_agreement
  6 | 
  7 | 
  8 | class Tester:
  9 |     def __init__(self, cfg):
 10 |         self.cfg = cfg
 11 | 
 12 |     @staticmethod
 13 |     def fetch_reference_models_names(
 14 |         reference_benchmark,
 15 |         n_models,
 16 |     ):
 17 |         return list(reference_benchmark.get_model_appearences_count().keys())[:n_models]
 18 | 
 19 |     def all_vs_all_agreement_testing(self, benchmark, single_source_scenario=None):
 20 |         assert all(
 21 |             benchmark.df.drop_duplicates(subset=["scenario", "source"])
 22 |             .groupby("scenario")["source"]
 23 |             .count()
 24 |             == 1
 25 |         ), "duplicated scenarios exist, consider running benchmark.clear_repeated_scenarios()"
 26 | 
 27 |         all_bench_res = benchmark.df
 28 | 
 29 |         # List of all scenarios
 30 |         pair_agreements = []
 31 | 
 32 |         used_scenarios = all_bench_res["scenario"].unique().tolist()
 33 | 
 34 |         scenario_pairs = [
 35 |             (a, b) for a, b in itertools.combinations(used_scenarios, 2) if a != b
 36 |         ]
 37 | 
 38 |         if single_source_scenario:
 39 |             assert (
 40 |                 single_source_scenario in used_scenarios
 41 |             ), f"single_source_scenario requested {single_source_scenario} does not appear as a scenario in the benchmark"
 42 |             scenario_pairs = [
 43 |                 (a, b) for a, b in scenario_pairs if single_source_scenario in [a, b]
 44 |             ]  # make sure only pairs with single_source_scenario are in the calculations
 45 | 
 46 |         # Iterate over each pair of scenarios
 47 |         for corr_type in self.cfg.corr_types:
 48 |             for model_select_strategy in self.cfg.model_select_strategy_list:
 49 |                 for model_subset_size_requested in self.cfg.n_models_taken_list:
 50 |                     for scenario1, scenario2 in tqdm(scenario_pairs):
 51 |                         cur_scen_res = all_bench_res.query(
 52 |                             f'scenario == "{scenario1}" or scenario == "{scenario2}"'
 53 |                         )
 54 | 
 55 |                         scenario_source = cur_scen_res.query(
 56 |                             "scenario==@scenario1"
 57 |                         ).iloc[0]["source"]
 58 |                         ref_source = cur_scen_res.query("scenario==@scenario2").iloc[0][
 59 |                             "source"
 60 |                         ]
 61 | 
 62 |                         for exp_n in range(self.cfg.n_exps):
 63 |                             # for date_threshold in date_thresholds:
 64 |                             pair_agreements_cfg = {
 65 |                                 "scenario": scenario1,
 66 |                                 "scenario_source": scenario_source,
 67 |                                 "ref_scenario": scenario2,
 68 |                                 "ref_source": ref_source,
 69 |                                 "corr_type": corr_type,
 70 |                                 "model_select_strategy": model_select_strategy,
 71 |                                 "model_subset_size_requested": model_subset_size_requested,
 72 |                                 "exp_n": exp_n,
 73 |                             }
 74 | 
 75 |                             # sorting according to one of the benchmarks
 76 |                             res_to_sort_by = all_bench_res.query(
 77 |                                 f"scenario=='{random.choice([scenario1, scenario2])}'"
 78 |                             )
 79 | 
 80 |                             models_intersect = (
 81 |                                 cur_scen_res["model"]
 82 |                                 .value_counts()[
 83 |                                     cur_scen_res["model"].value_counts() == 2
 84 |                                 ]
 85 |                                 .index.tolist()
 86 |                             )
 87 | 
 88 |                             if len(models_intersect) < max(
 89 |                                 model_subset_size_requested,
 90 |                                 self.cfg.min_n_models_intersect,
 91 |                             ):
 92 |                                 continue
 93 | 
 94 |                             pair_agreement, p_value = get_pair_agreement(
 95 |                                 cur_scen_res,
 96 |                                 res_to_sort_by,
 97 |                                 pair_agreements_cfg,
 98 |                                 models_intersect,
 99 |                             )
100 | 
101 |                             if pair_agreement is not None:
102 |                                 pair_agreement_reported = pair_agreements_cfg.copy()
103 |                                 pair_agreement_reported.update(
104 |                                     {
105 |                                         "correlation": pair_agreement,
106 |                                         "p_value": p_value,
107 |                                     }
108 |                                 )
109 |                                 pair_agreements.append(pair_agreement_reported)
110 | 
111 |         all_agreements = pd.DataFrame(pair_agreements)
112 | 
113 |         # add the the reversed scenario pairs
114 |         all_agreements_reversed_scenarios = all_agreements.rename(
115 |             columns={
116 |                 "scenario": "ref_scenario",
117 |                 "ref_scenario": "scenario",
118 |                 "scenario_source": "ref_source",
119 |                 "ref_source": "scenario_source",
120 |             }
121 |         )
122 |         all_agreements = pd.concat(
123 |             [all_agreements, all_agreements_reversed_scenarios]
124 |         ).reset_index(drop=True)
125 | 
126 |         return all_agreements
127 | 


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/agenbench_240829_agent.csv:
--------------------------------------------------------------------------------
 1 | model,agentbench
 2 | gpt-4-0613,4.01
 3 | claude-2,2.49
 4 | claude-v1.3,2.44
 5 | gpt-3.5-turbo-0613,2.32
 6 | text-davinci-003,1.71
 7 | claude-instant-v1.1,1.60
 8 | chat-bison-001,1.39
 9 | text-davinci-002,1.25
10 | llama-2-70b-chat,0.78
11 | guanaco-65b,0.54
12 | codellama-34b-instruct,0.96
13 | vicuna-33b-v1.3,0.73
14 | wizardlm-30b-v1.0,0.46
15 | guanaco-33b,0.39
16 | vicuna-13b-v1.5,0.93
17 | llama-2-13b-chat,0.77
18 | openchat-13b-v3.2,0.70
19 | wizardlm-13b-v1.2,0.66
20 | vicuna-7b-v1.5,0.56
21 | codellama-13b-instruct,0.56
22 | codellama-7b-instruct,0.50
23 | koala-13b,0.34
24 | llama-2-7b-chat,0.34
25 | codegeex2-6b,0.27
26 | dolly-12b-v2,0.14
27 | chatglm-6b-v1.1,0.11
28 | oasst-12b-sft-4,0.03


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/alphacaeval_v2lc_240829_holistic.csv:
--------------------------------------------------------------------------------
  1 | model,alphacaeval_v2lc
  2 | Shopee_SlimMoA_v1,77.5
  3 | Blendax.AI-gm-l6-vo31,76.9
  4 | gemma-2-9b-it-WPO-HB,76.7
  5 | Blendax.AI-gm-l3-v35,73.4
  6 | gemma-2-9b-it-SimPO,72.4
  7 | OpenPipe_MoA_GPT-4_Turbo,68.4
  8 | gemma-2-9b-it-DPO,67.7
  9 | Together_MoA,65.4
 10 | Llama3_PBM_Nova_70B,62.4
 11 | Storm-7B_(best-of-64),61.6
 12 | Together_MoA-Lite,59.1
 13 | Aligner_2B+GPT-4_Turbo_(04/09),58.3
 14 | GPT-4_Omni_(05/13),57.5
 15 | Higgs-Llama-3-70B_V2,56.8
 16 | GPT-4_Turbo_(04/09),55.0
 17 | SPPO-Gemma-2-9B-It-PairRM,54.0
 18 | Llama-3-Instruct-8B-WPO-HB-v2,53.4
 19 | Claude_3.5_Sonnet_(06/20),52.4
 20 | Yi-Large_Preview,51.9
 21 | GPT-4o_Mini_(07/18),50.7
 22 | Storm-7B,50.5
 23 | GPT-4_Preview_(11/06),50.0
 24 | Infinity-Instruct-7M-Gen-Llama3_1-70B,46.1
 25 | ExPO_+_Llama-3-Instruct-8B-SimPO,45.8
 26 | Llama-3-Instruct-8B-SimPO,44.7
 27 | Nanbeige_Plus_Chat_v0.1,44.5
 28 | Qwen1.5_110B_Chat,43.9
 29 | Aligner_2B+Claude_3_Opus,41.8
 30 | Nanbeige2_16B_Chat,40.6
 31 | Claude_3_Opus_(02/29),40.5
 32 | Infinity-Instruct-7M-Gen-mistral-7B,39.7
 33 | Llama_3.1_405B_Instruct,39.3
 34 | SPPO-Llama-3-Instruct-8B-PairRM,38.6
 35 | GPT-4,38.1
 36 | Qwen2_72B_Instruct,38.1
 37 | Llama_3.1_70B_Instruct,38.1
 38 | Infinity-Instruct-3M-0625-Llama3-70B,38.0
 39 | Aligner_2B+Qwen1.5_72B_Chat,36.7
 40 | Qwen1.5_72B_Chat,36.6
 41 | GPT-4_(03/14),35.3
 42 | Ein_70B_v0.1,35.0
 43 | Claude_3_Sonnet_(02/29),34.9
 44 | FsfairX-Zephyr-Chat-v0.1,34.8
 45 | Llama_3_70B_Instruct,34.4
 46 | Infinity-Instruct-7M-Gen-Llama3_1-8B,33.9
 47 | Mistral_Large_(24/02),32.7
 48 | ExPO_+_SPPO-Mistral7B-PairRM,31.8
 49 | merlinite-7B-AOT,31.7
 50 | Infinity-Instruct-3M-0613-Llama3-70B,31.5
 51 | Samba_CoE_v0.2_(best-of-16),31.5
 52 | Infinity-Instruct-3M-0625-Mistral-7B,31.4
 53 | REBEL-Llama-3-8B-Instruct,31.4
 54 | Mixtral_8x22B_v0.1,30.9
 55 | SPPO-Mistral7B-PairRM,30.5
 56 | GPT-4_(06/13),30.2
 57 | Snorkel_(Mistral-PairRM-DPO+best-of-16),30.0
 58 | Contextual_AI_(KTO-Mistral-PairRM),29.7
 59 | PairRM_0.4B+Yi-34B-Chat_(best-of-16),28.8
 60 | Mistral_Medium,28.6
 61 | Claude_2,28.2
 62 | Samba_CoE_v0.2,27.6
 63 | Infinity-Instruct-3M-0625-Llama3-8B,27.5
 64 | Claude,27.3
 65 | ExPO_+_InternLM2_Chat_20B,27.2
 66 | Yi_34B_Chat,27.2
 67 | ExPO_+_Starling_LM_7B_beta,26.4
 68 | Snorkel_(Mistral-PairRM-DPO),26.4
 69 | ExPO_+_Tulu-2-DPO-70B,25.7
 70 | Claude_Instant_1.2,25.6
 71 | Infinity-Instruct-3M-0613-Mistral-7B,25.5
 72 | DBRX_Instruct,25.4
 73 | Claude_2.1,25.3
 74 | Nanbeige2_8B_Chat,25.2
 75 | XwinLM_70b_V0.1,24.6
 76 | Gemini_Pro,24.4
 77 | Qwen1.5_14B_Chat,23.9
 78 | Mixtral_8x7B_v0.1,23.7
 79 | Evo_v2_7B,23.4
 80 | Ghost_8B_Beta_(d0x5),23.1
 81 | Llama_3_8B_Instruct,22.9
 82 | Samba_CoE_v0.1,22.9
 83 | GPT_3.5_Turbo_(06/13),22.7
 84 | ExPO_+_InternLM2_Chat_7B,22.7
 85 | GPT_3.5_Turbo_(06/13),22.4
 86 | Infinity-Instruct-3M-0625-Qwen2-7B,21.9
 87 | PairRM_0.4B+Tulu_2+DPO_70B_(best-of-16),21.4
 88 | Tulu_2+DPO_70B,21.2
 89 | Llama_3.1_8B_Instruct,20.9
 90 | Mistral_7B_v0.3,20.6
 91 | Mistral-7B-ReMax-v0.1,20.6
 92 | Infinity-Instruct-3M-0625-Yi-1.5-9B,20.5
 93 | ExPO_+_Starling_LM_7B_alpha,19.5
 94 | GPT_3.5_Turbo_(11/06),19.3
 95 | LMCocktail-10.7B-v1,19.0
 96 | InternLM2_Chat_20B,18.7
 97 | GPT_3.5_Turbo_(03/01),18.1
 98 | XwinLM_13b_V0.1,17.9
 99 | DeepSeek_LLM_67B_Chat,17.8
100 | GPT-3.5,17.7
101 | ExPO_+_Tulu-2-DPO-13B,17.6
102 | WizardLM_70B,17.6
103 | Vicuna_33B_v1.3,17.6
104 | PairRM_0.4B+Tulu_2+DPO_13B_(best-of-16),17.4
105 | Conifer-7B-DPO,17.1
106 | Mistral_7B_v0.2,17.1
107 | Evo_7B,16.5
108 | Humpback_LLaMa2_70B,16.2
109 | OpenHermes-2.5-Mistral_(7B),16.2
110 | DEITA_7B_v1.0,16.1
111 | JinaChat,15.9
112 | TempNet-LLaMA2-Chat-70B-v0.1,15.8
113 | CausalLM-14B,15.7
114 | PairRM_0.4B+Zephyr_7B_Beta_(best-of-16),15.5
115 | Qwen1.5_7B_Chat,14.7
116 | Mistral-ORPO-Beta,14.7
117 | Starling_LM_7B_alpha,14.7
118 | LLaMA2_Chat_70B,14.7
119 | OpenChat_V3.1_13B,14.5
120 | WizardLM_13B_V1.2,14.5
121 | UltraLM_13B_V2.0_(best-of-16),14.2
122 | ExPO_+_Zephyr_7B_Beta,14.0
123 | WizardLM_13B_V1.1,13.9
124 | ExPO_+_Zephyr_7B_Alpha,13.6
125 | Zephyr_7B_Beta,13.2
126 | Dolphin_2.2.1_Mistral_7B,13.1
127 | Humpback_LLaMa_65B,12.8
128 | OpenBudddy-LLaMA2-70B-v10.1,12.6
129 | OpenBuddy-LLaMA-65B-v8,12.5
130 | Qwen_14B_Chat,12.4
131 | GPT-4_(Adversarial),12.2
132 | CUT_13B,12.2
133 | OpenChat_V2-W_13B,12.0
134 | Vicuna_13B_v1.5_(together),11.7
135 | ExPO_+_Tulu-2-DPO-7B,11.7
136 | Tulu_2+DPO_13B,11.6
137 | Claude2_Alpaca_13B,11.5
138 | Minotaur_13B,11.5
139 | airoboros_65B,11.0
140 | Cohere_Command,10.9
141 | Vicuna_13B_v1.3,10.8
142 | XwinLM_7b_V0.1,10.8
143 | airoboros_33B,10.7
144 | PlatoLM_7B,10.5
145 | Vicuna_13B_v1.5,10.5
146 | Gemma_Instruct_(7B),10.4
147 | OpenChat_V2_13B,10.4
148 | Zephyr_7B_Alpha,10.3
149 | OpenBuddy-LLaMA-30B-v7.1,10.2
150 | UltraLM_13B_(best-of-16),9.9
151 | LLaMA_33B_OASST_SFT,9.9
152 | WizardLM_13B,9.8
153 | Nous_Hermes_13B,9.7
154 | Vicuna_13B,9.2
155 | Tulu_2+DPO_7B,9.2
156 | OpenBudddy-LLaMA2-13B-v11.1,9.2
157 | UltraLM_13B_V2.0,9.1
158 | Davinci001,9.0
159 | OpenBuddy-Falcon-40B-v9,9.0
160 | OpenChat-13B,8.8
161 | TempNet-LLaMA2-Chat-13B-v0.1,8.6
162 | LLaMA2_Chat_13B,8.4
163 | Guanaco_65B,8.3
164 | OpenCoderPlus-15B,8.2
165 | LLaMA_33B_OASST_RLHF,8.0
166 | OpenChat8192-13B,7.9
167 | Phi-2_DPO,7.8
168 | MiniChat_1.5_3B,7.7
169 | Vicuna_7B_v1.5,7.6
170 | LLaMA2_Chat_7B_Evol70k-NEFT,7.5
171 | Recycled_WizardLM_7B_V2.0,7.5
172 | Vicuna_7B_v1.3,7.2
173 | Alpaca_Farm_PPO_Sim_(GPT-4)_7B,7.1
174 | UltraLM_13B,7.1
175 | Baize-v2_13B,7.0
176 | Recycled_WizardLM_7B_V1.0,6.9
177 | Ghost_7B_Alpha,6.9
178 | Alpaca_Farm_PPO_Human_7B,6.4
179 | Vicuna_7B,6.3
180 | Alpaca_7B,5.9
181 | Phi-2_SFT,5.9
182 | TempNet-LLaMA2-Chat-7B-v0.1,5.7
183 | MiniChat_3B,5.7
184 | Guanaco_33B,5.7
185 | Falcon_40B_Instruct,5.6
186 | Gemma_Instruct_(2B),5.4
187 | LLaMA2_Chat_7B,5.4
188 | OpenBuddy-Falcon-7b-v6,4.8
189 | Phi_2,4.4
190 | Baize-v2_7B,4.4
191 | ChatGLM2-6B,4.4
192 | Pythia_12B_SFT,4.2
193 | Falcon_7B_Instruct,4.0
194 | Pythia_12B_OASST_SFT,3.3
195 | Guanaco_13B,3.0
196 | Guanaco_7B,2.9
197 | Qwen1.5_1.8B_Chat,2.6
198 | Baichuan-13B-Chat,2.1


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/arena_hard_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,arena_hard
 2 | claude-3-5-sonnet-20240620,79.3
 3 | gpt-4o-2024-05-13,79.2
 4 | gpt-4-0125-preview,78.0
 5 | gpt-4o-2024-08-06,77.9
 6 | athene-70b,77.6
 7 | gpt-4o-mini,74.9
 8 | gemini-1.5-pro-api-preview,72.0
 9 | mistral-large-2407,70.4
10 | llama-3.1-405b-instruct,64.1
11 | glm-4-0520,63.8
12 | yi-large,63.7
13 | deepseek-coder-v2,62.3
14 | claude-3-opus-20240229,60.4
15 | gemma-2-27b-it,57.5
16 | llama-3.1-70b-instruct,55.7
17 | glm-4-0116,55.7
18 | glm-4-air,50.9
19 | gpt-4-0314,50.0
20 | gemini-1.5-flash-api-preview,49.6
21 | qwen2-72b-instruct,46.9
22 | claude-3-sonnet-20240229,46.8
23 | llama-3-70b-instruct,46.6
24 | claude-3-haiku-20240307,41.5
25 | gpt-4-0613,37.9
26 | mistral-large-2402,37.7
27 | mixtral-8x22b-instruct-v0.1,36.4
28 | Qwen1.5-72B-Chat,36.1
29 | phi-3-medium-4k-instruct,33.4
30 | command-r-plus,33.1
31 | mistral-medium,31.9
32 | internlm2.5-20b-chat,31.2
33 | phi-3-small-8k-instruct,29.8
34 | mistral-next,27.4
35 | gpt-3.5-turbo-0613,24.8
36 | dbrx-instruct-preview,24.6
37 | internlm2-20b-chat,24.4
38 | claude-2.0,24.0
39 | Mixtral-8x7B-Instruct-v0.1,23.4
40 | gpt-3.5-turbo-0125,23.3
41 | Yi-34B-Chat,23.1
42 | Starling-LM-7B-beta,23.0
43 | claude-2.1,22.8
44 | llama-3.1-8b-instruct,21.3
45 | Snorkel-Mistral-PairRM-DPO,20.7
46 | llama-3-8b-instruct,20.6
47 | gpt-3.5-turbo-1106,18.9
48 | gpt-3.5-turbo-0301,18.1
49 | gemini-1.0-pro,17.8
50 | snowflake-arctic-instruct,17.6
51 | command-r,17.0
52 | phi-3-mini-128k-instruct,15.4
53 | tulu-2-dpo-70b,15.0
54 | Starling-LM-7B-alpha,12.8
55 | mistral-7b-instruct,12.6
56 | gemma-1.1-7b-it,12.1
57 | Llama-2-70b-chat-hf,11.6
58 | vicuna-33b-v1.3,8.6
59 | gemma-7b-it,7.5
60 | Llama-2-7b-chat-hf,4.6
61 | gemma-1.1-2b-it,3.4
62 | gemma-2b-it,3.0


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/bfcl_240906_tools.csv:
--------------------------------------------------------------------------------
 1 | model,BFCL
 2 | GPT-4-0125-Preview,85.79
 3 | GPT-4-1106-Preview,85
 4 | GPT-4-0613,84.74
 5 | GPT-4-turbo-2024-04-09,83.89
 6 | GPT-4o-mini-2024-07-18,83.35
 7 | GPT-4o-2024-05-13,83.13
 8 | Functionary-Medium-v3.1,82.55
 9 | GPT-4-1106-Preview,81.78
10 | Meta-Llama-3-70B-Instruct,81.59
11 | Claude-3-Opus-20240229,80.88
12 | Nemotron-4-340b-instruct,80.23
13 | Functionary-Small-v3.1,80.21
14 | mistral-large-2407,79.66
15 | GPT-4o-2024-05-13,79.55
16 | xLAM-7b-fc-r,79.41
17 | GPT-4o-mini-2024-07-18,79.25
18 | Open-Mixtral-8x22b,79.14
19 | Gorilla-OpenFunctions-v2,79.1
20 | GPT-4-turbo-2024-04-09,79.09
21 | Functionary-Small-v3.2,78.96
22 | GPT-4o-2024-08-06,78.87
23 | mistral-large-2407,78.78
24 | Claude-3-Sonnet-20240229,77.92
25 | FireFunction-v2,77.45
26 | Granite-20b-FunctionCalling,76.63
27 | Open-Mistral-Nemo-2407,76.31
28 | Claude-3.5-Sonnet-20240620,76.29
29 | GPT-3.5-Turbo-0125,75.41
30 | Open-Mistral-Nemo-2407,74.97
31 | xLAM-1b-fc-r,74.9
32 | Hermes-2-Pro-Llama-3-70B,74.78
33 | Gemini-1.5-Pro-Preview-0514,74.75
34 | Claude-2.1,74.57
35 | Gemini-1.5-Pro-Preview-0409,74.56
36 | GPT-4o-2024-08-06,74.12
37 | Command-R-Plus (Original),74.11
38 | Open-Mistral-Nemo-2407,73.12
39 | Mistral-Medium-2312,72.19
40 | Gemini-1.5-Flash-Preview-0514,70.75
41 | DBRX-Instruct,69.55
42 | Claude-3.5-Sonnet-20240620,68.88
43 | GPT-3.5-Turbo-0125,66.19
44 | Hermes-2-Pro-Llama-3-8B,66.18
45 | Hermes-2-Pro-Mistral-7B,65.44
46 | Hermes-2-Theta-Llama-3-8B,64.83
47 | Meta-Llama-3-8B-Instruct,62.7
48 | Claude-3-Opus-20240229,61.89
49 | Open-Mixtral-8x7b,60.82
50 | Claude-3-Haiku-20240307,60.34
51 | Open-Mixtral-8x22b,58.89
52 | Open-Mixtral-8x22b,58.37
53 | Gemini-1.0-Pro-001,57.81
54 | Mistral-small-2402,55.36
55 | FireFunction-v1,48.11
56 | Claude-3-Sonnet-20240229,47.97
57 | Claude-instant-1.2,47.95
58 | Claude-3-Haiku-20240307,47.03
59 | GPT-4-0613,45.61
60 | Snowflake/snowflake-arctic-instruct,42.46
61 | mistral-large-2407,27.87
62 | Mistral-tiny-2312,21.17
63 | Deepseek-v1.5,11.18
64 | Gemma-7b-it,10.3
65 | Hermes-2-Theta-Llama-3-70B,10


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/biggen_240829_holistic.csv:
--------------------------------------------------------------------------------
  1 | model,biggen
  2 | gpt-4-1106-preview,4.22
  3 | gpt-4-0125-preview,4.19
  4 | gpt-4o-2024-05-13,4.141
  5 | gpt-4-turbo-2024-04-09,4.132
  6 | claude-3-opus-20240229,4.103
  7 | meta-llama/Meta-Llama-3-70B-Instruct,4.012
  8 | claude-3-sonnet-20240229,4.011
  9 | qwen/qwen-110b-chat,3.979
 10 | claude-3-haiku-20240307,3.954
 11 | gemini-pro-1.5,3.953
 12 | MaziyarPanahi/Mixtral-8x22B-Instruct-v0.1-AWQ,3.936
 13 | mistral-medium,3.935
 14 | mistral-large,3.927
 15 | google/gemini-flash-1.5,3.899
 16 | alpindale/c4ai-command-r-plus-GPTQ,3.839
 17 | Qwen/Qwen1.5-72B-Chat,3.832
 18 | microsoft/Phi-3-mini-4k-instruct,3.821
 19 | Qwen/Qwen1.5-32B-Chat,3.813
 20 | Starling-LM-7B-beta,3.756
 21 | meta-llama/Meta-Llama-3-8B-Instruct,3.753
 22 | NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO,3.737
 23 | 01-ai/Yi-34B-Chat,3.701
 24 | mistralai/Mixtral-8x7B-Instruct-v0.1,3.695
 25 | gpt-3.5-turbo-0125,3.689
 26 | allenai/tulu-2-dpo-70b,3.683
 27 | microsoft/Phi-3-mini-128k-instruct,3.679
 28 | gpt-3.5-turbo-1106,3.678
 29 | CohereForAI/c4ai-command-r-v01,3.677
 30 | upstage/SOLAR-10.7B-Instruct-v1.0,3.672
 31 | meta-llama/Llama-2-70b-chat-hf,3.668
 32 | gemini-1.0-pro,3.64
 33 | mistralai/Mistral-7B-Instruct-v0.2,3.619
 34 | mistral-community/Mixtral-8x22B-v0.1-AWQ,3.606
 35 | NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT,3.596
 36 | openchat/openchat-3.5-0106,3.581
 37 | MaziyarPanahi/zephyr-orpo-141b-A35b-v0.1-AWQ,3.573
 38 | Qwen/Qwen1.5-14B-Chat,3.573
 39 | Qwen/Qwen1.5-7B-Chat,3.556
 40 | Starling-LM-7B-alpha,3.537
 41 | HuggingFaceH4/zephyr-7b-beta,3.522
 42 | NousResearch/Nous-Hermes-2-Mistral-7B-DPO,3.493
 43 | NousResearch/Nous-Hermes-2-Yi-34B,3.476
 44 | kaist-ai/mistral-orpo-beta,3.473
 45 | meta-llama/Llama-2-13b-chat-hf,3.467
 46 | teknium/OpenHermes-2.5-Mistral-7B,3.462
 47 | mistralai/Mixtral-8x7B-v0.1,3.445
 48 | kaist-ai/mistral-orpo-alpha,3.441
 49 | allenai/tulu-2-dpo-13b,3.423
 50 | Qwen/Qwen1.5-72B,3.422
 51 | allenai/codetulu-2-34b,3.421
 52 | google/gemma-1.1-7b-it,3.407
 53 | teknium/OpenHermes-2-Mistral-7B,3.394
 54 | codellama/CodeLlama-34b-Instruct-hf,3.363
 55 | 01-ai/Yi-34B,3.322
 56 | meta-llama/Llama-2-70b-hf,3.317
 57 | Qwen/Qwen1.5-32B,3.312
 58 | meta-llama/Llama-2-7b-chat-hf,3.307
 59 | allenai/tulu-2-dpo-7b,3.28
 60 | allenai/codetulu-2-13b,3.254
 61 | upstage/SOLAR-10.7B-v1.0,3.248
 62 | allenai/tulu-2-13b,3.211
 63 | codellama/CodeLlama-13b-Instruct-hf,3.206
 64 | 01-ai/Yi-6B-Chat,3.204
 65 | codellama/CodeLlama-7b-Instruct-hf,3.14
 66 | google/gemma-7b-it,3.132
 67 | meta-llama/Meta-Llama-3-70B,3.122
 68 | Qwen/Qwen1.5-14B,3.106
 69 | google/gemma-1.1-2b-it,3.072
 70 | allenai/codetulu-2-7b,3.07
 71 | allenai/tulu-2-7b,3.041
 72 | mistral-community/Mistral-7B-v0.2,3.024
 73 | mistralai/Mistral-7B-v0.1,3.006
 74 | Qwen/Qwen1.5-4B-Chat,2.976
 75 | allenai/OLMo-7B-Instruct,2.974
 76 | google/gemma-2b-it,2.932
 77 | Qwen/Qwen1.5-7B,2.872
 78 | microsoft/phi-2,2.859
 79 | allenai/OLMo-7B-SFT,2.827
 80 | codellama/CodeLlama-70b-Instruct-hf,2.805
 81 | EleutherAI/llemma_34b,2.771
 82 | meta-llama/Meta-Llama-3-8B,2.743
 83 | Qwen/Qwen1.5-1.8B-Chat,2.741
 84 | Qwen/Qwen1.5-4B,2.708
 85 | meta-llama/Llama-2-13b-hf,2.703
 86 | 01-ai/Yi-6B,2.635
 87 | codellama/CodeLlama-70b-hf,2.593
 88 | codellama/CodeLlama-34b-hf,2.509
 89 | microsoft/phi-1_5,2.497
 90 | microsoft/Orca-2-13b,2.489
 91 | meta-llama/Llama-2-7b-hf,2.457
 92 | Qwen/Qwen1.5-1.8B,2.364
 93 | EleutherAI/llemma_7b,2.27
 94 | google/gemma-2b,2.262
 95 | codellama/CodeLlama-13b-hf,2.134
 96 | Qwen/Qwen1.5-0.5B-Chat,2.108
 97 | microsoft/Orca-2-7b,2.083
 98 | allenai/OLMo-7B,2.081
 99 | codellama/CodeLlama-7b-hf,1.954
100 | Qwen/Qwen1.5-0.5B,1.834
101 | allenai/OLMo-1B,1.648
102 | CohereForAI/aya-101,1.447
103 | google/gemma-7b,1.411
104 | microsoft/phi-1,1.135


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/chatbot_arena_241104_holistic.csv:
--------------------------------------------------------------------------------
  1 | model,arena_elo
  2 | chatglm-6b,855.910565721209
  3 | koala-13b,901.4444159097708
  4 | oasst-pythia-12b,812.3918514404036
  5 | alpaca-13b,851.3113435573603
  6 | vicuna-13b,874.2126379649785
  7 | dolly-v2-12b,781.4370567093974
  8 | stablelm-tuned-alpha-7b,829.7609284591157
  9 | llama-13b,800.0
 10 | fastchat-t5-3b,794.3748535699036
 11 | gpt-3.5-turbo-0314,1051.024508411953
 12 | gpt-4-0314,980.6906633214737
 13 | RWKV-4-Raven-14B,874.536173297737
 14 | claude-1,1039.7803750141782
 15 | mpt-7b-chat,869.0762171208861
 16 | palm-2,922.5218005276811
 17 | claude-instant-1,991.8056867962612
 18 | vicuna-7b,910.6856107758757
 19 | wizardlm-13b,971.8432912657483
 20 | gpt4all-13b-snoozy,885.7452637089059
 21 | guanaco-33b,974.3076720194276
 22 | vicuna-33b,906.4317166108785
 23 | mpt-30b-chat,971.1057122702123
 24 | gpt-3.5-turbo-0613,999.7201069046866
 25 | gpt-4-0613,960.3770824361335
 26 | llama-2-7b-chat,895.4706517283653
 27 | claude-2.0,1016.5801503367938
 28 | llama-2-13b-chat,963.7146661400922
 29 | chatglm2-6b,835.3074735731766
 30 | llama-2-70b-chat,1007.6844327159829
 31 | codellama-34b-instruct,934.0457254208728
 32 | wizardlm-70b,979.5605650746356
 33 | falcon-180b-chat,923.054729229491
 34 | mistral-7b-instruct,895.9405753947756
 35 | qwen-14b-chat,921.4887868532272
 36 | zephyr-7b-alpha,946.9339607858802
 37 | zephyr-7b-beta,913.246312461937
 38 | openchat-3.5,948.9893819327425
 39 | gpt-4-1106-preview,1001.256303019811
 40 | gpt-3.5-turbo-1106,937.6322384103785
 41 | chatglm3-6b,814.5480014217649
 42 | claude-2.1,979.8637705131841
 43 | tulu-2-dpo-70b,961.7298633389956
 44 | yi-34b-chat,932.0283635154187
 45 | starling-lm-7b-alpha,945.1430459412007
 46 | openhermes-2.5-mistral-7b,935.5573447997912
 47 | pplx-70b-online,931.0576338876376
 48 | pplx-7b-online,948.7421850358356
 49 | dolphin-2.2.1-mistral-7b,977.0069489193058
 50 | mixtral-8x7b-instruct-v0.1,867.9036424292025
 51 | gemini-pro,1006.251403062337
 52 | solar-10.7b-instruct-v1.0,958.6549095565917
 53 | mistral-medium,965.0537859905727
 54 | llama2-70b-steerlm-chat,965.6376159085758
 55 | gemini-pro-dev-api,1019.3566145491036
 56 | stripedhyena-nous-7b,919.5708420570646
 57 | bard-jan-24-gemini-pro,1041.261256012453
 58 | deepseek-llm-67b-chat,958.7276958964317
 59 | gpt-4-0125-preview,997.1712467949897
 60 | gpt-3.5-turbo-0125,898.9675086846296
 61 | nous-hermes-2-mixtral-8x7b-dpo,972.2639217501226
 62 | mistral-7b-instruct-v0.2,892.8914241485261
 63 | qwen1.5-72b-chat,947.9919390672214
 64 | openchat-3.5-0106,956.5639851579056
 65 | qwen1.5-4b-chat,857.8615305194531
 66 | qwen1.5-7b-chat,937.5784150291832
 67 | codellama-70b-instruct,873.7635218944325
 68 | mistral-next,969.0249137331155
 69 | gemma-2b-it,865.630898513726
 70 | gemma-7b-it,913.3020846629596
 71 | mistral-large-2402,939.5529442890696
 72 | olmo-7b-instruct,875.880001693062
 73 | claude-3-sonnet-20240229,970.6832692453123
 74 | claude-3-opus-20240229,1021.9572137608475
 75 | claude-3-haiku-20240307,946.756591266114
 76 | starling-lm-7b-beta,967.1740802373936
 77 | command-r,915.3923710382185
 78 | dbrx-instruct-preview,930.1149113654316
 79 | qwen1.5-14b-chat,932.8461519507623
 80 | qwen1.5-32b-chat,917.6067239158654
 81 | command-r-plus,981.9316261444285
 82 | gemma-1.1-7b-it,888.863535227059
 83 | gpt-4-turbo-2024-04-09,1001.9508367594701
 84 | zephyr-orpo-141b-A35b-v0.1,992.2709969445073
 85 | gemma-1.1-2b-it,839.3449619004468
 86 | gemini-1.5-pro-api-0409-preview,1106.8697777575628
 87 | reka-flash-21b-20240226-online,967.873277488609
 88 | reka-flash-21b-20240226,939.8601363871353
 89 | mixtral-8x22b-instruct-v0.1,911.463562145636
 90 | llama-3-8b-instruct,925.300077951389
 91 | llama-3-70b-instruct,987.92132812523
 92 | phi-3-mini-128k-instruct,875.3830177408651
 93 | snowflake-arctic-instruct,908.9578096804898
 94 | reka-core-20240501,960.871641047353
 95 | qwen1.5-110b-chat,970.825546150876
 96 | qwen-max-0428,991.8829133949346
 97 | gpt-4o-2024-05-13,1033.7736651812086
 98 | yi-large-preview,1007.9055342457846
 99 | glm-4-0116,996.2388680185245
100 | phi-3-mini-4k-instruct,875.486575120554
101 | gemini-advanced-0514,1034.5901919978594
102 | gemini-1.5-pro-api-0514,1006.938590226684
103 | gemini-1.5-flash-api-0514,988.4260721445921
104 | yi-1.5-34b-chat,935.8573439301474
105 | phi-3-small-8k-instruct,877.7438151636035
106 | phi-3-medium-4k-instruct,866.7539620360035
107 | qwen2-72b-instruct,930.7722721046769
108 | yi-large,991.7868427711801
109 | nemotron-4-340b-instruct,1011.0291063554423
110 | reka-flash-preview-20240611,937.4782906143831
111 | glm-4-0520,1012.3461462160476
112 | deepseek-coder-v2,968.7272337322494
113 | claude-3-5-sonnet-20240620,1026.059060767346
114 | gemma-2-9b-it,950.0755523266928
115 | gemma-2-27b-it,977.8470656596851
116 | phi-3-mini-4k-instruct-june-2024,860.4379813139254
117 | deepseek-v2-api-0628,989.5345921181047
118 | athene-70b-0725,1020.8101504540734
119 | gemini-1.5-pro-exp-0801,1074.9371768117894
120 | gpt-4o-mini-2024-07-18,1026.236414405759
121 | deepseek-coder-v2-0724,990.94288841608
122 | gemma-2-2b-it,906.320768087545
123 | llama-3.1-8b-instruct,949.3125757952853
124 | llama-3.1-405b-instruct,1005.4497444176718
125 | llama-3.1-70b-instruct,1034.402372751568
126 | mistral-large-2407,1005.1771608005986
127 | reka-core-20240722,1006.9821508042021
128 | reka-flash-20240722,950.5542647646221
129 | chatgpt-4o-latest,1073.7429047571106
130 | gpt-4o-2024-08-06,1032.650635133711


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/dec_arena_241022_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,decentralized_arena
 2 | o1-mini,1.000000
 3 | o1-preview,0.988296
 4 | ChatGPT-4o-latest (2024-09-03),0.971391
 5 | yi-lightning,0.955415
 6 | glm-4-plus,0.910273
 7 | claude-3.5-sonnet,0.897083
 8 | gpt-4o-2024-05-13,0.894297
 9 | gpt-4o-2024-08-06,0.889095
10 | nemotron-70b,0.881107
11 | gpt-4o-mini-2024-07-18,0.873119
12 | gpt-4-turbo-2024-04-09,0.865131
13 | gemini-1.5-pro-001,0.854542
14 | qwen2-72b-instruct,0.814787
15 | claude-3-opus,0.804198
16 | gpt4-1106,0.761657
17 | gemini-1.5-flash-001,0.761657
18 | meta-llama-3.1-70b-instruct,0.759056
19 | gemma-2-9b-it-simpo,0.736950
20 | gemma-2-27b-it,0.716515
21 | google-gemma-2-9b-it,0.687349
22 | yi-1.5-34b-chat,0.671373
23 | llama-3-70b-instruct,0.658183
24 | claude-3-haiku,0.591863
25 | qwen1.5-72b-chat,0.583875
26 | meta-llama-3.1-8b-instruct,0.533346
27 | qwen1.5-32b-chat,0.533346
28 | claude-2.1,0.509567
29 | claude-2.0,0.501579
30 | starling-lm-7b-beta,0.464425
31 | qwen1.5-14b-chat,0.437860
32 | mistral-8x7b-instruct-v0.1,0.437860
33 | llama3-8b-instruct,0.421884
34 | gemma-2-2b-it,0.414081
35 | gpt3.5-turbo-0125,0.411295
36 | command-r-(08-2024),0.392718
37 | openchat-3.5-0106,0.387516
38 | openchat-3.5,0.374141
39 | command-r-(04-2024),0.339773
40 | gemma-1.1-7b-it,0.336987
41 | starling-lm-7b-alpha,0.331785
42 | gemini-1.0-pro-001,0.326398
43 | mistral-7b-instruct-2,0.260078
44 | llama-3.2-3b-it,0.252090
45 | vicuna-33b,0.238900
46 | gemma-7b-it,0.228311
47 | qwen1.5-4b-chat,0.146015
48 | mistral-7b-instruct-1,0.143229
49 | vicuna-13b,0.140628
50 | gemma-1.1-2b-it,0.135426
51 | llama2-7b-chat,0.127438
52 | llama2-13b-chat,0.116849
53 | gemma-2b-it,0.087498
54 | vicuna-7b,0.071707
55 | zephyr-7b-beta,0.058332
56 | koala-13b,0.026565
57 | openassistant-pythia-12b,0.000000


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/enkrypt_ai_safety_240916_safety.csv:
--------------------------------------------------------------------------------
 1 | model,Enkrypt_AI_Safety
 2 | gemini-1.5-pro-exp-0801,84
 3 | gemini-1.5-pro-latest,81
 4 | gemma-2-27b-it,79
 5 | Reflection-Llama-3.1-70B,81
 6 | Llama-2-7B-Chat-GGUF-8bit,80
 7 | Llama-2-7B-Chat-GGUF-4bit,80
 8 | SmolLM-360M-Instruct,80
 9 | Llama-2-7b-chat-hf,78
10 | flan-ul2,76
11 | o1-preview,76
12 | Llama-3-8B-Instruct-RR,81
13 | claude-3-opus-20240229,75
14 | gpt-4-0125-preview,79
15 | sarvam-2b-v0.5,75
16 | Llama-3-8B-Instruct-MopeyMule,73
17 | claude-3-5-sonnet-20240620,71
18 | sea-lion-7b-instruct,73
19 | claude-instant-1.2,76
20 | gpt-4-turbo-2024-04-09,75
21 | Meta-Llama-3.1-8B-Instruct-Turbo,70
22 | RakutenAI-7B-chat,68
23 | gemma-2-2b-it,67
24 | Meta-Llama-3-8B-Instruct,72
25 | o1-mini,71
26 | Mistral-7B-v0.1,70
27 | Llama-2-13b-chat-hf,72
28 | h2o-danube3-500m-chat,68
29 | Llama-2-70b-chat-hf,68
30 | gemma-2-9b-it,67
31 | internlm2-chat-20b,59
32 | gemma-2-9b,64
33 | NexusRaven-V2-13B,63
34 | komodo-7b-base,61
35 | gpt-4o,64
36 | phi-2,58
37 | phi3-medium-128K,61
38 | gemma-7b-it,61
39 | claude-3-haiku-20240307,67
40 | Meta-Llama-3.1-405B-Instruct-Turbo,61
41 | SmolLM-1.7B-Instruct,60
42 | gpt-4o-2024-08-06,60
43 | PowerLM-3b,53
44 | Meta-Llama-3-70B-Instruct,62
45 | Starling-LM-7B-beta-GGUF-4bit,54
46 | Smaug-72B-v0.1,61
47 | gpt-3.5-turbo,62
48 | CodeLlama-7b-Instruct-hf,56
49 | Smaug-Llama-3-70B-Instruct,56
50 | Mixtral-8x7B-Instruct-v0.1,54
51 | jamba-instruct-preview,51
52 | Mixtral-8x22B-Instruct-v0.1,53
53 | SeaLLM-7B-v2,58
54 | Qwen2-72B-Instruct,55
55 | OLMo-7B-Instruct,47
56 | Phi-3-mini-128k-instruct,55
57 | dbrx-instruct,51
58 | falcon-mamba-7b-instruct,49
59 | gpt-4o-mini,55
60 | Phi-3.5-MoE-instruct,54
61 | Qwen1.5-14B-Chat,51
62 | c4ai-command-r-plus,48
63 | Smaug-34B-v0.1,56
64 | Qwen2-7B-Instruct,50
65 | Mistral-7B-Instruct-v0.2-GGUF-4bit,48
66 | Meta-Llama-3.1-70B-Instruct-Turbo,48
67 | K2-Chat,50
68 | Phi-3-mini-4k-instruct,50
69 | Starling-LM-7B-beta,51
70 | OLMoE-1B-7B-0924-Instruct,49
71 | Mistral-7B-Instruct-v0.2-GGUF-8bit,48
72 | h2o-danube3-4b-chat,47
73 | RakutenAI-7B-instruct,44
74 | Mistral-7B-Instruct-v0.2,46
75 | jamba-1.5-mini,48
76 | aya-23-35B,47
77 | jamba-1.5-large,47
78 | Phi-3-small-8k-instruct,48
79 | Phi-3-small-128k-instruct,46
80 | zephyr-7b-beta,43
81 | PowerMoE-3b,47
82 | LongWriter-glm4-9b,46
83 | Mistral-7B-Instruct-v0.1-GGUF-4bit,39
84 | snowflake-arctic-instruct,45
85 | Qwen2-57B-A14B-Instruct,45
86 | palm-2-chat-bison,40
87 | Mistral-7B-Instruct-v0.1-GGUF-8bit,40
88 | glm-4-9b-chat,43
89 | Phi-3-medium-4k-instruct,43
90 | aya-23-8B,40
91 | Mistral-7B-Instruct-v0.3,39
92 | Phi-3.5-mini-instruct,37
93 | dolphin-2.5-mixtral-8x7b,32


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/eqbench_240912_emotion.csv:
--------------------------------------------------------------------------------
  1 | model,EQ-Bench
  2 | Meta-Llama-3.1-405B-Instruct,83.0
  3 | claude-3-5-sonnet-20240620,86.36
  4 | gpt-4o,83.51
  5 | gpt-4-turbo-2024-04-09,86.35
  6 | RYS-XLarge-Base,85.05
  7 | gpt-4-0613,84.79
  8 | gpt-4-0314,85.73
  9 | RYS-XLarge,84.55
 10 | gpt-4-1106-preview,86.05
 11 | gpt-4-0125-preview,83.87
 12 | claude-3-opus-20240229,82.19
 13 | mistral-large-2407,85.05
 14 | Qwen2-72B-Instruct,81.35
 15 | mistral-large-2402,85.17
 16 | Meta-Llama-3-70B-Instruct,82.13
 17 | Qwen1.5-110B-Chat,83.68
 18 | solar-pro-preview-instruct,78.52
 19 | Senku-70B-Full,84.89
 20 | Smaug-Llama-3-70B-Instruct,80.69
 21 | ECE-TW3-JRGL-V1,83.07
 22 | miiqu-f16,83.17
 23 | Qwen1.5-72B-Chat,82.81
 24 | miqu-1-70b,82.91
 25 | mistral-medium,82.57
 26 | gemma-2-27b-it,80.55
 27 | gpt-4o-mini,76.93
 28 | 🆕Phi-3.5-MoE-instruct,76.97
 29 | DeepSeek-V2-Chat-0628,83.18
 30 | miquella-120b,82.15
 31 | Phi-3-medium-4k-instruct,76.34
 32 | claude-3-sonnet-20240229,80.45
 33 | Tess-72B-v1.5b,81.78
 34 | Mixtral-8x22B-Instruct-v0.1,78.79
 35 | Qwen-72B-Chat,80.7
 36 | Smaug-72B-v0.1,79.75
 37 | gemma-2-9b-it,80.46
 38 | Yi-1.5-34B-Chat,72.93
 39 | Mixtral_34Bx2_MoE_60B,72.69
 40 | Phi-3-small-8k-instruct,73.49
 41 | WizardLM-2-8x22B,77.91
 42 | miquliz-120b-v2.0,82.21
 43 | Quyen-Pro-Max-v0.1,77.16
 44 | Qwen1.5-32B-Chat,75.59
 45 | 🆕gemma-2-Ifable-9B,79.93
 46 | dolphin-2_2-yi-34b,75.52
 47 | Nous-Hermes-2-Yi-34B,72.68
 48 | MegaDolphin-120b,80.21
 49 | dbrx-instruct,76.82
 50 | Meta-Llama-3-8B-Instruct,68.88
 51 | DiscoLM-120b,78.48
 52 | mistral-small-2402,80.36
 53 | dolphin-2.2-70b,79.6
 54 | Yi-34B-Chat,71.62
 55 | tulu-2-dpo-70b,76.63
 56 | Tess-XL-v1.0,78.46
 57 | Yi-1.5-9B-Chat,70.37
 58 | goliath-120b,76.09
 59 | c4ai-command-r-plus,76.11
 60 | Samantha-120b,76.44
 61 | Nous-Hermes-2-Mixtral-8x7B-SFT,72.91
 62 | Qwen1.5-14B-Chat,74.99
 63 | SynthIA-70B-v1.5,73.71
 64 | gemini-pro,75.08
 65 | Mistral-Nemo-Instruct-2407,77.13
 66 | Mixtral-8x7B-Instruct-v0.1,72.37
 67 | Quyen-Pro-v0.1,70.75
 68 | gpt-3.5-turbo-0301,70.67
 69 | Midnight-Miqu-70B-v1.0,75.9
 70 | meow,73.94
 71 | LMCocktail-10.7B-v1,73.67
 72 | Experiment26-7B,77.21
 73 | Beyonder-4x7B-v3,77.01
 74 | SauerkrautLM-UNA-SOLAR-Instruct,73.56
 75 | NeuralBeagle14-7B,74.79
 76 | NeuralMonarch-7B,76.26
 77 | SOLAR-10.7b-Instruct-dpo,73.21
 78 | Beagle14-7B,74.45
 79 | Monarch-7B,75.8
 80 | WestLake-7B-v2,78.7
 81 | AlphaMonarch-7B,76.08
 82 | GML-Mistral-merged-v1,74.01
 83 | gpt-3.5-turbo-1106,71.74
 84 | Starling-LM-7B-beta,73.82
 85 | SOLAR-10.7B-Instruct-v1.0,73.53
 86 | Phi-3-mini-4k-instruct,58.15
 87 | claude-3-haiku-20240307,63.65
 88 | openchat-3.5-1210,72.52
 89 | NeuralMarcoro14-7B,74.15
 90 | WizardLM-70B-V1.0,71.28
 91 | Starling-LM-7B-alpha,73.9
 92 | gpt-3.5-turbo-0613,69.35
 93 | openchat_3.5,72.18
 94 | 🆕EXAONE-3.0-7.8B-Instruct,66.72
 95 | laserxtral,71.96
 96 | Llama-2-70b-chat-hf,73.59
 97 | marcoroni-7b-v3-safetensor,71.68
 98 | 🆕Trillama-8B,66.63
 99 | 🆕Phi-3.5-mini-instruct,54.74
100 | gpt-3.5-turbo-0125,64.97
101 | Beyonder-4x7B-v2,69.23
102 | firefly-mixtral-8x7b,64.36
103 | Yi-1.5-6B-Chat,59.45
104 | Marcoroni-neural-chat-7B-v2,68.54
105 | WizardLM-2-7B,69.31
106 | OpenHermes-2.5-Mistral-7B,66.89
107 | NeuralHermes-2.5-Mistral-7B,65.86
108 | Snorkel-Mistral-PairRM-DPO,65.83
109 | Qwen-14B-Chat,63.47
110 | dolphin-2.2.1-mistral-7b,69.92
111 | Mistral-7B-Instruct-v0.2,68.18
112 | Mistral-7B-OpenOrca,66.55
113 | neural-chat-7b-v3-1,64.77
114 | internlm2-chat-7b,62.61
115 | Yi-6B-Chat,61.79
116 | Orion-14B-Chat,59.71
117 | una-cybertron-7b-v2-bf16,62.83
118 | c4ai-command-r-v01,56.05
119 | Mistral-7B-Instruct-v0.3,63.15
120 | vicuna-33b-v1.3,67.07
121 | Nanbeige2-8B-Chat,65.17
122 | gemma-1.1-7b-it,59.17
123 | Qwen1.5-MoE-A2.7B-Chat,58.07
124 | vicuna-13b-v1.5,67.39
125 | gemma-2-2b-it,60.86
126 | Qwen1.5-7B-Chat,54.41
127 | sparsetral-16x7B-v2,59.9
128 | zephyr-7b-beta,58.33
129 | WizardLM-13B-V1.2,63.71
130 | zephyr-7b-alpha,56.82
131 | phi-2-orange,56.94
132 | phi-2-psy,56.44
133 | gemma-7b-it,61.72
134 | phi-2-dpo,54.42
135 | phixtral-2x2_8,54.58
136 | Qwen-7B-Chat,50.11
137 | mistral-7b-instruct-v0.1,52.15
138 | Llama-2-13b-chat-hf,49.12
139 | guanaco-33b-merged,36.11
140 | Nous-Capybara-7B-V1,34.37
141 | Llama-2-7b-chat-hf,36.32
142 | Qwen1.5-4B-Chat,28.75
143 | Qwen-1_8B-Chat,30
144 | phi-2,27.6
145 | Qwen1.5-1.8B-Chat,24.12
146 | vicuna-7b-v1.1,26.12
147 | gemma-2b-it,23.26
148 | koala-7B-HF,21.54
149 | stablelm-2-zephyr-1_6b,15.04
150 | random-baseline,0.0
151 | falcon-180B-chat,56.82
152 | claude-instant-1.2,69.04
153 | claude-2.1,73.96
154 | claude-1,76.83
155 | claude-2.0,72.89
156 | pplx-70b-online,62.79
157 | pplx-7b-online,48.91
158 | TheProfessor-155b,78.82


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/helm_airbench_240916_safety.csv:
--------------------------------------------------------------------------------
 1 | model,HELM_AIRBENCH_Security_Risks,HELM_AIRBENCH_Operational_Misuses,HELM_AIRBENCH_Violence_&_Extremism,HELM_AIRBENCH_Hate/Toxicity,HELM_AIRBENCH_Sexual_Content,HELM_AIRBENCH_Child_Harm,HELM_AIRBENCH_Self_harm,HELM_AIRBENCH_Political_Usage,HELM_AIRBENCH_Economic_Harm,HELM_AIRBENCH_Deception,HELM_AIRBENCH_Manipulation,HELM_AIRBENCH_Defamation,HELM_AIRBENCH_Fundamental_Rights,HELM_AIRBENCH_Discrimination/Bias,HELM_AIRBENCH_Privacy,HELM_AIRBENCH_Criminal_Activities,HELM_AIRBENCH_AIR_Score
 2 | anthropic/claude-3-haiku-20240307,0.005,0.572,0.159,0.057,0.219,0.152,0.022,0.041,0.04,0.089,0.053,0.037,0,0.382,0.086,0.008,0.198
 3 | anthropic/claude-3-sonnet-20240229,0.009,0.473,0.156,0.071,0.184,0.133,0,0.031,0.02,0.096,0.127,0.074,0,0.332,0.089,0,0.177
 4 | meta/llama-3-8b-chat,0.065,0.694,0.214,0.188,0.208,0.276,0.089,0.177,0.14,0.259,0.353,0.38,0.027,0.521,0.225,0,0.386
 5 | anthropic/claude-3-opus-20240229,0.065,0.477,0.187,0.091,0.34,0.19,0.022,0.063,0.06,0.126,0.1,0.074,0,0.27,0.096,0.017,0.177
 6 | google/gemini-1.5-pro-001-safety-default,0.097,0.338,0.253,0.135,0.288,0.233,0.078,0.161,0.09,0.215,0.22,0.194,0.06,0.24,0.123,0.042,0.189
 7 | google/gemini-1.5-flash-001-safety-default,0.124,0.371,0.289,0.164,0.302,0.286,0.022,0.195,0.153,0.3,0.293,0.278,0.053,0.325,0.14,0.033,0.233
 8 | openai/gpt-3.5-turbo-0613,0.137,0.551,0.455,0.274,0.549,0.429,0.089,0.463,0.433,0.522,0.433,0.463,0.213,0.516,0.316,0.108,0.407
 9 | openai/gpt-4-turbo-2024-04-09,0.142,0.636,0.329,0.144,0.378,0.381,0.156,0.323,0.293,0.304,0.34,0.167,0.08,0.461,0.207,0.058,0.322
10 | meta/llama-3-70b-chat,0.158,0.726,0.351,0.329,0.49,0.267,0.078,0.339,0.34,0.385,0.427,0.574,0.147,0.502,0.274,0.025,0.386
11 | openai/gpt-3.5-turbo-1106,0.275,0.636,0.589,0.433,0.559,0.629,0.322,0.609,0.623,0.659,0.573,0.481,0.333,0.589,0.39,0.267,0.511
12 | openai/gpt-4o-2024-05-13,0.297,0.813,0.527,0.327,0.524,0.552,0.189,0.601,0.587,0.504,0.54,0.426,0.267,0.575,0.45,0.233,0.506
13 | openai/gpt-3.5-turbo-0125,0.405,0.768,0.664,0.51,0.667,0.752,0.422,0.725,0.71,0.748,0.7,0.593,0.52,0.624,0.471,0.45,0.593
14 | qwen/qwen1.5-72b-chat,0.453,0.772,0.579,0.371,0.635,0.686,0.356,0.616,0.623,0.733,0.633,0.63,0.467,0.571,0.546,0.35,0.558
15 | deepseek-ai/deepseek-llm-67b-chat,0.457,0.709,0.541,0.365,0.622,0.643,0.344,0.532,0.567,0.648,0.573,0.407,0.373,0.584,0.515,0.3,0.533
16 | 01-ai/yi-34b-chat,0.509,0.691,0.558,0.377,0.576,0.624,0.289,0.52,0.503,0.681,0.533,0.491,0.227,0.559,0.436,0.275,0.507
17 | mistralai/mixtral-8x22b-instruct-v0.1,0.671,0.744,0.726,0.417,0.569,0.767,0.322,0.747,0.647,0.726,0.66,0.463,0.573,0.593,0.593,0.646,0.611
18 | mistralai/mixtral-8x7b-instruct-v0.1,0.777,0.818,0.733,0.504,0.632,0.848,0.533,0.808,0.74,0.822,0.687,0.602,0.627,0.592,0.579,0.742,0.645
19 | cohere/command-r,0.782,0.878,0.775,0.586,0.712,0.824,0.578,0.861,0.82,0.822,0.813,0.648,0.773,0.678,0.699,0.717,0.722
20 | cohere/command-r-plus,0.829,0.881,0.816,0.653,0.729,0.819,0.578,0.895,0.897,0.867,0.853,0.815,0.8,0.68,0.709,0.817,0.747
21 | mistralai/mistral-7b-instruct-v0.3,0.932,0.841,0.806,0.501,0.597,0.924,0.522,0.909,0.91,0.889,0.853,0.648,0.893,0.624,0.717,0.942,0.718
22 | databricks/dbrx-instruct,0.955,0.874,0.841,0.624,0.684,0.924,0.722,0.963,0.953,0.926,0.953,0.75,0.947,0.675,0.817,0.967,0.786


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/helm_classic_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,helm_classic
 2 | Llama 2 (70B),0.944
 3 | LLaMA (65B),0.908
 4 | text-davinci-002,0.905
 5 | Mistral v0.1 (7B),0.884
 6 | Cohere Command beta (52.4B),0.874
 7 | text-davinci-003,0.872
 8 | Jurassic-2 Jumbo (178B),0.824
 9 | Llama 2 (13B),0.823
10 | TNLG v2 (530B),0.787
11 | gpt-3.5-turbo-0613,0.783
12 | LLaMA (30B),0.781
13 | Anthropic-LM v4-s3 (52B),0.78
14 | gpt-3.5-turbo-0301,0.76
15 | Jurassic-2 Grande (17B),0.743
16 | Palmyra X (43B),0.732
17 | Falcon (40B),0.729
18 | Falcon-Instruct (40B),0.727
19 | MPT-Instruct (30B),0.716
20 | MPT (30B),0.714
21 | J1-Grande v2 beta (17B),0.706
22 | Vicuna v1.3 (13B),0.706
23 | Cohere Command beta (6.1B),0.675
24 | Cohere xlarge v20221108 (52.4B),0.664
25 | Luminous Supreme (70B),0.662
26 | Vicuna v1.3 (7B),0.625
27 | OPT (175B),0.609
28 | Llama 2 (7B),0.607
29 | LLaMA (13B),0.595
30 | InstructPalmyra (30B),0.568
31 | Cohere xlarge v20220609 (52.4B),0.56
32 | Jurassic-2 Large (7.5B),0.553
33 | davinci (175B),0.538
34 | LLaMA (7B),0.533
35 | RedPajama-INCITE-Instruct (7B),0.524
36 | J1-Jumbo v1 (178B),0.517
37 | GLM (130B),0.512
38 | Luminous Extended (30B),0.485
39 | OPT (66B),0.448
40 | BLOOM (176B),0.446
41 | J1-Grande v1 (17B),0.433
42 | Alpaca (7B),0.381
43 | Falcon (7B),0.378
44 | RedPajama-INCITE-Base (7B),0.378
45 | Cohere large v20220720 (13.1B),0.372
46 | RedPajama-INCITE-Instruct-v1 (3B),0.366
47 | text-curie-001,0.36
48 | GPT-NeoX (20B),0.351
49 | Luminous Base (13B),0.315
50 | Cohere medium v20221108 (6.1B),0.312
51 | RedPajama-INCITE-Base-v1 (3B),0.311
52 | TNLG v2 (6.7B),0.309
53 | J1-Large v1 (7.5B),0.285
54 | GPT-J (6B),0.273
55 | Pythia (12B),0.257
56 | curie (6.7B),0.247
57 | Falcon-Instruct (7B),0.244
58 | Cohere medium v20220720 (6.1B),0.23
59 | text-babbage-001,0.229
60 | T0pp (11B),0.197
61 | Pythia (6.9B),0.196
62 | UL2 (20B),0.167
63 | T5 (11B),0.131
64 | babbage (1.3B),0.114
65 | Cohere small v20220720 (410M),0.109
66 | ada (350M),0.108
67 | text-ada-001,0.107
68 | YaLM (100B),0.075


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/helm_lite_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,helm_lite
 2 | GPT-4o (2024-05-13),0.963
 3 | Claude 3.5 Sonnet (20240620),0.915
 4 | GPT-4 (0613),0.915
 5 | GPT-4 Turbo (2024-04-09),0.908
 6 | Llama 3.1 Instruct Turbo (405B),0.896
 7 | Llama 3.1 Instruct Turbo (70B),0.858
 8 | Llama 3 (70B),0.838
 9 | Qwen2 Instruct (72B),0.827
10 | Mistral Large 2 (2407),0.803
11 | Gemini 1.5 Pro (001),0.793
12 | GPT-4o mini (2024-07-18),0.776
13 | Mixtral (8x22B),0.767
14 | GPT-4 Turbo (1106 preview),0.758
15 | Palmyra X V3 (72B),0.749
16 | Gemma 2 Instruct (27B),0.742
17 | Gemini 1.5 Flash (001),0.733
18 | Claude 3 Opus (20240229),0.722
19 | PaLM-2 (Unicorn),0.703
20 | Qwen1.5 (72B),0.68
21 | Palmyra X V2 (33B),0.659
22 | Gemma 2 Instruct (9B),0.639
23 | Yi (34B),0.634
24 | Qwen1.5 Chat (110B),0.619
25 | Qwen1.5 (32B),0.615
26 | Claude v1.3,0.594
27 | PaLM-2 (Bison),0.584
28 | Mixtral (8x7B 32K seqlen),0.582
29 | Phi-3 (14B),0.579
30 | Claude 2.0,0.56
31 | DeepSeek LLM Chat (67B),0.556
32 | Phi-3 (7B),0.545
33 | Llama 2 (70B),0.537
34 | Yi Large (Preview),0.53
35 | Command R Plus,0.509
36 | GPT-3.5 (text-davinci-003),0.503
37 | Claude 2.1,0.503
38 | Qwen1.5 (14B),0.491
39 | Gemini 1.0 Pro (002),0.484
40 | Claude Instant 1.2,0.464
41 | Llama 3 (8B),0.441
42 | GPT-3.5 Turbo (0613),0.42
43 | Claude 3 Sonnet (20240229),0.42
44 | Mistral NeMo (2402),0.401
45 | Arctic Instruct,0.399
46 | Gemma (7B),0.392
47 | GPT-3.5 (text-davinci-002),0.392
48 | LLaMA (65B),0.39
49 | Mistral Large (2402),0.382
50 | Command,0.365
51 | Command R,0.35
52 | Llama 3.1 Instruct Turbo (8B),0.347
53 | Mistral Small (2402),0.342
54 | DBRX Instruct,0.341
55 | Jamba Instruct,0.339
56 | Mistral v0.1 (7B),0.338
57 | Mistral Medium (2312),0.318
58 | Qwen1.5 (7B),0.317
59 | Claude 3 Haiku (20240307),0.309
60 | Yi (6B),0.289
61 | Llama 2 (13B),0.273
62 | Jurassic-2 Jumbo (178B),0.254
63 | Falcon (40B),0.249
64 | Mistral Instruct v0.3 (7B),0.233
65 | Jurassic-2 Grande (17B),0.203
66 | Phi-2,0.202
67 | Llama 2 (7B),0.18
68 | Luminous Supreme (70B),0.172
69 | Command Light,0.125
70 | Luminous Extended (30B),0.093
71 | Falcon (7B),0.078
72 | OLMo (7B),0.063
73 | Luminous Base (13B),0.052


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/helm_mmlu_240829_knowledge.csv:
--------------------------------------------------------------------------------
 1 | model,helm_mmlu
 2 | Claude 3.5 Sonnet (20240620),0.865
 3 | Claude 3 Opus (20240229),0.846
 4 | Llama 3.1 Instruct Turbo (405B),0.845
 5 | GPT-4o (2024-05-13),0.842
 6 | Gemini 1.5 Pro (001),0.827
 7 | GPT-4 (0613),0.824
 8 | Qwen2 Instruct (72B),0.824
 9 | GPT-4 Turbo (2024-04-09),0.813
10 | Gemini 1.5 Pro (0409 preview),0.81
11 | Llama 3.1 Instruct Turbo (70B),0.801
12 | Mistral Large 2 (2407),0.8
13 | GPT-4 Turbo (1106 preview),0.796
14 | Llama 3 (70B),0.793
15 | Yi Large (Preview),0.793
16 | Palmyra X V3 (72B),0.786
17 | PaLM-2 (Unicorn),0.786
18 | Gemini 1.5 Flash (001),0.779
19 | Mixtral (8x22B),0.778
20 | Gemini 1.5 Flash (0514 preview),0.778
21 | Phi-3 (14B),0.775
22 | Qwen1.5 (72B),0.774
23 | Qwen1.5 Chat (110B),0.768
24 | GPT-4o mini (2024-07-18),0.767
25 | Yi (34B),0.762
26 | Claude 3 Sonnet (20240229),0.759
27 | Gemma 2 (27B),0.757
28 | Phi-3 (7B),0.757
29 | Qwen1.5 (32B),0.744
30 | DBRX Instruct,0.741
31 | Claude 3 Haiku (20240307),0.738
32 | Claude 2.1,0.735
33 | DeepSeek LLM Chat (67B),0.725
34 | Gemma 2 (9B),0.721
35 | Mixtral (8x7B 32K seqlen),0.717
36 | Gemini 1.0 Pro (001),0.7
37 | Llama 2 (70B),0.695
38 | Command R Plus,0.694
39 | PaLM-2 (Bison),0.692
40 | GPT-3.5 Turbo (0613),0.689
41 | Claude Instant 1.2,0.688
42 | Mistral Large (2402),0.688
43 | Mistral Small (2402),0.687
44 | Qwen1.5 (14B),0.686
45 | Arctic Instruct,0.677
46 | Llama 3 (8B),0.668
47 | Gemma (7B),0.661
48 | Jamba Instruct,0.659
49 | Mistral NeMo (2402),0.653
50 | Command R,0.652
51 | Yi (6B),0.64
52 | Qwen1.5 (7B),0.626
53 | Mistral Instruct v0.3 (7B),0.599
54 | Phi-2,0.584
55 | Mistral v0.1 (7B),0.566
56 | Llama 3.1 Instruct Turbo (8B),0.561
57 | Llama 2 (13B),0.554
58 | OLMo 1.7 (7B),0.538
59 | Llama 2 (7B),0.458
60 | OLMo (7B),0.295


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/holmes_240829_linguistics.csv:
--------------------------------------------------------------------------------
 1 | model,holmes
 2 | google/flan-ul2,72.2
 3 | google/flan-t5-xxl,70.5
 4 | google/t5-xxl-lm-adapt,70.2
 5 | lmsys/vicuna-13b-v1.5,68.6
 6 | meta-llama/Llama-2-70b-chat-hf,66.3
 7 | ibm/labradorite-13b,66.1
 8 | meta-llama/Llama-2-13b-hf,65.0
 9 | meta-llama/Llama-2-13b-chat-hf,64.1
10 | EleutherAI/pythia-12b-deduped,63.1
11 | facebook/bart-base,63.0
12 | microsoft/Orca-2-13b,62.7
13 | EleutherAI/pythia-6.9b-deduped,62.3
14 | google/ul2,60.5
15 | google/flan-t5-xl,60.0
16 | google/t5-xl-lm-adapt,59.5
17 | google/electra-base-discriminator,58.3
18 | databricks/dolly-v2-12b,58.2
19 | EleutherAI/pythia-12b,58.0
20 | allenai/tulu-2-13b,57.6
21 | EleutherAI/pythia-6.9b,56.6
22 | microsoft/deberta-v3-base,56.0
23 | EleutherAI/pythia-2.8b-deduped,56.0
24 | meta-llama/Llama-2-70b-hf,55.9
25 | allenai/tulu-2-dpo-13b,55.5
26 | WizardLM/WizardLM-13B-V1.2,55.4
27 | microsoft/deberta-base,55.3
28 | EleutherAI/pythia-1.4b,54.2
29 | EleutherAI/pythia-2.8b,54.0
30 | allenai/tulu-2-70b,53.5
31 | mistralai/Mistral-7B-Instruct-v0.1,52.9
32 | albert-base-v2,52.3
33 | allenai/tk-instruct-11b-def,51.7
34 | allenai/tulu-2-dpo-70b,51.2
35 | google/flan-t5-large,50.9
36 | google/t5-base-lm-adapt,48.7
37 | google/flan-t5-base,48.7
38 | EleutherAI/pythia-1b-deduped,47.5
39 | meta-llama/Llama-2-7b-hf,47.2
40 | EleutherAI/pythia-1.4b-deduped,47.2
41 | mistralai/Mixtral-8x7B-Instruct-v0.1,46.5
42 | bert-base-uncased,45.3
43 | mistralai/Mistral-7B-v0.1,45.2
44 | meta-llama/Llama-2-7b-chat-hf,45.0
45 | ibm/merlinite-7b,44.1
46 | roberta-base,43.2
47 | google/t5-large-lm-adapt,42.4
48 | mistralai/Mixtral-8x7B-v0.1,42.2
49 | gpt2,40.6
50 | EleutherAI/pythia-410m,40.0
51 | google/flan-t5-small,38.9
52 | google/t5-small-lm-adapt,36.0
53 | EleutherAI/pythia-410m-deduped,31.3
54 | Glove.840B,26.6
55 | EleutherAI/pythia-160m-deduped,17.2
56 | EleutherAI/pythia-160m,16.3
57 | EleutherAI/pythia-70m,15.6
58 | EleutherAI/pythia-70m-deduped,14.4


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/hydrox_safety_241001_safety.csv:
--------------------------------------------------------------------------------
 1 | model,hydrox_safety,hydrox_privacy,hydrox_security,hydrox_integrity,hydrox_overall_score
 2 | hydroxai/hydro-safe-Mistral-7B-v0.1-dpo-full,89.41,99.62,96.66,98.16,94.44
 3 | anthropic/claude-3-5-sonnet,94.75,93.83,92.61,95.56,94.18
 4 | anthropic/claude-3-sonnet,92.33,94.36,94.62,94.14,93.62
 5 | anthropic/claude-3-opus,92.50,91.26,90.47,94.08,92.02
 6 | google/gemma-2-2b-it,92.15,92.43,89.22,93.14,91.66
 7 | hydroxai/hydro-safe-Mistral-7B-Instruct-v0.1-dpo-full-1-epoch,86.56,96.21,91.35,97.74,91.60
 8 | anthropic/claude-3-haiku,91.52,93.69,91.39,89.53,91.59
 9 | OpenAI/gpt-4-0613,79.94,91.79,92.00,96.04,85.43
10 | hydroxai/hydro-safe-llama2-7b-chat-dpo-full-3-epoch,79.83,90.63,84.68,84.27,83.93
11 | meta-llama/Meta-Llama-3-8B-Instruct,83.32,88.61,82.51,80.86,83.72
12 | OpenAI/gpt-4o-mini-2024-07-18,80.87,82.32,77.55,81.38,80.43
13 | google/gemini-1.0-pro-latest,69.20,87.82,77.91,88.61,78.29
14 | hydroxai/hydro-safe-zephyr-td-full,78.18,0.00,0.00,0.00,78.18
15 | meta-llama/Llama-3.2-3B-Instruct,79.46,77.90,72.51,79.24,77.42
16 | google/gemini-1.0-pro,65.18,90.39,79.93,87.11,77.20
17 | meta-llama/Llama-3.2-1B-Instruct,76.25,75.71,74.20,76.98,75.78
18 | meta-llama/Meta-Llama-3-70B-Instruct,74.65,80.65,70.21,73.55,74.44
19 | google/gemini-1.5-flash,77.61,83.33,72.05,60.00,74.43
20 | google/gemini-pro,63.56,90.60,67.49,84.42,73.04
21 | OpenAI/gpt-3.5-turbo-0613,56.94,90.00,93.43,80.84,72.04
22 | Qwen/Qwen2-72B-Instruct,77.10,73.40,65.19,70.13,71.86
23 | OpenAI/gpt-4o-2024-05-13,67.11,68.46,60.89,63.54,65.26
24 | hydroxai/hydro-safe-zephyr-td-full,69.64,49.70,66.63,71.25,65.23
25 | h2oai/h2ogpt-4096-llama2-70b-chat,60.65,73.46,59.38,65.75,63.67
26 | h2oai/h2ogpt-4096-llama2-70b-chat,63.64,65.15,63.34,59.50,63.19
27 | OpenAI/gpt-4-0314,56.36,76.67,72.79,54.00,62.51
28 | meta-llama/Llama-2-70b-chat-hf,61.00,68.87,59.58,63.00,62.50
29 | meta-llama/Llama-2-13b-chat-hf,58.60,63.37,57.85,62.67,60.00
30 | meta-llama/Llama-2-7b-chat-hf,52.30,55.30,46.71,51.63,51.26
31 | deepseek-ai/DeepSeek-V2-Chat-0628,50.00,0.00,0.00,0.00,50.00
32 | deepseek-ai/DeepSeek-V2-Lite-Chat,44.26,48.84,41.91,45.93,44.91
33 | google/gemini-1.5-pro,46.99,40.63,41.65,40.84,43.27
34 | 01-ai/Yi-6B-Chat,37.35,45.36,31.49,36.02,37.00
35 | mistralai/Mistral-7B-Instruct-v0.2,41.71,37.18,32.24,32.52,36.82
36 | lmsys/vicuna-13b-v1.5,38.46,29.78,30.71,36.08,34.07
37 | meta-llama/Llama-2-7b-chat-hf,30.37,38.61,26.57,45.52,33.47
38 | hydroxai/hydro-safe-Sheared-LLaMA-1.3B-dpo-full,26.44,45.30,27.07,35.98,31.87
39 | hydroxai/hydro-safe-dolly-v2-7b-dpo-full-3-epoch,22.95,32.34,25.64,35.51,27.81
40 | tiiuae/falcon-40b-instruct,28.10,30.83,22.97,30.32,27.55
41 | google/gemma-2-2b,25.61,27.04,24.50,24.88,25.50
42 | upstage/SOLAR-0-70b-16bit,22.40,33.80,17.55,30.25,24.50
43 | HuggingFaceH4/zephyr-7b-beta,21.20,30.60,22.40,24.95,23.80
44 | mistralai/Mixtral-8x7B-Instruct-v0.1,27.70,25.04,18.24,21.23,23.75
45 | hydroxai/zephyr-reproduction-dpo-full,19.35,21.65,21.22,26.05,21.38
46 | argilla/notus-7b-v1,26.55,22.05,15.53,19.50,21.30
47 | microsoft/Orca-2-7b,18.30,18.31,20.52,22.09,19.53
48 | lmsys/vicuna-13b-v1.5-16k,21.14,17.01,16.99,22.25,19.31
49 | minimax/abab5-5s-chat,22.54,20.63,14.17,19.46,19.12
50 | Intel/neural-chat-7b-v3-1,15.86,22.28,14.72,22.84,17.86
51 | Intel/neural-chat-7b-v3-2,19.68,14.36,18.62,15.33,17.82
52 | lmsys/vicuna-33b-v1.3,18.42,21.34,13.89,18.64,17.64
53 | microsoft/Orca-2-13b,33.06,27.78,0.00,0.00,17.48
54 | mistralai/Mistral-7B-Instruct-v0.1,26.91,12.08,10.86,12.39,16.74
55 | lmsys/vicuna-7b-v1.5,22.47,10.91,12.61,11.74,15.37
56 | tiiuae/falcon-7b-instruct,14.64,11.30,14.01,15.76,14.01
57 | hydroxai/zephyr-reproduction-sft-full,14.92,14.94,9.50,13.61,13.10
58 | mistralai/Mistral-7B-Instruct-v0.1,13.92,8.63,6.29,8.04,9.68
59 | google/gemma-2-27b-it,8.10,11.11,10.00,10.94,9.67
60 | mistralai/Mixtral-8x7B-v0.1,10.61,8.81,6.73,8.16,8.81
61 | google/gemma-2b,8.55,8.27,8.09,6.39,7.99
62 | databricks/dolly-v2-7b,9.92,8.33,4.96,8.33,7.79
63 | hydroxai/hydro-safe-dolly-v2-7b-dpo-full,11.03,6.16,5.10,5.96,7.64
64 | LumiOpen/Viking-13B,7.75,8.32,5.76,7.68,7.33
65 | mistralai/Mistral-7B-v0.1,11.38,4.18,2.86,8.53,7.32
66 | LumiOpen/Viking-33B,6.87,6.48,6.92,6.38,6.73
67 | minimax/abab5-5-chat,8.32,5.13,4.85,8.09,6.60
68 | WizardLM/WizardLM-30B-V1.0,8.00,3.88,6.49,5.58,6.41
69 | databricks/dolly-v2-12b,11.46,3.48,3.39,3.72,6.21
70 | LumiOpen/Viking-7B,5.37,3.91,7.60,9.05,6.15
71 | databricks/dolly-v2-7b,8.81,2.86,2.89,5.94,5.41
72 | TinyLlama/TinyLlama-1.1B-Chat-v1.0,6.87,3.30,4.57,5.65,5.38
73 | mistralai/Mistral-7B-v0.1,7.63,2.78,4.44,3.53,5.00
74 | Nexusflow/NexusRaven-V2-13B,3.95,3.13,4.77,4.50,4.16
75 | hydroxai/hydro-safe-Mistral-7B-Instruct-v0.1-dpo-full-1-epoch,8.26,0.00,8.16,0.00,4.04
76 | databricks/dolly-v2-3b,4.08,1.08,0.55,0.18,1.81
77 | tiiuae/falcon-40b,2.08,0.25,0.40,0.64,0.90
78 | tiiuae/falcon-7b,1.05,0.11,0.43,0.23,0.51
79 | minimax/abab5-5-chat,1.27,0.21,0.04,0.20,0.44
80 | princeton-nlp/Sheared-LLaMA-1.3B,1.14,0.05,0.03,0.04,0.29
81 | davidkim205/komt-mistral-7b-v1,0.65,0.02,0.00,0.00,0.13
82 | EleutherAI/pythia-70m-deduped,0.00,0.00,0.00,0.00,0.00


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/livebench_240701_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,livebench_240624,LB_Reasoning_Average,LB_Coding_Average,LB_Mathematics_Average,LB_Data_Analysis_Average,LB_Language_Average,LB_IF_Average
 2 | zephyr-7b-beta,17.32,16.00,8.32,11.23,15.75,4.28,48.32
 3 | zephyr-7b-alpha,19.28,17.00,11.32,9.96,17.40,7.20,52.79
 4 | yi-6b-chat,9.02,8.00,1.32,8.53,4.38,4.69,27.22
 5 | vicuna-7b-v1.5-16k,14.22,15.00,1.32,9.04,9.93,7.92,42.12
 6 | vicuna-7b-v1.5,12.31,12.00,1.00,7.10,3.33,8.66,41.75
 7 | starling-lm-7b-beta,16.62,19.00,18.26,14.86,2.00,7.26,38.32
 8 | smaug-qwen2-72b-instruct,39.66,37.00,39.05,40.67,26.19,30.03,65.00
 9 | qwen2-72b-instruct,40.16,42.00,31.79,43.44,26.24,29.21,68.27
10 | qwen2-7b-instruct,26.63,20.00,29.21,26.87,28.75,10.21,44.74
11 | qwen2-1.5b-instruct,10.42,8.00,5.63,9.94,10.01,3.05,25.90
12 | qwen2-0.5b-instruct,7.30,3.00,2.00,7.35,2.00,2.80,26.63
13 | qwen1.5-110b-chat,29.07,26.00,22.21,26.28,31.45,13.22,55.26
14 | qwen1.5-72b-chat,28.89,21.00,22.89,26.82,32.98,11.37,58.25
15 | qwen1.5-7b-chat,17.02,13.00,6.63,15.29,16.90,6.18,44.12
16 | qwen1.5-4b-chat,11.59,13.00,4.00,9.86,9.13,5.80,27.75
17 | qwen1.5-1.8b-chat,6.32,5.00,0.00,3.53,3.33,3.16,22.90
18 | qwen1.5-0.5b-chat,5.43,4.00,0.00,4.43,0.00,2.88,21.30
19 | phi-3.5-moe-instruct,35.14,41.00,19.26,33.30,40.46,17.07,59.73
20 | phi-3.5-mini-instruct,27.81,31.00,15.26,22.20,30.43,9.67,58.30
21 | phi-3-small-128k-instruct,29.68,28.00,24.87,28.97,27.26,15.53,53.47
22 | phi-3-small-8k-instruct,29.09,29.00,21.24,23.73,29.62,15.13,55.81
23 | phi-3-mini-128k-instruct,24.76,24.00,14.29,17.06,34.02,7.76,51.40
24 | phi-3-mini-4k-instruct,24.41,22.00,14.79,20.84,29.55,8.06,51.25
25 | phi-3-medium-128k-instruct,29.88,31.00,21.58,25.64,32.12,12.76,56.15
26 | phi-3-medium-4k-instruct,30.96,35.00,20.58,31.36,31.63,13.91,53.30
27 | openhermes-2.5-mistral-7b,23.36,17.00,11.63,20.45,26.92,11.37,52.78
28 | open-mistral-nemo,29.02,25.00,28.16,21.66,33.35,14.15,51.80
29 | mixtral-8x22b-instruct-v0.1,35.29,29.00,33.11,28.33,31.67,26.48,63.17
30 | mixtral-8x7b-instruct-v0.1,22.79,18.00,11.32,20.71,28.13,13.76,44.81
31 | mistral-small-2402,33.03,28.00,24.21,28.15,31.88,22.06,63.91
32 | mistral-large-2407,48.35,45.00,46.37,40.48,46.61,39.79,71.85
33 | mistral-large-2402,38.92,35.00,26.84,32.20,42.55,28.74,68.19
34 | mistral-7b-instruct-v0.3,20.09,11.00,9.00,14.56,21.77,11.85,52.37
35 | mistral-7b-instruct-v0.2,19.51,13.00,11.63,17.08,14.62,9.05,51.65
36 | meta-llama-3.1-405b-instruct-turbo,55.18,57.00,45.68,46.55,53.51,49.85,78.47
37 | meta-llama-3.1-70b-instruct-turbo,48.90,43.00,33.11,45.58,50.29,42.36,79.08
38 | meta-llama-3.1-8b-instruct-turbo,28.11,14.00,21.58,24.37,32.15,20.05,56.53
39 | meta-llama-3-70b-instruct,37.60,31.00,20.95,32.31,43.75,34.11,63.50
40 | meta-llama-3-8b-instruct,27.46,25.00,18.26,19.66,26.00,18.72,57.14
41 | mathstral-7b-v0.1,24.33,16.00,15.63,17.84,27.89,15.37,53.25
42 | llama-2-7b-chat-hf,10.25,5.00,0.00,4.78,0.00,6.86,44.88
43 | hermes-3-llama-3.1-70b,39.56,32.00,29.79,28.32,48.11,43.77,55.37
44 | gpt-4o-mini-2024-07-18,44.57,37.00,43.37,41.58,44.52,35.28,65.68
45 | gpt-4o-2024-08-06,56.46,54.00,50.63,52.29,52.89,54.37,74.58
46 | gpt-4o-2024-05-13,54.96,55.00,46.37,49.88,52.41,53.94,72.17
47 | gpt-4-turbo-2024-04-09,53.00,54.00,47.05,48.99,51.32,45.26,71.39
48 | gpt-4-0613,44.94,31.00,37.05,36.22,44.03,49.57,71.79
49 | gpt-4-0125-preview,49.39,48.00,44.05,42.75,54.06,43.55,63.92
50 | gpt-3.5-turbo-0125,34.66,26.00,29.16,26.93,41.21,24.22,60.47
51 | gemma-2-27b-it,41.22,31.00,36.74,36.23,43.58,32.40,67.37
52 | gemma-2-9b-it,31.57,19.00,22.21,23.98,35.06,27.64,61.55
53 | gemma-1.1-7b-it,18.23,10.00,11.00,15.21,18.17,10.65,44.34
54 | gemini-1.5-pro-exp-0827,55.06,56.00,42.00,56.28,50.83,49.31,75.95
55 | gemini-1.5-pro-exp-0801,53.63,55.00,43.37,47.46,50.15,46.96,78.84
56 | gemini-1.5-pro-api-0514,44.41,33.00,32.79,42.42,52.81,38.25,67.20
57 | gemini-1.5-flash-exp-0827,47.51,52.00,39.74,36.29,47.87,31.04,78.11
58 | gemini-1.5-flash-api-0514,40.95,30.00,39.05,38.89,44.03,30.69,63.01
59 | dracarys-llama-3.1-70b-instruct,49.82,50.00,36.11,45.68,47.99,41.77,77.37
60 | dracarys-72b-instruct,41.72,41.00,41.05,42.77,26.24,31.17,68.08
61 | deepseek-v2-lite-chat,17.49,13.00,8.63,14.08,18.19,9.20,41.83
62 | deepseek-coder-v2-lite-instruct,29.21,22.00,26.84,34.44,33.00,10.64,48.34
63 | deepseek-coder-v2,46.84,49.00,41.05,52.54,38.25,33.04,67.18
64 | deepseek-chat-v2,46.36,41.00,42.05,52.11,45.59,32.77,64.61
65 | command-r-plus,32.86,32.00,20.26,24.85,24.60,23.92,71.51
66 | command-r,27.23,28.00,14.95,16.92,31.69,14.64,57.16
67 | claude-3-sonnet-20240229,38.08,26.00,25.21,29.65,44.56,38.08,65.00
68 | claude-3-opus-20240229,50.75,41.00,40.05,46.54,54.32,51.72,70.87
69 | claude-3-haiku-20240307,35.32,26.00,24.53,25.72,41.54,30.07,64.03
70 | claude-3-5-sonnet-20240620,61.16,64.00,63.21,53.75,56.74,56.94,72.30
71 | chatgpt-4o-latest,55.35,57.00,46.00,52.19,54.43,49.95,72.52


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/livebench_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,livebench_240725,LB_Reasoning,LB_Coding,LB_Mathematics,LB_Data_Analysis,LB_Language,LB_IF
 2 | claude-3-5-sonnet-20240620,59.87,58.67,60.85,53.75,56.74,56.94,72.30
 3 | gpt-4o-2024-08-06,56.71,54.67,51.44,52.29,52.89,54.37,74.58
 4 | chatgpt-4o-latest,54.71,52.00,47.15,52.19,54.43,49.95,72.52
 5 | gpt-4o-2024-05-13,54.63,50.00,49.36,49.88,52.41,53.94,72.17
 6 | meta-llama-3.1-405b-instruct-turbo,54.25,53.33,43.80,46.55,53.51,49.85,78.47
 7 | gemini-1.5-pro-exp-0827,53.78,49.33,40.95,56.28,50.83,49.31,75.95
 8 | gpt-4-turbo-2024-04-09,52.88,51.33,49.00,48.99,51.32,45.26,71.39
 9 | gemini-1.5-pro-exp-0801,52.22,48.67,41.23,47.46,50.15,46.96,78.84
10 | claude-3-opus-20240229,50.56,41.33,38.59,46.54,54.32,51.72,70.87
11 | gpt-4-0125-preview,48.90,47.33,41.80,42.75,54.06,43.55,63.92
12 | dracarys-llama-3.1-70b-instruct,48.67,44.00,35.23,45.68,47.99,41.77,77.37
13 | meta-llama-3.1-70b-instruct-turbo,48.44,40.67,32.67,45.58,50.29,42.36,79.08
14 | mistral-large-2407,47.97,42.00,47.08,40.48,46.61,39.79,71.85
15 | gemini-1.5-flash-exp-0827,46.87,47.33,40.59,36.29,47.87,31.04,78.11
16 | deepseek-coder-v2,46.31,45.33,41.51,52.54,38.25,33.04,67.18
17 | deepseek-chat-v2,46.04,40.00,41.15,52.11,45.59,32.77,64.61
18 | gpt-4-0613,45.60,34.67,37.31,36.22,44.03,49.57,71.79
19 | gemini-1.5-pro-api-0514,44.72,35.33,32.31,42.42,52.81,38.25,67.20
20 | gpt-4o-mini-2024-07-18,44.26,35.33,43.15,41.58,44.52,35.28,65.68
21 | gemma-2-27b-it,41.26,32.00,35.95,36.23,43.58,32.40,67.37
22 | dracarys-72b-instruct,41.20,40.00,38.95,42.77,26.24,31.17,68.08
23 | qwen2-72b-instruct,40.15,41.33,32.38,43.44,26.24,29.21,68.27
24 | hermes-3-llama-3.1-70b,40.05,33.33,31.38,28.32,48.11,43.77,55.37
25 | gemini-1.5-flash-api-0514,40.04,29.33,34.31,38.89,44.03,30.69,63.01
26 | smaug-qwen2-72b-instruct,39.32,36.00,38.03,40.67,26.19,30.03,65.00
27 | mistral-large-2402,39.18,36.00,27.38,32.20,42.55,28.74,68.19
28 | claude-3-sonnet-20240229,38.72,28.67,26.38,29.65,44.56,38.08,65.00
29 | meta-llama-3-70b-instruct,37.73,30.67,22.03,32.31,43.75,34.11,63.50
30 | claude-3-haiku-20240307,35.86,29.33,24.46,25.72,41.54,30.07,64.03
31 | mixtral-8x22b-instruct-v0.1,35.17,29.33,32.03,28.33,31.67,26.48,63.17
32 | phi-3.5-moe-instruct,35.16,38.67,21.74,33.30,40.46,17.07,59.73
33 | gpt-3.5-turbo-0125,34.54,26.67,27.74,26.93,41.21,24.22,60.47
34 | mistral-small-2402,32.19,26.00,21.18,28.15,31.88,22.06,63.91
35 | command-r-plus,32.17,28.67,19.46,24.85,24.60,23.92,71.51
36 | gemma-2-9b-it,31.34,17.33,22.46,23.98,35.06,27.64,61.55
37 | phi-3-medium-4k-instruct,31.22,36.67,20.46,31.36,31.63,13.91,53.30
38 | phi-3-medium-128k-instruct,30.30,34.00,21.10,25.64,32.12,12.76,56.15
39 | phi-3-small-128k-instruct,29.97,30.00,24.57,28.97,27.26,15.53,53.47
40 | qwen1.5-110b-chat,29.78,30.67,21.82,26.28,31.45,13.22,55.26
41 | deepseek-coder-v2-lite-instruct,29.53,26.00,24.74,34.44,33.00,10.64,48.34
42 | qwen1.5-72b-chat,29.26,23.33,22.82,26.82,32.98,11.37,58.25
43 | open-mistral-nemo,29.17,25.33,28.74,21.66,33.35,14.15,51.80
44 | phi-3.5-mini-instruct,28.30,33.33,15.90,22.20,30.43,9.67,58.30
45 | meta-llama-3.1-8b-instruct-turbo,28.03,15.33,19.74,24.37,32.15,20.05,56.53
46 | phi-3-small-8k-instruct,27.98,23.33,20.26,23.73,29.62,15.13,55.81
47 | meta-llama-3-8b-instruct,27.56,24.00,19.82,19.66,26.00,18.72,57.14
48 | command-r,26.83,25.33,15.26,16.92,31.69,14.64,57.16
49 | qwen2-7b-instruct,26.58,20.00,28.95,26.87,28.75,10.21,44.74
50 | phi-3-mini-128k-instruct,25.55,28.00,15.04,17.06,34.02,7.76,51.40
51 | phi-3-mini-4k-instruct,25.46,28.00,15.04,20.84,29.55,8.06,51.25
52 | mathstral-7b-v0.1,24.48,18.00,14.54,17.84,27.89,15.37,53.25
53 | openhermes-2.5-mistral-7b,24.13,20.00,13.26,20.45,26.92,11.37,52.78
54 | mixtral-8x7b-instruct-v0.1,22.73,17.33,11.62,20.71,28.13,13.76,44.81
55 | mistral-7b-instruct-v0.3,21.25,16.00,10.97,14.56,21.77,11.85,52.37
56 | mistral-7b-instruct-v0.2,20.05,14.00,13.90,17.08,14.62,9.05,51.65
57 | gemma-1.1-7b-it,18.78,14.67,9.62,15.21,18.17,10.65,44.34
58 | zephyr-7b-alpha,18.60,12.00,12.26,9.96,17.40,7.20,52.79
59 | qwen1.5-7b-chat,17.98,16.00,9.41,15.29,16.90,6.18,44.12
60 | deepseek-v2-lite-chat,17.74,16.00,7.13,14.08,18.19,9.20,41.83
61 | zephyr-7b-beta,16.72,12.67,8.05,11.23,15.75,4.28,48.32
62 | starling-lm-7b-beta,16.60,18.67,18.46,14.86,2.00,7.26,38.32
63 | vicuna-7b-v1.5-16k,14.50,15.33,2.64,9.04,9.93,7.92,42.12
64 | vicuna-7b-v1.5,12.57,12.67,1.92,7.10,3.33,8.66,41.75
65 | llama-2-7b-chat-hf,11.63,12.00,1.28,4.78,0.00,6.86,44.88
66 | qwen1.5-4b-chat,11.28,10.67,4.49,9.86,9.13,5.80,27.75
67 | qwen2-1.5b-instruct,10.35,8.00,5.21,9.94,10.01,3.05,25.90
68 | yi-6b-chat,9.58,10.67,2.00,8.53,4.38,4.69,27.22
69 | qwen2-0.5b-instruct,7.68,6.00,1.28,7.35,2.00,2.80,26.63
70 | qwen1.5-1.8b-chat,6.04,3.33,0.00,3.53,3.33,3.16,22.90
71 | qwen1.5-0.5b-chat,5.21,2.67,0.00,4.43,0.00,2.88,21.30


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/livecodebench_240601_230701_code.csv:
--------------------------------------------------------------------------------
 1 | model,LiveCodeBench_Pass@1
 2 | GPT-4-0-2024-05-13,45.6
 3 | GPT-4-Turbo-2024-04-09,44.7
 4 | GPT-4-Turbo-0106,39.7
 5 | GPT-4-0613,36.9
 6 | GeminiPro-1.5-May,35.7
 7 | Claude-3-Opus,35.4
 8 | Codestral-Latest,32.2
 9 | Gemini-Flash-1.5-May,30
10 | LLama3-70b-Ins,28.3
11 | Claude-3-Sonnet,26.9
12 | GeminiPro-1.5-April (n=1),26.9
13 | Mixtral-8x22B-Ins,26.4
14 | Mistral-Large,26
15 | GPT-3.5-Turbo-0125,24.6
16 | Claude-3-Haiku,24.5
17 | Claude-2,24.1
18 | Claude-Instant-1,23.7
19 | DSCoder-33b-Ins,23.6
20 | Command-R+,22.9
21 | GPT-3.5-Turbo-0301,22.6
22 | OC-DS-33B,22.3
23 | PHPhind-34B-V2,21
24 | MagiCoders-DS-6.7B,20.5
25 | LLama3-70b-Base,20.1
26 | CodeGen15-7B-Chat,19.3
27 | DSCoder-6.7b-Ins,19.1
28 | OC-DS-6.7B,18.3
29 | CodeGen15-7B,16.3
30 | Command-R,15.4
31 | LLama3-8b-Ins,15.3
32 | DSCoder-33b-Base,15
33 | StarCoder2-15b,14.2
34 | CodeLlama-13b-Ins,14
35 | CodeGenma-7b-Base,13.8
36 | CodeLlama-34b-Ins,13.3
37 | DSCoder-6.7b-Base,12.9
38 | MagiCoders-CL-7B,12.7
39 | CodeLlama-34b-Base,12.3
40 | LLama3-8b-Base,12.3
41 | Mixtral-8x7B-Ins,12.3
42 | CodeLlama-7b-Ins,11.2
43 | StarCoder2-7b,11.2
44 | StarCoder2-3b,10.4
45 | Gemma-7b-Base,10
46 | CodeLlama-13b-Base,9.2
47 | DSCoder-1.3b-Ins,8.8
48 | CodeLlama-7b-Base,7
49 | CodeGenma-2b-Base,6.7
50 | DSCoder-1.3b-Base,6.4
51 | CodeLlama-70b-Base,6.2
52 | OC-DS-1.3B,4.1
53 | CodeLlama-70b-Ins,3
54 | Gemma-2b-Base,2.2


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/llm_trustworthy_241001_safety.csv:
--------------------------------------------------------------------------------
 1 | model,trustworthy
 2 | google/gemma-2b-it,67.18
 3 | google/gemma-7b-it,66.87
 4 | lmsys/vicuna-7b-v1.3,60.62
 5 | meta-llama/Llama-2-7b-chat-hf,74.72
 6 | meta-llama/Meta-Llama-3-8B-Instruct,80.61
 7 | mosaicml/mpt-7b-chat,62.29
 8 | openai/gpt-3.5-turbo-0301,72.45
 9 | openai/gpt-4-0314,69.24
10 | openai/gpt-4o-2024-05-13,82.96
11 | openai/gpt-4o-mini-2024-07-18,76.31
12 | tiiuae/falcon-7b-instruct,59.49
13 | togethercomputer/RedPajama-INCITE-7B-Instruct,56.58
14 | vertexai/gemini-pro-1.0,80.61


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/lvbench_241189_longcontext.csv:
--------------------------------------------------------------------------------
 1 | llm_model,average,factrecall-zh,loogle-cr-mixup,factrecall-en,loogle-sd-mixup,loogle-mr-mixup,lic-mixup,cmrc-mixup,multifieldqa-en-mixup,dureader-mixup,multifieldqa-zh-mixup,hotpotwikiqa-mixup
 2 | ChatGLM3-6B,18.73509091,6.1,10.168,52.6,22.29,9.102,15.024,28.16,12.926,19.574,18.992,11.15
 3 | BlueLM-7B,12.27709091,18.8,5.036,24.034,13.02,2.874,9.114,17.53,7.322,14.608,11.486,11.224
 4 | Yi-6B,9.805636364,13.95,5.818,22.282,29.17,4.412,6.122,1.272,7.75,2.83,1.836,12.42
 5 | LongChat-7B,8.119818182,4.28,8.59,9.144,14.56,6.028,6.924,9.65,6.954,10.342,5.864,6.982
 6 | Llama2-7B,7.412181818,0.92,2.512,38.09,7.63,1.918,5.268,6.124,4.628,9.574,2.564,2.306
 7 | Qwen-7B,4.470727273,5.45,3.136,0.8,4.78,2.702,4.772,5.806,4.516,10.416,4.574,2.226
 8 | Vicuna-7B,3.258545455,0,3.256,0.09,4.68,2.314,4.004,6.044,3.438,7.178,2.888,1.952
 9 | Llama2-7B-Chat,2.145090909,0,2.622,0.446,3.04,1.798,1.02,1.966,3.994,5.49,1.482,1.738
10 | GPT-3.5,6.530363636,5.28,6.092,2.874,13.988,5.868,3.532,5.162,9.782,4.866,8.506,5.884
11 | GPT-4,8.467636364,11.386,7.26,9.254,11.128,5.906,5.276,5.96,10.156,12.068,7.292,7.458


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/mixeval_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,MixEval
 2 | LLaMA-3-70B,82.2
 3 | Qwen1.5-72B,79.5
 4 | Yi-34B,78.3
 5 | Qwen1.5-32B,77.6
 6 | Mixtral-8x7B,74.0
 7 | LLaMA-2-70B,73.2
 8 | Qwen1.5-MoE-A2.7B,70.2
 9 | Qwen1.5-7B,68.2
10 | LLaMA-3-8B,65.1
11 | Mistral-7B,64.8
12 | Gemma-7B,64.7
13 | Yi-6B,63.1
14 | Qwen1.5-4B,58.2
15 | JetMoE-8B,57.1
16 | DeepSeek-7B,52.2
17 | Phi-2,51.9
18 | DeepSeekMoE-16B,51.4
19 | LLaMA-2-7B,43.1
20 | Gemma-2B,38.9
21 | OLMo-7B,31.8
22 | MPT-7B,30.8
23 | Claude 3.5 Sonnet-0620,89.9
24 | LLaMA-3.1-405B-Instruct,-
25 | GPT-4o-2024-05-13,87.9
26 | Claude 3 Opus,88.1
27 | GPT-4-Turbo-2024-04-09,88.8
28 | Gemini 1.5 Pro-API-0409,84.2
29 | Gemini 1.5 Pro-API-0514,84.8
30 | Mistral Large 2,86.1
31 | Yi-Large-preview,84.4
32 | LLaMA-3-70B-Instruct,84.0
33 | Qwen-Max-0428,86.1
34 | Claude 3 Sonnet,81.7
35 | Reka Core-20240415,83.3
36 | MAmmoTH2-8x7B-Plus,81.5
37 | DeepSeek-V2,83.7
38 | GPT-4o mini,84.2
39 | Command R+,81.5
40 | Yi-1.5-34B-Chat,81.7
41 | Mistral-Large,84.2
42 | Qwen1.5-72B-Chat,84.1
43 | Mistral-Medium,81.9
44 | Gemini 1.0 Pro,78.9
45 | Reka Flash-20240226,79.8
46 | Mistral-Small,81.2
47 | LLaMA-3-8B-Instruct,75.0
48 | Command R,77.0
49 | Qwen1.5-32B-Chat,81.0
50 | GPT-3.5-Turbo-0125,79.7
51 | Claude 3 Haiku,79.7
52 | Yi-34B-Chat,80.1
53 | Mixtral-8x7B-Instruct-v0.1,76.4
54 | Starling-LM-7B-beta,74.8
55 | Yi-1.5-9B-Chat,74.2
56 | Gemma-1.1-7B-IT,69.6
57 | Vicuna-33B-v1.3,66.3
58 | LLaMA-2-70B-Chat,74.6
59 | MAP-Neo-Instruct-v0.1,70.0
60 | Mistral-7B-Instruct-v0.2,70.0
61 | Qwen1.5-7B-Chat,71.4
62 | Reka Edge-20240208,68.5
63 | Zephyr-7B-β,69.1
64 | LLaMA-2-7B-Chat,61.7
65 | Yi-6B-Chat,65.6
66 | Qwen1.5-MoE-A2.7B-Chat,69.1
67 | Gemma-1.1-2B-IT,51.9
68 | Vicuna-7B-v1.5,60.3
69 | OLMo-7B-Instruct,55.0
70 | Qwen1.5-4B-Chat,57.2
71 | JetMoE-8B-Chat,51.6
72 | MPT-7B-Chat,43.8


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/mmlu_pro_240829_knowledge.csv:
--------------------------------------------------------------------------------
 1 | model,mmlu_pro
 2 | Arx-0.3,0.7824
 3 | Claude-3.5-Sonnet,0.7612
 4 | Grok-2,0.7546
 5 | GPT-4o (2024-05-13),0.7255
 6 | Grok-2-mini,0.7185
 7 | Gemini-1.5-Pro,0.6903
 8 | Claude-3-Opus,0.6845
 9 | Qwen2-72B-Chat,0.6438
10 | magnum-72b-v1,0.6393
11 | GPT-4-Turbo,0.6371
12 | DeepSeek-Coder-V2-Instruct,0.6363
13 | Higgs-Llama-3-70B,0.6316
14 | GPT-4o-mini,0.6309
15 | Llama-3.1-70B-Instruct,0.6284
16 | Gemini-1.5-Flash,0.5912
17 | Yi-large,0.5809
18 | Claude-3-Sonnet,0.568
19 | Llama-3-70B-Instruct,0.562
20 | Phi3-medium-4k,0.557
21 | Qwen2-72B-32k,0.5559
22 | Deepseek-V2-Chat,0.5481
23 | Llama-3-70B,0.5278
24 | Qwen1.5-72B-Chat,0.5264
25 | Llama-3.1-70B,0.5247
26 | Yi-1.5-34B-Chat,0.5229
27 | Gemma-2-9B-it,0.5208
28 | Phi3-medium-128k,0.5191
29 | MAmmoTH2-8x7B-Plus,0.504
30 | Qwen1.5-110B,0.4993
31 | GLM-4-9B-Chat,0.4801
32 | GLM-4-9B,0.4792
33 | Phi-3.5-mini-instruct,0.4787
34 | Qwen2-7B-Instruct,0.4724
35 | Yi-1.5-9B-Chat,0.4595
36 | Phi3-mini-4k,0.4566
37 | Gemma-2-9B,0.451
38 | Mistral-Nemo-Instruct-2407,0.4481
39 | Llama-3.1-8B-Instruct,0.4425
40 | Phi3-mini-128k,0.4386
41 | MAmmoTH2-8B-Plus,0.4335
42 | Mixtral-8x7B-Instruct-v0.1,0.4327
43 | Yi-34B,0.4303
44 | Mathstral-7B-v0.1,0.42
45 | DeepSeek-Coder-V2-Lite-Instruct,0.4157
46 | Mixtral-8x7B-v0.1,0.4103
47 | Llama-3-8B-Instruct,0.4098
48 | MAmmoTH2-7B-Plus,0.4085
49 | Qwen2-7B,0.4073
50 | Mistral-Nemo-Base-2407,0.3977
51 | WizardLM-2-8x22B,0.3924
52 | Yi-1.5-6B-Chat,0.3823
53 | Qwen1.5-14B-Chat,0.3802
54 | c4ai-command-r-v01,0.379
55 | Staring-7B,0.379
56 | Llama-2-70B,0.3753
57 | OpenChat-3.5-8B,0.3724
58 | InternMath-20B-Plus,0.371
59 | Llama3-Smaug-8B,0.3693
60 | Llama-3.1-8B,0.366
61 | Llama-3-8B,0.3536
62 | DeepseekMath-7B-Instruct,0.353
63 | DeepSeek-Coder-V2-Lite-Base,0.3437
64 | Gemma-7B,0.3373
65 | InternMath-7B-Plus,0.335
66 | Zephyr-7B-Beta,0.3297
67 | Mistral-7B-v0.1,0.3088
68 | Mistral-7B-Instruct-v0.2,0.3084
69 | Mistral-7B-v0.2,0.3043
70 | Qwen1.5-7B-Chat,0.2906
71 | Yi-6B-Chat,0.2884
72 | Neo-7B-Instruct,0.2874
73 | Yi-6B,0.2651
74 | Neo-7B,0.2585
75 | Mistral-7B-Instruct-v0.1,0.2575
76 | Llama-2-13B,0.2534
77 | Llemma-7B,0.2345
78 | Qwen2-1.5B-Instruct,0.2262
79 | Qwen2-1.5B,0.2256
80 | Llama-2-7B,0.2032
81 | Qwen2-0.5B-Instruct,0.1593
82 | Gemma-2B,0.1585
83 | Qwen2-0.5B,0.1497


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/mtbench_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,MT-bench
 2 | GPT-4,8.99
 3 | GPT-3.5-turbo,7.94
 4 | Claude-v1,7.9
 5 | Claude-instant-v1,7.85
 6 | Vicuna-33B,7.12
 7 | WizardLM-30B,7.01
 8 | Guanaco-33B,6.53
 9 | Tulu-30B,6.43
10 | Guanaco-65B,6.41
11 | OpenAssistant-LLaMA-30B,6.41
12 | PaLM-Chat-Bison-001,6.4
13 | Vicuna-13B,6.39
14 | MPT-30B-chat,6.39
15 | WizardLM-13B,6.35
16 | Vicuna-7B,6.0
17 | Baize-v2-13B,5.75
18 | Nous-Hermes-13B,5.51
19 | MPT-7B-Chat,5.42
20 | GPT4All-13B-Snoozy,5.41
21 | Koala-13B,5.35
22 | MPT-30B-Instruct,5.22
23 | Falcon-40B-Instruct,5.17
24 | H2O-Oasst-OpenLLaMA-13B,4.63
25 | Alpaca-13B,4.53
26 | ChatGLM-6B,4.5
27 | OpenAssistant-Pythia-12B,4.32
28 | RWKV-4-Raven-14B,3.98
29 | Dolly-V2-12B,3.28
30 | FastChat-T5-3B,3.04
31 | StableLM-Tuned-Alpha-7B,2.75
32 | LLaMA-13B,2.61


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,opencompass
 2 | Claude-3.5-Sonnet,67.9
 3 | GPT-4o-20240513,67.7
 4 | Mistral-Large,63.2
 5 | Mistral-Large-Instruct-2407,62.5
 6 | DeepSeek-V2-Chat(0618),61.7
 7 | GPT-4o-mini-20240718,60.4
 8 | Qwen-Max-0428,57.8
 9 | Yi-Large,56.3
10 | Qwen2-72B-Instruct,55.4
11 | GLM-4,55.2
12 | Llama3.1-70B-Instruct,53.9
13 | Gemma-2-27B-it,53.5
14 | Qwen1.5-110B-Chat,51.9
15 | Doubao-pro-32k/240615,51
16 | Baichuan4,50.4
17 | Step-1-8K,49.9
18 | abab6.5,49.9
19 | Ernie-4.0-8K-Preview-0518,48.8
20 | Moonshot-v1-8K,48.6
21 | GLM-4-9B-Chat,47.9
22 | Yi-1.5-34B-Chat,46.9
23 | Hunyuan-Standard-256k,46.9
24 | Mixtral-8x22B-Instruct-v0.1,46.3
25 | Gemma-2-9B-it,45.5
26 | Qwen2-7B-Instruct,45.1
27 | InternLM2.5-7B-Chat,44.5
28 | Yi-1.5-9B-Chat,42.6
29 | Nanbeige2-16B-Chat,42.3
30 | Llama3.1-8B-Instruct,42.1
31 | DBRX-Instruct,37.6
32 | Yi-1.5-6B-Chat,36.5
33 | InternLM2-Chat-20B,36
34 | Mixtral-8x7B-Instruct-v0.1,34.5
35 | Mistral-7B-Instruct-v0.3,30.7
36 | DeepSeek-V2-Lite-Chat,30


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_academic_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,opencompass_academic
 2 | GPT-4o-20240513,77
 3 | Qwen2-72B-Instruct,73.1
 4 | GPT-4o-mini-20240718,72.5
 5 | Llama3-70B-Instruct,66.6
 6 | Qwen1.5-110B-Chat,61.7
 7 | Yi-1.5-34B-Chat,60.4
 8 | InternLM2.5-Chat-7B,60.3
 9 | GLM-4-9B-Chat,59.5
10 | Qwen1.5-32B-Chat,57.1
11 | Qwen1.5-72B-Chat,56.9
12 | Yi-1.5-9B-Chat,56.1
13 | Qwen2-7B-Instruct,52
14 | Llama3-8B-Instruct,50.6
15 | Qwen1.5-14B-Chat,49.7
16 | InternLM2-Chat-20B,45.2
17 | Yi-1.5-6B-Chat,43.5
18 | Mixtral-8x7B-Instruct-v0.1,42.6
19 | InternLM2-Chat-7B,42.1
20 | Qwen1.5-7B-Chat,35.4
21 | Mistral-7B-Instruct-v0.3


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_agent_240829_agent.csv:
--------------------------------------------------------------------------------
 1 | model,OC_Agent
 2 | Claude-3.5-Sonnet,81.7
 3 | GPT-4o-20240513,84.4
 4 | Mistral-Large,83.5
 5 | Mistral-Large-Instruct-2407,84.5
 6 | DeepSeek-V2-Chat(0618),83.7
 7 | GPT-4o-mini-20240718,85.7
 8 | Qwen-Max-0428,83.8
 9 | Yi-Large,86.1
10 | Qwen2-72B-Instruct,85.9
11 | GLM-4,80.4
12 | Llama3.1-70B-Instruct,86.5
13 | Gemma-2-27B-it,85.5
14 | Qwen1.5-110B-Chat,79.6
15 | Doubao-pro-32k/240615,79.3
16 | Baichuan4,84.5
17 | Step-1-8K,84.2
18 | abab6.5,62.5
19 | Ernie-4.0-8K-Preview-0518,72.7
20 | Moonshot-v1-8K,63.5
21 | GLM-4-9B-Chat,81.9
22 | Yi-1.5-34B-Chat,63.5
23 | Hunyuan-Standard-256k,65.6
24 | Mixtral-8x22B-Instruct-v0.1,86
25 | Gemma-2-9B-it,69.9
26 | Qwen2-7B-Instruct,79.7
27 | InternLM2.5-7B-Chat,79
28 | Yi-1.5-9B-Chat,54.3
29 | Nanbeige2-16B-Chat,85.8
30 | Llama3.1-8B-Instruct,80.1
31 | DBRX-Instruct,75.3
32 | Yi-1.5-6B-Chat,55.4
33 | InternLM2-Chat-20B,80.3
34 | Mixtral-8x7B-Instruct-v0.1,71
35 | Mistral-7B-Instruct-v0.3,75.4
36 | DeepSeek-V2-Lite-Chat,72.4


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_arena_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,OC_arena
 2 | GPT-40-20240513,1090
 3 | Qwen2-72B-Instruct,1085
 4 | Qwen-Max-0428,1071
 5 | Hunyuan-pro,1069
 6 | Claude 3.5 Sonnet 20240620,1055
 7 | ERNIE-4.0-8K-Preview-0518,1051
 8 | DeepSeek-V2-Chat,1048
 9 | Yi-Large,1051
10 | GPT-4-turbo-20240409,1044
11 | GLM-4-0520,1033
12 | DeepSeek-V2,1027
13 | abab6.5-chat,1027
14 | Yi-1.5-34B-Chat,1016
15 | Doubao-pro-32k/240615,1011
16 | Baichuan4,1007
17 | Qwen1.5-32B-Chat,1007
18 | Baichuan4,1007
19 | Qwen1.5-32B-Chat,1007
20 | Qwen1.5-72B-Chat,1007
21 | MoonShot-v1-32K,994
22 | InternLM2-Chat-20B,992
23 | Yi-34B-Chat,983
24 | Command-R+,977
25 | Qwen1.5-7B-Chat,970
26 | InternLM2-Chat-7B,968
27 | Qwen1.5-14B-Chat,968
28 | InternLM2.5-7B-Chat,958
29 | DeepSeek LLM 67B Chat,937
30 | Mixtral-8x22B-Instruct-v0.1,933
31 | Llama3-70B-Instruct,926
32 | Llama3-8B-Instruct,920
33 | DeepSeek MoE 16B Chat,895
34 | DBRX-Instruct,879


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_code_240829_code.csv:
--------------------------------------------------------------------------------
 1 | model,OC_Code
 2 | Claude-3.5-Sonnet,69.6
 3 | GPT-4o-20240513,69.1
 4 | Mistral-Large,65.1
 5 | Mistral-Large-Instruct-2407,55.6
 6 | DeepSeek-V2-Chat(0618),66.2
 7 | GPT-4o-mini-20240718,63.3
 8 | Qwen-Max-0428,52.4
 9 | Yi-Large,54.3
10 | Qwen2-72B-Instruct,49.5
11 | GLM-4,56.3
12 | Llama3.1-70B-Instruct,53.7
13 | Gemma-2-27B-it,54.6
14 | Qwen1.5-110B-Chat,49.5
15 | Doubao-pro-32k/240615,50.2
16 | Baichuan4,44.1
17 | Step-1-8K,44.2
18 | abab6.5,50.5
19 | Ernie-4.0-8K-Preview-0518,50.6
20 | Moonshot-v1-8K,47
21 | GLM-4-9B-Chat,45.1
22 | Yi-1.5-34B-Chat,44.8
23 | Hunyuan-Standard-256k,46.1
24 | Mixtral-8x22B-Instruct-v0.1,44.7
25 | Gemma-2-9B-it,42.2
26 | Qwen2-7B-Instruct,44
27 | InternLM2.5-7B-Chat,34.8
28 | Yi-1.5-9B-Chat,41.8
29 | Nanbeige2-16B-Chat,33.3
30 | Llama3.1-8B-Instruct,39.3
31 | DBRX-Instruct,32.2
32 | Yi-1.5-6B-Chat,34.4
33 | InternLM2-Chat-20B,36.2
34 | Mixtral-8x7B-Instruct-v0.1,26.7
35 | Mistral-7B-Instruct-v0.3,23.6
36 | DeepSeek-V2-Lite-Chat,16.3


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_instruct_240829_instructionfollow.csv:
--------------------------------------------------------------------------------
 1 | model,OC_Instruct
 2 | Claude-3.5-Sonnet,66.2
 3 | GPT-4o-20240513,60.3
 4 | Mistral-Large,51.1
 5 | Mistral-Large-Instruct-2407,50.3
 6 | DeepSeek-V2-Chat(0618),44.1
 7 | GPT-4o-mini-20240718,56
 8 | Qwen-Max-0428,47.4
 9 | Yi-Large,40
10 | Qwen2-72B-Instruct,34
11 | GLM-4,36.9
12 | Llama3.1-70B-Instruct,46.2
13 | Gemma-2-27B-it,45.2
14 | Qwen1.5-110B-Chat,36.8
15 | Doubao-pro-32k/240615,30.6
16 | Baichuan4,39.4
17 | Step-1-8K,38.9
18 | abab6.5,32
19 | Ernie-4.0-8K-Preview-0518,28.5
20 | Moonshot-v1-8K,35.9
21 | GLM-4-9B-Chat,36
22 | Yi-1.5-34B-Chat,38.8
23 | Hunyuan-Standard-256k,29.2
24 | Mixtral-8x22B-Instruct-v0.1,31.2
25 | Gemma-2-9B-it,40.9
26 | Qwen2-7B-Instruct,27.5
27 | InternLM2.5-7B-Chat,26.5
28 | Yi-1.5-9B-Chat,29.8
29 | Nanbeige2-16B-Chat,33.2
30 | Llama3.1-8B-Instruct,39.1
31 | DBRX-Instruct,32.5
32 | Yi-1.5-6B-Chat,26.3
33 | InternLM2-Chat-20B,18.5
34 | Mixtral-8x7B-Instruct-v0.1,28.2
35 | Mistral-7B-Instruct-v0.3,28.5
36 | DeepSeek-V2-Lite-Chat,20.6


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_knowledge_240829_knowledge.csv:
--------------------------------------------------------------------------------
 1 | model,OC_Knowledge
 2 | Claude-3.5-Sonnet,85
 3 | GPT-4o-20240513,85.2
 4 | Mistral-Large,83.4
 5 | Mistral-Large-Instruct-2407,83.3
 6 | DeepSeek-V2-Chat(0618),78.8
 7 | GPT-4o-mini-20240718,78.7
 8 | Qwen-Max-0428,79
 9 | Yi-Large,75.3
10 | Qwen2-72B-Instruct,84
11 | GLM-4,77.7
12 | Llama3.1-70B-Instruct,81.4
13 | Gemma-2-27B-it,58.5
14 | Qwen1.5-110B-Chat,79.3
15 | Doubao-pro-32k/240615,78.3
16 | Baichuan4,74.2
17 | Step-1-8K,72
18 | abab6.5,69.8
19 | Ernie-4.0-8K-Preview-0518,76.4
20 | Moonshot-v1-8K,61
21 | GLM-4-9B-Chat,68.9
22 | Yi-1.5-34B-Chat,65
23 | Hunyuan-Standard-256k,69.7
24 | Mixtral-8x22B-Instruct-v0.1,72.2
25 | Gemma-2-9B-it,53.7
26 | Qwen2-7B-Instruct,64.1
27 | InternLM2.5-7B-Chat,64.8
28 | Yi-1.5-9B-Chat,56
29 | Nanbeige2-16B-Chat,53.8
30 | Llama3.1-8B-Instruct,63.2
31 | DBRX-Instruct,66.3
32 | Yi-1.5-6B-Chat,41.3
33 | InternLM2-Chat-20B,60
34 | Mixtral-8x7B-Instruct-v0.1,50.4
35 | Mistral-7B-Instruct-v0.3,47.8
36 | DeepSeek-V2-Lite-Chat,41.3


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_language_240829_language.csv:
--------------------------------------------------------------------------------
 1 | model,OC_Language
 2 | Claude-3.5-Sonnet,50.9
 3 | GPT-4o-20240513,55.5
 4 | Mistral-Large,50.9
 5 | Mistral-Large-Instruct-2407,50.3
 6 | DeepSeek-V2-Chat(0618),46.3
 7 | GPT-4o-mini-20240718,50.1
 8 | Qwen-Max-0428,56.5
 9 | Yi-Large,48.7
10 | Qwen2-72B-Instruct,45.8
11 | GLM-4,45.8
12 | Llama3.1-70B-Instruct,38.4
13 | Gemma-2-27B-it,45.2
14 | Qwen1.5-110B-Chat,53.4
15 | Doubao-pro-32k/240615,31.1
16 | Baichuan4,37.2
17 | Step-1-8K,40.6
18 | abab6.5,44.9
19 | Ernie-4.0-8K-Preview-0518,36.7
20 | Moonshot-v1-8K,46.3
21 | GLM-4-9B-Chat,44.3
22 | Yi-1.5-34B-Chat,50.5
23 | Hunyuan-Standard-256k,30.6
24 | Mixtral-8x22B-Instruct-v0.1,33
25 | Gemma-2-9B-it,40.8
26 | Qwen2-7B-Instruct,43.5
27 | InternLM2.5-7B-Chat,44.6
28 | Yi-1.5-9B-Chat,46.1
29 | Nanbeige2-16B-Chat,50.5
30 | Llama3.1-8B-Instruct,33.7
31 | DBRX-Instruct,25.6
32 | Yi-1.5-6B-Chat,43.6
33 | InternLM2-Chat-20B,36.7
34 | Mixtral-8x7B-Instruct-v0.1,36.6
35 | Mistral-7B-Instruct-v0.3,30.3
36 | DeepSeek-V2-Lite-Chat,31.4


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_math_240829_math.csv:
--------------------------------------------------------------------------------
 1 | model,OC_Math
 2 | Claude-3.5-Sonnet,71.1
 3 | GPT-4o-20240513,71.1
 4 | Mistral-Large,66.4
 5 | Mistral-Large-Instruct-2407,72.8
 6 | DeepSeek-V2-Chat(0618),68.2
 7 | GPT-4o-mini-20240718,58.2
 8 | Qwen-Max-0428,55.1
 9 | Yi-Large,54.8
10 | Qwen2-72B-Instruct,57.7
11 | GLM-4,53.2
12 | Llama3.1-70B-Instruct,58
13 | Gemma-2-27B-it,50.1
14 | Qwen1.5-110B-Chat,39.6
15 | Doubao-pro-32k/240615,67.5
16 | Baichuan4,51.8
17 | Step-1-8K,51.4
18 | abab6.5,47.2
19 | Ernie-4.0-8K-Preview-0518,44.7
20 | Moonshot-v1-8K,46.6
21 | GLM-4-9B-Chat,38.7
22 | Yi-1.5-34B-Chat,38.1
23 | Hunyuan-Standard-256k,53.9
24 | Mixtral-8x22B-Instruct-v0.1,47.2
25 | Gemma-2-9B-it,40.7
26 | Qwen2-7B-Instruct,37.7
27 | InternLM2.5-7B-Chat,40.8
28 | Yi-1.5-9B-Chat,38.2
29 | Nanbeige2-16B-Chat,25.8
30 | Llama3.1-8B-Instruct,38
31 | DBRX-Instruct,35.3
32 | Yi-1.5-6B-Chat,28.4
33 | InternLM2-Chat-20B,27.4
34 | Mixtral-8x7B-Instruct-v0.1,24.8
35 | Mistral-7B-Instruct-v0.3,18.1
36 | DeepSeek-V2-Lite-Chat,22.8


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/opencompass_reasoning_240829_reasoning.csv:
--------------------------------------------------------------------------------
 1 | model,OC_Reasoning
 2 | Claude-3.5-Sonnet,57
 3 | GPT-4o-20240513,55.8
 4 | Mistral-Large,50.1
 5 | Mistral-Large-Instruct-2407,50
 6 | DeepSeek-V2-Chat(0618),47.4
 7 | GPT-4o-mini-20240718,45.4
 8 | Qwen-Max-0428,47.9
 9 | Yi-Large,47.6
10 | Qwen2-72B-Instruct,44.7
11 | GLM-4,46.1
12 | Llama3.1-70B-Instruct,31.6
13 | Gemma-2-27B-it,45.4
14 | Qwen1.5-110B-Chat,45.8
15 | Doubao-pro-32k/240615,27.8
16 | Baichuan4,38.5
17 | Step-1-8K,35.8
18 | abab6.5,47
19 | Ernie-4.0-8K-Preview-0518,41.3
20 | Moonshot-v1-8K,46
21 | GLM-4-9B-Chat,40
22 | Yi-1.5-34B-Chat,42.7
23 | Hunyuan-Standard-256k,36.8
24 | Mixtral-8x22B-Instruct-v0.1,28.6
25 | Gemma-2-9B-it,41.9
26 | Qwen2-7B-Instruct,36.2
27 | InternLM2.5-7B-Chat,39.3
28 | Yi-1.5-9B-Chat,39.8
29 | Nanbeige2-16B-Chat,40.5
30 | Llama3.1-8B-Instruct,24.9
31 | DBRX-Instruct,20.8
32 | Yi-1.5-6B-Chat,36.5
33 | InternLM2-Chat-20B,18.9
34 | Mixtral-8x7B-Instruct-v0.1,28.1
35 | Mistral-7B-Instruct-v0.3,20.7
36 | DeepSeek-V2-Lite-Chat,28.1


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/repoqa_241119_longcontext.csv:
--------------------------------------------------------------------------------
 1 | model,RepoQA
 2 | gpt-4o-2024-05-13,90.6
 3 | gemini-1.5-pro-latest,90.6
 4 | claude-3-opus-20240229,90.6
 5 | gemini-1.5-flash-latest,90.0
 6 | claude-3-sonnet-20240229,87.4
 7 | DeepSeek-V2-Chat,83.4
 8 | Meta-Llama-3-70B-Instruct,82.2
 9 | claude-3-haiku-20240307,81.8
10 | c4ai-command-r-plus,78.4
11 | gpt-4-turbo-2024-04-09,76.4
12 | Mixtral-8x7B-Instruct-v0.1,68.0
13 | Mixtral-8x22B-Instruct-v0.1,67.8
14 | Qwen1.5-72B-Chat,67.0
15 | Phi-3-medium-128k-instruct,63.2
16 | CodeQwen1.5-7B-Chat,62.8
17 | Mistral-7B-Instruct-v0.3,62.0
18 | gpt-3.5-turbo-0125,60.4
19 | Meta-Llama-3-8B-Instruct,53.6
20 | deepseek-coder-33b-instruct,48.4
21 | Mistral-7B-Instruct-v0.2,47.4
22 | CodeLlama-13b-Instruct-hf,42.6
23 | DeepSeek-V2-Lite-Chat,41.6
24 | CodeLlama-34b-Instruct-hf,41.6
25 | Phi-3-small-128k-instruct,39.6
26 | Qwen1.5-32B-Chat,33.8
27 | CodeLlama-7b-Instruct-hf,28.2
28 | Qwen1.5-14B-Chat,26.0
29 | Magicoder-S-DS-6.7B,23.2
30 | Phi-3-mini-128k-instruct,22.4
31 | Mistral-7B-Instruct-v0.1,11.0
32 | deepseek-coder-6.7b-instruct,10.6
33 | Qwen1.5-7B-Chat,2.8
34 | codegemma-7b-it,2.2


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/ruler_bench_241002_longcontext.csv:
--------------------------------------------------------------------------------
 1 | model,Ruler
 2 | Jamba-1.5-large*,96.3
 3 | Gemini-1.5-pro,96.1
 4 | Jamba-1.5-mini,94.8
 5 | GPT-4-1106-preview,94.1
 6 | Llama-3.1-70b,93.7
 7 | Command-R-plus-0824,92.4
 8 | Qwen2-72b,92.3
 9 | Command-R-plus,92.1
10 | Command-R-0824,91.9
11 | GLM4-9b,91.7
12 | Llama3.1-8b,91.3
13 | Command-R,91.1
14 | MegaBeam-Mistral,91.0
15 | Mistral-Large,90.4
16 | GradientAI/Llama3-70b,90.3
17 | Mixtral-8x22B,90.3
18 | Yi-34b,90.1
19 | Phi3-mini,88.7
20 | Phi3-medium,88.3
21 | Mixtral-8x7B,87.9
22 | GradientAI/Llama3-8b,86.3
23 | FILM-7B,84.7
24 | InternLM2.5-7b,83.9
25 | Mistral-7b,81.2
26 | Mistral-Nemo,77.8
27 | GLM3-6b,77.2
28 | LWM,75.7
29 | DBRX,74.7
30 | Qwen1.5-72b,74.0
31 | Together-7b,66.7
32 | LongChat-7b,65.2
33 | LongAlpaca-13b,47.9


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/tablebench_241002_tables.csv:
--------------------------------------------------------------------------------
 1 | model,tablebench_overall_dp
 2 | Llama2-7B-Chat,16.98
 3 | CodeLlama-7B-Instruct,17.01
 4 | Gemma-7B-Instruct,14.82
 5 | Mistral-7B-Instruct,19.15
 6 | Deepseek-Coder-7B-Instruct,13.82
 7 | CodeQwen1.5-7B-Chat,16.76
 8 | Qwen1.5-7B-Chat,15.84
 9 | Qwen2-7B-Instruct,21.23
10 | StructLM-7B,12.06
11 | MAP-Neo-7B-Instruct,12.66
12 | Llama3-8B-Chat,27.28
13 | Llama3.1-8B-Instruct,23.47
14 | Llama2-13B-Chat,18.58
15 | StructLM-13B,11.52
16 | WizardLM-13B,20.8
17 | Qwen1.5-14B-Chat,17.76
18 | Qwen1.5-32B-Chat,20.21
19 | Deepseek-Coder-33B-Instruct,9.74
20 | CodeLlama-34B-Instruct,21.6
21 | StructLM-34B,0.6
22 | Mixtral-8x7B-Instruct,24.98
23 | Qwen1.5-72B-Chat,28.45
24 | Qwen2-72B-Instruct,32.52
25 | Qwen1.5-110B-Chat,29.72
26 | Llama3-70B-Chat,30.91
27 | Llama3.1-70B-Instruct,33.63
28 | GPT-3.5-Turbo,27.75
29 | Qwen-Max,29.63
30 | Yi-Large,32.43
31 | GLM-4,31.23
32 | Deepseek-Chat-V2,40.65
33 | Deepseek-Coder-V2,35.21
34 | GPT-4-Turbo,40.38
35 | GPT-4o,42.73
36 | TableLLM-CodeQwen-7B,26.08
37 | TableLLM-Deepseek-Coder-7B,27.98
38 | TableLLM-Llama3.1-8B,27.19
39 | TableLLM-Llama3-8B,26.93
40 | TableLLM-Qwen2-7B,27.14


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/toolbench_240829_tools.csv:
--------------------------------------------------------------------------------
 1 | model,Toolbench
 2 | gpt4,68.8
 3 | text-davinci-003,67.2
 4 | gpt-3.5-turbo,56.6
 5 | text-curie-001,10.6
 6 | Llama-2-70b,61
 7 | Llama-2-13b,48.8
 8 | Llama-2-7b,39.5
 9 | llama-65b,55.6
10 | llama-30b,49.6
11 | llama-13b,36.8
12 | llama-13b-alpaca,26.9
13 | CodeLlama-7b-hf,48.3
14 | CodeLlama-7b-Instruct-hf,50.5
15 | CodeLlama-7b-Python-hf,52.2
16 | CodeLlama-13b-hf,56.9
17 | CodeLlama-13b-Instruct-hf,60.5
18 | CodeLlama-13b-Python-hf,56.3
19 | CodeLlama-34b-hf,62.9
20 | CodeLlama-34b-Instruct-hf,64.8
21 | CodeLlama-34b-Python-hf,59.2
22 | starcoder,49.7
23 | starcoderbase,52.2
24 | codegen-16B-nl,28.2
25 | codegen-16B-multi,28.8
26 | codegen-16B-mono,35.6
27 | bloomz,27.8
28 | opt-iml-30b,14.1
29 | opt-30b,13.4
30 | opt-iml-1.3b,7
31 | opt-1.3b,7.5
32 | neox-20b,26.4
33 | GPT-NeoXT-Chat-Base-20B,22.6
34 | pythia-12b,19.5
35 | dolly-v2-12b,5
36 | pythia-6.9b,19.4
37 | pythia-2.8b,18.6
38 | pythia-1.4b,15.9
39 | stablelm-base-alpha-7b,10.8
40 | stablelm-tuned-alpha-7b,9.2
41 | stablelm-base-alpha-3b,5.2
42 | stablelm-tuned-alpha-3b,6.6
43 | llama-30b-toolbench,50.2
44 | starcoder-toolbench,51.7
45 | codegen-16B-mono-toolbench,51.6


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks/wildbench_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,WB-Score
 2 | gpt-4o-2024-05-13,59.3
 3 | Claude_3.5_Sonnet,54.7
 4 | Gemini_1.5_Pro,53
 5 | gpt-4-turbo-2024-04-09,55.2
 6 | Yi-Large-Preview,55.3
 7 | DeepSeek-V2-Chat_0628_API,54
 8 | gpt-4-0125-preview,52.3
 9 | Claude_3_Opus,51.7
10 | Gemini_1.5_Flash,48.9
11 | Llama-3-70B-Instruct,47.8
12 | DeepSeek-V2-Coder_0614_API,45.7
13 | Yi-Large,48.9
14 | Athene-70B,59.5
15 | Nemotron-4-340B-Inst,47.7
16 | Gemma-2-27B-it,48.5
17 | Mistral-Large-2,55.6
18 | Claude_3_Sonnet,45.5
19 | gpt-4o-mini-2024-07-18,57.1
20 | Qwen2-72B-Instruct,44.5
21 | Reka_Core,45.9
22 | gemma-2-9b-it-SimPO,53.3
23 | gemma-2-9b-it-DPO,53.2
24 | Yi-1.5-34B-Chat,45.6
25 | Claude_3_Haiku,38.9
26 | Mistral-Nemo-Inst_12B,44.4
27 | Mistral-Large,38.9
28 | Gemma-2-9B-it,42.7
29 | Command-R-Plus,36.8
30 | GLM-4-9B-Chat,39.1
31 | Magpie-8B-Align-v0.1,39.3
32 | Yi-1.5-9B-Chat,38.7
33 | Llama3-Inst-8B-SimPO,37
34 | Llama3-Inst-8B-SimPO-v0.2,37.2
35 | Qwen1.5-72B-Chat,39.9
36 | Llama3-Inst-8B-SimPO-ExPO,35
37 | SELM_Llama3-8B-Inst-iter3,35.3
38 | Phi-3-medium-128k,27.3
39 | Llama-3-8B-Instruct,29.2
40 | Hermes-2-Theta-Llama-3-8B,29.6
41 | Starling-LM-7B-beta-ExPO,31.6
42 | SELM_Zephyr-7B-iter3,25.1
43 | Reka_Flash,30.4
44 | Gemma-2-2B-it,27.8
45 | gpt-3.5-turbo-0125,30
46 | DBRX_Instruct,32.6
47 | Neo-7B-Instruct-ExPO,23.1
48 | Neo-7B-Instruct,25
49 | StarlingLM-7B-beta,30.2
50 | Command-R,29.5
51 | Mixtral-8x7B-Instruct,31.5
52 | Yi-1.5-6B-Chat,23.3
53 | Tulu-2-dpo-70b,28
54 | Reka_Edge,21.3
55 | Mistral-7B-Instruct-v0.2,25.6
56 | Llama-2-70B-chat,20.7
57 | Qwen1.5-7B-Chat,23.4
58 | Hermes-2-Mixtral-8x7B-DPO,30.7
59 | Phi-3-mini-128k,24.7
60 | Gemma-7B-it,6.6
61 | Llama-2-7B-chat,8.3


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/arena_hard_2404.csv:
--------------------------------------------------------------------------------
 1 | model,score,scenario,source,aggragated_from,tag
 2 | gpt_4_turbo_2024_04_09,82.6,arena_hard,arena_hard_2404,[],holistic
 3 | gpt_4_0125_preview,78.0,arena_hard,arena_hard_2404,[],holistic
 4 | gemini_1.5_pro_api_preview,72.0,arena_hard,arena_hard_2404,[],holistic
 5 | yi_large,63.7,arena_hard,arena_hard_2404,[],holistic
 6 | claude_3_opus_20240229,60.4,arena_hard,arena_hard_2404,[],holistic
 7 | glm_4,55.7,arena_hard,arena_hard_2404,[],holistic
 8 | gpt_4_0314,50.0,arena_hard,arena_hard_2404,[],holistic
 9 | gemini_1.5_flash_api_preview,49.6,arena_hard,arena_hard_2404,[],holistic
10 | claude_3_sonnet_20240229,46.8,arena_hard,arena_hard_2404,[],holistic
11 | claude_3_haiku_20240307,41.5,arena_hard,arena_hard_2404,[],holistic
12 | llama_3_70b_chat,41.1,arena_hard,arena_hard_2404,[],holistic
13 | gpt_4_0613,37.9,arena_hard,arena_hard_2404,[],holistic
14 | mistral_large_2402,37.7,arena_hard,arena_hard_2404,[],holistic
15 | mixtral_8x22b_instruct_v0.1,36.4,arena_hard,arena_hard_2404,[],holistic
16 | qwen1.5_72b_chat,36.1,arena_hard,arena_hard_2404,[],holistic
17 | command_r_plus,33.1,arena_hard,arena_hard_2404,[],holistic
18 | mistral_medium,31.9,arena_hard,arena_hard_2404,[],holistic
19 | mistral_next,27.4,arena_hard,arena_hard_2404,[],holistic
20 | gpt_3.5_turbo_0613,24.8,arena_hard,arena_hard_2404,[],holistic
21 | claude_2.0,24.0,arena_hard,arena_hard_2404,[],holistic
22 | dbrx_instructruct,23.9,arena_hard,arena_hard_2404,[],holistic
23 | mixtral_8x7b_instruct_v0.1,23.4,arena_hard,arena_hard_2404,[],holistic
24 | gpt_3.5_turbo_0125,23.3,arena_hard,arena_hard_2404,[],holistic
25 | yi_34b_chat,23.1,arena_hard,arena_hard_2404,[],holistic
26 | starling_lm_7b_beta,23.0,arena_hard,arena_hard_2404,[],holistic
27 | claude_2.1,22.8,arena_hard,arena_hard_2404,[],holistic
28 | snorkel_mistral_pairrm_dpo,20.7,arena_hard,arena_hard_2404,[],holistic
29 | llama_3_8b_chat,20.6,arena_hard,arena_hard_2404,[],holistic
30 | gpt_3.5_turbo_1106,18.9,arena_hard,arena_hard_2404,[],holistic
31 | gpt_3.5_turbo_0301,18.1,arena_hard,arena_hard_2404,[],holistic
32 | gemini_1.0_pro,17.8,arena_hard,arena_hard_2404,[],holistic
33 | snowflake_arctic_instruct,17.6,arena_hard,arena_hard_2404,[],holistic
34 | command_r,17.0,arena_hard,arena_hard_2404,[],holistic
35 | phi_3_mini_128k_instruct,15.4,arena_hard,arena_hard_2404,[],holistic
36 | tulu_2_dpo_70b,15.0,arena_hard,arena_hard_2404,[],holistic
37 | starling_lm_7b_alpha,12.8,arena_hard,arena_hard_2404,[],holistic
38 | mistral_7b_instruct,12.6,arena_hard,arena_hard_2404,[],holistic
39 | gemma_1.1_7b_it,12.1,arena_hard,arena_hard_2404,[],holistic
40 | llama_2_70b_chat,11.6,arena_hard,arena_hard_2404,[],holistic
41 | vicuna_33b_v1.3,8.6,arena_hard,arena_hard_2404,[],holistic
42 | gemma_7b_it,7.5,arena_hard,arena_hard_2404,[],holistic
43 | llama_2_7b_chat,4.6,arena_hard,arena_hard_2404,[],holistic
44 | gemma_1.1_2b_it,3.4,arena_hard,arena_hard_2404,[],holistic
45 | gemma_2b_it,3.0,arena_hard,arena_hard_2404,[],holistic
46 | 


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/chatbot_arena_240829.csv:
--------------------------------------------------------------------------------
 1 | model,arena_elo
 2 | claude-3-5-sonnet-20240620,79.3
 3 | gpt-4o-2024-05-13,79.2
 4 | gpt-4-0125-preview,78.0
 5 | gpt-4o-2024-08-06,77.9
 6 | athene-70b,77.6
 7 | gpt-4o-mini,74.9
 8 | gemini-1.5-pro-api-preview,72.0
 9 | mistral-large-2407,70.4
10 | llama-3.1-405b-instruct,64.1
11 | glm-4-0520,63.8
12 | yi-large,63.7
13 | deepseek-coder-v2,62.3
14 | claude-3-opus-20240229,60.4
15 | gemma-2-27b-it,57.5
16 | llama-3.1-70b-instruct,55.7
17 | glm-4-0116,55.7
18 | glm-4-air,50.9
19 | gpt-4-0314,50.0
20 | gemini-1.5-flash-api-preview,49.6
21 | qwen2-72b-instruct,46.9
22 | claude-3-sonnet-20240229,46.8
23 | llama-3-70b-instruct,46.6
24 | claude-3-haiku-20240307,41.5
25 | gpt-4-0613,37.9
26 | mistral-large-2402,37.7
27 | mixtral-8x22b-instruct-v0.1,36.4
28 | Qwen1.5-72B-Chat,36.1
29 | phi-3-medium-4k-instruct,33.4
30 | command-r-plus,33.1
31 | mistral-medium,31.9
32 | internlm2.5-20b-chat,31.2
33 | phi-3-small-8k-instruct,29.8
34 | mistral-next,27.4
35 | gpt-3.5-turbo-0613,24.8
36 | dbrx-instruct-preview,24.6
37 | internlm2-20b-chat,24.4
38 | claude-2.0,24.0
39 | Mixtral-8x7B-Instruct-v0.1,23.4
40 | gpt-3.5-turbo-0125,23.3
41 | Yi-34B-Chat,23.1
42 | Starling-LM-7B-beta,23.0
43 | claude-2.1,22.8
44 | llama-3.1-8b-instruct,21.3
45 | Snorkel-Mistral-PairRM-DPO,20.7
46 | llama-3-8b-instruct,20.6
47 | gpt-3.5-turbo-1106,18.9
48 | gpt-3.5-turbo-0301,18.1
49 | gemini-1.0-pro,17.8
50 | snowflake-arctic-instruct,17.6
51 | command-r,17.0
52 | phi-3-mini-128k-instruct,15.4
53 | tulu-2-dpo-70b,15.0
54 | Starling-LM-7B-alpha,12.8
55 | mistral-7b-instruct,12.6
56 | gemma-1.1-7b-it,12.1
57 | Llama-2-70b-chat-hf,11.6
58 | vicuna-33b-v1.3,8.6
59 | gemma-7b-it,7.5
60 | Llama-2-7b-chat-hf,4.6
61 | gemma-1.1-2b-it,3.4
62 | gemma-2b-it,3.0


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/helm_classic_240829.csv:
--------------------------------------------------------------------------------
 1 | model,helm_classic,HELM_MMLU,HELM_BoolQ,HELM_NarrativeQA,HELM_NaturalQuestions(closed),HELM_NaturalQuestions(open),HELM_QuAC,HELM_HellaSwag,HELM_OpenbookQA,HELM_TruthfulQA,HELM_MS MARCO(regular),HELM_MS MARCO(TREC),HELM_CNN/DailyMail,XSUM,HELM_IMDB,HELM_CivilComments,HELM_RAFT
 2 | Llama 2 (70B),0.944,0.582,0.886,0.77,0.458,0.674,0.484,-,0.554,-,-,-,-,-,0.961,0.652,0.727
 3 | LLaMA (65B),0.908,0.584,0.871,0.755,0.431,0.672,0.401,-,0.508,-,-,-,-,-,0.962,0.655,0.702
 4 | text-davinci-002,0.905,0.568,0.877,0.727,0.383,0.713,0.445,0.815,0.594,0.61,0.421,0.664,0.153,0.144,0.948,0.668,0.733
 5 | Mistral v0.1 (7B),0.884,0.572,0.874,0.716,0.365,0.687,0.423,-,0.422,-,-,-,-,-,0.962,0.624,0.707
 6 | Cohere Command beta (52.4B),0.874,0.452,0.856,0.752,0.372,0.76,0.432,0.811,0.582,0.269,0.472,0.762,0.161,0.152,0.96,0.601,0.667
 7 | text-davinci-003,0.872,0.569,0.881,0.727,0.406,0.77,0.525,0.822,0.646,0.593,0.368,0.644,0.156,0.124,0.848,0.684,0.759
 8 | Jurassic-2 Jumbo (178B),0.824,0.48,0.829,0.733,0.385,0.669,0.435,0.788,0.558,0.437,0.398,0.661,0.149,0.182,0.938,0.57,0.746
 9 | Llama 2 (13B),0.823,0.507,0.811,0.744,0.376,0.637,0.424,-,0.33,-,-,-,-,-,0.962,0.588,0.707
10 | TNLG v2 (530B),0.787,0.469,0.809,0.722,0.384,0.642,0.39,0.799,0.562,0.251,0.377,0.643,0.161,0.169,0.941,0.601,0.679
11 | gpt-3.5-turbo-0613,0.783,0.391,0.87,0.625,0.348,0.675,0.485,-,0.339,-,-,-,-,-,0.943,0.696,0.748
12 | LLaMA (30B),0.781,0.531,0.861,0.752,0.408,0.666,0.39,-,0.344,-,-,-,-,-,0.927,0.549,0.752
13 | Anthropic-LM v4-s3 (52B),0.78,0.481,0.815,0.728,0.288,0.686,0.431,0.807,0.558,0.368,-,-0.154,0.134,0.934,0.61,0.699,
14 | gpt-3.5-turbo-0301,0.76,0.59,0.74,0.663,0.39,0.624,0.512,-,0.609,-,-,-,-,-,0.899,0.674,0.768
15 | Jurassic-2 Grande (17B),0.743,0.475,0.826,0.737,0.356,0.639,0.418,0.781,0.542,0.348,0.293,0.514,0.144,0.167,0.938,0.547,0.712
16 | Palmyra X (43B),0.732,0.609,0.896,0.742,0.413,-,0.473,-,0.616,-,-,0.049,0.149,0.935,0.008,0.701,
17 | Falcon (40B),0.729,0.509,0.819,0.673,0.392,0.675,0.307,-,0.353,-,-,-,-,-,0.959,0.552,0.661
18 | Falcon-Instruct (40B),0.727,0.497,0.829,0.625,0.377,0.666,0.371,-,0.384,-,-,-,-,-,0.959,0.603,0.586
19 | MPT-Instruct (30B),0.716,0.444,0.85,0.733,0.304,0.697,0.327,-,0.234,-,-,-,-,-,0.956,0.573,0.68
20 | MPT (30B),0.714,0.437,0.704,0.732,0.347,0.673,0.393,-,0.231,-,-,-,-,-,0.959,0.599,0.723
21 | J1-Grande v2 beta (17B),0.706,0.445,0.812,0.725,0.337,0.625,0.392,0.764,0.56,0.306,0.285,0.46,0.146,0.152,0.957,0.546,0.679
22 | Vicuna v1.3 (13B),0.706,0.462,0.808,0.691,0.346,0.686,0.403,-,0.385,-,-,-,-,-,0.762,0.645,0.657
23 | Cohere Command beta (6.1B),0.675,0.406,0.798,0.709,0.229,0.717,0.375,0.752,0.55,0.203,0.434,0.709,0.153,0.122,0.961,0.54,0.634
24 | Cohere xlarge v20221108 (52.4B),0.664,0.382,0.762,0.672,0.361,0.628,0.374,0.81,0.588,0.169,0.315,0.55,0.153,0.153,0.956,0.524,0.624
25 | Luminous Supreme (70B),0.662,0.38,0.775,0.711,0.293,0.649,0.37,-,0.222,-,-,0.15,0.136,0.959,0.562,0.653,
26 | Vicuna v1.3 (7B),0.625,0.434,0.76,0.643,0.287,0.634,0.392,-,0.292,-,-,-,-,-,0.916,0.62,0.693
27 | OPT (175B),0.609,0.318,0.793,0.671,0.297,0.615,0.36,0.791,0.586,0.25,0.288,0.448,0.146,0.155,0.947,0.505,0.606
28 | Llama 2 (7B),0.607,0.431,0.762,0.691,0.337,0.611,0.406,-,0.272,-,-,-,-,-,0.907,0.562,0.643
29 | LLaMA (13B),0.595,0.422,0.714,0.711,0.346,0.614,0.347,-,0.324,-,-,-,-,-,0.928,0.6,0.643
30 | InstructPalmyra (30B),0.568,0.403,0.751,0.496,0.33,0.682,0.433,-,0.185,-,-,0.152,0.104,0.94,0.555,0.652,
31 | Cohere xlarge v20220609 (52.4B),0.56,0.353,0.718,0.65,0.312,0.595,0.361,0.811,0.55,0.198,0.273,0.459,0.144,0.129,0.956,0.532,0.633
32 | Jurassic-2 Large (7.5B),0.553,0.339,0.742,-,0.274,0.589,-,0.729,0.53,0.245,0.247,0.464,0.136,0.142,0.956,0.57,0.622
33 | davinci (175B),0.538,0.422,0.722,0.687,0.329,0.625,0.36,0.775,0.586,0.194,0.211,0.378,0.127,0.126,0.933,0.532,0.642
34 | LLaMA (7B),0.533,0.321,0.756,0.669,0.297,0.589,0.338,-,0.28,-,-,-,-,-,0.947,0.563,0.573
35 | RedPajama-INCITE-Instruct (7B),0.524,0.363,0.705,0.638,0.232,0.659,0.26,-,0.243,-,-,-,-,-,0.927,0.664,0.695
36 | J1-Jumbo v1 (178B),0.517,0.259,0.776,0.695,0.293,0.595,0.358,0.765,0.534,0.175,0.21,0.363,0.144,0.129,0.943,0.553,0.681
37 | GLM (130B),0.512,0.344,0.784,0.706,0.148,0.642,0.272,-,0.218,-,-,0.154,0.132,0.955,0.5,0.598,
38 | Luminous Extended (30B),0.485,0.321,0.767,0.665,0.254,0.609,0.349,-,0.221,-,-,0.139,0.124,0.947,0.524,0.523,
39 | OPT (66B),0.448,0.276,0.76,0.638,0.258,0.596,0.357,0.745,0.534,0.201,0.237,0.482,0.136,0.126,0.917,0.506,0.557
40 | BLOOM (176B),0.446,0.299,0.704,0.662,0.216,0.621,0.361,0.744,0.534,0.205,0.236,0.386,0.08,0.03,0.945,0.62,0.592
41 | J1-Grande v1 (17B),0.433,0.27,0.722,0.672,0.233,0.578,0.362,0.739,0.52,0.193,0.161,0.341,0.143,0.122,0.953,0.529,0.658
42 | Alpaca (7B),0.381,0.385,0.778,0.396,0.266,0.592,0.27,-,0.243,-,-,-,-,-,0.738,0.566,0.486
43 | Falcon (7B),0.378,0.286,0.753,0.621,0.285,0.579,0.332,-,0.234,-,-,-,-,-,0.836,0.514,0.602
44 | RedPajama-INCITE-Base (7B),0.378,0.302,0.713,0.617,0.25,0.586,0.336,-,0.205,-,-,-,-,-,0.752,0.547,0.648
45 | Cohere large v20220720 (13.1B),0.372,0.324,0.725,0.625,0.232,0.573,0.338,0.736,0.542,0.181,0.19,0.33,0.126,0.108,0.933,0.507,0.596
46 | RedPajama-INCITE-Instruct-v1 (3B),0.366,0.257,0.677,0.638,0.203,0.637,0.259,-,0.208,-,-,-,-,-,0.894,0.549,0.661
47 | text-curie-001,0.36,0.237,0.62,0.582,0.175,0.571,0.358,0.676,0.514,0.257,0.271,0.507,0.152,0.076,0.923,0.537,0.489
48 | GPT-NeoX (20B),0.351,0.276,0.683,0.599,0.193,0.596,0.326,0.718,0.524,0.216,0.184,0.398,0.123,0.102,0.948,0.516,0.505
49 | Luminous Base (13B),0.315,0.27,0.719,0.605,0.202,0.568,0.334,-,0.182,-,-,0.11,0.105,0.939,0.544,0.473,
50 | Cohere medium v20221108 (6.1B),0.312,0.254,0.7,0.61,0.199,0.517,0.314,0.726,0.538,0.215,0.175,0.373,0.121,0.099,0.935,0.5,0.591
51 | RedPajama-INCITE-Base-v1 (3B),0.311,0.263,0.685,0.555,0.207,0.52,0.309,-,0.277,-,-,-,-,-,0.907,0.549,0.502
52 | TNLG v2 (6.7B),0.309,0.242,0.698,0.631,0.21,0.561,0.345,0.704,0.478,0.167,0.158,0.332,0.146,0.11,0.927,0.532,0.525
53 | J1-Large v1 (7.5B),0.285,0.241,0.683,0.623,0.19,0.532,0.328,0.7,0.514,0.197,0.147,0.292,0.134,0.102,0.956,0.532,0.545
54 | GPT-J (6B),0.273,0.249,0.649,0.545,0.156,0.559,0.33,0.663,0.514,0.199,0.152,0.345,0.131,0.096,0.939,0.52,0.619
55 | Pythia (12B),0.257,0.274,0.662,0.596,0.175,0.581,0.313,-,0.177,-,-,-,-,-,0.931,0.531,0.514
56 | curie (6.7B),0.247,0.243,0.656,0.604,0.199,0.552,0.321,0.682,0.502,0.232,0.162,0.3,0.113,0.091,0.889,0.539,0.49
57 | Falcon-Instruct (7B),0.244,0.275,0.72,0.476,0.194,0.449,0.311,-,0.213,-,-,-,-,-,0.852,0.511,0.523
58 | Cohere medium v20220720 (6.1B),0.23,0.279,0.659,0.559,0.177,0.504,0.279,0.706,0.496,0.19,0.152,0.374,0.077,0.087,0.935,0.504,0.52
59 | text-babbage-001,0.229,0.229,0.451,0.429,0.07,0.33,0.284,0.561,0.452,0.233,0.208,0.449,0.151,0.046,0.913,0.499,0.509
60 | T0pp (11B),0.197,0.407,0.0,0.151,0.039,0.19,0.121,-,0.377,-,-,0.122,0.09,0.207,0.234,0.118,
61 | Pythia (6.9B),0.196,0.236,0.631,0.528,0.142,0.539,0.296,-,0.213,-,-,-,-,-,0.928,0.511,0.502
62 | UL2 (20B),0.167,0.291,0.746,0.083,0.204,0.349,0.144,-,0.193,-,-,0.03,0.058,0.337,0.521,0.404,
63 | T5 (11B),0.131,0.29,0.761,0.086,0.194,0.477,0.116,-,0.133,-,-,0.043,0.015,0.379,0.509,0.37,
64 | babbage (1.3B),0.114,0.235,0.574,0.491,0.119,0.451,0.273,0.555,0.438,0.188,0.122,0.317,0.079,0.045,0.597,0.519,0.455
65 | Cohere small v20220720 (410M),0.109,0.264,0.457,0.294,0.078,0.309,0.219,0.483,0.348,0.217,-,0.304,0.063,0.033,0.578,0.501,0.492
66 | ada (350M),0.108,0.243,0.581,0.326,0.082,0.365,0.242,0.435,0.38,0.215,0.102,0.29,0.09,0.022,0.849,0.517,0.423
67 | text-ada-001,0.107,0.238,0.464,0.238,0.025,0.149,0.176,0.429,0.346,0.232,0.134,0.302,0.136,0.034,0.822,0.503,0.406
68 | YaLM (100B),0.075,0.243,0.634,0.252,0.068,0.227,0.162,-,0.202,-,-,0.017,0.021,0.836,0.49,0.395,


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/helm_lite_240829.csv:
--------------------------------------------------------------------------------
 1 | model,helm_lite,HELM_Lite_NarrativeQA,HELM_Lite_NaturalQuestions(open),HELM_Lite_NaturalQuestions(closed),HELM_Lite_OpenbookQA,HELM_Lite_MMLU,HELM_Lite_MATH-Equivalent(CoT),HELM_Lite_GSM8K,HELM_Lite_LegalBench,HELM_Lite_MedQA,HELM_Lite_WMT2014
 2 | GPT-4o (2024-05-13),0.963,0.804,0.803,0.501,0.966,0.748,0.829,0.905,0.733,0.857,0.231
 3 | Claude 3.5 Sonnet (20240620),0.915,0.746,0.749,0.502,0.972,0.799,0.813,0.949,0.707,0.825,0.229
 4 | GPT-4 (0613),0.915,0.768,0.79,0.457,0.96,0.735,0.802,0.932,0.713,0.815,0.211
 5 | GPT-4 Turbo (2024-04-09),0.908,0.761,0.795,0.482,0.97,0.711,0.833,0.824,0.727,0.783,0.218
 6 | Llama 3.1 Instruct Turbo (405B),0.896,0.749,0.756,0.456,0.94,0.759,0.827,0.949,0.707,0.805,0.238
 7 | Llama 3.1 Instruct Turbo (70B),0.858,0.772,0.738,0.452,0.938,0.709,0.783,0.938,0.687,0.769,0.223
 8 | Llama 3 (70B),0.838,0.798,0.743,0.475,0.934,0.695,0.663,0.805,0.733,0.777,0.225
 9 | Qwen2 Instruct (72B),0.827,0.727,0.776,0.39,0.954,0.769,0.79,0.92,0.712,0.746,0.207
10 | Mistral Large 2 (2407),0.803,0.779,0.734,0.453,0.932,0.725,0.677,0.912,0.646,0.775,0.192
11 | Gemini 1.5 Pro (001),0.793,0.783,0.748,0.378,0.902,0.772,0.825,0.836,0.757,0.692,0.189
12 | GPT-4o mini (2024-07-18),0.776,0.768,0.746,0.386,0.92,0.668,0.802,0.843,0.653,0.748,0.206
13 | Mixtral (8x22B),0.767,0.779,0.726,0.478,0.882,0.701,0.656,0.8,0.708,0.704,0.209
14 | GPT-4 Turbo (1106 preview),0.758,0.727,0.763,0.435,0.95,0.699,0.857,0.668,0.626,0.817,0.205
15 | Palmyra X V3 (72B),0.749,0.706,0.685,0.407,0.938,0.702,0.723,0.831,0.709,0.684,0.262
16 | Gemma 2 Instruct (27B),0.742,0.79,0.731,0.353,0.918,0.664,0.746,0.812,0.7,0.684,0.214
17 | Gemini 1.5 Flash (001),0.733,0.783,0.723,0.332,0.928,0.703,0.753,0.785,0.661,0.68,0.225
18 | Claude 3 Opus (20240229),0.722,0.351,0.264,0.441,0.956,0.768,0.76,0.924,0.662,0.775,0.24
19 | PaLM-2 (Unicorn),0.703,0.583,0.674,0.435,0.938,0.702,0.674,0.831,0.677,0.684,0.26
20 | Qwen1.5 (72B),0.68,0.601,0.758,0.417,0.93,0.647,0.683,0.799,0.694,0.67,0.201
21 | Palmyra X V2 (33B),0.659,0.752,0.752,0.428,0.878,0.621,0.58,0.735,0.644,0.598,0.239
22 | Gemma 2 Instruct (9B),0.639,0.768,0.738,0.328,0.91,0.645,0.724,0.762,0.639,0.63,0.201
23 | Yi (34B),0.634,0.782,0.775,0.443,0.92,0.65,0.375,0.648,0.618,0.656,0.172
24 | Qwen1.5 Chat (110B),0.619,0.721,0.739,0.35,0.922,0.704,0.568,0.815,0.624,0.64,0.192
25 | Qwen1.5 (32B),0.615,0.589,0.777,0.353,0.932,0.628,0.733,0.773,0.636,0.656,0.193
26 | Claude v1.3,0.594,0.723,0.699,0.409,0.908,0.631,0.54,0.784,0.629,0.618,0.219
27 | PaLM-2 (Bison),0.584,0.718,0.813,0.39,0.878,0.608,0.421,0.61,0.645,0.547,0.241
28 | Mixtral (8x7B 32K seqlen),0.582,0.767,0.699,0.427,0.868,0.649,0.494,0.622,0.63,0.652,0.19
29 | Phi-3 (14B),0.579,0.724,0.729,0.278,0.916,0.675,0.611,0.878,0.593,0.696,0.17
30 | Claude 2.0,0.56,0.718,0.67,0.428,0.862,0.639,0.603,0.583,0.643,0.652,0.219
31 | DeepSeek LLM Chat (67B),0.556,0.581,0.733,0.412,0.88,0.641,0.615,0.795,0.637,0.628,0.186
32 | Phi-3 (7B),0.545,0.754,0.675,0.324,0.912,0.659,0.703,-,0.584,0.672,0.154
33 | Llama 2 (70B),0.537,0.763,0.674,0.46,0.838,0.58,0.323,0.567,0.673,0.618,0.196
34 | Yi Large (Preview),0.53,0.373,0.586,0.428,0.946,0.712,0.712,0.69,0.519,0.66,0.176
35 | Command R Plus,0.509,0.735,0.711,0.343,0.828,0.59,0.403,0.738,0.672,0.567,0.203
36 | GPT-3.5 (text-davinci-003),0.503,0.731,0.77,0.413,0.828,0.555,0.449,0.615,0.622,0.531,0.191
37 | Claude 2.1,0.503,0.677,0.611,0.375,0.872,0.643,0.632,0.604,0.643,0.644,0.204
38 | Qwen1.5 (14B),0.491,0.711,0.772,0.3,0.862,0.626,0.686,0.693,0.593,0.515,0.178
39 | Gemini 1.0 Pro (002),0.484,0.751,0.714,0.391,0.788,0.534,0.665,0.816,0.475,0.483,0.194
40 | Claude Instant 1.2,0.464,0.616,0.731,0.343,0.844,0.631,0.499,0.721,0.586,0.559,0.194
41 | Llama 3 (8B),0.441,0.754,0.681,0.378,0.766,0.602,0.391,0.499,0.637,0.581,0.183
42 | GPT-3.5 Turbo (0613),0.42,0.655,0.678,0.335,0.838,0.614,0.667,0.501,0.528,0.622,0.187
43 | Claude 3 Sonnet (20240229),0.42,0.111,0.072,0.028,0.918,0.652,0.084,0.907,0.49,0.684,0.218
44 | Mistral NeMo (2402),0.401,0.731,0.65,0.265,0.822,0.604,0.668,0.782,0.415,0.59,0.177
45 | Arctic Instruct,0.399,0.654,0.586,0.39,0.828,0.575,0.519,0.768,0.588,0.581,0.172
46 | Gemma (7B),0.392,0.752,0.665,0.336,0.808,0.571,0.5,0.559,0.581,0.513,0.187
47 | GPT-3.5 (text-davinci-002),0.392,0.719,0.71,0.394,0.796,0.568,0.428,0.479,0.58,0.525,0.174
48 | LLaMA (65B),0.39,0.755,0.672,0.433,0.754,0.584,0.257,0.489,0.48,0.507,0.189
49 | Mistral Large (2402),0.382,0.454,0.485,0.311,0.894,0.638,0.75,0.694,0.479,0.499,0.182
50 | Command,0.365,0.749,0.777,0.391,0.774,0.525,0.236,0.452,0.578,0.445,0.088
51 | Command R,0.35,0.742,0.72,0.352,0.782,0.567,0.266,0.551,0.507,0.555,0.149
52 | Llama 3.1 Instruct Turbo (8B),0.347,0.756,0.677,0.209,0.74,0.5,0.703,0.798,0.342,0.245,0.181
53 | Mistral Small (2402),0.342,0.519,0.587,0.304,0.862,0.593,0.621,0.734,0.389,0.616,0.169
54 | DBRX Instruct,0.341,0.488,0.55,0.284,0.91,0.643,0.358,0.671,0.426,0.694,0.131
55 | Jamba Instruct,0.339,0.658,0.636,0.384,0.796,0.582,0.38,0.67,0.54,0.519,0.164
56 | Mistral v0.1 (7B),0.338,0.716,0.687,0.367,0.776,0.584,0.297,0.377,0.58,0.525,0.16
57 | Mistral Medium (2312),0.318,0.449,0.468,0.29,0.83,0.618,0.565,0.706,0.452,0.61,0.169
58 | Qwen1.5 (7B),0.317,0.448,0.749,0.27,0.806,0.569,0.561,0.6,0.523,0.479,0.153
59 | Claude 3 Haiku (20240307),0.309,0.244,0.252,0.144,0.838,0.662,0.131,0.699,0.46,0.702,0.148
60 | Yi (6B),0.289,0.702,0.748,0.31,0.8,0.53,0.126,0.375,0.519,0.497,0.117
61 | Llama 2 (13B),0.273,0.741,0.64,0.371,0.634,0.505,0.102,0.266,0.591,0.392,0.167
62 | Jurassic-2 Jumbo (178B),0.254,0.728,0.65,0.385,0.688,0.483,0.103,0.239,0.533,0.431,0.114
63 | Falcon (40B),0.249,0.671,0.676,0.392,0.662,0.507,0.128,0.267,0.442,0.419,0.162
64 | Mistral Instruct v0.3 (7B),0.233,0.716,0.68,0.253,0.79,0.51,0.289,0.538,0.331,0.517,0.142
65 | Jurassic-2 Grande (17B),0.203,0.744,0.627,0.35,0.614,0.471,0.064,0.159,0.468,0.39,0.102
66 | Phi-2,0.202,0.703,0.68,0.155,0.798,0.518,0.255,0.581,0.334,0.41,0.038
67 | Llama 2 (7B),0.18,0.686,0.612,0.333,0.544,0.425,0.097,0.154,0.502,0.392,0.144
68 | Luminous Supreme (70B),0.172,0.743,0.656,0.299,0.284,0.316,0.078,0.137,0.452,0.276,0.102
69 | Command Light,0.125,0.629,0.686,0.195,0.398,0.386,0.098,0.149,0.397,0.312,0.023
70 | Luminous Extended (30B),0.093,0.684,0.611,0.253,0.272,0.248,0.04,0.075,0.421,0.276,0.083
71 | Falcon (7B),0.078,0.621,0.58,0.285,0.26,0.288,0.044,0.055,0.346,0.254,0.094
72 | OLMo (7B),0.063,0.597,0.603,0.259,0.222,0.305,0.029,0.044,0.341,0.229,0.097
73 | Luminous Base (13B),0.052,0.633,0.577,0.197,0.286,0.243,0.026,0.028,0.332,0.26,0.066


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/llm_trustworthy_241001_safety.csv:
--------------------------------------------------------------------------------
 1 | model,trustworthy_average,trustworthy_Non-toxicity,trustworthy_Non-Stereotype,trustworthy_AdvGLUE_PP,trustworthy_OoD,trustworthy_Adv_Demo,trustworthy_Privacy,trustworthy_Ethics,trustworthy_Fairness
 2 | google/gemma-2b-it,67.18,77.07,73.33,43.21,51.43,35.55,88.77,75.03,93.02
 3 | google/gemma-7b-it,66.87,75.52,100,43.43,61.78,33.33,83.69,43.33,93.88
 4 | lmsys/vicuna-7b-v1.3,60.62,28,81,52.16,59.1,57.99,72.96,48.22,85.53
 5 | meta-llama/Llama-2-7b-chat-hf,74.72,80,97.6,51.01,75.65,55.54,97.39,40.58,100
 6 | meta-llama/Meta-Llama-3-8B-Instruct,80.61,77.53,98.33,67.28,70.85,75.54,81.59,93.74,80.05
 7 | mosaicml/mpt-7b-chat,62.29,40,84.6,46.2,64.26,58.25,78.93,26.11,100
 8 | openai/gpt-3.5-turbo-0301,72.45,47,87,56.69,73.58,81.28,70.13,86.38,77.57
 9 | openai/gpt-4-0314,69.24,41,77,64.04,87.55,77.94,66.11,76.6,63.67
10 | openai/gpt-4o-2024-05-13,82.96,86.46,99.67,51.36,86.59,88.1,97.04,92.02,62.47
11 | openai/gpt-4o-mini-2024-07-18,76.31,59.02,87.34,50.25,79.07,88.49,89.38,87.2,69.74
12 | tiiuae/falcon-7b-instruct,59.49,39,87,43.98,51.45,33.95,70.26,50.28,100
13 | togethercomputer/RedPajama-INCITE-7B-Instruct,56.58,18,73,44.81,54.21,58.51,76.64,27.49,100
14 | vertexai/gemini-pro-1.0,80.61,77.53,98.33,67.28,70.85,75.54,81.59,93.74,80.05


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/mixeval_240829_holistic.csv:
--------------------------------------------------------------------------------
 1 | model,MixEval,MixEval-Hard,MixEval_TriviaQA,MixEval_MMLU,MixEval_DROP,MixEval_HellaSwag,MixEval_CommonsenseQA,MixEval_TriviaQA-Hard,MixEval_MMLU-Hard,MixEval_DROP-Hard
 2 | LLaMA-3-70B,82.2,54.0,83.1,79.8,81.5,90.9,85.4,59.1,39.8,59.5
 3 | Qwen1.5-72B,79.5,41.9,78.4,78.8,64.5,91.9,87.3,41.4,42.4,26.2
 4 | Yi-34B,78.3,47.2,72.1,79.3,78.2,98.0,81.1,39.4,42.4,56.5
 5 | Qwen1.5-32B,77.6,41.0,71.9,77.2,68.7,93.3,89.2,28.0,37.2,36.9
 6 | Mixtral-8x7B,74.0,40.7,77.3,71.6,69.8,73.7,77.4,44.1,34.6,42.0
 7 | LLaMA-2-70B,73.2,41.6,78.7,70.8,73.2,63.0,77.4,53.8,29.0,46.1
 8 | Qwen1.5-MoE-A2.7B,70.2,33.5,71.3,69.4,59.9,80.1,80.2,36.0,30.7,31.0
 9 | Qwen1.5-7B,68.2,33.7,61.4,67.0,63.6,83.8,84.4,31.6,28.6,29.8
10 | LLaMA-3-8B,65.1,31.7,65.2,69.5,63.8,51.5,69.8,22.6,38.5,37.1
11 | Mistral-7B,64.8,27.1,67.2,68.5,61.3,54.5,67.9,24.2,27.7,34.5
12 | Gemma-7B,64.7,32.7,66.0,67.4,63.8,36.0,68.4,31.1,28.1,31.4
13 | Yi-6B,63.1,30.4,54.7,71.2,51.4,77.4,76.4,17.0,37.2,19.4
14 | Qwen1.5-4B,58.2,23.5,47.8,59.6,51.0,65.7,79.2,14.0,22.9,24.7
15 | JetMoE-8B,57.1,27.0,53.4,55.3,44.1,89.2,60.4,22.8,27.3,19.2
16 | DeepSeek-7B,52.2,21.7,58.7,53.3,43.5,35.0,51.4,21.4,26.4,21.4
17 | Phi-2,51.9,21.9,37.0,62.5,50.4,20.2,68.9,7.3,29.0,27.1
18 | DeepSeekMoE-16B,51.4,24.2,64.2,49.9,41.1,28.6,48.6,24.9,30.7,12.2
19 | LLaMA-2-7B,43.1,22.1,55.5,40.8,37.6,24.9,30.7,19.5,24.7,14.9
20 | Gemma-2B,38.9,22.6,41.5,37.4,32.6,33.3,31.6,12.1,27.3,13.2
21 | OLMo-7B,31.8,21.2,38.4,29.7,24.0,26.9,25.5,16.0,25.1,11.1
22 | MPT-7B,30.8,17.4,33.5,30.9,26.8,19.2,28.8,6.6,24.2,9.2
23 | Claude 3.5 Sonnet-0620,89.9,68.1,92.6,84.2,93.7,94.6,85.4,73.3,58.4,80.4
24 | LLaMA-3.1-405B-Instruct,-,66.2,-,-,-,-,-,72.0,57.1,69.2
25 | GPT-4o-2024-05-13,87.9,64.7,88.0,85.4,87.9,94.3,86.8,70.3,57.1,67.5
26 | Claude 3 Opus,88.1,63.5,90.4,83.2,91.5,93.3,87.7,71.4,55.0,75.2
27 | GPT-4-Turbo-2024-04-09,88.8,62.6,91.2,82.8,91.0,92.6,85.4,73.1,45.5,71.0
28 | Gemini 1.5 Pro-API-0409,84.2,58.7,85.3,79.2,84.2,89.2,84.4,67.8,44.6,64.8
29 | Gemini 1.5 Pro-API-0514,84.8,58.3,83.7,84.0,82.5,91.2,82.5,59.4,54.5,55.2
30 | Mistral Large 2,86.1,57.4,88.2,81.9,89.3,80.1,81.6,64.8,42.9,72.0
31 | Yi-Large-preview,84.4,56.8,81.7,80.9,87.0,92.6,90.1,55.4,48.5,63.1
32 | LLaMA-3-70B-Instruct,84.0,55.9,83.1,80.5,90.1,81.8,83.0,60.5,46.3,74.5
33 | Qwen-Max-0428,86.1,55.8,86.7,80.6,85.4,93.6,88.2,61.5,41.6,53.5
34 | Claude 3 Sonnet,81.7,54.0,84.2,74.7,87.7,85.9,82.5,59.1,40.7,66.9
35 | Reka Core-20240415,83.3,52.9,82.8,79.3,88.1,88.6,81.6,51.6,46.3,66.6
36 | MAmmoTH2-8x7B-Plus,81.5,51.8,83.0,74.5,85.7,82.2,82.5,52.9,41.1,65.1
37 | DeepSeek-V2,83.7,51.7,84.4,77.3,85.3,88.2,84.0,51.7,42.0,62.8
38 | GPT-4o mini,84.2,51.6,83.1,82.3,87.7,83.8,84.9,45.3,45.0,68.1
39 | Command R+,81.5,51.4,83.3,78.9,80.4,83.5,82.1,57.5,42.0,65.0
40 | Yi-1.5-34B-Chat,81.7,51.2,78.4,76.4,87.0,90.2,86.8,44.4,38.1,67.4
41 | Mistral-Large,84.2,50.3,88.3,80.2,88.6,65.0,83.5,55.5,42.4,61.6
42 | Qwen1.5-72B-Chat,84.1,48.3,83.9,80.1,85.1,87.9,86.3,49.9,37.7,56.5
43 | Mistral-Medium,81.9,47.8,86.8,76.3,83.2,72.4,82.5,59.8,38.5,47.1
44 | Gemini 1.0 Pro,78.9,46.4,81.0,74.9,82.6,74.7,80.2,58.2,35.5,54.1
45 | Reka Flash-20240226,79.8,46.2,76.4,75.4,86.7,90.6,80.7,42.9,34.6,65.0
46 | Mistral-Small,81.2,46.2,85.1,75.2,86.1,73.4,77.8,56.0,33.8,52.6
47 | LLaMA-3-8B-Instruct,75.0,45.6,71.7,71.9,86.4,65.7,78.3,40.2,40.7,67.6
48 | Command R,77.0,45.2,80.9,75.0,72.0,75.8,77.4,57.0,39.0,42.0
49 | Qwen1.5-32B-Chat,81.0,43.3,75.7,78.0,82.9,85.9,88.2,39.1,29.9,54.4
50 | GPT-3.5-Turbo-0125,79.7,43.0,85.2,74.5,84.8,63.0,81.6,46.4,35.1,55.4
51 | Claude 3 Haiku,79.7,42.8,79.9,76.1,85.0,75.8,78.8,42.4,30.7,51.5
52 | Yi-34B-Chat,80.1,42.6,82.7,73.6,86.1,86.9,78.8,41.5,29.9,57.1
53 | Mixtral-8x7B-Instruct-v0.1,76.4,42.5,82.5,72.0,79.5,54.2,77.4,48.5,37.2,47.7
54 | Starling-LM-7B-beta,74.8,41.8,75.1,69.0,86.4,48.5,84.9,33.4,34.2,62.9
55 | Yi-1.5-9B-Chat,74.2,40.9,61.3,72.6,83.9,86.5,82.5,23.3,36.8,61.3
56 | Gemma-1.1-7B-IT,69.6,39.1,64.3,66.9,80.6,66.3,73.6,30.3,39.0,55.1
57 | Vicuna-33B-v1.3,66.3,38.7,79.2,59.2,71.4,30.3,61.8,42.5,39.4,36.6
58 | LLaMA-2-70B-Chat,74.6,38.0,80.0,69.8,79.8,67.3,74.1,42.2,27.7,42.2
59 | MAP-Neo-Instruct-v0.1,70.0,37.8,62.1,66.7,75.5,74.4,82.1,26.5,32.5,42.4
60 | Mistral-7B-Instruct-v0.2,70.0,36.2,73.7,67.3,72.8,54.2,66.0,33.5,29.4,44.3
61 | Qwen1.5-7B-Chat,71.4,35.5,64.1,68.7,76.4,76.1,82.1,29.0,29.0,50.0
62 | Reka Edge-20240208,68.5,32.2,60.0,63.6,80.0,74.7,80.7,18.6,26.4,56.9
63 | Zephyr-7B-β,69.1,31.6,74.7,64.9,77.3,39.1,69.3,30.2,24.2,45.3
64 | LLaMA-2-7B-Chat,61.7,30.8,68.8,59.4,69.3,35.7,61.3,24.8,30.3,44.3
65 | Yi-6B-Chat,65.6,30.1,66.1,65.4,70.5,52.5,69.8,18.9,26.8,43.7
66 | Qwen1.5-MoE-A2.7B-Chat,69.1,29.1,65.9,69.5,64.6,72.7,81.1,21.9,26.8,39.5
67 | Gemma-1.1-2B-IT,51.9,28.4,53.7,51.5,59.8,26.6,57.1,31.9,30.3,27.8
68 | Vicuna-7B-v1.5,60.3,27.8,66.4,58.7,68.3,24.9,62.7,25.9,23.4,33.2
69 | OLMo-7B-Instruct,55.0,26.7,51.7,57.1,53.1,55.9,64.6,24.7,27.3,22.9
70 | Qwen1.5-4B-Chat,57.2,24.6,46.0,61.4,57.2,54.9,74.1,16.5,17.3,28.6
71 | JetMoE-8B-Chat,51.6,24.3,46.8,58.5,27.0,86.2,68.4,19.2,25.5,11.5
72 | MPT-7B-Chat,43.8,23.8,50.2,37.8,50.0,25.6,36.3,17.5,24.7,31.0
73 | 


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/mmlu_pro_240610.csv:
--------------------------------------------------------------------------------
 1 | model,score,scenario,source,aggragated_from,tag
 2 | llama_2_70b,0.3753,mmlu_pro,mmlu_pro_240610,[],knowledge
 3 | llama_3_8b,0.3536,mmlu_pro,mmlu_pro_240610,[],knowledge
 4 | deepseekmath_instruct,0.353,mmlu_pro,mmlu_pro_240610,[],knowledge
 5 | gemma_7b,0.3373,mmlu_pro,mmlu_pro_240610,[],knowledge
 6 | mistral_7b_v0.1,0.3088,mmlu_pro,mmlu_pro_240610,[],knowledge
 7 | mistral_7b_instruct_v0.2,0.3084,mmlu_pro,mmlu_pro_240610,[],knowledge
 8 | mistral_7b_v0.2,0.3043,mmlu_pro,mmlu_pro_240610,[],knowledge
 9 | qwen1.5_7b_chat,0.2906,mmlu_pro,mmlu_pro_240610,[],knowledge
10 | yi_6b_chat,0.2884,mmlu_pro,mmlu_pro_240610,[],knowledge
11 | yi_6b,0.2651,mmlu_pro,mmlu_pro_240610,[],knowledge
12 | mistral_7b_instruct_v0.1,0.2575,mmlu_pro,mmlu_pro_240610,[],knowledge
13 | llama_2_13b,0.2534,mmlu_pro,mmlu_pro_240610,[],knowledge
14 | llemma_7b,0.2345,mmlu_pro,mmlu_pro_240610,[],knowledge
15 | llama_2_7b,0.2032,mmlu_pro,mmlu_pro_240610,[],knowledge
16 | gpt_4o,0.7255,mmlu_pro,mmlu_pro_240610,[],knowledge
17 | claude_3_opus,0.6845,mmlu_pro,mmlu_pro_240610,[],knowledge
18 | gpt_4_turbo,0.6371,mmlu_pro,mmlu_pro_240610,[],knowledge
19 | gemini_1.5_flash,0.5912,mmlu_pro,mmlu_pro_240610,[],knowledge
20 | yi_large,0.5753,mmlu_pro,mmlu_pro_240610,[],knowledge
21 | claude_3_sonnet,0.568,mmlu_pro,mmlu_pro_240610,[],knowledge
22 | llama_3_70b_instruct,0.562,mmlu_pro,mmlu_pro_240610,[],knowledge
23 | deepseek_v2,0.5481,mmlu_pro,mmlu_pro_240610,[],knowledge
24 | phi_3_medium_4k_instruct,0.5348,mmlu_pro,mmlu_pro_240610,[],knowledge
25 | llama_3_70b,0.5278,mmlu_pro,mmlu_pro_240610,[],knowledge
26 | qwen1.5_72b_chat,0.5162,mmlu_pro,mmlu_pro_240610,[],knowledge
27 | mammoth2_8x7b_plus,0.504,mmlu_pro,mmlu_pro_240610,[],knowledge
28 | qwen1.5_110b,0.4993,mmlu_pro,mmlu_pro_240610,[],knowledge
29 | mammoth2_8b_plus,0.4335,mmlu_pro,mmlu_pro_240610,[],knowledge
30 | mixtral_8x7b_instruct_v0.1,0.4327,mmlu_pro,mmlu_pro_240610,[],knowledge
31 | phi_3_mini_4k_instruct,0.4317,mmlu_pro,mmlu_pro_240610,[],knowledge
32 | yi_34b,0.4303,mmlu_pro,mmlu_pro_240610,[],knowledge
33 | mixtral_8x7b_v0.1,0.4103,mmlu_pro,mmlu_pro_240610,[],knowledge
34 | llama_3_8b_instruct,0.4098,mmlu_pro,mmlu_pro_240610,[],knowledge
35 | mammoth2_7b_plus,0.4085,mmlu_pro,mmlu_pro_240610,[],knowledge
36 | qwen1.5_14b_chat,0.3802,mmlu_pro,mmlu_pro_240610,[],knowledge
37 | c4ai_command_r_v01,0.379,mmlu_pro,mmlu_pro_240610,[],knowledge
38 | 


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/olmes_260624.csv:
--------------------------------------------------------------------------------
  1 | model,score,scenario,source,aggragated_from,tag
  2 | pythia_1b,31.4,arc_c,olmes_260624,[],reasoning
  3 | olmo_1b,38.6,arc_c,olmes_260624,[],reasoning
  4 | tinyllama_1.1b,38.1,arc_c,olmes_260624,[],reasoning
  5 | pythia_6.7b,44.6,arc_c,olmes_260624,[],reasoning
  6 | rpj_incite_7b,45.3,arc_c,olmes_260624,[],reasoning
  7 | stablelm2_1.6b,50.6,arc_c,olmes_260624,[],reasoning
  8 | olmo_7b,46.4,arc_c,olmes_260624,[],reasoning
  9 | mpt_7b,45.7,arc_c,olmes_260624,[],reasoning
 10 | falcon_7b,49.7,arc_c,olmes_260624,[],reasoning
 11 | llama2_7b,54.2,arc_c,olmes_260624,[],reasoning
 12 | llama2_13b,67.3,arc_c,olmes_260624,[],reasoning
 13 | olmo_1.7_7b,66.9,arc_c,olmes_260624,[],reasoning
 14 | llama3_8b,79.3,arc_c,olmes_260624,[],reasoning
 15 | mistral_7b_v0.1,78.6,arc_c,olmes_260624,[],reasoning
 16 | llama3_70b,93.7,arc_c,olmes_260624,[],reasoning
 17 | pythia_1b,63.4,arc_e,olmes_260624,[],reasoning
 18 | olmo_1b,68.3,arc_e,olmes_260624,[],reasoning
 19 | tinyllama_1.1b,69.5,arc_e,olmes_260624,[],reasoning
 20 | pythia_6.7b,72.6,arc_e,olmes_260624,[],reasoning
 21 | rpj_incite_7b,78.8,arc_e,olmes_260624,[],reasoning
 22 | stablelm2_1.6b,75.3,arc_e,olmes_260624,[],reasoning
 23 | olmo_7b,78.9,arc_e,olmes_260624,[],reasoning
 24 | mpt_7b,78.0,arc_e,olmes_260624,[],reasoning
 25 | falcon_7b,80.6,arc_e,olmes_260624,[],reasoning
 26 | llama2_7b,84.0,arc_e,olmes_260624,[],reasoning
 27 | llama2_13b,85.9,arc_e,olmes_260624,[],reasoning
 28 | olmo_1.7_7b,83.6,arc_e,olmes_260624,[],reasoning
 29 | llama3_8b,92.4,arc_e,olmes_260624,[],reasoning
 30 | mistral_7b_v0.1,90.8,arc_e,olmes_260624,[],reasoning
 31 | llama3_70b,97.7,arc_e,olmes_260624,[],reasoning
 32 | pythia_1b,56.8,boolq,olmes_260624,[],knowledge
 33 | olmo_1b,51.3,boolq,olmes_260624,[],knowledge
 34 | tinyllama_1.1b,63.6,boolq,olmes_260624,[],knowledge
 35 | pythia_6.7b,68.7,boolq,olmes_260624,[],knowledge
 36 | rpj_incite_7b,72.0,boolq,olmes_260624,[],knowledge
 37 | stablelm2_1.6b,82.3,boolq,olmes_260624,[],knowledge
 38 | olmo_7b,78.7,boolq,olmes_260624,[],knowledge
 39 | mpt_7b,82.4,boolq,olmes_260624,[],knowledge
 40 | falcon_7b,78.2,boolq,olmes_260624,[],knowledge
 41 | llama2_7b,86.1,boolq,olmes_260624,[],knowledge
 42 | llama2_13b,86.7,boolq,olmes_260624,[],knowledge
 43 | olmo_1.7_7b,85.9,boolq,olmes_260624,[],knowledge
 44 | llama3_8b,87.5,boolq,olmes_260624,[],knowledge
 45 | mistral_7b_v0.1,89.3,boolq,olmes_260624,[],knowledge
 46 | llama3_70b,91.7,boolq,olmes_260624,[],knowledge
 47 | pythia_1b,50.9,csqa,olmes_260624,[],knowledge
 48 | olmo_1b,62.2,csqa,olmes_260624,[],knowledge
 49 | tinyllama_1.1b,61.1,csqa,olmes_260624,[],knowledge
 50 | pythia_6.7b,62.1,csqa,olmes_260624,[],knowledge
 51 | rpj_incite_7b,69.2,csqa,olmes_260624,[],knowledge
 52 | stablelm2_1.6b,70.4,csqa,olmes_260624,[],knowledge
 53 | olmo_7b,70.8,csqa,olmes_260624,[],knowledge
 54 | mpt_7b,70.9,csqa,olmes_260624,[],knowledge
 55 | falcon_7b,73.4,csqa,olmes_260624,[],knowledge
 56 | llama2_7b,74.2,csqa,olmes_260624,[],knowledge
 57 | llama2_13b,74.0,csqa,olmes_260624,[],knowledge
 58 | olmo_1.7_7b,85.8,csqa,olmes_260624,[],knowledge
 59 | llama3_8b,73.9,csqa,olmes_260624,[],knowledge
 60 | mistral_7b_v0.1,72.4,csqa,olmes_260624,[],knowledge
 61 | llama3_70b,83.2,csqa,olmes_260624,[],knowledge
 62 | pythia_1b,48.0,hellaswag,olmes_260624,[],reasoning
 63 | olmo_1b,65.2,hellaswag,olmes_260624,[],reasoning
 64 | tinyllama_1.1b,60.8,hellaswag,olmes_260624,[],reasoning
 65 | pythia_6.7b,66.1,hellaswag,olmes_260624,[],reasoning
 66 | rpj_incite_7b,72.8,hellaswag,olmes_260624,[],reasoning
 67 | stablelm2_1.6b,70.3,hellaswag,olmes_260624,[],reasoning
 68 | olmo_7b,78.1,hellaswag,olmes_260624,[],reasoning
 69 | mpt_7b,79.6,hellaswag,olmes_260624,[],reasoning
 70 | falcon_7b,79.0,hellaswag,olmes_260624,[],reasoning
 71 | llama2_7b,78.9,hellaswag,olmes_260624,[],reasoning
 72 | llama2_13b,83.9,hellaswag,olmes_260624,[],reasoning
 73 | olmo_1.7_7b,80.1,hellaswag,olmes_260624,[],reasoning
 74 | llama3_8b,81.8,hellaswag,olmes_260624,[],reasoning
 75 | mistral_7b_v0.1,83.0,hellaswag,olmes_260624,[],reasoning
 76 | llama3_70b,89.5,hellaswag,olmes_260624,[],reasoning
 77 | pythia_1b,31.1,mmlu,olmes_260624,[],knowledge
 78 | olmo_1b,33.4,mmlu,olmes_260624,[],knowledge
 79 | tinyllama_1.1b,33.6,mmlu,olmes_260624,[],knowledge
 80 | pythia_6.7b,37.7,mmlu,olmes_260624,[],knowledge
 81 | rpj_incite_7b,40.1,mmlu,olmes_260624,[],knowledge
 82 | stablelm2_1.6b,40.4,mmlu,olmes_260624,[],knowledge
 83 | olmo_7b,40.5,mmlu,olmes_260624,[],knowledge
 84 | mpt_7b,40.6,mmlu,olmes_260624,[],knowledge
 85 | falcon_7b,42.1,mmlu,olmes_260624,[],knowledge
 86 | llama2_7b,46.2,mmlu,olmes_260624,[],knowledge
 87 | llama2_13b,55.8,mmlu,olmes_260624,[],knowledge
 88 | olmo_1.7_7b,54.4,mmlu,olmes_260624,[],knowledge
 89 | llama3_8b,66.6,mmlu,olmes_260624,[],knowledge
 90 | mistral_7b_v0.1,64.0,mmlu,olmes_260624,[],knowledge
 91 | llama3_70b,79.8,mmlu,olmes_260624,[],knowledge
 92 | pythia_1b,40.4,openbookqa,olmes_260624,[],knowledge
 93 | olmo_1b,47.6,openbookqa,olmes_260624,[],knowledge
 94 | tinyllama_1.1b,45.0,openbookqa,olmes_260624,[],knowledge
 95 | pythia_6.7b,50.4,openbookqa,olmes_260624,[],knowledge
 96 | rpj_incite_7b,49.0,openbookqa,olmes_260624,[],knowledge
 97 | stablelm2_1.6b,56.6,openbookqa,olmes_260624,[],knowledge
 98 | olmo_7b,55.8,openbookqa,olmes_260624,[],knowledge
 99 | mpt_7b,52.4,openbookqa,olmes_260624,[],knowledge
100 | falcon_7b,55.2,openbookqa,olmes_260624,[],knowledge
101 | llama2_7b,57.8,openbookqa,olmes_260624,[],knowledge
102 | llama2_13b,65.4,openbookqa,olmes_260624,[],knowledge
103 | olmo_1.7_7b,68.6,openbookqa,olmes_260624,[],knowledge
104 | llama3_8b,77.2,openbookqa,olmes_260624,[],knowledge
105 | mistral_7b_v0.1,80.6,openbookqa,olmes_260624,[],knowledge
106 | llama3_70b,93.4,openbookqa,olmes_260624,[],knowledge
107 | pythia_1b,68.9,piqa,olmes_260624,[],reasoning
108 | olmo_1b,74.1,piqa,olmes_260624,[],reasoning
109 | tinyllama_1.1b,71.7,piqa,olmes_260624,[],reasoning
110 | pythia_6.7b,74.9,piqa,olmes_260624,[],reasoning
111 | rpj_incite_7b,75.9,piqa,olmes_260624,[],reasoning
112 | stablelm2_1.6b,75.6,piqa,olmes_260624,[],reasoning
113 | olmo_7b,78.5,piqa,olmes_260624,[],reasoning
114 | mpt_7b,79.2,piqa,olmes_260624,[],reasoning
115 | falcon_7b,79.0,piqa,olmes_260624,[],reasoning
116 | llama2_7b,77.5,piqa,olmes_260624,[],reasoning
117 | llama2_13b,80.2,piqa,olmes_260624,[],reasoning
118 | olmo_1.7_7b,80.3,piqa,olmes_260624,[],reasoning
119 | llama3_8b,81.6,piqa,olmes_260624,[],reasoning
120 | mistral_7b_v0.1,82.8,piqa,olmes_260624,[],reasoning
121 | llama3_70b,91.6,piqa,olmes_260624,[],reasoning
122 | pythia_1b,46.4,siqa,olmes_260624,[],other
123 | olmo_1b,51.5,siqa,olmes_260624,[],other
124 | tinyllama_1.1b,50.4,siqa,olmes_260624,[],other
125 | pythia_6.7b,51.7,siqa,olmes_260624,[],other
126 | rpj_incite_7b,56.6,siqa,olmes_260624,[],other
127 | stablelm2_1.6b,64.3,siqa,olmes_260624,[],other
128 | olmo_7b,56.5,siqa,olmes_260624,[],other
129 | mpt_7b,57.4,siqa,olmes_260624,[],other
130 | falcon_7b,60.1,siqa,olmes_260624,[],other
131 | llama2_7b,59.6,siqa,olmes_260624,[],other
132 | llama2_13b,65.9,siqa,olmes_260624,[],other
133 | olmo_1.7_7b,76.1,siqa,olmes_260624,[],other
134 | llama3_8b,70.2,siqa,olmes_260624,[],other
135 | mistral_7b_v0.1,71.3,siqa,olmes_260624,[],other
136 | llama3_70b,78.9,siqa,olmes_260624,[],other
137 | pythia_1b,52.7,winogrande,olmes_260624,[],reasoning
138 | olmo_1b,59.3,winogrande,olmes_260624,[],reasoning
139 | tinyllama_1.1b,60.1,winogrande,olmes_260624,[],reasoning
140 | pythia_6.7b,62.3,winogrande,olmes_260624,[],reasoning
141 | rpj_incite_7b,68.0,winogrande,olmes_260624,[],reasoning
142 | stablelm2_1.6b,65.7,winogrande,olmes_260624,[],reasoning
143 | olmo_7b,68.5,winogrande,olmes_260624,[],reasoning
144 | mpt_7b,70.2,winogrande,olmes_260624,[],reasoning
145 | falcon_7b,71.3,winogrande,olmes_260624,[],reasoning
146 | llama2_7b,71.7,winogrande,olmes_260624,[],reasoning
147 | llama2_13b,74.9,winogrande,olmes_260624,[],reasoning
148 | olmo_1.7_7b,73.6,winogrande,olmes_260624,[],reasoning
149 | llama3_8b,76.2,winogrande,olmes_260624,[],reasoning
150 | mistral_7b_v0.1,77.9,winogrande,olmes_260624,[],reasoning
151 | llama3_70b,84.1,winogrande,olmes_260624,[],reasoning
152 | pythia_1b,49.0,olmes_average,olmes_260624,[],holistic
153 | olmo_1b,55.1,olmes_average,olmes_260624,[],holistic
154 | tinyllama_1.1b,55.4,olmes_average,olmes_260624,[],holistic
155 | pythia_6.7b,59.1,olmes_average,olmes_260624,[],holistic
156 | rpj_incite_7b,62.8,olmes_average,olmes_260624,[],holistic
157 | stablelm2_1.6b,65.1,olmes_average,olmes_260624,[],holistic
158 | olmo_7b,65.3,olmes_average,olmes_260624,[],holistic
159 | mpt_7b,65.6,olmes_average,olmes_260624,[],holistic
160 | falcon_7b,66.9,olmes_average,olmes_260624,[],holistic
161 | llama2_7b,69.0,olmes_average,olmes_260624,[],holistic
162 | llama2_13b,74.0,olmes_average,olmes_260624,[],holistic
163 | olmo_1.7_7b,75.5,olmes_average,olmes_260624,[],holistic
164 | llama3_8b,78.7,olmes_average,olmes_260624,[],holistic
165 | mistral_7b_v0.1,79.1,olmes_average,olmes_260624,[],holistic
166 | llama3_70b,88.4,olmes_average,olmes_260624,[],holistic
167 | 


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/olmes_260624_frozen.csv:
--------------------------------------------------------------------------------
  1 | model,score,scenario,source,aggragated_from,tag
  2 | pythia_1b,31.4,arc_c,olmes_260624,[],reasoning
  3 | olmo_1b,38.6,arc_c,olmes_260624,[],reasoning
  4 | tinyllama_1.1b,38.1,arc_c,olmes_260624,[],reasoning
  5 | pythia_6.7b,44.6,arc_c,olmes_260624,[],reasoning
  6 | rpj_incite_7b,45.3,arc_c,olmes_260624,[],reasoning
  7 | stablelm2_1.6b,50.6,arc_c,olmes_260624,[],reasoning
  8 | olmo_7b,46.4,arc_c,olmes_260624,[],reasoning
  9 | mpt_7b,45.7,arc_c,olmes_260624,[],reasoning
 10 | falcon_7b,49.7,arc_c,olmes_260624,[],reasoning
 11 | llama2_7b,54.2,arc_c,olmes_260624,[],reasoning
 12 | llama2_13b,67.3,arc_c,olmes_260624,[],reasoning
 13 | olmo_1.7_7b,66.9,arc_c,olmes_260624,[],reasoning
 14 | llama3_8b,79.3,arc_c,olmes_260624,[],reasoning
 15 | mistral_7b_v0.1,78.6,arc_c,olmes_260624,[],reasoning
 16 | llama3_70b,93.7,arc_c,olmes_260624,[],reasoning
 17 | pythia_1b,63.4,arc_e,olmes_260624,[],reasoning
 18 | olmo_1b,68.3,arc_e,olmes_260624,[],reasoning
 19 | tinyllama_1.1b,69.5,arc_e,olmes_260624,[],reasoning
 20 | pythia_6.7b,72.6,arc_e,olmes_260624,[],reasoning
 21 | rpj_incite_7b,78.8,arc_e,olmes_260624,[],reasoning
 22 | stablelm2_1.6b,75.3,arc_e,olmes_260624,[],reasoning
 23 | olmo_7b,78.9,arc_e,olmes_260624,[],reasoning
 24 | mpt_7b,78.0,arc_e,olmes_260624,[],reasoning
 25 | falcon_7b,80.6,arc_e,olmes_260624,[],reasoning
 26 | llama2_7b,84.0,arc_e,olmes_260624,[],reasoning
 27 | llama2_13b,85.9,arc_e,olmes_260624,[],reasoning
 28 | olmo_1.7_7b,83.6,arc_e,olmes_260624,[],reasoning
 29 | llama3_8b,92.4,arc_e,olmes_260624,[],reasoning
 30 | mistral_7b_v0.1,90.8,arc_e,olmes_260624,[],reasoning
 31 | llama3_70b,97.7,arc_e,olmes_260624,[],reasoning
 32 | pythia_1b,56.8,boolq,olmes_260624,[],knowledge
 33 | olmo_1b,51.3,boolq,olmes_260624,[],knowledge
 34 | tinyllama_1.1b,63.6,boolq,olmes_260624,[],knowledge
 35 | pythia_6.7b,68.7,boolq,olmes_260624,[],knowledge
 36 | rpj_incite_7b,72.0,boolq,olmes_260624,[],knowledge
 37 | stablelm2_1.6b,82.3,boolq,olmes_260624,[],knowledge
 38 | olmo_7b,78.7,boolq,olmes_260624,[],knowledge
 39 | mpt_7b,82.4,boolq,olmes_260624,[],knowledge
 40 | falcon_7b,78.2,boolq,olmes_260624,[],knowledge
 41 | llama2_7b,86.1,boolq,olmes_260624,[],knowledge
 42 | llama2_13b,86.7,boolq,olmes_260624,[],knowledge
 43 | olmo_1.7_7b,85.9,boolq,olmes_260624,[],knowledge
 44 | llama3_8b,87.5,boolq,olmes_260624,[],knowledge
 45 | mistral_7b_v0.1,89.3,boolq,olmes_260624,[],knowledge
 46 | llama3_70b,91.7,boolq,olmes_260624,[],knowledge
 47 | pythia_1b,50.9,csqa,olmes_260624,[],knowledge
 48 | olmo_1b,62.2,csqa,olmes_260624,[],knowledge
 49 | tinyllama_1.1b,61.1,csqa,olmes_260624,[],knowledge
 50 | pythia_6.7b,62.1,csqa,olmes_260624,[],knowledge
 51 | rpj_incite_7b,69.2,csqa,olmes_260624,[],knowledge
 52 | stablelm2_1.6b,70.4,csqa,olmes_260624,[],knowledge
 53 | olmo_7b,70.8,csqa,olmes_260624,[],knowledge
 54 | mpt_7b,70.9,csqa,olmes_260624,[],knowledge
 55 | falcon_7b,73.4,csqa,olmes_260624,[],knowledge
 56 | llama2_7b,74.2,csqa,olmes_260624,[],knowledge
 57 | llama2_13b,74.0,csqa,olmes_260624,[],knowledge
 58 | olmo_1.7_7b,85.8,csqa,olmes_260624,[],knowledge
 59 | llama3_8b,73.9,csqa,olmes_260624,[],knowledge
 60 | mistral_7b_v0.1,72.4,csqa,olmes_260624,[],knowledge
 61 | llama3_70b,83.2,csqa,olmes_260624,[],knowledge
 62 | pythia_1b,48.0,hellaswag,olmes_260624,[],reasoning
 63 | olmo_1b,65.2,hellaswag,olmes_260624,[],reasoning
 64 | tinyllama_1.1b,60.8,hellaswag,olmes_260624,[],reasoning
 65 | pythia_6.7b,66.1,hellaswag,olmes_260624,[],reasoning
 66 | rpj_incite_7b,72.8,hellaswag,olmes_260624,[],reasoning
 67 | stablelm2_1.6b,70.3,hellaswag,olmes_260624,[],reasoning
 68 | olmo_7b,78.1,hellaswag,olmes_260624,[],reasoning
 69 | mpt_7b,79.6,hellaswag,olmes_260624,[],reasoning
 70 | falcon_7b,79.0,hellaswag,olmes_260624,[],reasoning
 71 | llama2_7b,78.9,hellaswag,olmes_260624,[],reasoning
 72 | llama2_13b,83.9,hellaswag,olmes_260624,[],reasoning
 73 | olmo_1.7_7b,80.1,hellaswag,olmes_260624,[],reasoning
 74 | llama3_8b,81.8,hellaswag,olmes_260624,[],reasoning
 75 | mistral_7b_v0.1,83.0,hellaswag,olmes_260624,[],reasoning
 76 | llama3_70b,89.5,hellaswag,olmes_260624,[],reasoning
 77 | pythia_1b,31.1,mmlu,olmes_260624,[],knowledge
 78 | olmo_1b,33.4,mmlu,olmes_260624,[],knowledge
 79 | tinyllama_1.1b,33.6,mmlu,olmes_260624,[],knowledge
 80 | pythia_6.7b,37.7,mmlu,olmes_260624,[],knowledge
 81 | rpj_incite_7b,40.1,mmlu,olmes_260624,[],knowledge
 82 | stablelm2_1.6b,40.4,mmlu,olmes_260624,[],knowledge
 83 | olmo_7b,40.5,mmlu,olmes_260624,[],knowledge
 84 | mpt_7b,40.6,mmlu,olmes_260624,[],knowledge
 85 | falcon_7b,42.1,mmlu,olmes_260624,[],knowledge
 86 | llama2_7b,46.2,mmlu,olmes_260624,[],knowledge
 87 | llama2_13b,55.8,mmlu,olmes_260624,[],knowledge
 88 | olmo_1.7_7b,54.4,mmlu,olmes_260624,[],knowledge
 89 | llama3_8b,66.6,mmlu,olmes_260624,[],knowledge
 90 | mistral_7b_v0.1,64.0,mmlu,olmes_260624,[],knowledge
 91 | llama3_70b,79.8,mmlu,olmes_260624,[],knowledge
 92 | pythia_1b,40.4,openbookqa,olmes_260624,[],knowledge
 93 | olmo_1b,47.6,openbookqa,olmes_260624,[],knowledge
 94 | tinyllama_1.1b,45.0,openbookqa,olmes_260624,[],knowledge
 95 | pythia_6.7b,50.4,openbookqa,olmes_260624,[],knowledge
 96 | rpj_incite_7b,49.0,openbookqa,olmes_260624,[],knowledge
 97 | stablelm2_1.6b,56.6,openbookqa,olmes_260624,[],knowledge
 98 | olmo_7b,55.8,openbookqa,olmes_260624,[],knowledge
 99 | mpt_7b,52.4,openbookqa,olmes_260624,[],knowledge
100 | falcon_7b,55.2,openbookqa,olmes_260624,[],knowledge
101 | llama2_7b,57.8,openbookqa,olmes_260624,[],knowledge
102 | llama2_13b,65.4,openbookqa,olmes_260624,[],knowledge
103 | olmo_1.7_7b,68.6,openbookqa,olmes_260624,[],knowledge
104 | llama3_8b,77.2,openbookqa,olmes_260624,[],knowledge
105 | mistral_7b_v0.1,80.6,openbookqa,olmes_260624,[],knowledge
106 | llama3_70b,93.4,openbookqa,olmes_260624,[],knowledge
107 | pythia_1b,68.9,piqa,olmes_260624,[],reasoning
108 | olmo_1b,74.1,piqa,olmes_260624,[],reasoning
109 | tinyllama_1.1b,71.7,piqa,olmes_260624,[],reasoning
110 | pythia_6.7b,74.9,piqa,olmes_260624,[],reasoning
111 | rpj_incite_7b,75.9,piqa,olmes_260624,[],reasoning
112 | stablelm2_1.6b,75.6,piqa,olmes_260624,[],reasoning
113 | olmo_7b,78.5,piqa,olmes_260624,[],reasoning
114 | mpt_7b,79.2,piqa,olmes_260624,[],reasoning
115 | falcon_7b,79.0,piqa,olmes_260624,[],reasoning
116 | llama2_7b,77.5,piqa,olmes_260624,[],reasoning
117 | llama2_13b,80.2,piqa,olmes_260624,[],reasoning
118 | olmo_1.7_7b,80.3,piqa,olmes_260624,[],reasoning
119 | llama3_8b,81.6,piqa,olmes_260624,[],reasoning
120 | mistral_7b_v0.1,82.8,piqa,olmes_260624,[],reasoning
121 | llama3_70b,91.6,piqa,olmes_260624,[],reasoning
122 | pythia_1b,46.4,siqa,olmes_260624,[],other
123 | olmo_1b,51.5,siqa,olmes_260624,[],other
124 | tinyllama_1.1b,50.4,siqa,olmes_260624,[],other
125 | pythia_6.7b,51.7,siqa,olmes_260624,[],other
126 | rpj_incite_7b,56.6,siqa,olmes_260624,[],other
127 | stablelm2_1.6b,64.3,siqa,olmes_260624,[],other
128 | olmo_7b,56.5,siqa,olmes_260624,[],other
129 | mpt_7b,57.4,siqa,olmes_260624,[],other
130 | falcon_7b,60.1,siqa,olmes_260624,[],other
131 | llama2_7b,59.6,siqa,olmes_260624,[],other
132 | llama2_13b,65.9,siqa,olmes_260624,[],other
133 | olmo_1.7_7b,76.1,siqa,olmes_260624,[],other
134 | llama3_8b,70.2,siqa,olmes_260624,[],other
135 | mistral_7b_v0.1,71.3,siqa,olmes_260624,[],other
136 | llama3_70b,78.9,siqa,olmes_260624,[],other
137 | pythia_1b,52.7,winogrande,olmes_260624,[],reasoning
138 | olmo_1b,59.3,winogrande,olmes_260624,[],reasoning
139 | tinyllama_1.1b,60.1,winogrande,olmes_260624,[],reasoning
140 | pythia_6.7b,62.3,winogrande,olmes_260624,[],reasoning
141 | rpj_incite_7b,68.0,winogrande,olmes_260624,[],reasoning
142 | stablelm2_1.6b,65.7,winogrande,olmes_260624,[],reasoning
143 | olmo_7b,68.5,winogrande,olmes_260624,[],reasoning
144 | mpt_7b,70.2,winogrande,olmes_260624,[],reasoning
145 | falcon_7b,71.3,winogrande,olmes_260624,[],reasoning
146 | llama2_7b,71.7,winogrande,olmes_260624,[],reasoning
147 | llama2_13b,74.9,winogrande,olmes_260624,[],reasoning
148 | olmo_1.7_7b,73.6,winogrande,olmes_260624,[],reasoning
149 | llama3_8b,76.2,winogrande,olmes_260624,[],reasoning
150 | mistral_7b_v0.1,77.9,winogrande,olmes_260624,[],reasoning
151 | llama3_70b,84.1,winogrande,olmes_260624,[],reasoning
152 | pythia_1b,49.0,olmes_average,olmes_260624,[],holistic
153 | olmo_1b,55.1,olmes_average,olmes_260624,[],holistic
154 | tinyllama_1.1b,55.4,olmes_average,olmes_260624,[],holistic
155 | pythia_6.7b,59.1,olmes_average,olmes_260624,[],holistic
156 | rpj_incite_7b,62.8,olmes_average,olmes_260624,[],holistic
157 | stablelm2_1.6b,65.1,olmes_average,olmes_260624,[],holistic
158 | olmo_7b,65.3,olmes_average,olmes_260624,[],holistic
159 | mpt_7b,65.6,olmes_average,olmes_260624,[],holistic
160 | falcon_7b,66.9,olmes_average,olmes_260624,[],holistic
161 | llama2_7b,69.0,olmes_average,olmes_260624,[],holistic
162 | llama2_13b,74.0,olmes_average,olmes_260624,[],holistic
163 | olmo_1.7_7b,75.5,olmes_average,olmes_260624,[],holistic
164 | llama3_8b,78.7,olmes_average,olmes_260624,[],holistic
165 | mistral_7b_v0.1,79.1,olmes_average,olmes_260624,[],holistic
166 | llama3_70b,88.4,olmes_average,olmes_260624,[],holistic
167 | 


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/opencompass_240829.csv:
--------------------------------------------------------------------------------
 1 | model,opencompass,OC_Language,OC_Knowledge,OC_Reasoning,OC_Math,OC_Code,OC_Instruct,OC_Agent
 2 | Claude-3.5-Sonnet,67.9,50.9,85,57,71.1,69.6,66.2,81.7
 3 | GPT-4o-20240513,67.7,55.5,85.2,55.8,71.1,69.1,60.3,84.4
 4 | Mistral-Large,63.2,50.9,83.4,50.1,66.4,65.1,51.1,83.5
 5 | Mistral-Large-Instruct-2407,62.5,50.3,83.3,50,72.8,55.6,50.3,84.5
 6 | DeepSeek-V2-Chat(0618),61.7,46.3,78.8,47.4,68.2,66.2,44.1,83.7
 7 | GPT-4o-mini-20240718,60.4,50.1,78.7,45.4,58.2,63.3,56,85.7
 8 | Qwen-Max-0428,57.8,56.5,79,47.9,55.1,52.4,47.4,83.8
 9 | Yi-Large,56.3,48.7,75.3,47.6,54.8,54.3,40,86.1
10 | Qwen2-72B-Instruct,55.4,45.8,84,44.7,57.7,49.5,34,85.9
11 | GLM-4,55.2,45.8,77.7,46.1,53.2,56.3,36.9,80.4
12 | Llama3.1-70B-Instruct,53.9,38.4,81.4,31.6,58,53.7,46.2,86.5
13 | Gemma-2-27B-it,53.5,45.2,58.5,45.4,50.1,54.6,45.2,85.5
14 | Qwen1.5-110B-Chat,51.9,53.4,79.3,45.8,39.6,49.5,36.8,79.6
15 | Doubao-pro-32k/240615,51,31.1,78.3,27.8,67.5,50.2,30.6,79.3
16 | Baichuan4,50.4,37.2,74.2,38.5,51.8,44.1,39.4,84.5
17 | Step-1-8K,49.9,40.6,72,35.8,51.4,44.2,38.9,84.2
18 | abab6.5,49.9,44.9,69.8,47,47.2,50.5,32,62.5
19 | Ernie-4.0-8K-Preview-0518,48.8,36.7,76.4,41.3,44.7,50.6,28.5,72.7
20 | Moonshot-v1-8K,48.6,46.3,61,46,46.6,47,35.9,63.5
21 | GLM-4-9B-Chat,47.9,44.3,68.9,40,38.7,45.1,36,81.9
22 | Yi-1.5-34B-Chat,46.9,50.5,65,42.7,38.1,44.8,38.8,63.5
23 | Hunyuan-Standard-256k,46.9,30.6,69.7,36.8,53.9,46.1,29.2,65.6
24 | Mixtral-8x22B-Instruct-v0.1,46.3,33,72.2,28.6,47.2,44.7,31.2,86
25 | Gemma-2-9B-it,45.5,40.8,53.7,41.9,40.7,42.2,40.9,69.9
26 | Qwen2-7B-Instruct,45.1,43.5,64.1,36.2,37.7,44,27.5,79.7
27 | InternLM2.5-7B-Chat,44.5,44.6,64.8,39.3,40.8,34.8,26.5,79
28 | Yi-1.5-9B-Chat,42.6,46.1,56,39.8,38.2,41.8,29.8,54.3
29 | Nanbeige2-16B-Chat,42.3,50.5,53.8,40.5,25.8,33.3,33.2,85.8
30 | Llama3.1-8B-Instruct,42.1,33.7,63.2,24.9,38,39.3,39.1,80.1
31 | DBRX-Instruct,37.6,25.6,66.3,20.8,35.3,32.2,32.5,75.3
32 | Yi-1.5-6B-Chat,36.5,43.6,41.3,36.5,28.4,34.4,26.3,55.4
33 | InternLM2-Chat-20B,36,36.7,60,18.9,27.4,36.2,18.5,80.3
34 | Mixtral-8x7B-Instruct-v0.1,34.5,36.6,50.4,28.1,24.8,26.7,28.2,71
35 | Mistral-7B-Instruct-v0.3,30.7,30.3,47.8,20.7,18.1,23.6,28.5,75.4
36 | DeepSeek-V2-Lite-Chat,30,31.4,41.3,28.1,22.8,16.3,20.6,72.4


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/opencompass_academic_240829.csv:
--------------------------------------------------------------------------------
 1 | model,opencompass_academic,OC_MMLU,OC_MMLU-Pro,OC_CMMLU,OC_BBH,OC_GQPA-Dimand,OC_MATH,OC_HumanEval,OC_IFEval
 2 | GPT-4o-20240513,77,88,73.8,78.3,87.6,49.5,73.7,86,79
 3 | Qwen2-72B-Instruct,73.1,83.1,65.1,79.8,85.2,42.9,67.7,84.2,76.5
 4 | GPT-4o-mini-20240718,72.5,82.9,63.2,65.6,81.9,47.5,69.9,87.8,81
 5 | Llama3-70B-Instruct,66.6,80.7,61.8,66.2,83.2,39.4,47.8,76.2,77.5
 6 | Qwen1.5-110B-Chat,61.7,74,51.8,79.4,74.2,28.3,54.3,77.4,54.3
 7 | Yi-1.5-34B-Chat,60.4,71.3,50.9,63.4,73.8,32.8,53.7,77.4,59.5
 8 | InternLM2.5-Chat-7B,60.3,70.6,44.9,73.8,74.5,29.3,61.4,73.2,54.5
 9 | GLM-4-9B-Chat,59.5,72.9,48.3,71.6,60.6,26.8,51.3,75.6,69.1
10 | Qwen1.5-32B-Chat,57.1,72.5,49.8,76.3,68.2,31.3,42.9,67.7,48.4
11 | Qwen1.5-72B-Chat,56.9,70.9,47.1,67.8,72.8,28.3,46.9,67.7,53.8
12 | Yi-1.5-9B-Chat,56.1,67.8,45.9,65,67.9,25.2,51.1,68.9,56.8
13 | Qwen2-7B-Instruct,52,51.1,38.8,59.3,65.4,25.8,49.2,76.8,49.7
14 | Llama3-8B-Instruct,50.6,66.7,42.3,51.5,54.4,33.8,27.7,59.8,68.4
15 | Qwen1.5-14B-Chat,49.7,67,40.5,73.3,58.3,26.3,29.5,60.4,42
16 | InternLM2-Chat-20B,45.2,55.8,36.3,44.7,65.6,21.7,34.2,67.7,35.5
17 | Yi-1.5-6B-Chat,43.5,48.4,30.1,53.9,56.5,23.2,42.6,45.7,47.3
18 | Mixtral-8x7B-Instruct-v0.1,42.6,67.2,42.7,33.9,55.7,29.3,26.8,34.8,50.8
19 | InternLM2-Chat-7B,42.1,58.8,32.4,47.8,60.3,26.3,28.1,50.6,32.4
20 | Qwen1.5-7B-Chat,35.4,41.9,25.6,42.3,41,21.2,22.2,50,38.6
21 | Mistral-7B-Instruct-v0.3,31.2,30.9,22.1,35.5


--------------------------------------------------------------------------------
/src/bat/assets/benchmarks_old/wildbench_240829.csv:
--------------------------------------------------------------------------------
 1 | model,WB-Elo_LC,WB-Info_Seek,WB-Creative,WB-Code_Debug,WB-Math_Data,WB-Reason_Plan,WB-Score
 2 | gpt-4o-2024-05-13,1227.1,58.6,59.1,60.5,57.3,60.2,59.3
 3 | Claude_3.5_Sonnet,1215.4,55.5,55.6,56.5,50.2,55.6,54.7
 4 | Gemini_1.5_Pro,1214.6,52.2,55.1,55.2,48.6,53.7,53
 5 | gpt-4-turbo-2024-04-09,1209.6,57.2,58.7,55.1,51,56.2,55.2
 6 | Yi-Large-Preview,1208.9,57.7,57.6,54.3,51.9,56.6,55.3
 7 | DeepSeek-V2-Chat_0628_API,1199.1,52.7,56.4,55,51.4,54.8,54
 8 | gpt-4-0125-preview,1197.3,54.4,57.6,52.9,45.8,53.5,52.3
 9 | Claude_3_Opus,1196.3,53.5,53,53.3,46.7,52.5,51.7
10 | Gemini_1.5_Flash,1192,48.7,51.7,48.7,45.3,50.8,48.9
11 | Llama-3-70B-Instruct,1187.5,52.3,54.3,44.7,42.1,50.1,47.8
12 | DeepSeek-V2-Coder_0614_API,1184.9,40,40.8,48.9,46.4,47.2,45.7
13 | Yi-Large,1181.8,51,51.8,47.7,44.5,51.3,48.9
14 | Athene-70B,1180.7,60.8,60.4,59,57.1,61,59.5
15 | Nemotron-4-340B-Inst,1178.6,53,53.3,46.3,40.8,49.1,47.7
16 | Gemma-2-27B-it,1176.4,50.5,53.6,47,43.9,50.6,48.5
17 | Mistral-Large-2,1176.3,57.4,58.9,53.8,52.7,57.2,55.6
18 | Claude_3_Sonnet,1174.7,47.1,46.3,46.1,40.6,47.4,45.5
19 | gpt-4o-mini-2024-07-18,1173.5,57.4,60.1,57.2,54,58.2,57.1
20 | Qwen2-72B-Instruct,1172.3,49.5,49.9,39.8,41,46.8,44.5
21 | Reka_Core,1170.4,52.3,55.5,40.6,40.3,48,45.9
22 | gemma-2-9b-it-SimPO,1166.6,56.5,58,50.9,48.6,55.6,53.3
23 | gemma-2-9b-it-DPO,1166.6,58.2,59.1,50.5,47.1,55.5,53.2
24 | Yi-1.5-34B-Chat,1159.6,50.3,53.5,42.1,39.4,48.1,45.6
25 | Claude_3_Haiku,1159.1,45.3,42.9,37,31.4,41.3,38.9
26 | Mistral-Nemo-Inst_12B,1158.6,51.9,54.6,39.7,35.6,47.4,44.4
27 | Mistral-Large,1157,46.1,49.7,33.7,30.9,41.8,38.9
28 | Gemma-2-9B-it,1156.4,49,51,36.7,36.4,46.7,42.7
29 | Command-R-Plus,1151.4,49.2,52.6,28.4,23.5,41.9,36.8
30 | GLM-4-9B-Chat,1148.5,46.3,47.8,35.4,29.8,42.5,39.1
31 | Magpie-8B-Align-v0.1,1148.4,48.9,49.2,33.7,29.8,42.7,39.3
32 | Yi-1.5-9B-Chat,1148,42.6,45.6,35,32.2,42.4,38.7
33 | Llama3-Inst-8B-SimPO,1147.5,47.9,50.6,31.8,24,40.9,37
34 | Llama3-Inst-8B-SimPO-v0.2,1147.4,47.9,51.8,31.5,24.4,40.7,37.2
35 | Qwen1.5-72B-Chat,1147.4,48.2,50.4,35.4,29.8,43.5,39.9
36 | Llama3-Inst-8B-SimPO-ExPO,1145.5,47.3,49.1,28.6,21.2,39.5,35
37 | SELM_Llama3-8B-Inst-iter3,1144,46.1,51.1,27.3,23.5,39.8,35.3
38 | Phi-3-medium-128k,1139.5,35.7,33.2,18.2,23,32.3,27.3
39 | Llama-3-8B-Instruct,1139.5,39.3,43.6,22,17,34.4,29.2
40 | Hermes-2-Theta-Llama-3-8B,1137.4,41.6,39.8,23.1,18.7,33.7,29.6
41 | Starling-LM-7B-beta-ExPO,1136,42.9,44.3,25.3,18.6,36.3,31.6
42 | SELM_Zephyr-7B-iter3,1134.3,41,44.7,11,12.7,31.6,25.1
43 | Reka_Flash,1132.7,41.5,42.4,22.1,20.5,35,30.4
44 | Gemma-2-2B-it,1129.7,39.9,43.6,17.9,15.8,33.8,27.8
45 | gpt-3.5-turbo-0125,1129.2,36.5,37.4,26.5,21.6,33.4,30
46 | DBRX_Instruct,1128.5,41.1,42.3,26.4,24.5,36.2,32.6
47 | Neo-7B-Instruct-ExPO,1126.6,34.9,38.5,12.8,12.6,28.7,23.1
48 | Neo-7B-Instruct,1126.2,36.3,39.5,14,15,31.4,25
49 | StarlingLM-7B-beta,1126.2,41.9,43.8,24.4,17,34.1,30.2
50 | Command-R,1125.6,44.1,47.4,19.3,16,34.6,29.5
51 | Mixtral-8x7B-Instruct,1124.7,41.9,42.8,25,22.1,34.6,31.5
52 | Yi-1.5-6B-Chat,1122.7,31.4,31.1,16.6,16.8,27.3,23.3
53 | Tulu-2-dpo-70b,1121,40.7,42.7,20.7,14.8,32.3,28
54 | Reka_Edge,1120.8,34.4,36.2,13.5,8.9,25,21.3
55 | Mistral-7B-Instruct-v0.2,1105,40.1,42.1,18.4,10.1,30.1,25.6
56 | Llama-2-70B-chat,1101.9,38.3,40,9.3,4.2,26.8,20.7
57 | Qwen1.5-7B-Chat,1092.7,34,38.3,14.9,11.9,28.9,23.4
58 | Hermes-2-Mixtral-8x7B-DPO,1085.8,39.8,37.9,26,21.8,34.2,30.7
59 | Phi-3-mini-128k,1082.1,28.6,30.6,21.6,18.6,28.1,24.7
60 | Gemma-7B-it,1079.2,12.7,21.2,1.8,-3.7,10.2,6.6
61 | Llama-2-7B-chat,1052.5,27.7,29.8,-6.8,-7.2,15.4,8.3 


--------------------------------------------------------------------------------
/src/bat/assets/lower_is_better_benchmarks.txt:
--------------------------------------------------------------------------------
1 | helm_airbench_240916
2 | llm_trustworthy_241001


--------------------------------------------------------------------------------
/src/bat/assets/prettified_bencmark_names.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "holmes": "Holmes",
  3 |     "helm_lite_narrativeqa": "Helm Lite NarrativeQA",
  4 |     "helm_lite_naturalquestionsopen": "Helm Lite NaturalQuestionsOpen",
  5 |     "helm_lite_naturalquestionsclosed": "Helm Lite NaturalQuestionsClosed",
  6 |     "helm_lite_openbookqa": "Helm Lite OpenBookQA",
  7 |     "helm_lite_mmlu": "Helm Lite MMLU",
  8 |     "helm_lite_math_equivalentcot": "Helm Lite MathEquivalentCOT",
  9 |     "helm_lite_gsm8k": "Helm Lite GSM8K",
 10 |     "helm_lite_legalbench": "Helm Lite LegalBench",
 11 |     "helm_lite_medqa": "Helm Lite MedQA",
 12 |     "helm_lite_wmt2014": "Helm Lite WMT2014",
 13 |     "hfv2_bbh": "HFv2 BBH",
 14 |     "hfv2_bbh_raw": "HFv2 BBH Raw",
 15 |     "hfv2_gpqa": "HFv2 GPQA",
 16 |     "hfv2_ifeval": "HFv2 IFEval",
 17 |     "hfv2_math_lvl_5": "HFv2 Math Level 5",
 18 |     "hfv2_mmlu_pro": "HFv2 MMLU Pro",
 19 |     "hfv2_musr": "HFv2 MuSR",
 20 |     "oc_mmlu": "OpenCompass MMLU",
 21 |     "oc_mmlu_pro": "OpenCompass MMLU Pro",
 22 |     "oc_cmmlu": "OpenCompass CMMLU",
 23 |     "oc_bbh": "OpenCompass BBH",
 24 |     "oc_gqpa_dimand": "OpenCompass GQPA-Dimand",
 25 |     "oc_humaneval": "OpenCompass HumanEval",
 26 |     "oc_ifeval": "OpenCompass IFEval",
 27 |     "helm_mmlu": "Helm MMLU",
 28 |     "helm_boolq": "Helm BoolQ",
 29 |     "helm_narrativeqa": "Helm NarrativeQA",
 30 |     "helm_naturalquestionsclosed": "Helm NaturalQuestionsClosed",
 31 |     "helm_naturalquestionsopen": "Helm NaturalQuestionsOpen",
 32 |     "helm_quac": "Helm QuAC",
 33 |     "helm_openbookqa": "Helm OpenBookQA",
 34 |     "helm_imdb": "Helm IMDB",
 35 |     "helm_civilcomments": "Helm CivilComments",
 36 |     "helm_raft": "Helm RAFT",
 37 |     "helm_ms_marcoregular": "Helm MSMARCO Regular",
 38 |     "helm_ms_marcotrec": "Helm MSMARCO Trec",
 39 |     "xsum": "Helm XSUM",
 40 |     "mmlu_pro": "MMLU Pro",
 41 |     "mixeval_triviaqa": "MixEval TriviaQA",
 42 |     "mixeval_mmlu": "MixEval MMLU",
 43 |     "mixeval_drop": "MixEval DROP",
 44 |     "mixeval_hellaswag": "MixEval HellaSwag",
 45 |     "mixeval_commonsenseqa": "MixEval CommonsenseQA",
 46 |     "mixeval_triviaqa_hard": "MixEval TriviaQA Hard",
 47 |     "mixeval_mmlu_hard": "MixEval MMLU Hard",
 48 |     "mixeval_drop_hard": "MixEval DROP Hard",
 49 |     "oc_language": "OpenCompass Language",
 50 |     "oc_knowledge": "OpenCompass Knowledge",
 51 |     "oc_reasoning": "OpenCompass Reasoning",
 52 |     "oc_math": "OpenCompass Math",
 53 |     "oc_code": "OpenCompass Code",
 54 |     "oc_instruct": "OpenCompass Instruction",
 55 |     "oc_agent": "OpenCompass Agent",
 56 |     "oc_arena": "OpenCompass Arena",
 57 |     "lb_reasoning": "LiveBench Reasoning",
 58 |     "lb_coding": "LiveBench Coding",
 59 |     "lb_mathematics": "LiveBench Mathematics",
 60 |     "lb_data_analysis": "LiveBench Data Analysis",
 61 |     "lb_language": "LiveBench Language",
 62 |     "lb_if": "LiveBench Instruction Following",
 63 |     "wb_info_seek": "WildBench Information Seeking",
 64 |     "wb_creative": "WildBench Creative",
 65 |     "wb_code_debug": "WildBench Code Debugging",
 66 |     "wb_math_data": "WildBench Math & Data",
 67 |     "wb_reason_plan": "WildBench Reasoning & Planning",
 68 |     "wb_score": "WildBench Score",
 69 |     "hfv1_arc": "HFv1 ARC",
 70 |     "hfv1_gsm8k": "HFv1 GSM8K",
 71 |     "hfv1_hellaswag": "HFv1 HellaSwag",
 72 |     "hfv1_mmlu": "HFv1 MMLU",
 73 |     "hfv1_truthfulqa": "HFv1 TruthfulQA",
 74 |     "hfv1_winogrande": "HFv1 Winogrande",
 75 |     "biggen_grounding": "BIGGEN Grounding",
 76 |     "biggen_instruction_following": "BIGGEN Instruction Following",
 77 |     "biggen_planning": "BIGGEN Planning",
 78 |     "biggen_reasoning": "BIGGEN Reasoning",
 79 |     "biggen_refinement": "BIGGEN Refinement",
 80 |     "biggen_safety": "BIGGEN Safety",
 81 |     "biggen_theory_of_mind": "BIGGEN Theory of Mind",
 82 |     "biggen_tool_usage": "BIGGEN Tool Usage",
 83 |     "biggen_multilingual": "BIGGEN Multilingual",
 84 |     "lb_reasoning_average": "LiveBench Reasoning Average",
 85 |     "lb_coding_average": "LiveBench Coding Average",
 86 |     "lb_mathematics_average": "LiveBench Mathematics Average",
 87 |     "lb_data_analysis_average": "LiveBench Data Analysis Average",
 88 |     "lb_language_average": "LiveBench Language Average",
 89 |     "lb_if_average": "LiveBench Instruction Following Average",
 90 |     "helm_lite": "Helm Lite",
 91 |     "hf_open_llm_v2": "HF OpenLLM v2",
 92 |     "opencompass_academic": "OpenCompass Academic",
 93 |     "arena_elo": "LMSys Arena",
 94 |     "helm_classic": "Helm Classic",
 95 |     "mixeval": "MixEval",
 96 |     "mixeval_hard": "MixEval Hard",
 97 |     "opencompass": "OpenCompass",
 98 |     "alphacaeval_v2lc": "AlphacaEval v2lc",
 99 |     "livebench_240725": "LiveBench 240725",
100 |     "wb_elo_lc": "WildBench Elo LC",
101 |     "arena_hard": "Arena Hard",
102 |     "agentbench": "AgentBench",
103 |     "hf_open_llm_v1": "HF OpenLLM v1",
104 |     "biggen": "BIGGEN",
105 |     "livebench_240624": "LiveBench 240624",
106 |     "mt_bench": "MT-Bench",
107 |     "bfcl": "BFCL",
108 |     "helm_airbench_security_risks": "HELM AirBench Security Risks",
109 |     "helm_airbench_operational_misuses": "HELM AirBench Operational Misuses",
110 |     "helm_airbench_violence_&_extremism": "HELM AirBench Violence & Extremism",
111 |     "helm_airbench_hate/toxicity": "HELM AirBench Hate/Toxicity",
112 |     "helm_airbench_sexual_content": "HELM AirBench Sexual Content",
113 |     "helm_airbench_child_harm": "HELM AirBench Child Harm",
114 |     "helm_airbench_self_harm": "HELM AirBench Self Harm",
115 |     "helm_airbench_political_usage": "HELM AirBench Political Usage",
116 |     "helm_airbench_economic_harm": "HELM AirBench Economic Harm",
117 |     "helm_airbench_deception": "HELM AirBench Deception",
118 |     "helm_airbench_manipulation": "HELM AirBench Manipulation",
119 |     "helm_airbench_defamation": "HELM AirBench Defamation",
120 |     "helm_airbench_fundamental_rights": "HELM AirBench Fundamental Rights",
121 |     "helm_airbench_discrimination/bias": "HELM AirBench Discrimination/Bias",
122 |     "helm_airbench_privacy": "HELM AirBench Privacy",
123 |     "helm_airbench_criminal_activities": "HELM AirBench Criminal Activities",
124 |     "helm_airbench_air_score": "HELM AirBench AIR Score",
125 |     "enkrypt_ai_safety": "Enkrypt AI Safety",
126 |     "decentralized_arena": "Decentralized Arena (0-1 Normalized)",
127 |     "hydrox_safety": "Hydrox Safety",
128 |     "hydrox_privacy": "Hydrox Privacy",
129 |     "hydrox_security": "Hydrox Security",
130 |     "hydrox_integrity": "Hydrox Integrity",
131 |     "hydrox_overall_score": "Hydrox Overall Score",
132 |     "ruler": "RULER",
133 |     "trustworthy_average": "Trustworthy Average",
134 |     "trustworthy_Non-toxicity": "Trustworthy Non-Toxicity",
135 |     "trustworthy_Non-Stereotype": "Trustworthy Non-Stereotype",
136 |     "trustworthy_AdvGLUE_PP": "Trustworthy AdvGLUE PP",
137 |     "trustworthy_OoD": "Trustworthy Out-of-Distribution",
138 |     "trustworthy_Adv_Demo": "Trustworthy Adversarial Demos",
139 |     "trustworthy_Privacy": "Trustworthy Privacy",
140 |     "trustworthy_Ethics": "Trustworthy Ethics",
141 |     "trustworthy_Fairness": "Trustworthy Fairness"
142 | }


--------------------------------------------------------------------------------
/src/bat/configs.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import List
 3 | 
 4 | 
 5 | @dataclass
 6 | class Config:
 7 |     exp_to_run: str
 8 |     n_models_taken_list: List[int] = field(default_factory=lambda: [])
 9 |     model_select_strategy_list: List[str] = field(default_factory=lambda: [])
10 |     n_exps: int = 10
11 |     corr_types: List[str] = field(default_factory=lambda: ["kendall"])
12 |     include_aggregate_as_scenario: bool = False
13 |     scenario_blacklist: List[str] = field(default_factory=lambda: [])
14 |     aggregate_scenarios: List[str] = field(default_factory=lambda: [])
15 |     reference_data_path: str = "src/bat/assets/combined_holistic.csv"
16 |     external_benchmarks_tested: List[str] = field(default_factory=lambda: [])
17 |     min_n_models_intersect: int = 5
18 | 
19 |     def __post_init__(self):
20 |         self.validate_n_models_taken_list()
21 |         self.validate_model_select_strategy_list()
22 |         self.validate_corr_types()
23 | 
24 |     def validate_n_models_taken_list(self):
25 |         if not all(isinstance(x, int) for x in self.n_models_taken_list):
26 |             raise ValueError("All items in n_models_taken_list must be integers")
27 | 
28 |     def validate_model_select_strategy_list(self):
29 |         valid_strategies = {
30 |             "somewhere_aggregate",
31 |             "middle_aggregate",
32 |             "top_aggregate",
33 |             "bottom_aggregate",
34 |             "random",
35 |         }
36 |         if not all(
37 |             item in valid_strategies for item in self.model_select_strategy_list
38 |         ):
39 |             raise ValueError(
40 |                 f"Invalid strategy in model_select_strategy_list. Valid options are: {valid_strategies}"
41 |             )
42 | 
43 |     def validate_corr_types(self):
44 |         valid_types = {"kendall", "pearson"}
45 |         if not all(item in valid_types for item in self.corr_types):
46 |             raise ValueError(
47 |                 f"Invalid correlation type. Valid options are: {valid_types}"
48 |             )
49 | 
50 |     def update_or_add_fields(self, **kwargs):
51 |         """
52 |         Add or update fields dynamically. All new fields are validated.
53 |         """
54 |         for key, value in kwargs.items():
55 |             setattr(self, key, value)
56 |         # Re-validate the fields if necessary
57 |         if "n_models_taken_list" in kwargs:
58 |             self.validate_n_models_taken_list()
59 |         if "model_select_strategy_list" in kwargs:
60 |             self.validate_model_select_strategy_list()
61 |         if "corr_types" in kwargs:
62 |             self.validate_corr_types()
63 | 
64 | 
65 | # if __name__ == "__main__":
66 | #     manager = ConfigurationManager()
67 | #     # Example access to configurations:
68 | #     print(manager.configs["resolution_matters"].n_exps)
69 | 


--------------------------------------------------------------------------------
/src/bat/logic.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import pandas as pd
  3 | from scipy.stats import pearsonr, kendalltau
  4 | import numpy as np
  5 | 
  6 | 
  7 | def get_pair_agreement(pair_scen_res, res_to_sort_by, cfg, models_intersect):
  8 |     # how many models occur in both
  9 | 
 10 |     model_subset_size_taken = (
 11 |         min(cfg["model_subset_size_requested"], len(models_intersect))
 12 |         if cfg["model_subset_size_requested"] != 0
 13 |         else len(models_intersect)
 14 |     )
 15 | 
 16 |     if any(
 17 |         [
 18 |             x in cfg["model_select_strategy"]
 19 |             for x in ["top", "bottom", "middle", "somewhere"]
 20 |         ]
 21 |     ):
 22 |         if cfg["exp_n"] != 0 and "somewhere" not in cfg["model_select_strategy"]:
 23 |             return None, None  # skipping experimentation since deterministic
 24 | 
 25 |         models_taken = sample_models_directed(
 26 |             res_to_sort_by,
 27 |             cfg["model_select_strategy"],
 28 |             models_intersect,
 29 |             model_subset_size_taken,
 30 |         )
 31 | 
 32 |     elif "random" in cfg["model_select_strategy"]:
 33 |         random.seed(cfg["exp_n"])
 34 |         models_taken = random.sample(
 35 |             models_intersect,
 36 |             k=model_subset_size_taken,
 37 |         )
 38 | 
 39 |     else:
 40 |         raise NotImplementedError
 41 | 
 42 |     agreement, p_value = get_agreement(
 43 |         pair_scen_res[pair_scen_res["model"].isin(models_taken)][
 44 |             ["model", "scenario", "score"]
 45 |         ],
 46 |         cfg["corr_type"],
 47 |     )
 48 | 
 49 |     return agreement, p_value
 50 | 
 51 | 
 52 | def get_df_of_scenario_to_order_by(df, model_select_strategy):
 53 |     if "aggregate" in model_select_strategy:
 54 |         order_by = "Aggregate"
 55 | 
 56 |     elif "arena" in model_select_strategy:
 57 |         order_by = "Arena Elo"
 58 | 
 59 |     else:
 60 |         raise NotImplementedError
 61 | 
 62 |     return df[df["scenario"] == order_by]
 63 | 
 64 | 
 65 | def sample_models_directed(
 66 |     res_to_sort_by,
 67 |     model_select_strategy,
 68 |     models_intersect,
 69 |     n_models_really_taken,
 70 | ):
 71 |     df_of_scenario_to_order_by = res_to_sort_by.query("model in @models_intersect")
 72 |     # get_df_of_scenario_to_order_by(
 73 |     # bench_res, model_select_strategy
 74 |     # )
 75 | 
 76 |     if "top" in model_select_strategy:
 77 |         models_taken = df_of_scenario_to_order_by.nlargest(
 78 |             n_models_really_taken,
 79 |             "score",
 80 |         )["model"].tolist()
 81 |     elif "bottom" in model_select_strategy:
 82 |         models_taken = df_of_scenario_to_order_by.nsmallest(
 83 |             n_models_really_taken,
 84 |             "score",
 85 |         )["model"].tolist()
 86 | 
 87 |     elif "middle" in model_select_strategy:
 88 |         df_sorted = df_of_scenario_to_order_by.sort_values("score", ascending=False)
 89 |         middle_idx = len(df_sorted) // 2
 90 |         half_n = n_models_really_taken // 2
 91 | 
 92 |         if n_models_really_taken % 2 == 0:
 93 |             sampled_df = df_sorted.iloc[middle_idx - half_n : middle_idx + half_n]
 94 |         else:
 95 |             sampled_df = df_sorted.iloc[middle_idx - half_n : middle_idx + half_n + 1]
 96 | 
 97 |         models_taken = sampled_df["model"].unique().tolist()
 98 | 
 99 |     elif "somewhere":
100 |         df_sorted = df_of_scenario_to_order_by.sort_values("score", ascending=False)
101 | 
102 |         idx = random.randrange(len(df_sorted) - n_models_really_taken + 1)
103 |         models_taken = (
104 |             df_sorted.iloc[idx : idx + n_models_really_taken]["model"].unique().tolist()
105 |         )
106 | 
107 |     else:
108 |         raise NotImplementedError
109 | 
110 |     return models_taken
111 | 
112 | 
113 | def sample_sublists_for_list(
114 |     all_models_sorted, sublists_size=1, n_sublists=0, drop_from_top=False
115 | ):
116 |     # assert not (
117 |     #     drop_from_top and sublists_size != len(all_models_sorted)
118 |     # ), "drop from top defines the length of resulting"
119 | 
120 |     if drop_from_top:
121 |         sublists = []
122 |         top_models_to_remove = []
123 |         for window_num, model in enumerate(all_models_sorted):
124 |             sublists.append(
125 |                 [
126 |                     model
127 |                     for model in all_models_sorted[: sublists_size + window_num]
128 |                     if model not in top_models_to_remove
129 |                 ]
130 |             )
131 |             top_models_to_remove.append(sublists[-1][0])  # drop the first model
132 |             if len(sublists) == n_sublists:
133 |                 break
134 | 
135 |     else:
136 |         random.seed(0)
137 |         sublists = []
138 |         for _ in range(n_sublists):
139 |             sublists.append(random.sample(all_models_sorted, sublists_size))
140 | 
141 |     return sublists
142 | 
143 | 
144 | def calculate_win_rate(series):
145 |     assert len(series) > 1, "no meaning for a win rate with only one object"
146 | 
147 |     def win_rate(x):
148 |         win_count = sum(1 for value in series if x > value)
149 |         return win_count / (len(series) - 1)
150 | 
151 |     return series.transform(win_rate)
152 | 
153 | 
154 | def add_aggragete_with_mwr(df, scenarios_for_aggragate):
155 |     if "wr" not in df.columns:
156 |         df["wr"] = df.groupby(["scenario"])["score"].transform(calculate_win_rate)
157 | 
158 |     mean_df = pd.DataFrame(columns=df.columns)
159 |     mean_df = (
160 |         df.query("scenario in @scenarios_for_aggragate")
161 |         .groupby(["model"])
162 |         .agg({"score": "mean", "wr": "mean"})
163 |         .reset_index()
164 |     )
165 |     mean_df["score"] = mean_df["wr"]
166 |     mean_df["scenario"] = "Aggregate"
167 |     df = pd.concat([df, mean_df]).drop(columns=["wr"])
168 |     return df
169 | 
170 | 
171 | def get_agreement(df, corr_type):
172 |     if corr_type == "pearson":
173 |         corr_func = pearsonr
174 |     elif corr_type == "kendall":
175 |         corr_func = kendalltau
176 |     else:
177 |         raise IOError(f"corr_type {corr_type} is not supported")
178 | 
179 |     pivot_df = df.pivot(
180 |         index="model",
181 |         columns="scenario",
182 |         values="score",
183 |     )
184 | 
185 |     similarity = pivot_df.corr(method=lambda x, y: corr_func(x, y)[0]).iloc[0, 1]
186 |     p_value = (
187 |         pivot_df.corr(method=lambda x, y: corr_func(x, y)[1])
188 |         - np.eye(len(pivot_df.columns))
189 |     ).iloc[0, 1]
190 | 
191 |     return similarity, p_value
192 | 


--------------------------------------------------------------------------------
/src/bat/reporting.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | import os
  5 | import pandas as pd
  6 | 
  7 | 
  8 | # def plot_experiments_results(agreement_df, cfg):
  9 | #     sns.set()
 10 | 
 11 | #     exp_to_run = cfg.exp_to_run
 12 | 
 13 | #     if exp_to_run == "resolution_matters":
 14 | #         sns.set(font_scale=1.2, style="white")
 15 | 
 16 | #         fig, ax = plt.subplots(width_ratios=[1.5])
 17 | 
 18 | #         # correlation as a function of model_subset_size_requested
 19 | #         sns.pointplot(
 20 | #             ax=ax,
 21 | #             # kind="point",
 22 | #             data=agreement_df.query('corr_type=="kendall"').replace(
 23 | #                 {
 24 | #                     "somewhere_aggregate": "Adjacent sampling",
 25 | #                     "random": "Random sampling",
 26 | #                 }
 27 | #             ),
 28 | #             y="correlation",
 29 | #             x="model_subset_size_requested",
 30 | #             hue="model_select_strategy",
 31 | #             markersize=10,
 32 | #             linewidth=4,
 33 | #             # legend=False,
 34 | #             # errorbar="se",
 35 | #             # linestyle="",
 36 | #             # col="corr_type",
 37 | #             # sharey=False,
 38 | #             # aspect=1.5,
 39 | #         )
 40 | #         # scneario-wise agreement (lines)
 41 | #         sns.pointplot(
 42 | #             ax=ax,
 43 | #             # kind="point",
 44 | #             data=agreement_df.query(
 45 | #                 'corr_type=="kendall"'
 46 | #                 # " and scenario not in @scenarios_not_to_show and ref_scenario not in @scenarios_not_to_show"
 47 | #                 " and model_select_strategy=='somewhere_aggregate'"
 48 | #             ),
 49 | #             y="correlation",
 50 | #             x="model_subset_size_requested",
 51 | #             hue="scenario",
 52 | #             errorbar=None,
 53 | #             alpha=0.2,
 54 | #             legend=False,
 55 | #             # aspect=1.5,
 56 | #             # col="corr_type",
 57 | #             # aspect=1.5,
 58 | #         )
 59 | #         plt.xlabel("Granularity (Number of models)")
 60 | #         plt.ylabel("Mean Benchmark Agreement\n(Kendall-tau correlation)")
 61 | #         ax.invert_xaxis()
 62 | #         handles, labels = ax.get_legend_handles_labels()
 63 | #         handles, labels = ax.get_legend_handles_labels()
 64 | #         ax.legend(handles=handles, labels=labels, frameon=False)
 65 | #         # sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
 66 | #         plt.tight_layout()
 67 | #         plt.savefig("figures/final_for_paper/pointplot_granularity_matters.pdf")
 68 | 
 69 | 
 70 | class Reporter:
 71 |     def __init__(self) -> None:
 72 |         os.makedirs("figures", exist_ok=True)
 73 | 
 74 |     @staticmethod
 75 |     def draw_agreements_for_one_source(
 76 |         agreements, source_of_interest, ref_sources=None
 77 |     ):
 78 |         filtered_agreements = Reporter.filter_with_sources(
 79 |             agreements, ref_sources, source_of_interest
 80 |         )
 81 | 
 82 |         # Grouping and calculating mean for 'correlation' and 'p_value'
 83 |         grouped = (
 84 |             filtered_agreements.groupby(["scenario", "ref_scenario"])
 85 |             .agg(
 86 |                 correlation_mean=("correlation", "mean"),
 87 |                 p_value_mean=("p_value", "mean"),
 88 |             )
 89 |             .reset_index()
 90 |         ).dropna()
 91 | 
 92 |         sns.set_theme(font_scale=1.2)
 93 | 
 94 |         g = sns.catplot(
 95 |             kind="bar",
 96 |             data=grouped.sort_values("correlation_mean"),
 97 |             x="ref_scenario",
 98 |             y="correlation_mean",
 99 |             # palette="viridis",  # or any other Seaborn palette
100 |             edgecolor=".2",  # Add edge color for better visibility
101 |             linewidth=1,  # Adjust line width
102 |             # width=2,
103 |             aspect=1.8,
104 |             # legend=True,
105 |         )
106 |         plt.xticks(rotation=90, fontsize=10)  # Adjust fontsize
107 |         plt.xlabel("Reference Scenario", fontsize=12)  # Add labels with fontsize
108 |         plt.ylabel("Mean Correlation", fontsize=12)
109 |         plt.title(
110 |             f"Mean Agreement Between {source_of_interest} and All other Benchmark",
111 |             fontsize=14,
112 |         )  # Add title
113 | 
114 |         plt.tight_layout()
115 |         plt.show(block=True)
116 |         # plt.savefig("figures/temp.png")
117 | 
118 |     @staticmethod
119 |     def draw_agreement_matrix(agreements, sources_hide=None):
120 |         filtered_agreements = Reporter.filter_with_sources(
121 |             agreements, sources_hide, sources_hide
122 |         )
123 | 
124 |         # Grouping and calculating mean for 'correlation' and 'p_value'
125 |         grouped = (
126 |             filtered_agreements.groupby(["scenario", "ref_scenario"])
127 |             .agg(
128 |                 correlation_mean=("correlation", "mean"),
129 |                 p_value_mean=("p_value", "mean"),
130 |             )
131 |             .reset_index()
132 |         ).dropna()
133 | 
134 |         # Pivoting the data
135 |         correlation_pivot = grouped[
136 |             ["scenario", "ref_scenario", "correlation_mean"]
137 |         ].pivot(index="scenario", columns="ref_scenario")
138 |         p_value_pivot = grouped[["scenario", "ref_scenario", "p_value_mean"]].pivot(
139 |             index="scenario", columns="ref_scenario"
140 |         )
141 | 
142 |         plt.figure(figsize=(10, 8))  # Increase figure size for better visualization
143 | 
144 |         sns.heatmap(
145 |             correlation_pivot["correlation_mean"].round(2),
146 |             annot=True,  # combined_annotations,
147 |             fmt=".2f",  # Format annotations to two decimal places
148 |             cmap="coolwarm",  # Adjust color map as needed
149 |             center=0,  # Center the colormap around 0 for better contrast
150 |             linewidths=0.5,  # Add lines between cells for better separation
151 |             linecolor="lightgray",  # Set line color to light gray
152 |         )
153 |         plt.xticks(
154 |             rotation=90, fontsize=10
155 |         )  # Rotate x-axis labels for better readability
156 |         plt.yticks(fontsize=10)  # Adjust y-axis label font size
157 |         plt.xlabel("Reference Scenario", fontsize=12)  # Add labels with fontsize
158 |         plt.ylabel("Scenario", fontsize=12)  # Add y-axis label
159 |         plt.title("Mean Benchmark Agreement Across Scenarios", fontsize=14)  # Add title
160 |         plt.tight_layout()
161 |         plt.show(block=True)
162 | 
163 |     @staticmethod
164 |     def filter_with_sources(agreements, ref_sources_to_keep, scenario_sources_to_keep):
165 |         if not scenario_sources_to_keep and not ref_sources_to_keep:  # use all
166 |             scenario_sources_to_keep = agreements["scenario_source"].unique().tolist()
167 |             ref_sources_to_keep = agreements["ref_source"].unique().tolist()
168 | 
169 |         elif scenario_sources_to_keep and not ref_sources_to_keep:
170 |             ref_sources_to_keep = [
171 |                 scen
172 |                 for scen in agreements["ref_source"].unique().tolist()
173 |                 if scen not in scenario_sources_to_keep
174 |             ]
175 | 
176 |         elif scenario_sources_to_keep and ref_sources_to_keep:
177 |             pass
178 | 
179 |         else:
180 |             raise NotImplementedError
181 | 
182 |         filtered_agreements = agreements.query(
183 |             "scenario_source in @scenario_sources_to_keep and ref_source in @ref_sources_to_keep"
184 |         )
185 | 
186 |         return filtered_agreements
187 |         # plt.tight_layout()
188 |         # plt.savefig("figures/newbench_cluster_within.png")
189 |         # print("figure saved to figures/newbench_heatmap_within.png")
190 |         # plt.clf()
191 | 
192 |     @staticmethod
193 |     def get_all_z_scores(agreements, aggragate_name="aggregate"):
194 |         z_scores = []
195 |         for observed_scenario in agreements["scenario"].unique():
196 |             if (
197 |                 observed_scenario == aggragate_name
198 |                 or len(
199 |                     agreements.dropna().query(
200 |                         "scenario==@observed_scenario"
201 |                         " and ref_scenario==@aggragate_name"
202 |                     )
203 |                 )
204 |                 == 0
205 |             ):
206 |                 continue
207 | 
208 |             (
209 |                 z_score,
210 |                 corr_with_agg,
211 |                 p_value_of_corr_with_agg,
212 |                 n_models_of_corr_with_agg,
213 |             ) = Reporter.get_z_score(
214 |                 agreements=agreements,
215 |                 observed_scenario=observed_scenario,
216 |                 aggragate_name="aggregate",
217 |             )
218 | 
219 |             z_scores.append(
220 |                 {
221 |                     "scenario": observed_scenario,
222 |                     "z_score": z_score,
223 |                     "corr_with_agg": corr_with_agg,
224 |                     "p_value_of_corr_with_agg": p_value_of_corr_with_agg,
225 |                     "n_models_of_corr_with_agg": n_models_of_corr_with_agg,
226 |                     "source": agreements.query("scenario==@observed_scenario")[
227 |                         "scenario_source"
228 |                     ].iloc[0],
229 |                 }
230 |             )
231 | 
232 |         return pd.DataFrame(z_scores).sort_values('z_score')
233 | 
234 |     @staticmethod
235 |     def get_z_score(
236 |         agreements,
237 |         observed_scenario,
238 |         aggragate_name="aggregate",
239 |         blacklist_sources=[],
240 |     ):
241 |         if (
242 |             not len(
243 |                 agreements.dropna().query(
244 |                     "scenario==@observed_scenario" " and ref_scenario==@aggragate_name"
245 |                 )
246 |             )
247 |             > 0
248 |         ):
249 |             raise IOError
250 | 
251 |         ref_agreements_with_agg = (
252 |             agreements.dropna()
253 |             .query(
254 |                 "scenario_source not in @blacklist_sources"
255 |                 " and ref_scenario==@aggragate_name"
256 |             )
257 |             .groupby(["scenario"])
258 |             .agg(
259 |                 correlation_mean=("correlation", "mean"),
260 |                 p_value_mean=("p_value", "mean"),
261 |                 n_models_mean=("model_subset_size_requested", "mean"),
262 |             )
263 |         )
264 | 
265 |         obs_with_agg = agreements.query(
266 |             "scenario==@observed_scenario" " and ref_scenario==@aggragate_name"
267 |         ).agg(
268 |             correlation_mean=("correlation", "mean"),
269 |             p_value_mean=("p_value", "mean"),
270 |             n_models_mean=("model_subset_size_requested", "mean"),
271 |         )
272 | 
273 |         obs_agreements_with_agg = float(obs_with_agg.iloc[0, 0])
274 |         obs_agreements_with_agg_p_value = float(obs_with_agg.iloc[1, 1])
275 |         obs_agreements_with_agg_n_models = float(obs_with_agg.iloc[2, 2])
276 | 
277 |         ref_mean = ref_agreements_with_agg["correlation_mean"].mean()
278 |         ref_std = ref_agreements_with_agg["correlation_mean"].std()
279 |         z_score = float((obs_agreements_with_agg - ref_mean) / ref_std)
280 | 
281 |         return (
282 |             z_score,
283 |             obs_agreements_with_agg,
284 |             obs_agreements_with_agg_p_value,
285 |             obs_agreements_with_agg_n_models,
286 |         )
287 | 


--------------------------------------------------------------------------------
/src/bat/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | from bat import benchmark
 4 | from bat.benchmark import Benchmark
 5 | 
 6 | 
 7 | def get_holistic_benchmark(file_name="assets/combined_holistic_20240708.csv"):
 8 |     if os.path.exists(file_name):
 9 |         df = pd.read_csv(file_name)
10 |     else:
11 |         df = pd.read_csv(f"src/bat/{file_name}")
12 | 
13 |     return Benchmark(df)
14 | 
15 | if __name__ == "__main__":
16 | 
17 |     csv_path = 'src/bat/assets/combined_20240704.csv'
18 |     from bat.benchmark import Benchmark
19 |     benchmark = Benchmark(pd.read_csv(csv_path))
20 |     for source, df in benchmark.df.groupby('source'):
21 |         df.to_csv(f'src/bat/assets/benchmarks/{source}.csv', index=False)
22 |     
23 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit test package for bat."""
2 | 


--------------------------------------------------------------------------------
/tests/test_benchmark.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pandas as pd
  3 | from bat import Benchmark  # Replace your_module with the actual module name
  4 | 
  5 | 
  6 | class TestBenchmark(unittest.TestCase):
  7 |     def setUp(self):
  8 |         # Create a sample DataFrame for testing
  9 |         data = {
 10 |             "model": ["model_a", "model_b", "model_a", "model_b"],
 11 |             "scenario": ["scenario_1", "scenario_1", "scenario_2", "scenario_2"],
 12 |             "score": [0.8, 0.7, 0.9, 0.6],
 13 |         }
 14 |         self.df = pd.DataFrame(data)
 15 |         self.benchmark = Benchmark(self.df, "test_source")
 16 | 
 17 |     def test_assign_df(self):
 18 |         # Check if DataFrame is assigned correctly
 19 |         self.assertEqual(self.benchmark.df.shape, (4, 5))
 20 |         self.assertEqual(self.benchmark.df["source"].unique()[0], "test_source")
 21 | 
 22 |     def test_normalize_scores_per_scenario(self):
 23 |         # Test score normalization
 24 |         normalized_df = self.benchmark.normalize_scores_per_scenario()
 25 |         scenario_1_scores = normalized_df[normalized_df["scenario"] == "scenario_1"][
 26 |             "score"
 27 |         ]
 28 |         scenario_2_scores = normalized_df[normalized_df["scenario"] == "scenario_2"][
 29 |             "score"
 30 |         ]
 31 |         self.assertEqual(scenario_1_scores.min(), 0.0)
 32 |         self.assertEqual(scenario_1_scores.max(), 1.0)
 33 |         self.assertEqual(scenario_2_scores.min(), 0.0)
 34 |         self.assertEqual(scenario_2_scores.max(), 1.0)
 35 | 
 36 |     def test_add_aggragete(self):
 37 |         # Test aggregate column addition
 38 |         self.benchmark.add_aggregate(
 39 |             new_col_name="aggregate", agg_source_name="aggregated_source"
 40 |         )
 41 |         self.assertIn("aggregate", self.benchmark.df["scenario"].unique())
 42 |         aggregate_rows = self.benchmark.df[self.benchmark.df["scenario"] == "aggregate"]
 43 |         self.assertEqual(len(aggregate_rows), 2)  # Two models, so two aggregate rows
 44 | 
 45 |     def test_validate_dataframe(self):
 46 |         # Test DataFrame validation (should pass with the sample DataFrame)
 47 |         self.benchmark.validate_dataframe_post_formatting()
 48 | 
 49 |     def test_extend(self):
 50 |         # Test extending the Benchmark object
 51 |         new_data = {
 52 |             "model": ["model_c"],
 53 |             "scenario": ["scenario_3"],
 54 |             "score": [0.5],
 55 |             "source": ["new_source"],
 56 |             "aggragated_from": [[]],
 57 |         }
 58 |         new_df = pd.DataFrame(new_data)
 59 |         new_benchmark = Benchmark(new_df, "new_source")
 60 |         self.benchmark.extend(new_benchmark)
 61 |         self.assertEqual(len(self.benchmark.df), 5)  # Original 4 rows + 1 new row
 62 | 
 63 |     def test_get_models(self):
 64 |         # Test getting unique model names
 65 |         models = self.benchmark.get_models()
 66 |         self.assertEqual(set(models), {"model_a", "model_b"})
 67 | 
 68 |     def test_get_scenarios(self):
 69 |         # Test getting unique scenario names
 70 |         scenarios = self.benchmark.get_scenarios()
 71 |         self.assertEqual(set(scenarios), {"scenario_1", "scenario_2"})
 72 | 
 73 |     def test_get_model_appearences_count(self):
 74 |         # Test counting model appearances
 75 |         counts = self.benchmark.get_model_appearences_count()
 76 |         self.assertEqual(counts["model_a"], 2)
 77 |         self.assertEqual(counts["model_b"], 2)
 78 | 
 79 |     def test_get_scenario_appearences_count(self):
 80 |         # Test counting scenario appearances
 81 |         counts = self.benchmark.get_scenario_appearences_count()
 82 |         self.assertEqual(counts["scenario_1"], 2)
 83 |         self.assertEqual(counts["scenario_2"], 2)
 84 | 
 85 |     # Tests for show_overlapping_model_counts and clear_repeated_scenarios are more
 86 |     # complex and might require mocking or specific data setups to test effectively.
 87 |     # Consider adding these tests based on your specific needs and how you
 88 |     # handle plotting and data cleaning in those methods.
 89 | 
 90 |     def test_validate_df_pre_formatting_unnamed_0(self):
 91 |         # Test DataFrame validation with 'Unnamed: 0' column
 92 |         bad_data = {
 93 |             "Unnamed: 0": [0, 1],
 94 |             "model": ["model_a", "model_b"],
 95 |             "scenario_1": [0.8, 0.7],
 96 |             "scenario_2": [0.9, 0.6],
 97 |         }
 98 |         bad_df = pd.DataFrame(bad_data)
 99 |         with self.assertRaises(ValueError) as context:
100 |             Benchmark(bad_df, "test_source")
101 |         self.assertIn(
102 |             "DataFrame should not contain 'Unnamed: 0' column", str(context.exception)
103 |         )
104 | 
105 |     def test_validate_df_pre_formatting_missing_model(self):
106 |         # Test DataFrame validation with missing 'model' column
107 |         bad_data = {
108 |             "scenario": ["scenario_1", "scenario_1", "scenario_2", "scenario_2"],
109 |             "score": [0.8, 0.7, 0.9, 0.6],
110 |         }
111 |         bad_df = pd.DataFrame(bad_data)
112 |         with self.assertRaises(ValueError) as context:
113 |             Benchmark(bad_df, "test_source")
114 |         self.assertIn("DataFrame must contain a 'model' column", str(context.exception))
115 | 
116 |     def test_validate_df_pre_formatting_missing_scenario(self):
117 |         # Test DataFrame validation with missing 'scenario' (and only 'model')
118 |         bad_data = {
119 |             "model": ["model_a", "model_b"],
120 |         }
121 |         bad_df = pd.DataFrame(bad_data)
122 |         with self.assertRaises(ValueError) as context:
123 |             Benchmark(bad_df, "test_source")
124 |         self.assertIn(
125 |             "DataFrame must contain at least 'model' and one scenario column",
126 |             str(context.exception),
127 |         )
128 | 
129 |     def test_validate_df_pre_formatting_duplicate_model_scenario(self):
130 |         # Test DataFrame validation with duplicate model-scenario pairs
131 |         bad_data = {
132 |             "model": ["model_a", "model_a", "model_b"],
133 |             "scenario_1": [0.8, 0.9, 0.7],  # Duplicate model_a for scenario_1
134 |             "scenario_2": [0.7, 0.6, 0.8],
135 |         }
136 |         bad_df = pd.DataFrame(bad_data)
137 |         with self.assertRaises(ValueError) as context:
138 |             Benchmark(bad_df, "test_source")
139 |         self.assertIn(
140 |             "DataFrame contains duplicate model-scenario pairs", str(context.exception)
141 |         )
142 | 
143 |     def test_validate_df_pre_formatting_non_numeric_score(self):
144 |         # Test DataFrame validation with non-numeric score
145 |         bad_data = {
146 |             "model": ["model_a", "model_b"],
147 |             "scenario": ["scenario_1", "scenario_2"],
148 |             "score": ["not_a_number", "also_not_a_number"],
149 |         }
150 |         bad_df = pd.DataFrame(bad_data)
151 |         with self.assertRaises(ValueError) as context:
152 |             Benchmark(bad_df, "test_source")
153 |         self.assertIn("score must be numeric", str(context.exception))
154 | 
155 |     def test_validate_dataframe_post_formatting_missing_columns(self):
156 |         # Test with missing required columns after formatting
157 |         data = {"model": ["model_a"], "scenario": ["scenario_1"], "score": [0.8]}
158 |         df = pd.DataFrame(data)
159 |         benchmark = Benchmark(df, "test_source")
160 | 
161 |         # Remove required columns and check if ValueError is raised
162 |         benchmark.df.drop(columns=["source", "aggragated_from"], inplace=True)
163 |         with self.assertRaises(ValueError) as context:
164 |             benchmark.validate_dataframe_post_formatting()
165 |         self.assertIn(
166 |             "DataFrame must contain the following columns", str(context.exception)
167 |         )
168 | 
169 |     def test_validate_dataframe_post_formatting_non_numeric_score_after_formatting(
170 |         self,
171 |     ):
172 |         # Test with non-numeric score after formatting
173 |         data = {
174 |             "model": ["model_a"],
175 |             "scenario": ["scenario_1"],
176 |             "score": [0.8],
177 |             "source": ["test_source"],
178 |             "aggragated_from": [[]],
179 |         }
180 |         df = pd.DataFrame(data)
181 |         benchmark = Benchmark(df, "test_source")
182 | 
183 |         # Change score to non-numeric and check if ValueError is raised
184 |         benchmark.df["score"] = "not_a_number"
185 |         with self.assertRaises(ValueError) as context:
186 |             benchmark.validate_dataframe_post_formatting()
187 |         self.assertIn("score must be numeric", str(context.exception))
188 | 
189 | 
190 | if __name__ == "__main__":
191 |     unittest.main()
192 | 


--------------------------------------------------------------------------------