├── .env_sample ├── .github ├── ISSUE_TEMPLATE │ ├── 1-bug-report.yml │ ├── 2-feature-request.yml │ ├── 3-new-eval-request.yml │ ├── 4-documentation-improve.yml │ └── config.yml └── pull_request_template.md ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── assets ├── Evalverse_Color.png ├── Evalverse_White.png ├── architecture.png ├── intro-evalverse.png ├── overview.png └── sample_report.png ├── contribution └── CONTRIBUTING.md ├── evalverse ├── README.md ├── __init__.py ├── connector.py ├── evaluator.py ├── reporter.py ├── slack_bot.py ├── tests │ ├── test_evaluator.py │ ├── test_reporter.py │ ├── test_reproducibility.py │ └── test_results │ │ └── SOLAR-10.7B-Instruct-v1.0 │ │ └── h6_en │ │ ├── arc_challenge_25.json │ │ ├── gsm8k_5.json │ │ ├── hellaswag_10.json │ │ ├── mmlu_5.json │ │ ├── truthfulqa_mc2_0.json │ │ └── winogrande_5.json └── utils.py ├── examples ├── 01_basic_usage.ipynb ├── 02_advanced_usage.ipynb ├── README.md ├── db │ ├── figures │ │ └── figure_20240402_105011.jpeg │ ├── score_df.csv │ └── scores │ │ └── table_20240402_105011.csv └── results │ ├── Llama-2-7b-chat-hf │ ├── eq_bench │ │ ├── benchmark_results.csv │ │ └── raw_results.json │ ├── h6_en │ │ ├── arc_challenge_25.json │ │ ├── gsm8k_5.json │ │ ├── hellaswag_10.json │ │ ├── mmlu_5.json │ │ ├── truthfulqa_mc2_0.json │ │ └── winogrande_5.json │ ├── ifeval │ │ ├── eval_results_loose.jsonl │ │ ├── eval_results_strict.jsonl │ │ ├── output.jsonl │ │ └── scores.txt │ └── mt_bench │ │ ├── model_answer │ │ └── Llama-2-7b-chat-hf.jsonl │ │ ├── model_judgment │ │ └── gpt-4_single.jsonl │ │ └── scores.txt │ └── SOLAR-10.7B-Instruct-v1.0 │ ├── eq_bench │ ├── benchmark_results.csv │ └── raw_results.json │ ├── h6_en │ ├── arc_challenge_25.json │ ├── gsm8k_5.json │ ├── hellaswag_10.json │ ├── mmlu_5.json │ ├── truthfulqa_mc2_0.json │ └── winogrande_5.json │ ├── ifeval │ ├── eval_results_loose.jsonl │ ├── eval_results_strict.jsonl │ ├── output.jsonl │ └── scores.txt │ └── mt_bench │ ├── model_answer │ └── SOLAR-10.7B-Instruct-v1.0.jsonl │ ├── model_judgment │ └── gpt-4_single.jsonl │ └── scores.txt ├── poetry.lock └── pyproject.toml /.env_sample: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=sk-... 2 | 3 | SLACK_BOT_TOKEN=xoxb-... 4 | SLACK_APP_TOKEN=xapp-... -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/1-bug-report.yml: -------------------------------------------------------------------------------- 1 | name: "🐛 Bug Report" 2 | description: Create a new ticket for a bug. 3 | title: "🐛 [BUG] - " 4 | labels: [ 5 | "bug" 6 | ] 7 | 8 | body: 9 | - type: textarea 10 | id: environment-setting 11 | attributes: 12 | label: "Environment Settings" 13 | description: Python version, ... 14 | placeholder: Let us explain your environment settings to reproduce 15 | validations: 16 | required: true 17 | 18 | - type: textarea 19 | id: expected-behavior 20 | attributes: 21 | label: "Expected Behavior" 22 | placeholder: A clear and concise description of what you would expect to happen. 23 | validations: 24 | required: true 25 | 26 | - type: textarea 27 | id: actual-behavior 28 | attributes: 29 | label: "Actual Behavior" 30 | placeholder: A clear and concise description of what actually happened. 31 | 32 | - type: textarea 33 | id: reproduction 34 | attributes: 35 | label: Reproduction 36 | description: | 37 | Please enter an explicit steps to reproduce your problem. 38 | If you have any code snippets, error messages, and etc., please provide them here. 39 | 40 | placeholder: | 41 | Steps to reproduce: 42 | 43 | 1. 44 | 2. 45 | 3. 46 | 4. 47 | validations: 48 | required: true -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2-feature-request.yml: -------------------------------------------------------------------------------- 1 | name: "🚀 Feature Request" 2 | description: Suggesting new desired feature and enhancement of existing feature 3 | title: "🚀 [REQUEST] - <title>" 4 | labels: [ 5 | "enhancement", "feature" 6 | ] 7 | 8 | body: 9 | - type: textarea 10 | id: feature-request 11 | attributes: 12 | label: Feature request 13 | description: | 14 | Please describe the feature you want to add or needs to be enhanced. 15 | If you have any related paper or code, please provide us. 16 | validations: 17 | required: true 18 | 19 | 20 | - type: textarea 21 | id: context 22 | validations: 23 | required: false 24 | attributes: 25 | label: Context 26 | description: | 27 | Please let us know your motivation or additional context for this suggestion. 28 | Knowing the reason why it needs to be add/enhanced makes us easy to understand the need. 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3-new-eval-request.yml: -------------------------------------------------------------------------------- 1 | name: "🖍️ New Eval Request" 2 | description: Suggesting new desired evluation method or enhancement of existing evaluation method 3 | title: "🖍️ [REQUEST] - <title>" 4 | labels: [ 5 | "enhancement", "feature" 6 | ] 7 | 8 | body: 9 | - type: textarea 10 | id: new-eval-request 11 | attributes: 12 | label: New evaluation method request 13 | description: | 14 | Please describe the evaluation method you want to add or needs to be enhanced. 15 | If you have any related paper or code, please provide us. 16 | validations: 17 | required: true 18 | 19 | 20 | - type: textarea 21 | id: context 22 | validations: 23 | required: false 24 | attributes: 25 | label: Context 26 | description: | 27 | Please let us know your motivation or additional context for this suggestion. 28 | Knowing the reason why it needs to be add/enhanced makes us easy to understand the need. 29 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/4-documentation-improve.yml: -------------------------------------------------------------------------------- 1 | name: "📝 Documentation Improvement" 2 | description: Report wrong or missing documentation. You can suggest new document or document that needs any improvement. 3 | title: "📝 [Docs] - <title>" 4 | labels: [ 5 | "docs" 6 | ] 7 | 8 | body: 9 | - type: checkboxes 10 | attributes: 11 | label: evalverse version checks 12 | options: 13 | - label: > 14 | I have checked that the issue still exists on the latest versions of the _evalverse_. 15 | required: true 16 | 17 | - type: textarea 18 | id: location 19 | attributes: 20 | label: Location of the documentation 21 | description: > 22 | Please provide the location of the documentation. 23 | If you are suggesting new document, please provide appropriate place it has to be. 24 | validations: 25 | required: true 26 | 27 | - type: textarea 28 | id: problem 29 | attributes: 30 | label: Documentation problem 31 | description: > 32 | Please provide a description of what documentation you believe needs to be fixed/improved/added. 33 | validations: 34 | required: true 35 | 36 | - type: textarea 37 | id: suggestion 38 | attributes: 39 | label: Suggestion 40 | description: > 41 | Please explain the suggested fix and **why** it's better than the existing documentation. 42 | Or it could be content of new document you are suggesting. 43 | validations: 44 | required: true -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## PR Checklist 2 | Please check if your PR fulfills the following requirements: 3 | 4 | - [ ] The commit message follows _evalverse_ guidelines [link](https://github.com/UpstageAI/evalverse/blob/main/contribution/CONTRIBUTING.md#commit-messages): 5 | - [ ] Tests for the changes have been added (for bug fixes / features) 6 | - [ ] Docs have been added / updated (for bug fixes / features) 7 | 8 | 9 | ## What does this PR do? 10 | <!-- Please describe the link to a relevant issue and current behavior that you are modifying.--> 11 | 12 | - Issue Number: # 13 | - Description: -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Files from evalverse slack_bot 163 | db/* 164 | 165 | # Files from reproducibility test 166 | evalverse/tests/test_results_reproduced/* 167 | 168 | # Evaluation results -> please use git add -f <diff> for commiting results 169 | results/* 170 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "evalverse/submodules/FastChat"] 2 | path = evalverse/submodules/FastChat 3 | url = https://github.com/UpstageAI/evalverse-FastChat.git 4 | branch = main 5 | [submodule "evalverse/submodules/IFEval"] 6 | path = evalverse/submodules/IFEval 7 | url = https://github.com/UpstageAI/evalverse-IFEval.git 8 | branch = main 9 | [submodule "evalverse/submodules/EQBench"] 10 | path = evalverse/submodules/EQBench 11 | url = https://github.com/UpstageAI/evalverse-EQBench.git 12 | branch = main 13 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.2.0 4 | hooks: 5 | # - id: trailing-whitespace 6 | - id: check-added-large-files 7 | - id: detect-private-key 8 | - id: detect-aws-credentials 9 | args: [--allow-missing-credentials] 10 | - repo: https://github.com/pycqa/isort 11 | rev: 5.13.2 12 | hooks: 13 | - id: isort 14 | args: [ 15 | --profile=black, 16 | ] 17 | - repo: https://github.com/psf/black 18 | rev: 23.12.1 19 | hooks: 20 | - id: black 21 | args: [ 22 | --line-length=100, 23 | ] 24 | 25 | - repo: https://github.com/myint/autoflake 26 | rev: v2.2.0 27 | hooks: 28 | - id: autoflake 29 | args: [ 30 | # --in-place, 31 | # --remove-unused-variables, 32 | # --remove-all-unused-imports, 33 | --expand-star-imports, 34 | ] 35 | - repo: https://github.com/PyCQA/flake8 36 | rev: 6.0.0 37 | hooks: 38 | - id: flake8 39 | args: [ 40 | "--ignore=E203, E221, E231, E501, W503", 41 | ] 42 | # E203: Whitespace before ':' 43 | # E221: multiple spaces before operator 44 | # E231: missing whitespace after ',' 45 | # E501: line length - because black checks and this makes error even on commented code 46 | # W503: PEP8 now recommends to break before binary operator (https://peps.python.org/pep-0008/#should-a-line-break-before-or-after-a-binary-operator) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | <div align="center"> 2 | <picture> 3 | <source media="(prefers-color-scheme: dark)" srcset="assets/Evalverse_White.png" width=300> 4 | <source media="(prefers-color-scheme: light)" srcset="assets/Evalverse_Color.png" width=300> 5 | <img alt="Evalverse" src="assets/Evalverse_Color.png" width=300> 6 | </picture> 7 | 8 | The Universe of Evaluation. 9 | All about the evaluation for LLMs. </br> 10 | Upstage Solar is powered by Evalverse! Try at Upstage [Console](https://console.upstage.ai/)! 11 | 12 | [🤗HugginFace Space](https://huggingface.co/spaces/upstage/evalverse-space) • [📚Docs](https://evalverse.gitbook.io/evalverse-docs) • [📄Paper](https://arxiv.org/abs/2404.00943) 13 | 14 | [Examples](https://github.com/UpstageAI/evalverse/tree/main/examples) • [FAQ](https://evalverse.gitbook.io/evalverse-docs/documents/faqs) • [Contribution Guide](https://github.com/UpstageAI/evalverse/blob/main/contribution/CONTRIBUTING.md) • [Contact](mailto:evalverse@upstage.ai) • [Discord](https://discord.gg/D3bBj66K) 15 | </div> 16 | 17 | ### 🚀 Newly updated 18 | - [2024.05.10] LLM-Evaluation Report of Evalverse is now available on [HuggingFace Space](https://huggingface.co/spaces/upstage/evalverse-space). 19 | 20 | <div align="center"><img alt="overview" src="assets/overview.png" width=500></div> 21 | 22 | 23 | ## 👋 Welcome to Evalverse! 24 | Evalverse is a freely accessible, open-source project designed to support your LLM (Large Language Model) evaluation needs. We provide a simple, standardized, and user-friendly solution for the processing and management of LLM evaluations, catering to the needs of AI research engineers and scientists. We also support no-code evaluation processes for people who may have less experience working with LLMs. Moreover, you will receive a well-organized report with figures summarizing the evaluation results. 25 | 26 | ### With Evalverse, you are empowered to 27 | - access various evaluation methods without juggling multiple libraries. 28 | - receive insightful report about the evaluation results that helps you to compare the varied scores across different models. 29 | - initiate evaluation and generate reports without any code via Slack bot. 30 | 31 | 32 | ### Architecture of Evalverse 33 | <div align="center"><img alt="architecture" src="assets/architecture.png" width=700></div> 34 | 35 | ### Key Features of Evalverse 36 | - **Unified evaluation with Submodules**: Evalverse extends its evaluation capabilities through Git submodules, effortlessly incorporating frameworks like [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [FastChat](https://github.com/lm-sys/FastChat). Swiftly add new tools and keep pace with the latest in LLM evaluation. 37 | - **No-code evaluation request**: With Evalverse, request LLM evaluations without any code, simply by sending `Request!` in a direct message or Slack channel with an activate Evalverse Slack bot. Enter the model name in the Huggingface hub or local model directory path in Slack, and let the bot handle the rest. 38 | - **LLM evaluation report**: Obtain comprehensive, no-code reports from Evalverse. Request with a simple command -`Report!`-, select the model and evaluation criteria, and receive detailed reports with scores, rankings, and visuals, all generated from the stored score database. 39 | 40 | 41 | If you want to know more about Evalverse, please checkout our [docs](https://evalverse.gitbook.io/evalverse-docs). </br> 42 | By clicking below image, it'll take you to a short intro video! 43 | [![Brief Introduction](./assets/intro-evalverse.png)](https://www.youtube.com/watch?v=-VviAutjpgM) 44 | </br> 45 | 46 | ## 🌌 Installation 47 | ### 🌠 Option 1: Git clone 48 | Before cloning, please make sure you've registered proper SSH keys linked to your GitHub account. 49 | 50 | #### 1. Clone the Evalverse repository 51 | - Notes: add `--recursive` option to also clone submodules 52 | ``` 53 | git clone --recursive https://github.com/UpstageAI/evalverse.git 54 | ``` 55 | #### 2. Install requirement packages 56 | ``` 57 | cd evalverse 58 | pip install -e . 59 | ``` 60 | 61 | ### 🌠 Option 2: Install via Pypi *(WIP)* 62 | > Currently, installation via Pypi is not supported. Please install Evalverse with option 1. 63 | 64 | 65 | </br> 66 | 67 | ## 🌌 Configuration 68 | You have to set an API key and/or Token in the `.env` file (rename `.env_sample` to `.env`) to use all features of Evalverse. 69 | - OpenAI API Key (required for `mt_bench`) 70 | - Slack BOT/APP Token (required for slack reporter) 71 | ``` 72 | OPENAI_API_KEY=sk-... 73 | 74 | SLACK_BOT_TOKEN=xoxb-... 75 | SLACK_APP_TOKEN=xapp-... 76 | ``` 77 | 78 | </br> 79 | 80 | ## 🌌 Quickstart 81 | More detailed tutorials are [here](https://github.com/UpstageAI/evalverse/tree/main/examples). 82 | 83 | - [basic_usage.ipynb](https://github.com/UpstageAI/evalverse/tree/main/examples/01_basic_usage.ipynb): Very basic usage, like how to use `Evaluator` for evaluation and `Reporter` for generating report. 84 | - [advanced_usage.ipynb](https://github.com/UpstageAI/evalverse/tree/main/examples/02_advanced_usage.ipynb): Introduces methods for evaluating each benchmark and all benchmarks collectively. 85 | 86 | ### 🌠 Evaluation 87 | #### 💫 Evaluation with Library 88 | The following code is a simple example to evaluate the [SOLAR-10.7B-Instruct-v1.0 model](https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0) on the `h6_en` ([Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)) benchmark. 89 | 90 | ```python 91 | import evalverse as ev 92 | 93 | evaluator = ev.Evaluator() 94 | 95 | model = "upstage/SOLAR-10.7B-Instruct-v1.0" 96 | benchmark = "h6_en" 97 | 98 | evaluator.run(model=model, benchmark=benchmark) 99 | ``` 100 | 101 | 102 | #### 💫 Evaluation with CLI 103 | Here is a CLI script that produces the same result as the above code: 104 | 105 | ```bash 106 | cd evalverse 107 | 108 | python3 evaluator.py \ 109 | --h6_en \ 110 | --ckpt_path upstage/SOLAR-10.7B-Instruct-v1.0 111 | ``` 112 | ### 🌠 Report 113 | Currently, generating a report is only available through the library. We will work on a Command Line Interface (CLI) version as soon as possible. 114 | 115 | ```python 116 | import evalverse as ev 117 | 118 | db_path = "./db" 119 | output_path = "./results" 120 | reporter = ev.Reporter(db_path=db_path, output_path=output_path) 121 | 122 | reporter.update_db(save=True) 123 | 124 | model_list = ["SOLAR-10.7B-Instruct-v1.0", "Llama-2-7b-chat-hf"] 125 | benchmark_list = ["h6_en"] 126 | reporter.run(model_list=model_list, benchmark_list=benchmark_list) 127 | ``` 128 | <img alt="architecture" src="assets/sample_report.png" width=700> 129 | 130 | | Model | Ranking | total_avg | H6-ARC | H6-Hellaswag | H6-MMLU | H6-TruthfulQA | H6-Winogrande | H6-GSM8k | 131 | |--------------------------:|--------:|----------:|-------:|-------------:|--------:|--------------:|--------------:|---------:| 132 | | SOLAR-10.7B-Instruct-v1.0 | 1 | 74.62 | 71.33 | 88.19 | 65.52 | 71.72 | 83.19 | 67.78 | 133 | | Llama-2-7b-chat-hf | 2 | 53.51 | 53.16 | 78.59 | 47.38 | 45.31 | 72.69 | 23.96 | 134 | 135 | </br> 136 | 137 | ## 🌌 Supported Evaluations 138 | We currently support four evaluation methods. If you have suggestions for new methods, we welcome your input! 139 | 140 | | Evaluation | Original Repository | 141 | |---------------------------|--------------------------------------------| 142 | | H6 (Open LLM Leaderboard) | [EleutherAI](https://github.com/EleutherAI)/[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)| 143 | | MT-bench | [lm-sys](https://github.com/lm-sys)/[FastChat](https://github.com/lm-sys/FastChat)| 144 | | IFEval | [google-research](https://github.com/google-research/google-research/tree/master)/[instruction_following_eval](https://github.com/google-research/google-research/tree/master/instruction_following_eval)| 145 | | EQ-Bench | [EQ-bench](https://github.com/EQ-bench)/[EQ-Bench](https://github.com/EQ-bench/EQ-Bench)| 146 | 147 | </br> 148 | 149 | ## 🌌 Evalverse use-case 150 | > If you have any use-cases of your own, please feel free to let us know. </br>We would love to hear about them and possibly feature your case. 151 | 152 | 153 | *✨* [`Upstage`](https://www.upstage.ai/) is using Evalverse for evaluating [Solar](https://console.upstage.ai/services/solar?utm_source=upstage.ai&utm_medium=referral&utm_campaign=Main+hero+Solar+card&utm_term=Try+API+for+Free&utm_content=home). </br> 154 | *✨* [`Upstage`](https://www.upstage.ai/) is using Evalverse for evaluating models at [Open Ko-LLM Leaderboard](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard). 155 | 156 | </br> 157 | 158 | ## 🌌 Contributors 159 | <a href="https://github.com/UpstageAI/evalverse/graphs/contributors"> 160 | <img src="https://contrib.rocks/image?repo=UpstageAI/evalverse"/> 161 | </a> 162 | 163 | 164 | ## 🌌 Acknowledgements 165 | Evalverse is an open-source project orchestrated by the **Data-Centric LLM Team** at `Upstage`, designed as an ecosystem for LLM evaluation. Launched in April 2024, this initiative stands at the forefront of advancing evaluation handling in the realm of large language models (LLMs). 166 | 167 | ## 🌌 License 168 | Evalverse is completely freely-accessible open-source and licensed under the Apache License 2.0. 169 | 170 | ## 🌌 Citation 171 | If you want to cite our 🌌 Evalverse project, feel free to use the following bibtex. You can check our paper via [link](https://arxiv.org/abs/2404.00943). 172 | 173 | ```bibtex 174 | @misc{kim2024evalverse, 175 | title={Evalverse: Unified and Accessible Library for Large Language Model Evaluation}, 176 | author={Jihoo Kim and Wonho Song and Dahyun Kim and Yunsu Kim and Yungi Kim and Chanjun Park}, 177 | year={2024}, 178 | eprint={2404.00943}, 179 | archivePrefix={arXiv}, 180 | primaryClass={cs.CL} 181 | } 182 | ``` 183 | -------------------------------------------------------------------------------- /assets/Evalverse_Color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/Evalverse_Color.png -------------------------------------------------------------------------------- /assets/Evalverse_White.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/Evalverse_White.png -------------------------------------------------------------------------------- /assets/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/architecture.png -------------------------------------------------------------------------------- /assets/intro-evalverse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/intro-evalverse.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/overview.png -------------------------------------------------------------------------------- /assets/sample_report.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/sample_report.png -------------------------------------------------------------------------------- /contribution/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # __Contribution Guidelines__ 2 | Welcome to _Evalverse_! We warmly welcome any kind of contribution 😊✨. </br> 3 | This page provides an outline on how to contribute to _Evalverse_ and suggestions for nice conventions to follow. 4 | > __These are guidelines, NOT rules 💡__ <p> 5 | This page is not the Constituion of the _Evalverse_. We are providing guidelines to help you make a useful and efficient contribution to _Evalverse_. While we think these guidelines are sensible and we appreciate when they are observed, following them isn't strictly required. We hope you won't be tired by these guidelines. Also, we'd love to hear your ideas on how to improve our guidelines! 6 | 7 | </br> 8 | 9 | # Table of Contents 10 | - [Questions or Feedback](#questions-or-feedback) 11 | - [🤝 How to Contribute?](#how-to-contribute) 12 | - [Commit Guidelines](#commit-guidelines) 13 | - [Style Guides](#style-guides) 14 | 15 | </br> 16 | 17 | # Questions or Feedback 18 | Join the conversation on our GitHub discussion board! It's the go-to spot for questions, chats, and a helping hand from the _Evalverse_ community. Drop by and say hello here: [link](https://github.com/UpstageAI/evalverse/discussions) 19 | 20 | And if there's a shiny new feature you're dreaming of, don't be shy—head over to our [issue page](https://github.com/UpstageAI/evalverse/issues) to let us know! Your input could help shape the future. ✨ 21 | 22 | </br> 23 | 24 | # How to Contribute? 25 | - Any kind of improvement of document: fixing typo, enhancing grammar or semantic structuring or adding new examples. 26 | - Submit issues related to bugs, new desired features, or enhancement of existing features. 27 | - Fix a bug, implement new feature or improving existing feature. 28 | - Answer other users' question or help. 29 | 30 | 31 | ## __Report a Bug / Request New Feature / Suggest Enhancements__ 32 | Please open an issue whenever you find a bug or have an idea to enhance _Evalverse_. Maintainers will label it or leave comment on it as soon as they check the issue. Issues labeled as `Open for contribution` mean they are open for contribution. 33 | 34 | ## __Fix a Bug / Add New Feature / Improve Existing Feature__ 35 | If you have a particular roadmap, goals, or new feature, share it via issue. already fixed a bug or have new feature that enhances _Evalverse_, you can jump on to fourth step which is opening pull requests. Please note that when you open pull requests without opening an issue or maintainers' check, it can be declined if it does not aligh with philosophy of _Evalverse_. 36 | 37 | ### __1️⃣ Check issues labeled as__ `Open for contribution` 38 | You can find issues waiting for your contribution by filtering label with `Open for contribution`. This label does not stand alone. It is always with `Bug`, `Docs` or `Enhancement`. Issues with `Critical` or `ASAP` label are more urgent. 39 | 40 | 41 | ### __2️⃣ Leave a comment on the issue you want to contribute__ 42 | Once we review your comment, we'll entrust the issue to you by swapping out the `Open for contribution` label for a `WIP` (Work in Progress) label. 43 | 44 | ### __3️⃣ Work on it__ 45 | Before diving into coding, do take a moment to familiarize yourself with our coding style by visiting this [style guides](#style-guides). And hey, if you hit a snag while tackling the issue, don't hesitate to drop a comment right there. Our community is a supportive bunch and will jump in to assist or brainstorm with you. 46 | 47 | 1. Fork the repository of _Evalverse_. 48 | 2. Clone your fork to your local disk. 49 | 3. Create a new branch to hold your develompment changes. </br> 50 | It's not required to adhere strictly to the branch naming example provided; consider it a mild suggestion. 51 | ```bash 52 | git checkout -b {prefix}/{issue-number}-{description} 53 | ``` 54 | 4. Set up a development environment 55 | 5. Develop the features in your branch 56 | 57 | 58 | ### __4️⃣ Create a Pull Request__ 59 | Go ahead and visit your GitHub fork, then initiate a pull request — it's time to share your awesome work! Before you do, double-check that you've completed everything on the checklist we provided. Once you're all set, submit your contributions for the project maintainers to review. 60 | 61 | Don't worry if the maintainers have some feedback or suggest changes—it's all part of the process and happens to even our most experienced contributors. Keep your updates flowing by working in your local branch and pushing any new changes to your fork. Your pull request will update automatically for everyone to see the progress. 62 | 63 | </br> 64 | 65 | # Commit Guidelines 66 | ### Commit strategy 67 | - Avoid mixing multiple, unrelated modifications in a single commit. One commit is related with one issue. 68 | - Each commit should encapsulate a complete, autonomous upgrade to the code. 69 | 70 | ### Commit messages 71 | Please make sure your commit messages follow `type`: `title (#<related issue number>)` format. <br/> 72 | For example: 73 | ```plain text 74 | <TYPE>: Short summary with 72 characters or less (#<Issue number>) 75 | 76 | If you have more detalied explanatory text, put it as body. 77 | But the body is optional. 78 | ``` 79 | - Find adequate type in the below list: 80 | - `NEW`: introducing a new feature 81 | - `ENHANCE`: improve an existing code/feature. 82 | - `FIX`: fix a code bug 83 | - `DOCS`: write/update/add any kind of documents including docstring 84 | - `REFACTOR`: refactor existing code without any specific improvements 85 | - `STYLE`: changes that do not affect the meaning of the code (ex. white-space, line length) 86 | - `TEST`: add additional testing 87 | - `DEL`: remove code or files 88 | - `RELEASE`: release new version of evalverse 89 | - `OTHER`: anything not covered above (not recommended) 90 | - Use the present tense ("Add feature" not "Added feature") 91 | - Do not end the subject line with a punctuation 92 | 93 | </br> 94 | 95 | # Style Guides 96 | ### Pre-commit hook 97 | We provide a pre-commit git hook for style check. You can find exact check list in this [file](https://github.com/UpstageAI/evalverse/blob/main/.pre-commit-config.yaml). <br/> Please run the code below before a commit is created: 98 | ```bash 99 | pre-commit run 100 | ``` 101 | 102 | -------------------------------------------------------------------------------- /evalverse/README.md: -------------------------------------------------------------------------------- 1 | # Evalverse 2 | > The Universe of Evaluation. All about the evaluation for LLMs. 3 | 4 | 5 | ## 🌌 Submodule 6 | > The Submodule serves as the evaluation engine that is responsible for the heavy lifting involved in evaluating LLMs. Publicly available LLM evaluation libraries can be integrated into Evalverse as submodules. This component makes Evalverse expandable, thereby ensuring that the library remains up-to-date. 7 | 8 | ## 🌌 Connector 9 | > The Connector plays a role in linking the Submodules with the Evaluator. It contains evaluation scripts, along with the necessary arguments, from various external libraries. 10 | 11 | ## 🌌 Evaluator 12 | > The Evaluator performs the requested evaluations on the Compute Cluster by utilizing the evaluation scripts from the Connector. The Evaluator can receive evaluation requests either from the Reporter, which facilitates a no-code evaluation approach, or directly from the end-user for code-based evaluation. 13 | 14 | ## 🌌 Reporter 15 | > The Reporter handles the evaluation and report requests sent by the users, allowing for a no-code approach to LLM evaluation. The Reporter sends the requested evaluation jobs to the Evaluator and fetches the evaluation results from the Database, which are sent to the user via an external communication platform such as Slack. Through this, users can receive table and figure that summarize evaluation results. -------------------------------------------------------------------------------- /evalverse/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib.metadata 2 | 3 | from evalverse.evaluator import Evaluator 4 | from evalverse.reporter import Reporter 5 | 6 | __version__ = importlib.metadata.version("evalverse") 7 | 8 | __all__ = [Evaluator, Reporter] 9 | -------------------------------------------------------------------------------- /evalverse/connector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024-present Upstage Co., Ltd. 3 | Apache-2.0 license 4 | """ 5 | import json 6 | import os 7 | 8 | from evalverse.utils import EVALVERSE_MODULE_PATH, print_command, print_txt_file 9 | 10 | 11 | def lm_evaluation_harness( 12 | model_path="upstage/SOLAR-10.7B-Instruct-v1.0", 13 | tasks="arc_challenge", 14 | batch_size=16, 15 | use_vllm=False, 16 | gpu_memory_utilization=0.8, 17 | tensor_parallel_size=1, 18 | data_parallel_size=1, 19 | num_fewshot=0, 20 | use_fast_tokenizer=False, 21 | use_flash_attention_2=False, 22 | load_in_8bit=False, 23 | load_in_4bit=False, 24 | output_path="../results", 25 | ): 26 | output_json_path = os.path.join(output_path, f"{tasks}_{num_fewshot}.json") 27 | 28 | if not os.path.exists(output_json_path): 29 | if use_vllm: 30 | tokenizer_mode = "auto" if use_fast_tokenizer else "slow" 31 | eval_cmd = f""" 32 | lm_eval --model vllm \ 33 | --model_args pretrained={model_path},trust_remote_code=True,tensor_parallel_size={tensor_parallel_size},dtype=float16,gpu_memory_utilization={gpu_memory_utilization},data_parallel_size={data_parallel_size},tokenizer_mode={tokenizer_mode} \ 34 | --tasks {tasks} \ 35 | --batch_size {batch_size} \ 36 | --num_fewshot {num_fewshot} \ 37 | --output_path {output_json_path} \ 38 | """ 39 | else: 40 | hf_cmd = "lm_eval --model hf" 41 | model_args = f"pretrained={model_path},trust_remote_code=True,dtype=float16,use_fast_tokenizer={use_fast_tokenizer},use_flash_attention_2={use_flash_attention_2}" 42 | 43 | if data_parallel_size > 1: 44 | hf_cmd = "accelerate launch -m " + hf_cmd 45 | if tensor_parallel_size > 1: 46 | model_args = model_args + ",parallelize=True" 47 | if load_in_8bit: 48 | model_args = model_args + ",load_in_8bit=True" 49 | if load_in_4bit: 50 | model_args = model_args + ",load_in_4bit=True" 51 | 52 | eval_cmd = f""" 53 | NCCL_P2P_DISABLE=1 {hf_cmd} \ 54 | --model_args {model_args} \ 55 | --tasks {tasks} \ 56 | --batch_size {batch_size} \ 57 | --num_fewshot {num_fewshot} \ 58 | --output_path {output_json_path} \ 59 | """ 60 | print_command(eval_cmd) 61 | os.system(eval_cmd) 62 | 63 | else: 64 | print(f"The result already exists: {os.path.abspath(output_json_path)}") 65 | 66 | 67 | def fastchat_llm_judge( 68 | model_path="upstage/SOLAR-10.7B-Instruct-v1.0", 69 | model_id="SOLAR-10.7B-Instruct-v1.0", 70 | mt_bench_name="mt_bench", 71 | baselines=None, 72 | judge_model="gpt-4", 73 | num_gpus_per_model=1, 74 | num_gpus_total=1, 75 | parallel_api=1, 76 | output_path="../results", 77 | ): 78 | scores_file = os.path.join(output_path, model_id, "mt_bench", "scores.txt") 79 | 80 | if not os.path.exists(scores_file): 81 | if baselines: 82 | model_list = " ".join([model_id] + baselines.split(",")) 83 | else: 84 | model_list = model_id 85 | 86 | eval_code_path = os.path.join( 87 | EVALVERSE_MODULE_PATH, "submodules/FastChat/fastchat/llm_judge" 88 | ) 89 | answer_path = os.path.join(output_path, model_id, "mt_bench", "model_answer") 90 | answer_file = os.path.join(answer_path, f"{model_id}.jsonl") 91 | judgement_path = os.path.join(output_path, model_id, "mt_bench", "model_judgment") 92 | judgement_file = os.path.join(judgement_path, "gpt-4_single.jsonl") 93 | 94 | gen_answer_cmd = f"python3 gen_model_answer.py --model-path {model_path} --model-id {model_id} --bench-name {mt_bench_name} --answer-file {answer_file} --num-gpus-per-model {num_gpus_per_model} --num-gpus-total {num_gpus_total}" 95 | gen_judgment_cmd = f"echo -e '\n' | python3 gen_judgment.py --model-list {model_list} --bench-name {mt_bench_name} --model-answer-dir {answer_path} --model-judgement-dir {judgement_path} --judge-model {judge_model} --parallel {parallel_api}" 96 | save_result_cmd = f"python3 show_result.py --model-list {model_list} --bench-name {mt_bench_name} --judge-model {judge_model} --input-file {judgement_file} > {os.path.join(output_path, model_id, 'mt_bench', 'scores.txt')}" 97 | 98 | eval_cmd = f"cd {eval_code_path}" 99 | if not os.path.exists(answer_file): 100 | eval_cmd += f" && {gen_answer_cmd}" 101 | if not os.path.exists(judgement_file): 102 | eval_cmd += f" && {gen_judgment_cmd}" 103 | eval_cmd += f" && {save_result_cmd}" 104 | print_command(eval_cmd) 105 | os.system(eval_cmd) 106 | else: 107 | print(f"The result already exists: {os.path.abspath(scores_file)}") 108 | # print results 109 | print_txt_file(scores_file) 110 | 111 | 112 | def instruction_following_eval( 113 | model_path="upstage/SOLAR-10.7B-Instruct-v1.0", 114 | model_name="SOLAR-10.7B-Instruct-v1.0", 115 | gpu_per_inst_eval=1, 116 | devices="0", 117 | output_path="../results", 118 | ): 119 | scores_file = os.path.join(output_path, model_name, "ifeval", "scores.txt") 120 | 121 | if not os.path.exists(scores_file): 122 | eval_code_path = os.path.join(os.path.join(EVALVERSE_MODULE_PATH, "submodules/IFEval")) 123 | 124 | eval_cmd = f""" 125 | cd {eval_code_path} && python3 inst_eval.py \ 126 | --model {model_path} \ 127 | --model_name {model_name} \ 128 | --gpu_per_inst_eval {gpu_per_inst_eval} \ 129 | --output_path {output_path} \ 130 | --devices {devices} 131 | """ 132 | print_command(eval_cmd) 133 | os.system(eval_cmd) 134 | else: 135 | print(f"The result already exists: {os.path.abspath(scores_file)}") 136 | # print results 137 | print_txt_file(scores_file) 138 | 139 | 140 | def eq_bench( 141 | model_name="SOLAR-10.7B-Instruct-v1.0", # model name for saving results 142 | prompt_type="ChatML", # Chat template 143 | model_path="upstage/SOLAR-10.7B-Instruct-v1.0", # model path 144 | lora_path=None, # lora adapter path 145 | quantization=None, # quantization, [None, "8bit", "4bit"] for load_in_8bit etc. 146 | n_iterations=1, # number of iterations to repeat the inference 147 | devices="0", # cuda devices 148 | use_fast_tokenizer=False, # use fast tokenizer 149 | gpu_per_proc=1, # gpu per process, currently only supports 1 150 | use_flash_attention_2=True, # use flash attention 2 151 | torch_dtype="b16", # torch dtype, [b16, f16, f32] 152 | output_path="../results", # output path 153 | ): 154 | result_file = os.path.join(output_path, model_name, "eq_bench", "raw_results.json") 155 | if not os.path.exists(result_file): 156 | assert gpu_per_proc == 1, "Currently only supports 1 gpu per process" 157 | 158 | eval_code_path = os.path.join(os.path.join(EVALVERSE_MODULE_PATH, "submodules/EQBench")) 159 | single_eval_code = f""" 160 | CUDA_VISIBLE_DEVICES={devices} python3 eq-bench.py --model_name {model_name} --prompt_type {prompt_type} \ 161 | --model_path {model_path} --quantization {quantization} --n_iterations {n_iterations} \ 162 | --gpu_per_proc {gpu_per_proc} --torch_dtype {torch_dtype} --output_path {output_path} \ 163 | --devices {devices}""" 164 | if use_fast_tokenizer: 165 | single_eval_code += " --use_fast_tokenizer" 166 | if use_flash_attention_2: 167 | single_eval_code += " --use_flash_attention_2" 168 | if lora_path is not None: 169 | single_eval_code += f" --lora_path {lora_path}" 170 | 171 | eval_cmd = f""" 172 | cd {eval_code_path} && {single_eval_code} 173 | """ 174 | print_command(eval_cmd) 175 | os.system(eval_cmd) 176 | else: 177 | print(f"The result already exists: {os.path.abspath(result_file)}") 178 | # print results 179 | with open(result_file, "r") as f: 180 | data = json.load(f) 181 | result = data[list(data.keys())[0]]["iterations"]["1"]["benchmark_results_fullscale"] 182 | print(json.dumps(result, indent=4)) 183 | -------------------------------------------------------------------------------- /evalverse/evaluator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024-present Upstage Co., Ltd. 3 | Apache-2.0 license 4 | """ 5 | import logging 6 | import os 7 | import time 8 | from argparse import ArgumentParser 9 | from pathlib import Path 10 | from typing import Union, Optional 11 | 12 | from evalverse.connector import ( 13 | eq_bench, 14 | fastchat_llm_judge, 15 | instruction_following_eval, 16 | lm_evaluation_harness, 17 | ) 18 | from evalverse.reporter import AVAILABLE_BENCHMARKS 19 | from evalverse.utils import ( 20 | EVALVERSE_LOG_FORMAT, 21 | EVALVERSE_OUTPUT_PATH, 22 | get_h6_en_scores, 23 | get_logger, 24 | ) 25 | 26 | logging.basicConfig(format=EVALVERSE_LOG_FORMAT, datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO) 27 | 28 | 29 | class Evaluator: 30 | def __init__(self, mode="lib", log_path=None): 31 | self.mode = mode # lib or cli 32 | self.logger = get_logger(log_path) 33 | 34 | def get_args(self): 35 | parser = ArgumentParser() 36 | 37 | # Common Args 38 | parser.add_argument("--ckpt_path", type=str, default="upstage/SOLAR-10.7B-Instruct-v1.0") 39 | parser.add_argument("--output_path", type=str, default=EVALVERSE_OUTPUT_PATH) 40 | parser.add_argument("--model_name", type=str, help="using in save_path") 41 | parser.add_argument("--use_fast_tokenizer", action="store_true", default=False) 42 | parser.add_argument("--devices", type=str, default="0", help="The size of data parallel.") 43 | parser.add_argument("--use_flash_attention_2", action="store_true", default=False) 44 | 45 | # lm-evaluation-harness 46 | parser.add_argument("--h6_en", action="store_true", default=False) 47 | parser.add_argument("--batch_size", type=int, default=16) 48 | parser.add_argument("--use_vllm", action="store_true", default=False) 49 | parser.add_argument("--gpu_memory_utilization", type=float, default=0.8) 50 | parser.add_argument( 51 | "--model_parallel", type=int, default=1, help="The size of model parallel" 52 | ) 53 | parser.add_argument( 54 | "--data_parallel", type=int, default=1, help="The size of data parallel" 55 | ) 56 | parser.add_argument("--load_in_8bit", action="store_true", default=False) 57 | parser.add_argument("--load_in_4bit", action="store_true", default=False) 58 | 59 | # FastChat 60 | parser.add_argument("--mt_bench", action="store_true", default=False) 61 | parser.add_argument("--baselines", type=str, default=None) 62 | parser.add_argument("--judge_model", type=str, default="gpt-4") 63 | parser.add_argument( 64 | "--num_gpus_total", type=int, default=1, help="The total number of GPUs." 65 | ) 66 | parser.add_argument( 67 | "--num_gpus_per_model", type=int, default=1, help="The number of GPUs per model." 68 | ) 69 | parser.add_argument( 70 | "--parallel_api", type=int, default=1, help="The number of concurrent API calls." 71 | ) 72 | 73 | # Instruction Following Eval 74 | parser.add_argument("--ifeval", action="store_true", default=False) 75 | parser.add_argument( 76 | "--gpu_per_inst_eval", type=int, default=1, help="The number of GPUs per model." 77 | ) 78 | 79 | # EQ-Bench 80 | parser.add_argument("--eq_bench", action="store_true", default=False) 81 | parser.add_argument("--eq_bench_prompt_type", type=str, default="ChatML") 82 | parser.add_argument("--eq_bench_lora_path", type=str, default=None) 83 | parser.add_argument( 84 | "--eq_bench_quantization", type=str, default=None, choices=["8bit", "4bit", None] 85 | ) 86 | 87 | if self.mode == "lib": 88 | args = parser.parse_args(args=[]) 89 | elif self.mode == "cli": 90 | args = parser.parse_args() 91 | 92 | # update path to work regardless of / 93 | args.ckpt_path = str(Path(args.ckpt_path)) 94 | args.output_path = str(Path(args.output_path)) 95 | 96 | # handle model name 97 | if args.model_name is None: 98 | args.model_name = args.ckpt_path.split("/")[-1] 99 | 100 | # change relative path to absolute path 101 | if not os.path.isabs(args.output_path): 102 | args.output_path = os.path.abspath(args.output_path) 103 | 104 | return args 105 | 106 | def update_args(self, args, model, benchmark, kwargs): 107 | for k, v in kwargs.items(): 108 | if k in args: 109 | setattr(args, k, v) 110 | self.logger.info(f'The value of argument "{k}" has been changed to "{v}".') 111 | else: 112 | self.logger.warning(f'The argument "{k}" does not exist.') 113 | if model: 114 | args.ckpt_path = model 115 | if benchmark: 116 | if benchmark == "all": 117 | benchmark = AVAILABLE_BENCHMARKS 118 | self.logger.info(f"All available benchmarks are selected: {AVAILABLE_BENCHMARKS}") 119 | if benchmark in AVAILABLE_BENCHMARKS: 120 | setattr(args, benchmark, True) 121 | self.logger.info(f'The value of argument "{benchmark}" has been changed to "True".') 122 | elif type(benchmark) == list: 123 | for b in benchmark: 124 | if b in AVAILABLE_BENCHMARKS: 125 | setattr(args, b, True) 126 | self.logger.info(f'The value of argument "{b}" has been changed to "True".') 127 | else: 128 | raise ValueError( 129 | f'"{b}" is not in Available_Benchmarks: {AVAILABLE_BENCHMARKS}' 130 | ) 131 | else: 132 | raise ValueError( 133 | f'"{benchmark}" is not in Available_Benchmarks: {AVAILABLE_BENCHMARKS}' 134 | ) 135 | else: 136 | self.logger.info( 137 | f"No selected benchmarks. Available_Benchmarks: {AVAILABLE_BENCHMARKS}" 138 | ) 139 | self.logger.info(f"Args {vars(args)}") 140 | 141 | return args 142 | 143 | def run(self, model: Optional[str] = None, benchmark: Optional[Union[str, list]] = None, **kwargs): 144 | 145 | # update args 146 | args = self.get_args() 147 | args = self.update_args(args, model, benchmark, kwargs) 148 | 149 | # h6_en (with lm-evaluation-harness) 150 | if args.h6_en: 151 | task_and_shot = [ 152 | ("arc_challenge", 25), 153 | ("hellaswag", 10), 154 | ("mmlu", 5), 155 | ("truthfulqa_mc2", 0), 156 | ("winogrande", 5), 157 | ("gsm8k", 5), 158 | ] 159 | model_name = args.ckpt_path.split("/")[-1] 160 | h6_en_output_path = os.path.join(args.output_path, model_name, "h6_en") 161 | for _task_name, _num_fewshot in task_and_shot: 162 | start_time = time.time() 163 | ############################################# 164 | lm_evaluation_harness( 165 | model_path=args.ckpt_path, 166 | tasks=_task_name, 167 | batch_size=args.batch_size, 168 | use_vllm=args.use_vllm, 169 | gpu_memory_utilization=args.gpu_memory_utilization, 170 | tensor_parallel_size=args.model_parallel, 171 | data_parallel_size=args.data_parallel, 172 | num_fewshot=_num_fewshot, 173 | use_fast_tokenizer=args.use_fast_tokenizer, 174 | use_flash_attention_2=args.use_flash_attention_2, 175 | load_in_8bit=args.load_in_8bit, 176 | load_in_4bit=args.load_in_4bit, 177 | output_path=h6_en_output_path, 178 | ) 179 | ############################################# 180 | end_time = time.time() 181 | total_min = round((end_time - start_time) / 60) 182 | bench_name = _task_name + "_" + str(_num_fewshot) + "shot" 183 | self.logger.info( 184 | f"{bench_name} done! exec_time: {total_min} min for {args.ckpt_path}" 185 | ) 186 | get_h6_en_scores(h6_en_output_path, print_results=True) 187 | # mt_bench (with evalverse-FastChat) 188 | if args.mt_bench: 189 | if "OPENAI_API_KEY" not in os.environ: 190 | self.logger.warning("No OPENAI_API_KEY provided. Please add it.") 191 | start_time = time.time() 192 | ############################################# 193 | fastchat_llm_judge( 194 | model_path=args.ckpt_path, 195 | model_id=args.model_name, 196 | mt_bench_name="mt_bench", 197 | baselines=args.baselines, 198 | judge_model=args.judge_model, 199 | num_gpus_per_model=args.num_gpus_per_model, 200 | num_gpus_total=args.num_gpus_total, 201 | parallel_api=args.parallel_api, 202 | output_path=args.output_path, 203 | ) 204 | ############################################# 205 | end_time = time.time() 206 | total_min = round((end_time - start_time) / 60) 207 | bench_name = "mt_bench" 208 | self.logger.info(f"{bench_name} done! exec_time: {total_min} min for {args.ckpt_path}") 209 | 210 | # ifeval (with evalverse-IFEval) 211 | if args.ifeval: 212 | start_time = time.time() 213 | ############################################# 214 | instruction_following_eval( 215 | model_path=args.ckpt_path, 216 | model_name=args.model_name, 217 | gpu_per_inst_eval=args.gpu_per_inst_eval, 218 | devices=args.devices, 219 | output_path=args.output_path, 220 | ) 221 | ############################################# 222 | end_time = time.time() 223 | total_min = round((end_time - start_time) / 60) 224 | bench_name = "ifeval" 225 | self.logger.info(f"{bench_name} done! exec_time: {total_min} min for {args.ckpt_path}") 226 | 227 | # eq_bench (with evalverse-EQBench) 228 | if args.eq_bench: 229 | start_time = time.time() 230 | ############################################# 231 | eq_bench( 232 | model_name=args.model_name, 233 | prompt_type=args.eq_bench_prompt_type, 234 | model_path=args.ckpt_path, 235 | lora_path=args.eq_bench_lora_path, 236 | quantization=args.eq_bench_quantization, 237 | devices=args.devices, 238 | use_fast_tokenizer=args.use_fast_tokenizer, 239 | use_flash_attention_2=args.use_flash_attention_2, 240 | output_path=args.output_path, 241 | ) 242 | ############################################# 243 | end_time = time.time() 244 | total_min = round((end_time - start_time) / 60) 245 | bench_name = "eq_bench" 246 | self.logger.info(f"{bench_name} done! exec_time: {total_min} min for {args.ckpt_path}") 247 | 248 | 249 | if __name__ == "__main__": 250 | from dotenv import load_dotenv 251 | 252 | load_dotenv(override=True) 253 | 254 | evaluator_cli = Evaluator(mode="cli") 255 | evaluator_cli.run() 256 | -------------------------------------------------------------------------------- /evalverse/reporter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024-present Upstage Co., Ltd. 3 | Apache-2.0 license 4 | """ 5 | import os 6 | from datetime import datetime, timedelta, timezone 7 | 8 | import pandas as pd 9 | from typing import Union, List 10 | 11 | from evalverse.utils import ( 12 | EVALVERSE_DB_PATH, 13 | EVALVERSE_MODULE_PATH, 14 | EVALVERSE_OUTPUT_PATH, 15 | get_eqbench_score, 16 | get_figure, 17 | get_h6_en_scores, 18 | get_ifeval_scores, 19 | get_logger, 20 | get_mt_bench_scores, 21 | ) 22 | 23 | KST = timezone(timedelta(hours=9)) 24 | AVAILABLE_BENCHMARKS = ["h6_en", "mt_bench", "ifeval", "eq_bench"] 25 | 26 | H6EN_NAMES = ["H6-ARC", "H6-Hellaswag", "H6-MMLU", "H6-TruthfulQA", "H6-Winogrande", "H6-GSM8k"] 27 | MTBENCH_NAMES = [ 28 | "MT-Bench-Coding", 29 | "MT-Bench-Extraction", 30 | "MT-Bench-Humanities", 31 | "MT-Bench-Math", 32 | "MT-Bench-Reasoning", 33 | "MT-Bench-Roleplay", 34 | "MT-Bench-Stem", 35 | "MT-Bench-Writing", 36 | ] 37 | IFEVAL_NAMES = [ 38 | "IFEval-strict-prompt", 39 | "IFEval-strict-instruction", 40 | "IFEval-loose-prompt", 41 | "IFEval-loose-instruction", 42 | ] 43 | EQBENCH_NAME = ["EQ-Bench"] 44 | 45 | 46 | class Reporter: 47 | def __init__(self, db_path=EVALVERSE_DB_PATH, output_path=EVALVERSE_OUTPUT_PATH, log_path=None): 48 | self.db_path = db_path 49 | self.output_path = output_path 50 | self.logger = get_logger(log_path) 51 | 52 | self.score_path = os.path.join(self.db_path, "score_df.csv") 53 | self.table_dir = os.path.join(self.db_path, "scores") 54 | self.figure_dir = os.path.join(self.db_path, "figures") 55 | 56 | self.model_list = self._get_dirname_list(self.output_path) 57 | 58 | for path in [self.db_path, self.table_dir, self.figure_dir]: 59 | if not os.path.exists(path): 60 | os.makedirs(path) 61 | 62 | if os.path.exists(self.score_path): 63 | self.score_df = pd.read_csv(self.score_path) 64 | else: 65 | self.update_db(git_fetch=False) 66 | 67 | def _get_dirname_list(self, path): 68 | return sorted(os.listdir(path), key=str.lower) 69 | 70 | def update_db(self, save=False, git_fetch=False): 71 | if git_fetch: 72 | import git 73 | 74 | repo = git.Repo("../") 75 | repo.remotes.origin.fetch() 76 | 77 | self.model_list = self._get_dirname_list(self.output_path) 78 | if len(self.model_list) > 0: 79 | values_list = [] 80 | for model_name in self.model_list: 81 | bench_list = self._get_dirname_list(os.path.join(self.output_path, model_name)) 82 | if len(bench_list) > 0: 83 | values = [model_name] 84 | if "h6_en" in bench_list: 85 | h6_en_path = os.path.join(self.output_path, model_name, "h6_en") 86 | h6_en_scores = get_h6_en_scores(h6_en_path) 87 | values += h6_en_scores 88 | self.logger.info(f"DB updated: h6_en for {model_name}") 89 | else: 90 | values += [0] * len(H6EN_NAMES) 91 | if "mt_bench" in bench_list: 92 | mtbench_path = os.path.join(self.output_path, model_name, "mt_bench") 93 | question_file = os.path.join( 94 | EVALVERSE_MODULE_PATH, 95 | "submodules/FastChat/fastchat/llm_judge/data/mt_bench/question.jsonl", 96 | ) 97 | judgement_file = os.path.join( 98 | mtbench_path, "model_judgment", "gpt-4_single.jsonl" 99 | ) 100 | mt_scores = get_mt_bench_scores(model_name, question_file, judgement_file) 101 | values += mt_scores 102 | self.logger.info(f"DB updated: mt_bench for {model_name}") 103 | else: 104 | values += [0] * len(MTBENCH_NAMES) 105 | if "ifeval" in bench_list: 106 | score_file = os.path.join( 107 | self.output_path, model_name, "ifeval", "scores.txt" 108 | ) 109 | ifeval_scores = get_ifeval_scores(score_file) 110 | values += ifeval_scores 111 | self.logger.info(f"DB updated: ifeval for {model_name}") 112 | else: 113 | values += [0] * len(IFEVAL_NAMES) 114 | if "eq_bench" in bench_list: 115 | eqbench_result_file = os.path.join( 116 | self.output_path, model_name, "eq_bench", "raw_results.json" 117 | ) 118 | eqbench_score = get_eqbench_score(eqbench_result_file) 119 | values += eqbench_score 120 | self.logger.info(f"DB updated: eq_bench for {model_name}") 121 | else: 122 | values += [0] * len(EQBENCH_NAME) 123 | values_list.append(values) 124 | else: 125 | pass 126 | column_list = ["Model"] + H6EN_NAMES + MTBENCH_NAMES + IFEVAL_NAMES + EQBENCH_NAME 127 | self.score_df = pd.DataFrame(data=values_list, columns=column_list) 128 | if save: 129 | self.score_df.to_csv(self.score_path, index=False) 130 | self.logger.info(f"DB saved to {self.score_path}") 131 | else: 132 | pass 133 | 134 | def run(self, model_list: Union[List, str] = "all", benchmark_list: Union[List, str] = "all", save: bool = False): 135 | 136 | if type(model_list) == list: 137 | for m in model_list: 138 | if m in self.model_list: 139 | pass 140 | else: 141 | raise ValueError(f'"{m}" is not in Available_Models: {self.model_list}') 142 | elif type(model_list) == str: 143 | if model_list in self.model_list: 144 | model_list = [model_list] 145 | elif model_list == "all": 146 | model_list = self.model_list 147 | else: 148 | raise ValueError(f'"{model_list}" is not in Available_Models: {self.model_list}') 149 | else: 150 | raise TypeError 151 | 152 | if type(benchmark_list) == list: 153 | for b in benchmark_list: 154 | if b in AVAILABLE_BENCHMARKS: 155 | pass 156 | else: 157 | raise ValueError( 158 | f'"{b}" is not in Available_Benchmarks: {AVAILABLE_BENCHMARKS}' 159 | ) 160 | elif type(benchmark_list) == str: 161 | if benchmark_list in AVAILABLE_BENCHMARKS: 162 | benchmark_list = [benchmark_list] 163 | elif benchmark_list == "all": 164 | benchmark_list = AVAILABLE_BENCHMARKS 165 | else: 166 | raise ValueError( 167 | f'"{benchmark_list}" is not in Available_Benchmarks: {AVAILABLE_BENCHMARKS}' 168 | ) 169 | selected_benchmarks = [] 170 | for b in benchmark_list: 171 | if b == "h6_en": 172 | selected_benchmarks += H6EN_NAMES 173 | if b == "mt_bench": 174 | selected_benchmarks += MTBENCH_NAMES 175 | if b == "ifeval": 176 | selected_benchmarks += IFEVAL_NAMES 177 | if b == "eq_bench": 178 | selected_benchmarks += EQBENCH_NAME 179 | 180 | score_df = self.score_df.copy() 181 | score_df = score_df[(score_df["Model"].isin(model_list))] 182 | score_df["total_avg"] = score_df[selected_benchmarks].mean(axis=1).round(2) 183 | score_df = score_df.sort_values("total_avg", ascending=False).reset_index(drop=True) 184 | score_df["Ranking"] = score_df["total_avg"].rank(ascending=False).astype(int) 185 | score_df = score_df[["Model", "Ranking", "total_avg"] + selected_benchmarks] 186 | 187 | if save: 188 | request_time = datetime.now(KST).strftime("%Y%m%d_%H%M%S") 189 | table_name = f"table_{request_time}.csv" 190 | figure_name = f"figure_{request_time}.jpeg" 191 | table_path = os.path.join(self.table_dir, table_name) 192 | figure_path = os.path.join(self.figure_dir, figure_name) 193 | 194 | score_df.to_csv(table_path, index=False) 195 | get_figure(score_df, selected_benchmarks, figure_path, save=True) 196 | self.logger.info(f"Table saved to {table_path}") 197 | self.logger.info(f"Figure saved to {figure_path}") 198 | return table_path, figure_path 199 | else: 200 | get_figure(score_df, selected_benchmarks, save=False) 201 | return score_df 202 | -------------------------------------------------------------------------------- /evalverse/slack_bot.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024-present Upstage Co., Ltd. 3 | Apache-2.0 license 4 | """ 5 | import os 6 | 7 | from dotenv import load_dotenv 8 | from slack_bolt import App 9 | from slack_bolt.adapter.socket_mode import SocketModeHandler 10 | from slack_sdk import WebClient 11 | from slack_sdk.errors import SlackApiError 12 | 13 | from evalverse.reporter import AVAILABLE_BENCHMARKS, Reporter 14 | from evalverse.utils import EVALVERSE_DB_PATH, EVALVERSE_OUTPUT_PATH, get_logger 15 | 16 | # Slack 17 | load_dotenv(override=True) 18 | bot_token = os.getenv("SLACK_BOT_TOKEN") 19 | app_token = os.getenv("SLACK_APP_TOKEN") 20 | client = WebClient(token=bot_token) 21 | app = App(token=bot_token) 22 | 23 | # Reporter 24 | reporter = Reporter(db_path=EVALVERSE_DB_PATH, output_path=EVALVERSE_OUTPUT_PATH) 25 | 26 | # Logger 27 | logger = get_logger(os.path.join(EVALVERSE_DB_PATH, "slack_bot.log")) 28 | 29 | 30 | def send_msg(msg, channel_id): 31 | try: 32 | result = client.chat_postMessage(channel=channel_id, text=msg) 33 | logger.info(result) 34 | 35 | except SlackApiError as e: 36 | logger.error(f"Error posting message: {e}") 37 | 38 | 39 | def upload_file(file_name, channel_id): 40 | try: 41 | result = client.files_upload_v2( 42 | channels=channel_id, 43 | file=file_name, 44 | ) 45 | logger.info(result) 46 | 47 | except SlackApiError as e: 48 | logger.error("Error uploading file: {}".format(e)) 49 | 50 | 51 | @app.message(r"Request!|request!|!Request|!request") 52 | def request_eval(ack, body, say, logger): 53 | ack() 54 | logger.info(body) 55 | say( 56 | text="", 57 | blocks=[ 58 | { 59 | "dispatch_action": True, 60 | "type": "input", 61 | "element": { 62 | "type": "plain_text_input", 63 | "action_id": "model_request_en", 64 | "placeholder": { 65 | "type": "plain_text", 66 | "text": "ex) upstage/SOLAR... or /my_local/checkpoints/SOLAR...", 67 | }, 68 | }, 69 | "label": { 70 | "type": "plain_text", 71 | "text": "Model name in HugginFace hub or checkpoint path in local", 72 | }, 73 | } 74 | ], 75 | ) 76 | 77 | 78 | @app.action("model_request_en") 79 | def confirm_eval(ack, body, say, logger): 80 | ack() 81 | logger.info(body) 82 | 83 | global user_input 84 | user_input = body["actions"][0]["value"] 85 | say( 86 | text="", 87 | blocks=[ 88 | { 89 | "type": "section", 90 | "text": { 91 | "type": "mrkdwn", 92 | "text": f'❗ Please double-check the model you requested evaluation for.\nIf the name or path of the model is [{user_input}], please press "Confirm" 👉', 93 | }, 94 | "accessory": { 95 | "type": "button", 96 | "text": { 97 | "type": "plain_text", 98 | "text": "Confirm", 99 | }, 100 | "value": "click_me_123", 101 | "action_id": "model_confirm_en", 102 | }, 103 | } 104 | ], 105 | ) 106 | 107 | 108 | @app.action("model_confirm_en") 109 | def run_eval(ack, body, say, logger): 110 | ack() 111 | logger.info(body) 112 | 113 | # Start 114 | start_msg = ( 115 | f"⏳ Evaluation in progress for the model <@{body['user']['id']}> requested.. [{user_input}]" 116 | ) 117 | say(start_msg) 118 | 119 | # Run an evaluation 120 | from evalverse import Evaluator 121 | 122 | evaluator = Evaluator() 123 | evaluator.run(model=user_input, benchmark="all") 124 | 125 | # End 126 | req_channel_id = body["channel"]["id"] 127 | complete_msg = f"Done! <@{body['user']['id']}>\n[{user_input}] is added." 128 | send_msg(complete_msg, req_channel_id) 129 | 130 | logger.info(f"@{body['user']['id']}::{user_input}") 131 | 132 | 133 | @app.message(r"Report!|report!|!Report|!report") 134 | def report_model_selection(ack, body, say, logger): 135 | ack() 136 | logger.info(body) 137 | 138 | reporter.update_db(save=True, git_fetch=False) 139 | 140 | model_options = sorted(os.listdir(EVALVERSE_OUTPUT_PATH), key=str.lower) 141 | say( 142 | text="", 143 | blocks=[ 144 | { 145 | "type": "section", 146 | "block_id": "section_1", 147 | "text": {"type": "mrkdwn", "text": "Please select the model to evaluate."}, 148 | "accessory": { 149 | "action_id": "model_select_en", 150 | "type": "multi_static_select", 151 | "placeholder": {"type": "plain_text", "text": "Model selection"}, 152 | "options": [ 153 | {"text": {"type": "plain_text", "text": m[:75]}, "value": f"value-{i}"} 154 | for i, m in enumerate(model_options) 155 | ], 156 | }, 157 | } 158 | ], 159 | ) 160 | 161 | 162 | @app.action("model_select_en") 163 | def report_bench_selection(ack, body, say, logger): 164 | ack() 165 | logger.info(body) 166 | 167 | global model_list 168 | model_list = [] 169 | for action in body["actions"]: 170 | for option in action["selected_options"]: 171 | model_list.append(option["text"]["text"]) 172 | 173 | say( 174 | text="", 175 | blocks=[ 176 | { 177 | "type": "section", 178 | "block_id": "section_2", 179 | "text": {"type": "mrkdwn", "text": "Please select the evaluation criteria."}, 180 | "accessory": { 181 | "action_id": "bench_select_en", 182 | "type": "multi_static_select", 183 | "placeholder": {"type": "plain_text", "text": "Metric selection"}, 184 | "options": [ 185 | {"text": {"type": "plain_text", "text": m}, "value": f"value-{i}"} 186 | for i, m in enumerate(AVAILABLE_BENCHMARKS) 187 | ], 188 | }, 189 | } 190 | ], 191 | ) 192 | 193 | 194 | @app.action("bench_select_en") 195 | def report_figure_and_table(ack, body, say, logger): 196 | ack() 197 | logger.info(body) 198 | 199 | bench_list = [] 200 | for action in body["actions"]: 201 | for option in action["selected_options"]: 202 | bench_list.append(option["text"]["text"]) 203 | 204 | table_path, figure_path = reporter.run( 205 | model_list=model_list, benchmark_list=bench_list, save=True 206 | ) 207 | 208 | models = "\n".join([f"• {m}" for m in model_list]) 209 | benchs = "\n".join([f"• {m}" for m in bench_list]) 210 | 211 | # message 212 | msg = f"LLM Evaluation Report requested by <@{body['user']['id']}>.\n\n🤖 Selected models\n{models}\n\n📊 Selected benchmarks\n{benchs}" 213 | say(msg) 214 | 215 | # upload files for request 216 | req_channel_id = body["channel"]["id"] 217 | upload_file(figure_path, req_channel_id) 218 | upload_file(table_path, req_channel_id) 219 | 220 | # logging 221 | logger.info(f"@{body['user']['id']}::{bench_list}::{model_list}") 222 | 223 | 224 | if __name__ == "__main__": 225 | SocketModeHandler(app, app_token).start() 226 | -------------------------------------------------------------------------------- /evalverse/tests/test_evaluator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from evalverse.evaluator import Evaluator 5 | 6 | TEST_PATH = os.path.dirname(os.path.abspath(__file__)) 7 | 8 | 9 | class TestEvaluator(unittest.TestCase): 10 | def setUp(self): 11 | self.evaluator = Evaluator(mode="lib") 12 | 13 | def test_get_args_default(self): 14 | args = self.evaluator.get_args() 15 | self.assertEqual(args.ckpt_path, "upstage/SOLAR-10.7B-Instruct-v1.0") 16 | 17 | def test_run_args_overriding(self): 18 | your_model = "your/Model" 19 | your_output_path = "/your/output_path" 20 | self.evaluator.run(model=your_model, output_path=your_output_path) 21 | self.assertEqual(self.evaluator.args.ckpt_path, your_model) 22 | self.assertEqual(self.evaluator.args.output_path, your_output_path) 23 | 24 | def test_run_h6_en_existing(self): 25 | benchmark = "h6_en" 26 | output_path = os.path.join(TEST_PATH, "test_results") 27 | self.evaluator.run(benchmark=benchmark, output_path=output_path) 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /evalverse/tests/test_reporter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from evalverse.reporter import Reporter 5 | 6 | TEST_PATH = os.path.dirname(os.path.abspath(__file__)) 7 | 8 | 9 | class TestEvaluator(unittest.TestCase): 10 | def setUp(self): 11 | output_path = os.path.join(TEST_PATH, "test_results") 12 | self.reporter = Reporter(output_path=output_path) 13 | 14 | def test_update_db(self): 15 | self.reporter.update_db() 16 | 17 | def test_run(self): 18 | model_list = ["SOLAR-10.7B-Instruct-v1.0"] 19 | benchmark_list = ["h6_en"] 20 | self.reporter.run(model_list=model_list, benchmark_list=benchmark_list) 21 | 22 | 23 | if __name__ == "__main__": 24 | unittest.main() 25 | -------------------------------------------------------------------------------- /evalverse/tests/test_reproducibility.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from evalverse.evaluator import Evaluator 5 | from evalverse.utils import get_h6_en_scores 6 | 7 | TEST_PATH = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | 10 | class TestEvaluator(unittest.TestCase): 11 | def setUp(self): 12 | self.evaluator = Evaluator(mode="lib") 13 | 14 | def test_run_all_benchmarks(self): 15 | model = "upstage/SOLAR-10.7B-Instruct-v1.0" 16 | benchmark = "all" 17 | original_output_path = os.path.join(TEST_PATH, "test_results") 18 | reproduced_output_path = os.path.join(TEST_PATH, "test_results_reproduced") 19 | self.evaluator.run( 20 | model=model, 21 | benchmark=benchmark, 22 | data_parallel=8, 23 | num_gpus_total=8, 24 | parallel_api=4, 25 | devices="0,1,2,3,4,5,6,7", 26 | output_path=reproduced_output_path, 27 | ) 28 | 29 | # h6_score reproducilbility check 30 | model_name = model.split("/")[-1] 31 | original_scores = get_h6_en_scores(os.path.join(original_output_path, model_name, "h6_en")) 32 | original_stderr = get_h6_en_scores( 33 | os.path.join(original_output_path, model_name, "h6_en"), stderr=True 34 | ) 35 | reproduced_scores = get_h6_en_scores( 36 | os.path.join(reproduced_output_path, model_name, "h6_en") 37 | ) 38 | 39 | h6_list = ["arc_c_25", "hellaswag_10", "mmlu_5", "truthfulqa_0", "winogrande_5", "gsm8k_5"] 40 | for benchmark, original, stderr, reproduced in zip( 41 | h6_list, original_scores, original_stderr, reproduced_scores 42 | ): 43 | difference = abs(original - reproduced) 44 | print( 45 | f"[{benchmark}] \t original: {original} \t reproduced: {reproduced} \t difference: {round(difference, 2)} \t stderr: {stderr}" 46 | ) 47 | self.assertLessEqual(difference, stderr) 48 | 49 | 50 | if __name__ == "__main__": 51 | unittest.main() 52 | -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/arc_challenge_25.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "arc_challenge": { 4 | "acc,none": 0.6885665529010239, 5 | "acc_stderr,none": 0.01353247209985083, 6 | "acc_norm,none": 0.7133105802047781, 7 | "acc_norm_stderr,none": 0.013214986329274855, 8 | "alias": "arc_challenge" 9 | } 10 | }, 11 | "group_subtasks": { 12 | "arc_challenge": [] 13 | }, 14 | "configs": { 15 | "arc_challenge": { 16 | "task": "arc_challenge", 17 | "group": [ 18 | "ai2_arc" 19 | ], 20 | "dataset_path": "allenai/ai2_arc", 21 | "dataset_name": "ARC-Challenge", 22 | "training_split": "train", 23 | "validation_split": "validation", 24 | "test_split": "test", 25 | "doc_to_text": "Question: {{question}}\nAnswer:", 26 | "doc_to_target": "{{choices.label.index(answerKey)}}", 27 | "doc_to_choice": "{{choices.text}}", 28 | "description": "", 29 | "target_delimiter": " ", 30 | "fewshot_delimiter": "\n\n", 31 | "num_fewshot": 25, 32 | "metric_list": [ 33 | { 34 | "metric": "acc", 35 | "aggregation": "mean", 36 | "higher_is_better": true 37 | }, 38 | { 39 | "metric": "acc_norm", 40 | "aggregation": "mean", 41 | "higher_is_better": true 42 | } 43 | ], 44 | "output_type": "multiple_choice", 45 | "repeats": 1, 46 | "should_decontaminate": true, 47 | "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", 48 | "metadata": { 49 | "version": 1.0 50 | } 51 | } 52 | }, 53 | "versions": { 54 | "arc_challenge": 1.0 55 | }, 56 | "n-shot": { 57 | "arc_challenge": 25 58 | }, 59 | "config": { 60 | "model": "hf", 61 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 62 | "batch_size": "16", 63 | "batch_sizes": [], 64 | "device": null, 65 | "use_cache": null, 66 | "limit": null, 67 | "bootstrap_iters": 100000, 68 | "gen_kwargs": null 69 | }, 70 | "git_hash": "22f5854", 71 | "date": 1711604407.8730423, 72 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2813.569\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 73 | "transformers_version": "4.38.2", 74 | "upper_git_hash": null 75 | } -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/gsm8k_5.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "gsm8k": { 4 | "exact_match,strict-match": 0.6777862016679302, 5 | "exact_match_stderr,strict-match": 0.012872435481188778, 6 | "exact_match,flexible-extract": 0.6853677028051555, 7 | "exact_match_stderr,flexible-extract": 0.012791037227336034, 8 | "alias": "gsm8k" 9 | } 10 | }, 11 | "group_subtasks": { 12 | "gsm8k": [] 13 | }, 14 | "configs": { 15 | "gsm8k": { 16 | "task": "gsm8k", 17 | "group": [ 18 | "math_word_problems" 19 | ], 20 | "dataset_path": "gsm8k", 21 | "dataset_name": "main", 22 | "training_split": "train", 23 | "test_split": "test", 24 | "fewshot_split": "train", 25 | "doc_to_text": "Question: {{question}}\nAnswer:", 26 | "doc_to_target": "{{answer}}", 27 | "description": "", 28 | "target_delimiter": " ", 29 | "fewshot_delimiter": "\n\n", 30 | "num_fewshot": 5, 31 | "metric_list": [ 32 | { 33 | "metric": "exact_match", 34 | "aggregation": "mean", 35 | "higher_is_better": true, 36 | "ignore_case": true, 37 | "ignore_punctuation": false, 38 | "regexes_to_ignore": [ 39 | ",", 40 | "\\$", 41 | "(?s).*#### ", 42 | "\\.$" 43 | ] 44 | } 45 | ], 46 | "output_type": "generate_until", 47 | "generation_kwargs": { 48 | "until": [ 49 | "Question:", 50 | "</s>", 51 | "<|im_end|>" 52 | ], 53 | "do_sample": false, 54 | "temperature": 0.0 55 | }, 56 | "repeats": 1, 57 | "filter_list": [ 58 | { 59 | "name": "strict-match", 60 | "filter": [ 61 | { 62 | "function": "regex", 63 | "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" 64 | }, 65 | { 66 | "function": "take_first" 67 | } 68 | ] 69 | }, 70 | { 71 | "name": "flexible-extract", 72 | "filter": [ 73 | { 74 | "function": "regex", 75 | "group_select": -1, 76 | "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" 77 | }, 78 | { 79 | "function": "take_first" 80 | } 81 | ] 82 | } 83 | ], 84 | "should_decontaminate": false, 85 | "metadata": { 86 | "version": 3.0 87 | } 88 | } 89 | }, 90 | "versions": { 91 | "gsm8k": 3.0 92 | }, 93 | "n-shot": { 94 | "gsm8k": 5 95 | }, 96 | "config": { 97 | "model": "hf", 98 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 99 | "batch_size": "16", 100 | "batch_sizes": [], 101 | "device": null, 102 | "use_cache": null, 103 | "limit": null, 104 | "bootstrap_iters": 100000, 105 | "gen_kwargs": null 106 | }, 107 | "git_hash": "22f5854", 108 | "date": 1711605933.4303067, 109 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2475.477\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 110 | "transformers_version": "4.38.2", 111 | "upper_git_hash": null 112 | } -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/hellaswag_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "hellaswag": { 4 | "acc,none": 0.7061342362079267, 5 | "acc_stderr,none": 0.004546002255457021, 6 | "acc_norm,none": 0.8818960366460864, 7 | "acc_norm_stderr,none": 0.0032207161266851005, 8 | "alias": "hellaswag" 9 | } 10 | }, 11 | "group_subtasks": { 12 | "hellaswag": [] 13 | }, 14 | "configs": { 15 | "hellaswag": { 16 | "task": "hellaswag", 17 | "group": [ 18 | "multiple_choice" 19 | ], 20 | "dataset_path": "hellaswag", 21 | "training_split": "train", 22 | "validation_split": "validation", 23 | "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", 24 | "doc_to_text": "{{query}}", 25 | "doc_to_target": "{{label}}", 26 | "doc_to_choice": "choices", 27 | "description": "", 28 | "target_delimiter": " ", 29 | "fewshot_delimiter": "\n\n", 30 | "num_fewshot": 10, 31 | "metric_list": [ 32 | { 33 | "metric": "acc", 34 | "aggregation": "mean", 35 | "higher_is_better": true 36 | }, 37 | { 38 | "metric": "acc_norm", 39 | "aggregation": "mean", 40 | "higher_is_better": true 41 | } 42 | ], 43 | "output_type": "multiple_choice", 44 | "repeats": 1, 45 | "should_decontaminate": false, 46 | "metadata": { 47 | "version": 1.0 48 | } 49 | } 50 | }, 51 | "versions": { 52 | "hellaswag": 1.0 53 | }, 54 | "n-shot": { 55 | "hellaswag": 10 56 | }, 57 | "config": { 58 | "model": "hf", 59 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 60 | "batch_size": "16", 61 | "batch_sizes": [], 62 | "device": null, 63 | "use_cache": null, 64 | "limit": null, 65 | "bootstrap_iters": 100000, 66 | "gen_kwargs": null 67 | }, 68 | "git_hash": "22f5854", 69 | "date": 1711604551.2668173, 70 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2633.640\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 71 | "transformers_version": "4.38.2", 72 | "upper_git_hash": null 73 | } -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/truthfulqa_mc2_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "truthfulqa_mc2": { 4 | "acc,none": 0.7171838111166857, 5 | "acc_stderr,none": 0.01498853297119472, 6 | "alias": "truthfulqa_mc2" 7 | } 8 | }, 9 | "group_subtasks": { 10 | "truthfulqa_mc2": [] 11 | }, 12 | "configs": { 13 | "truthfulqa_mc2": { 14 | "task": "truthfulqa_mc2", 15 | "group": [ 16 | "truthfulqa" 17 | ], 18 | "dataset_path": "truthful_qa", 19 | "dataset_name": "multiple_choice", 20 | "validation_split": "validation", 21 | "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", 22 | "doc_to_target": 0, 23 | "doc_to_choice": "{{mc2_targets.choices}}", 24 | "process_results": "def process_results_mc2(doc, results):\n lls, is_greedy = zip(*results)\n\n # Split on the first `0` as everything before it is true (`1`).\n split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n # Compute the normalized probability mass for the correct answer.\n ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n p_true = p_true / (sum(p_true) + sum(p_false))\n\n return {\"acc\": sum(p_true)}\n", 25 | "description": "", 26 | "target_delimiter": " ", 27 | "fewshot_delimiter": "\n\n", 28 | "num_fewshot": 0, 29 | "metric_list": [ 30 | { 31 | "metric": "acc", 32 | "aggregation": "mean", 33 | "higher_is_better": true 34 | } 35 | ], 36 | "output_type": "multiple_choice", 37 | "repeats": 1, 38 | "should_decontaminate": true, 39 | "doc_to_decontamination_query": "question", 40 | "metadata": { 41 | "version": 2.0 42 | } 43 | } 44 | }, 45 | "versions": { 46 | "truthfulqa_mc2": 2.0 47 | }, 48 | "n-shot": { 49 | "truthfulqa_mc2": 0 50 | }, 51 | "config": { 52 | "model": "hf", 53 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 54 | "batch_size": "16", 55 | "batch_sizes": [], 56 | "device": null, 57 | "use_cache": null, 58 | "limit": null, 59 | "bootstrap_iters": 100000, 60 | "gen_kwargs": null 61 | }, 62 | "git_hash": "22f5854", 63 | "date": 1711605810.1983285, 64 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2474.946\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 65 | "transformers_version": "4.38.2", 66 | "upper_git_hash": null 67 | } -------------------------------------------------------------------------------- /evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/winogrande_5.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "winogrande": { 4 | "acc,none": 0.8318863456985004, 5 | "acc_stderr,none": 0.010510336954166734, 6 | "alias": "winogrande" 7 | } 8 | }, 9 | "group_subtasks": { 10 | "winogrande": [] 11 | }, 12 | "configs": { 13 | "winogrande": { 14 | "task": "winogrande", 15 | "dataset_path": "winogrande", 16 | "dataset_name": "winogrande_xl", 17 | "training_split": "train", 18 | "validation_split": "validation", 19 | "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", 20 | "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", 21 | "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", 22 | "description": "", 23 | "target_delimiter": " ", 24 | "fewshot_delimiter": "\n\n", 25 | "num_fewshot": 5, 26 | "metric_list": [ 27 | { 28 | "metric": "acc", 29 | "aggregation": "mean", 30 | "higher_is_better": true 31 | } 32 | ], 33 | "output_type": "multiple_choice", 34 | "repeats": 1, 35 | "should_decontaminate": true, 36 | "doc_to_decontamination_query": "sentence", 37 | "metadata": { 38 | "version": 1.0 39 | } 40 | } 41 | }, 42 | "versions": { 43 | "winogrande": 1.0 44 | }, 45 | "n-shot": { 46 | "winogrande": 5 47 | }, 48 | "config": { 49 | "model": "hf", 50 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 51 | "batch_size": "16", 52 | "batch_sizes": [], 53 | "device": null, 54 | "use_cache": null, 55 | "limit": null, 56 | "bootstrap_iters": 100000, 57 | "gen_kwargs": null 58 | }, 59 | "git_hash": "22f5854", 60 | "date": 1711605880.7907126, 61 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2511.241\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 62 | "transformers_version": "4.38.2", 63 | "upper_git_hash": null 64 | } -------------------------------------------------------------------------------- /evalverse/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2024-present Upstage Co., Ltd. 3 | Apache-2.0 license 4 | """ 5 | import json 6 | import logging 7 | import os 8 | import re 9 | 10 | import numpy as np 11 | import pandas as pd 12 | import plotly.express as px 13 | 14 | EVALVERSE_MODULE_PATH = os.path.dirname(os.path.abspath(__file__)) 15 | EVALVERSE_DB_PATH = os.path.join(os.path.dirname(EVALVERSE_MODULE_PATH), "db") 16 | EVALVERSE_OUTPUT_PATH = os.path.join(os.path.dirname(EVALVERSE_MODULE_PATH), "results") 17 | EVALVERSE_LOG_FORMAT = ( 18 | "[%(asctime)s][%(levelname)s][evalverse - %(filename)s:%(lineno)d] >> %(message)s" 19 | ) 20 | 21 | 22 | def print_command(command, only_cmd=False): 23 | cmd = re.sub(r"\s+", " ", command).strip() 24 | if only_cmd: 25 | return cmd 26 | else: 27 | print(cmd) 28 | 29 | 30 | def print_txt_file(path): 31 | with open(path, "r") as file: 32 | file_contents = file.read() 33 | print(file_contents) 34 | 35 | 36 | def get_logger(log_path=None): 37 | logger = logging.getLogger(__name__) 38 | logger.setLevel(level=logging.INFO) 39 | formatter = logging.Formatter( 40 | fmt=EVALVERSE_LOG_FORMAT, 41 | datefmt="%Y-%m-%d %H:%M:%S", 42 | ) 43 | if log_path: 44 | fileHandler = logging.FileHandler(filename=log_path) 45 | fileHandler.setFormatter(formatter) 46 | logger.addHandler(fileHandler) 47 | 48 | return logger 49 | 50 | 51 | def get_figure(score_df, benchmarks_list, figure_path=None, save=False): 52 | scores = [] 53 | for b in benchmarks_list: 54 | for m, n in score_df[["Model", b]].values: 55 | scores.append([m, b, n]) 56 | figure_df = pd.DataFrame(scores, columns=["model", "benchmark", "score"]) 57 | 58 | fig = px.line_polar( 59 | figure_df, 60 | r="score", 61 | theta="benchmark", 62 | line_close=True, 63 | category_orders={"benchmark": benchmarks_list}, 64 | color="model", 65 | markers=True, 66 | color_discrete_sequence=px.colors.qualitative.Pastel, 67 | title="LLM Evaluation Report (by Evalverse)", 68 | width=800, 69 | ) 70 | if save: 71 | fig.write_image(figure_path, scale=2) 72 | else: 73 | fig.show() 74 | 75 | 76 | def get_h6_en_scores(exp_path, stderr=False, print_results=False): 77 | acc_metric = "acc,none" 78 | acc_norm_metric = "acc_norm,none" 79 | gsm8k_metrics = ["exact_match,get-answer", "exact_match,strict-match"] 80 | if stderr: 81 | acc_metric = "acc_stderr,none" 82 | acc_norm_metric = "acc_norm_stderr,none" 83 | gsm8k_metrics = ["exact_match_stderr,get-answer", "exact_match_stderr,strict-match"] 84 | 85 | with open(os.path.join(exp_path, "arc_challenge_25.json"), "r") as json_file: 86 | arc_challenge_25 = json.load(json_file) 87 | if print_results: 88 | print( 89 | "ARC-Challenge (25-shot)", 90 | json.dumps(arc_challenge_25["results"]["arc_challenge"], indent=4), 91 | ) 92 | else: 93 | arc_score = arc_challenge_25["results"]["arc_challenge"][acc_norm_metric] 94 | 95 | with open(os.path.join(exp_path, "hellaswag_10.json"), "r") as json_file: 96 | hellaswag_10 = json.load(json_file) 97 | if print_results: 98 | print("Hellaswag (10-shot)", json.dumps(hellaswag_10["results"]["hellaswag"], indent=4)) 99 | else: 100 | hellaswag_score = hellaswag_10["results"]["hellaswag"][acc_norm_metric] 101 | 102 | with open(os.path.join(exp_path, "mmlu_5.json"), "r") as json_file: 103 | mmlu_5 = json.load(json_file) 104 | if print_results: 105 | print("MMLU (5-shot)", json.dumps(mmlu_5["results"]["mmlu"], indent=4)) 106 | else: 107 | mmlu_score = mmlu_5["results"]["mmlu"][acc_metric] 108 | 109 | with open(os.path.join(exp_path, "truthfulqa_mc2_0.json"), "r") as json_file: 110 | truthfulqa_mc2_0 = json.load(json_file) 111 | if print_results: 112 | print( 113 | "TruthfulQA (0-shot)", 114 | json.dumps(truthfulqa_mc2_0["results"]["truthfulqa_mc2"], indent=4), 115 | ) 116 | else: 117 | truthfulqa_score = truthfulqa_mc2_0["results"]["truthfulqa_mc2"][acc_metric] 118 | 119 | with open(os.path.join(exp_path, "winogrande_5.json"), "r") as json_file: 120 | winogrande_5 = json.load(json_file) 121 | if print_results: 122 | print( 123 | "Winogrande (5-shot)", json.dumps(winogrande_5["results"]["winogrande"], indent=4) 124 | ) 125 | else: 126 | winogrande_score = winogrande_5["results"]["winogrande"][acc_metric] 127 | 128 | with open(os.path.join(exp_path, "gsm8k_5.json"), "r") as json_file: 129 | gsm8k_5 = json.load(json_file) 130 | if print_results: 131 | print("GSM8k (5-shot)", json.dumps(gsm8k_5["results"]["gsm8k"], indent=4)) 132 | else: 133 | match_key = next( 134 | (key for key in gsm8k_metrics if key in gsm8k_5["results"]["gsm8k"]), None 135 | ) 136 | gsm8k_score = gsm8k_5["results"]["gsm8k"][match_key] 137 | 138 | if print_results: 139 | pass 140 | else: 141 | score_list = [ 142 | arc_score, 143 | hellaswag_score, 144 | mmlu_score, 145 | truthfulqa_score, 146 | winogrande_score, 147 | gsm8k_score, 148 | ] 149 | score_list = list(np.round((np.array(score_list) * 100), 2)) 150 | 151 | return score_list 152 | 153 | 154 | def get_mt_bench_scores(model_id, question_path, judgement_path): 155 | question_df = pd.read_json(question_path, lines=True) 156 | judgement_df = pd.read_json(judgement_path, lines=True) 157 | 158 | df = judgement_df[["question_id", "model", "score", "turn"]] 159 | df = df[(df["model"] == model_id) & (df["score"] != -1)] 160 | df = df.merge(question_df[["question_id", "category"]], how="left") 161 | df = df[["category", "score"]].groupby(["category"]).mean() 162 | df = df.sort_values("category") 163 | 164 | score_list = df.score.values.tolist() 165 | score_list = list(np.round((np.array(score_list) * 10), 2)) 166 | 167 | return score_list 168 | 169 | 170 | def get_ifeval_scores(score_txt_file): 171 | score_list = [] 172 | with open(score_txt_file, "r") as file: 173 | content = file.read() 174 | 175 | pattern = r"(prompt-level|instruction-level):\s([\d.]+)" 176 | matches = re.findall(pattern, content) 177 | 178 | for _, score in matches: 179 | score_list.append(float(score)) 180 | score_list = list(np.round((np.array(score_list) * 100), 2)) 181 | 182 | return score_list 183 | 184 | 185 | def get_eqbench_score(eqbench_results_json): 186 | with open(eqbench_results_json, "r") as f: 187 | data = json.load(f) 188 | 189 | final_score = data[list(data.keys())[0]]["iterations"]["1"]["benchmark_results_fullscale"][ 190 | "final_score" 191 | ] 192 | score_list = [round(final_score, 2)] 193 | 194 | return score_list 195 | 196 | 197 | if __name__ == "__main__": 198 | print(f"EVALVERSE_MODULE_PATH: {EVALVERSE_MODULE_PATH}") 199 | print(f"EVALVERSE_DB_PATH: {EVALVERSE_DB_PATH}") 200 | print(f"EVALVERSE_OUTPUT_PATH: {EVALVERSE_OUTPUT_PATH}") 201 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # 🌍 Examples 2 | > This is a example collection for `Evalverse`. We will talk about the basic usage of `Evalverse`, knowhows, and how to use it in your project. 3 | 4 | 5 | ### 🙋 I'm very new to Evalverse. 6 | Introduces very basic, but core steps to use Evalverse. (e.g., `Evaluator`, `Reporter`) 7 | 8 | - [01_basic_usage.ipynb](https://github.com/UpstageAI/evalverse/blob/main/examples/01_basic_usage.ipynb) 9 | 10 | ### 🙋 I want to know how to run evaluation in details 11 | If you want to run each evaluation (`h6_en`, `mt_bench`, `ifeval`, `eq_bench`) in details. 12 | 13 | - [02_advanced_usage.ipynb](https://github.com/UpstageAI/evalverse/blob/main/examples/02_advanced_usage.ipynb) 14 | 15 | 16 | -------------------------------------------------------------------------------- /examples/db/figures/figure_20240402_105011.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/examples/db/figures/figure_20240402_105011.jpeg -------------------------------------------------------------------------------- /examples/db/score_df.csv: -------------------------------------------------------------------------------- 1 | Model,H6-ARC,H6-Hellaswag,H6-MMLU,H6-TruthfulQA,H6-Winogrande,H6-GSM8k,MT-Bench-Coding,MT-Bench-Extraction,MT-Bench-Humanities,MT-Bench-Math,MT-Bench-Reasoning,MT-Bench-Roleplay,MT-Bench-Stem,MT-Bench-Writing,IFEval-strict-prompt,IFEval-strict-instruction,IFEval-loose-prompt,IFEval-loose-instruction,EQ-Bench 2 | Llama-2-7b-chat-hf,53.16,78.59,47.38,45.31,72.69,23.96,28.95,66.25,96.5,23.5,52.5,77.5,89.0,86.75,39.19,47.93,48.43,56.09,36.46 3 | SOLAR-10.7B-Instruct-v1.0,71.33,88.19,65.52,71.72,83.19,67.78,44.5,77.89,98.5,43.5,66.0,82.5,88.75,94.0,51.57,57.97,56.01,62.92,72.17 4 | -------------------------------------------------------------------------------- /examples/db/scores/table_20240402_105011.csv: -------------------------------------------------------------------------------- 1 | Model,Ranking,total_avg,H6-ARC,H6-Hellaswag,H6-MMLU,H6-TruthfulQA,H6-Winogrande,H6-GSM8k 2 | SOLAR-10.7B-Instruct-v1.0,1,74.62,71.33,88.19,65.52,71.72,83.19,67.78 3 | -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/eq_bench/benchmark_results.csv: -------------------------------------------------------------------------------- 1 | Run ID, Benchmark Completed, Prompt Format, Model Path, Lora Path, Quantization, Benchmark Score, EQ-Bench Version, Num Questions Parseable, Num Iterations, Inference Engine, Ooba Params, Download Filters, Error 2 | Llama-2-7b-chat-hf,2024-04-02 13:34:43,ChatML,meta-llama/Llama-2-7b-chat-hf,None,None,36.46,v2,155.0,1,transformers,none,--include ["n", "o", "n", "e"] --exclude ["n", "o", "n", "e"], 3 | -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/arc_challenge_25.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "arc_challenge": { 4 | "acc,none": 0.5093856655290102, 5 | "acc_stderr,none": 0.014608816322065, 6 | "acc_norm,none": 0.5315699658703071, 7 | "acc_norm_stderr,none": 0.014582236460866977, 8 | "alias": "arc_challenge" 9 | } 10 | }, 11 | "group_subtasks": { 12 | "arc_challenge": [] 13 | }, 14 | "configs": { 15 | "arc_challenge": { 16 | "task": "arc_challenge", 17 | "group": [ 18 | "ai2_arc" 19 | ], 20 | "dataset_path": "allenai/ai2_arc", 21 | "dataset_name": "ARC-Challenge", 22 | "training_split": "train", 23 | "validation_split": "validation", 24 | "test_split": "test", 25 | "doc_to_text": "Question: {{question}}\nAnswer:", 26 | "doc_to_target": "{{choices.label.index(answerKey)}}", 27 | "doc_to_choice": "{{choices.text}}", 28 | "description": "", 29 | "target_delimiter": " ", 30 | "fewshot_delimiter": "\n\n", 31 | "num_fewshot": 25, 32 | "metric_list": [ 33 | { 34 | "metric": "acc", 35 | "aggregation": "mean", 36 | "higher_is_better": true 37 | }, 38 | { 39 | "metric": "acc_norm", 40 | "aggregation": "mean", 41 | "higher_is_better": true 42 | } 43 | ], 44 | "output_type": "multiple_choice", 45 | "repeats": 1, 46 | "should_decontaminate": true, 47 | "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", 48 | "metadata": { 49 | "version": 1.0 50 | } 51 | } 52 | }, 53 | "versions": { 54 | "arc_challenge": 1.0 55 | }, 56 | "n-shot": { 57 | "arc_challenge": 25 58 | }, 59 | "config": { 60 | "model": "hf", 61 | "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 62 | "batch_size": "16", 63 | "batch_sizes": [], 64 | "device": null, 65 | "use_cache": null, 66 | "limit": null, 67 | "bootstrap_iters": 100000, 68 | "gen_kwargs": null 69 | }, 70 | "git_hash": "0ecf672", 71 | "date": 1712029901.2960556, 72 | "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2813.715\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.1.2 pypi_0 pypi\n[conda] triton 2.1.0 pypi_0 pypi", 73 | "transformers_version": "4.37.2", 74 | "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6" 75 | } -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/gsm8k_5.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "gsm8k": { 4 | "exact_match,strict-match": 0.2395754359363154, 5 | "exact_match_stderr,strict-match": 0.01175686434407741, 6 | "exact_match,flexible-extract": 0.24184988627748294, 7 | "exact_match_stderr,flexible-extract": 0.011794861371318698, 8 | "alias": "gsm8k" 9 | } 10 | }, 11 | "group_subtasks": { 12 | "gsm8k": [] 13 | }, 14 | "configs": { 15 | "gsm8k": { 16 | "task": "gsm8k", 17 | "group": [ 18 | "math_word_problems" 19 | ], 20 | "dataset_path": "gsm8k", 21 | "dataset_name": "main", 22 | "training_split": "train", 23 | "test_split": "test", 24 | "fewshot_split": "train", 25 | "doc_to_text": "Question: {{question}}\nAnswer:", 26 | "doc_to_target": "{{answer}}", 27 | "description": "", 28 | "target_delimiter": " ", 29 | "fewshot_delimiter": "\n\n", 30 | "num_fewshot": 5, 31 | "metric_list": [ 32 | { 33 | "metric": "exact_match", 34 | "aggregation": "mean", 35 | "higher_is_better": true, 36 | "ignore_case": true, 37 | "ignore_punctuation": false, 38 | "regexes_to_ignore": [ 39 | ",", 40 | "\\$", 41 | "(?s).*#### ", 42 | "\\.$" 43 | ] 44 | } 45 | ], 46 | "output_type": "generate_until", 47 | "generation_kwargs": { 48 | "until": [ 49 | "Question:", 50 | "</s>", 51 | "<|im_end|>" 52 | ], 53 | "do_sample": false, 54 | "temperature": 0.0 55 | }, 56 | "repeats": 1, 57 | "filter_list": [ 58 | { 59 | "name": "strict-match", 60 | "filter": [ 61 | { 62 | "function": "regex", 63 | "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" 64 | }, 65 | { 66 | "function": "take_first" 67 | } 68 | ] 69 | }, 70 | { 71 | "name": "flexible-extract", 72 | "filter": [ 73 | { 74 | "function": "regex", 75 | "group_select": -1, 76 | "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" 77 | }, 78 | { 79 | "function": "take_first" 80 | } 81 | ] 82 | } 83 | ], 84 | "should_decontaminate": false, 85 | "metadata": { 86 | "version": 3.0 87 | } 88 | } 89 | }, 90 | "versions": { 91 | "gsm8k": 3.0 92 | }, 93 | "n-shot": { 94 | "gsm8k": 5 95 | }, 96 | "config": { 97 | "model": "hf", 98 | "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 99 | "batch_size": "16", 100 | "batch_sizes": [], 101 | "device": null, 102 | "use_cache": null, 103 | "limit": null, 104 | "bootstrap_iters": 100000, 105 | "gen_kwargs": null 106 | }, 107 | "git_hash": "0ecf672", 108 | "date": 1712031337.2311273, 109 | "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2474.987\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.1.2 pypi_0 pypi\n[conda] triton 2.1.0 pypi_0 pypi", 110 | "transformers_version": "4.37.2", 111 | "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6" 112 | } -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/hellaswag_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "hellaswag": { 4 | "acc,none": 0.5944035052778331, 5 | "acc_stderr,none": 0.004900036261309049, 6 | "acc_norm,none": 0.7858992232622983, 7 | "acc_norm_stderr,none": 0.004093587404303701, 8 | "alias": "hellaswag" 9 | } 10 | }, 11 | "group_subtasks": { 12 | "hellaswag": [] 13 | }, 14 | "configs": { 15 | "hellaswag": { 16 | "task": "hellaswag", 17 | "group": [ 18 | "multiple_choice" 19 | ], 20 | "dataset_path": "hellaswag", 21 | "training_split": "train", 22 | "validation_split": "validation", 23 | "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", 24 | "doc_to_text": "{{query}}", 25 | "doc_to_target": "{{label}}", 26 | "doc_to_choice": "choices", 27 | "description": "", 28 | "target_delimiter": " ", 29 | "fewshot_delimiter": "\n\n", 30 | "num_fewshot": 10, 31 | "metric_list": [ 32 | { 33 | "metric": "acc", 34 | "aggregation": "mean", 35 | "higher_is_better": true 36 | }, 37 | { 38 | "metric": "acc_norm", 39 | "aggregation": "mean", 40 | "higher_is_better": true 41 | } 42 | ], 43 | "output_type": "multiple_choice", 44 | "repeats": 1, 45 | "should_decontaminate": false, 46 | "metadata": { 47 | "version": 1.0 48 | } 49 | } 50 | }, 51 | "versions": { 52 | "hellaswag": 1.0 53 | }, 54 | "n-shot": { 55 | "hellaswag": 10 56 | }, 57 | "config": { 58 | "model": "hf", 59 | "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 60 | "batch_size": "16", 61 | "batch_sizes": [], 62 | "device": null, 63 | "use_cache": null, 64 | "limit": null, 65 | "bootstrap_iters": 100000, 66 | "gen_kwargs": null 67 | }, 68 | "git_hash": "0ecf672", 69 | "date": 1712030282.4817123, 70 | "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2803.192\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.1.2 pypi_0 pypi\n[conda] triton 2.1.0 pypi_0 pypi", 71 | "transformers_version": "4.37.2", 72 | "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6" 73 | } -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/truthfulqa_mc2_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "truthfulqa_mc2": { 4 | "acc,none": 0.4531373800075501, 5 | "acc_stderr,none": 0.015639311798545347, 6 | "alias": "truthfulqa_mc2" 7 | } 8 | }, 9 | "group_subtasks": { 10 | "truthfulqa_mc2": [] 11 | }, 12 | "configs": { 13 | "truthfulqa_mc2": { 14 | "task": "truthfulqa_mc2", 15 | "group": [ 16 | "truthfulqa" 17 | ], 18 | "dataset_path": "truthful_qa", 19 | "dataset_name": "multiple_choice", 20 | "validation_split": "validation", 21 | "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", 22 | "doc_to_target": 0, 23 | "doc_to_choice": "{{mc2_targets.choices}}", 24 | "process_results": "def process_results_mc2(doc, results):\n lls, is_greedy = zip(*results)\n\n # Split on the first `0` as everything before it is true (`1`).\n split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n # Compute the normalized probability mass for the correct answer.\n ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n p_true = p_true / (sum(p_true) + sum(p_false))\n\n return {\"acc\": sum(p_true)}\n", 25 | "description": "", 26 | "target_delimiter": " ", 27 | "fewshot_delimiter": "\n\n", 28 | "num_fewshot": 0, 29 | "metric_list": [ 30 | { 31 | "metric": "acc", 32 | "aggregation": "mean", 33 | "higher_is_better": true 34 | } 35 | ], 36 | "output_type": "multiple_choice", 37 | "repeats": 1, 38 | "should_decontaminate": true, 39 | "doc_to_decontamination_query": "question", 40 | "metadata": { 41 | "version": 2.0 42 | } 43 | } 44 | }, 45 | "versions": { 46 | "truthfulqa_mc2": 2.0 47 | }, 48 | "n-shot": { 49 | "truthfulqa_mc2": 0 50 | }, 51 | "config": { 52 | "model": "hf", 53 | "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 54 | "batch_size": "16", 55 | "batch_sizes": [], 56 | "device": null, 57 | "use_cache": null, 58 | "limit": null, 59 | "bootstrap_iters": 100000, 60 | "gen_kwargs": null 61 | }, 62 | "git_hash": "0ecf672", 63 | "date": 1712031221.6043375, 64 | "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2746.724\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.1.2 pypi_0 pypi\n[conda] triton 2.1.0 pypi_0 pypi", 65 | "transformers_version": "4.37.2", 66 | "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6" 67 | } -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/h6_en/winogrande_5.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "winogrande": { 4 | "acc,none": 0.7269139700078927, 5 | "acc_stderr,none": 0.012522020105869457, 6 | "alias": "winogrande" 7 | } 8 | }, 9 | "group_subtasks": { 10 | "winogrande": [] 11 | }, 12 | "configs": { 13 | "winogrande": { 14 | "task": "winogrande", 15 | "dataset_path": "winogrande", 16 | "dataset_name": "winogrande_xl", 17 | "training_split": "train", 18 | "validation_split": "validation", 19 | "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", 20 | "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", 21 | "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", 22 | "description": "", 23 | "target_delimiter": " ", 24 | "fewshot_delimiter": "\n\n", 25 | "num_fewshot": 5, 26 | "metric_list": [ 27 | { 28 | "metric": "acc", 29 | "aggregation": "mean", 30 | "higher_is_better": true 31 | } 32 | ], 33 | "output_type": "multiple_choice", 34 | "repeats": 1, 35 | "should_decontaminate": true, 36 | "doc_to_decontamination_query": "sentence", 37 | "metadata": { 38 | "version": 1.0 39 | } 40 | } 41 | }, 42 | "versions": { 43 | "winogrande": 1.0 44 | }, 45 | "n-shot": { 46 | "winogrande": 5 47 | }, 48 | "config": { 49 | "model": "hf", 50 | "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 51 | "batch_size": "16", 52 | "batch_sizes": [], 53 | "device": null, 54 | "use_cache": null, 55 | "limit": null, 56 | "bootstrap_iters": 100000, 57 | "gen_kwargs": null 58 | }, 59 | "git_hash": "0ecf672", 60 | "date": 1712031284.3734753, 61 | "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2697.466\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.1.2 pypi_0 pypi\n[conda] triton 2.1.0 pypi_0 pypi", 62 | "transformers_version": "4.37.2", 63 | "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6" 64 | } -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/ifeval/scores.txt: -------------------------------------------------------------------------------- 1 | ================================================================ 2 | /data/private/new_lib/evalverse/results/Llama-2-7b-chat-hf/ifeval/eval_results_strict.jsonl Accuracy Scores: 3 | prompt-level: 0.39186691312384475 4 | instruction-level: 0.47925033467202144 5 | 6 | change_case 0.2857142857142857 7 | combination 0.0 8 | detectable_content 0.8181818181818182 9 | detectable_format 0.5733333333333334 10 | keywords 0.744 11 | language 0.4666666666666667 12 | length_constraints 0.5436893203883495 13 | punctuation 0.05970149253731343 14 | startend 0.5454545454545454 15 | 16 | change_case:capital_word_frequency 0.6 17 | change_case:english_capital 0.23076923076923078 18 | change_case:english_lowercase 0.15789473684210525 19 | combination:repeat_prompt 0.0 20 | combination:two_responses 0.0 21 | detectable_content:number_placeholders 0.7692307692307693 22 | detectable_content:postscript 0.8620689655172413 23 | detectable_format:constrained_response 1.0 24 | detectable_format:json_format 0.11764705882352941 25 | detectable_format:multiple_sections 0.25 26 | detectable_format:number_bullet_lists 0.4230769230769231 27 | detectable_format:number_highlighted_sections 0.5918367346938775 28 | detectable_format:title 0.8611111111111112 29 | keywords:existence 0.6666666666666666 30 | keywords:forbidden_words 0.8333333333333334 31 | keywords:frequency 0.7692307692307693 32 | keywords:letter_frequency 0.6551724137931034 33 | language:response_language 0.4666666666666667 34 | length_constraints:nth_paragraph_first_word 0.25 35 | length_constraints:number_paragraphs 0.12 36 | length_constraints:number_sentences 0.75 37 | length_constraints:number_words 0.7666666666666667 38 | punctuation:no_comma 0.05970149253731343 39 | startend:end_checker 0.8 40 | startend:quotation 0.3902439024390244 41 | ================================================================ 42 | /data/private/new_lib/evalverse/results/Llama-2-7b-chat-hf/ifeval/eval_results_loose.jsonl Accuracy Scores: 43 | prompt-level: 0.48428835489833644 44 | instruction-level: 0.5609103078982597 45 | 46 | change_case 0.4642857142857143 47 | combination 0.1044776119402985 48 | detectable_content 0.8181818181818182 49 | detectable_format 0.6266666666666667 50 | keywords 0.768 51 | language 0.5666666666666667 52 | length_constraints 0.5728155339805825 53 | punctuation 0.22388059701492538 54 | startend 0.7121212121212122 55 | 56 | change_case:capital_word_frequency 0.65 57 | change_case:english_capital 0.3076923076923077 58 | change_case:english_lowercase 0.47368421052631576 59 | combination:repeat_prompt 0.047619047619047616 60 | combination:two_responses 0.2 61 | detectable_content:number_placeholders 0.7692307692307693 62 | detectable_content:postscript 0.8620689655172413 63 | detectable_format:constrained_response 1.0 64 | detectable_format:json_format 0.5882352941176471 65 | detectable_format:multiple_sections 0.25 66 | detectable_format:number_bullet_lists 0.4230769230769231 67 | detectable_format:number_highlighted_sections 0.5918367346938775 68 | detectable_format:title 0.8611111111111112 69 | keywords:existence 0.6666666666666666 70 | keywords:forbidden_words 0.9166666666666666 71 | keywords:frequency 0.7692307692307693 72 | keywords:letter_frequency 0.6551724137931034 73 | language:response_language 0.5666666666666667 74 | length_constraints:nth_paragraph_first_word 0.5 75 | length_constraints:number_paragraphs 0.12 76 | length_constraints:number_sentences 0.75 77 | length_constraints:number_words 0.7666666666666667 78 | punctuation:no_comma 0.22388059701492538 79 | startend:end_checker 0.8 80 | startend:quotation 0.6585365853658537 81 | -------------------------------------------------------------------------------- /examples/results/Llama-2-7b-chat-hf/mt_bench/scores.txt: -------------------------------------------------------------------------------- 1 | Mode: single 2 | Input file: /data/private/new_lib/evalverse/results/Llama-2-7b-chat-hf/mt_bench/model_judgment/gpt-4_single.jsonl 3 | 4 | ########## First turn ########## 5 | score 6 | model turn 7 | Llama-2-7b-chat-hf 1 7.025 8 | 9 | ########## Second turn ########## 10 | score 11 | model turn 12 | Llama-2-7b-chat-hf 2 6.037975 13 | 14 | ########## Average ########## 15 | score 16 | model 17 | Llama-2-7b-chat-hf 6.534591 18 | -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/eq_bench/benchmark_results.csv: -------------------------------------------------------------------------------- 1 | Run ID, Benchmark Completed, Prompt Format, Model Path, Lora Path, Quantization, Benchmark Score, EQ-Bench Version, Num Questions Parseable, Num Iterations, Inference Engine, Ooba Params, Download Filters, Error 2 | SOLAR-10.7B-Instruct-v1.0,2024-05-09 18:17:58,Solar-v1,upstage/SOLAR-10.7B-Instruct-v1.0,None,None,72.17,v2,165.0,1,transformers,none,--include ["n", "o", "n", "e"] --exclude ["n", "o", "n", "e"], 3 | -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/arc_challenge_25.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "arc_challenge": { 4 | "acc,none": 0.6885665529010239, 5 | "acc_stderr,none": 0.01353247209985083, 6 | "acc_norm,none": 0.7133105802047781, 7 | "acc_norm_stderr,none": 0.013214986329274855, 8 | "alias": "arc_challenge" 9 | } 10 | }, 11 | "group_subtasks": { 12 | "arc_challenge": [] 13 | }, 14 | "configs": { 15 | "arc_challenge": { 16 | "task": "arc_challenge", 17 | "group": [ 18 | "ai2_arc" 19 | ], 20 | "dataset_path": "allenai/ai2_arc", 21 | "dataset_name": "ARC-Challenge", 22 | "training_split": "train", 23 | "validation_split": "validation", 24 | "test_split": "test", 25 | "doc_to_text": "Question: {{question}}\nAnswer:", 26 | "doc_to_target": "{{choices.label.index(answerKey)}}", 27 | "doc_to_choice": "{{choices.text}}", 28 | "description": "", 29 | "target_delimiter": " ", 30 | "fewshot_delimiter": "\n\n", 31 | "num_fewshot": 25, 32 | "metric_list": [ 33 | { 34 | "metric": "acc", 35 | "aggregation": "mean", 36 | "higher_is_better": true 37 | }, 38 | { 39 | "metric": "acc_norm", 40 | "aggregation": "mean", 41 | "higher_is_better": true 42 | } 43 | ], 44 | "output_type": "multiple_choice", 45 | "repeats": 1, 46 | "should_decontaminate": true, 47 | "doc_to_decontamination_query": "Question: {{question}}\nAnswer:", 48 | "metadata": { 49 | "version": 1.0 50 | } 51 | } 52 | }, 53 | "versions": { 54 | "arc_challenge": 1.0 55 | }, 56 | "n-shot": { 57 | "arc_challenge": 25 58 | }, 59 | "config": { 60 | "model": "hf", 61 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 62 | "batch_size": "16", 63 | "batch_sizes": [], 64 | "device": null, 65 | "use_cache": null, 66 | "limit": null, 67 | "bootstrap_iters": 100000, 68 | "gen_kwargs": null 69 | }, 70 | "git_hash": "22f5854", 71 | "date": 1711604407.8730423, 72 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2813.569\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 73 | "transformers_version": "4.38.2", 74 | "upper_git_hash": null 75 | } -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/gsm8k_5.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "gsm8k": { 4 | "exact_match,strict-match": 0.6777862016679302, 5 | "exact_match_stderr,strict-match": 0.012872435481188778, 6 | "exact_match,flexible-extract": 0.6853677028051555, 7 | "exact_match_stderr,flexible-extract": 0.012791037227336034, 8 | "alias": "gsm8k" 9 | } 10 | }, 11 | "group_subtasks": { 12 | "gsm8k": [] 13 | }, 14 | "configs": { 15 | "gsm8k": { 16 | "task": "gsm8k", 17 | "group": [ 18 | "math_word_problems" 19 | ], 20 | "dataset_path": "gsm8k", 21 | "dataset_name": "main", 22 | "training_split": "train", 23 | "test_split": "test", 24 | "fewshot_split": "train", 25 | "doc_to_text": "Question: {{question}}\nAnswer:", 26 | "doc_to_target": "{{answer}}", 27 | "description": "", 28 | "target_delimiter": " ", 29 | "fewshot_delimiter": "\n\n", 30 | "num_fewshot": 5, 31 | "metric_list": [ 32 | { 33 | "metric": "exact_match", 34 | "aggregation": "mean", 35 | "higher_is_better": true, 36 | "ignore_case": true, 37 | "ignore_punctuation": false, 38 | "regexes_to_ignore": [ 39 | ",", 40 | "\\$", 41 | "(?s).*#### ", 42 | "\\.$" 43 | ] 44 | } 45 | ], 46 | "output_type": "generate_until", 47 | "generation_kwargs": { 48 | "until": [ 49 | "Question:", 50 | "</s>", 51 | "<|im_end|>" 52 | ], 53 | "do_sample": false, 54 | "temperature": 0.0 55 | }, 56 | "repeats": 1, 57 | "filter_list": [ 58 | { 59 | "name": "strict-match", 60 | "filter": [ 61 | { 62 | "function": "regex", 63 | "regex_pattern": "#### (\\-?[0-9\\.\\,]+)" 64 | }, 65 | { 66 | "function": "take_first" 67 | } 68 | ] 69 | }, 70 | { 71 | "name": "flexible-extract", 72 | "filter": [ 73 | { 74 | "function": "regex", 75 | "group_select": -1, 76 | "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)" 77 | }, 78 | { 79 | "function": "take_first" 80 | } 81 | ] 82 | } 83 | ], 84 | "should_decontaminate": false, 85 | "metadata": { 86 | "version": 3.0 87 | } 88 | } 89 | }, 90 | "versions": { 91 | "gsm8k": 3.0 92 | }, 93 | "n-shot": { 94 | "gsm8k": 5 95 | }, 96 | "config": { 97 | "model": "hf", 98 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 99 | "batch_size": "16", 100 | "batch_sizes": [], 101 | "device": null, 102 | "use_cache": null, 103 | "limit": null, 104 | "bootstrap_iters": 100000, 105 | "gen_kwargs": null 106 | }, 107 | "git_hash": "22f5854", 108 | "date": 1711605933.4303067, 109 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2475.477\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 110 | "transformers_version": "4.38.2", 111 | "upper_git_hash": null 112 | } -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/hellaswag_10.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "hellaswag": { 4 | "acc,none": 0.7061342362079267, 5 | "acc_stderr,none": 0.004546002255457021, 6 | "acc_norm,none": 0.8818960366460864, 7 | "acc_norm_stderr,none": 0.0032207161266851005, 8 | "alias": "hellaswag" 9 | } 10 | }, 11 | "group_subtasks": { 12 | "hellaswag": [] 13 | }, 14 | "configs": { 15 | "hellaswag": { 16 | "task": "hellaswag", 17 | "group": [ 18 | "multiple_choice" 19 | ], 20 | "dataset_path": "hellaswag", 21 | "training_split": "train", 22 | "validation_split": "validation", 23 | "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n", 24 | "doc_to_text": "{{query}}", 25 | "doc_to_target": "{{label}}", 26 | "doc_to_choice": "choices", 27 | "description": "", 28 | "target_delimiter": " ", 29 | "fewshot_delimiter": "\n\n", 30 | "num_fewshot": 10, 31 | "metric_list": [ 32 | { 33 | "metric": "acc", 34 | "aggregation": "mean", 35 | "higher_is_better": true 36 | }, 37 | { 38 | "metric": "acc_norm", 39 | "aggregation": "mean", 40 | "higher_is_better": true 41 | } 42 | ], 43 | "output_type": "multiple_choice", 44 | "repeats": 1, 45 | "should_decontaminate": false, 46 | "metadata": { 47 | "version": 1.0 48 | } 49 | } 50 | }, 51 | "versions": { 52 | "hellaswag": 1.0 53 | }, 54 | "n-shot": { 55 | "hellaswag": 10 56 | }, 57 | "config": { 58 | "model": "hf", 59 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 60 | "batch_size": "16", 61 | "batch_sizes": [], 62 | "device": null, 63 | "use_cache": null, 64 | "limit": null, 65 | "bootstrap_iters": 100000, 66 | "gen_kwargs": null 67 | }, 68 | "git_hash": "22f5854", 69 | "date": 1711604551.2668173, 70 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2633.640\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 71 | "transformers_version": "4.38.2", 72 | "upper_git_hash": null 73 | } -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/truthfulqa_mc2_0.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "truthfulqa_mc2": { 4 | "acc,none": 0.7171838111166857, 5 | "acc_stderr,none": 0.01498853297119472, 6 | "alias": "truthfulqa_mc2" 7 | } 8 | }, 9 | "group_subtasks": { 10 | "truthfulqa_mc2": [] 11 | }, 12 | "configs": { 13 | "truthfulqa_mc2": { 14 | "task": "truthfulqa_mc2", 15 | "group": [ 16 | "truthfulqa" 17 | ], 18 | "dataset_path": "truthful_qa", 19 | "dataset_name": "multiple_choice", 20 | "validation_split": "validation", 21 | "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}", 22 | "doc_to_target": 0, 23 | "doc_to_choice": "{{mc2_targets.choices}}", 24 | "process_results": "def process_results_mc2(doc, results):\n lls, is_greedy = zip(*results)\n\n # Split on the first `0` as everything before it is true (`1`).\n split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n # Compute the normalized probability mass for the correct answer.\n ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n p_true = p_true / (sum(p_true) + sum(p_false))\n\n return {\"acc\": sum(p_true)}\n", 25 | "description": "", 26 | "target_delimiter": " ", 27 | "fewshot_delimiter": "\n\n", 28 | "num_fewshot": 0, 29 | "metric_list": [ 30 | { 31 | "metric": "acc", 32 | "aggregation": "mean", 33 | "higher_is_better": true 34 | } 35 | ], 36 | "output_type": "multiple_choice", 37 | "repeats": 1, 38 | "should_decontaminate": true, 39 | "doc_to_decontamination_query": "question", 40 | "metadata": { 41 | "version": 2.0 42 | } 43 | } 44 | }, 45 | "versions": { 46 | "truthfulqa_mc2": 2.0 47 | }, 48 | "n-shot": { 49 | "truthfulqa_mc2": 0 50 | }, 51 | "config": { 52 | "model": "hf", 53 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 54 | "batch_size": "16", 55 | "batch_sizes": [], 56 | "device": null, 57 | "use_cache": null, 58 | "limit": null, 59 | "bootstrap_iters": 100000, 60 | "gen_kwargs": null 61 | }, 62 | "git_hash": "22f5854", 63 | "date": 1711605810.1983285, 64 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2474.946\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 65 | "transformers_version": "4.38.2", 66 | "upper_git_hash": null 67 | } -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/winogrande_5.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": { 3 | "winogrande": { 4 | "acc,none": 0.8318863456985004, 5 | "acc_stderr,none": 0.010510336954166734, 6 | "alias": "winogrande" 7 | } 8 | }, 9 | "group_subtasks": { 10 | "winogrande": [] 11 | }, 12 | "configs": { 13 | "winogrande": { 14 | "task": "winogrande", 15 | "dataset_path": "winogrande", 16 | "dataset_name": "winogrande_xl", 17 | "training_split": "train", 18 | "validation_split": "validation", 19 | "doc_to_text": "def doc_to_text(doc):\n answer_to_num = {\"1\": 0, \"2\": 1}\n return answer_to_num[doc[\"answer\"]]\n", 20 | "doc_to_target": "def doc_to_target(doc):\n idx = doc[\"sentence\"].index(\"_\") + 1\n return doc[\"sentence\"][idx:].strip()\n", 21 | "doc_to_choice": "def doc_to_choice(doc):\n idx = doc[\"sentence\"].index(\"_\")\n options = [doc[\"option1\"], doc[\"option2\"]]\n return [doc[\"sentence\"][:idx] + opt for opt in options]\n", 22 | "description": "", 23 | "target_delimiter": " ", 24 | "fewshot_delimiter": "\n\n", 25 | "num_fewshot": 5, 26 | "metric_list": [ 27 | { 28 | "metric": "acc", 29 | "aggregation": "mean", 30 | "higher_is_better": true 31 | } 32 | ], 33 | "output_type": "multiple_choice", 34 | "repeats": 1, 35 | "should_decontaminate": true, 36 | "doc_to_decontamination_query": "sentence", 37 | "metadata": { 38 | "version": 1.0 39 | } 40 | } 41 | }, 42 | "versions": { 43 | "winogrande": 1.0 44 | }, 45 | "n-shot": { 46 | "winogrande": 5 47 | }, 48 | "config": { 49 | "model": "hf", 50 | "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False", 51 | "batch_size": "16", 52 | "batch_sizes": [], 53 | "device": null, 54 | "use_cache": null, 55 | "limit": null, 56 | "bootstrap_iters": 100000, 57 | "gen_kwargs": null 58 | }, 59 | "git_hash": "22f5854", 60 | "date": 1711605880.7907126, 61 | "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nByte Order: Little Endian\nAddress sizes: 43 bits physical, 48 bits virtual\nCPU(s): 128\nOn-line CPU(s) list: 0-127\nThread(s) per core: 1\nCore(s) per socket: 64\nSocket(s): 2\nNUMA node(s): 2\nVendor ID: AuthenticAMD\nCPU family: 25\nModel: 1\nModel name: AMD EPYC 7763 64-Core Processor\nStepping: 1\nFrequency boost: enabled\nCPU MHz: 2511.241\nCPU max MHz: 2450.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4890.43\nVirtualization: AMD-V\nL1d cache: 4 MiB\nL1i cache: 4 MiB\nL2 cache: 64 MiB\nL3 cache: 512 MiB\nNUMA node0 CPU(s): 0-63\nNUMA node1 CPU(s): 64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Retbleed: Not affected\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] torch 2.2.1 pypi_0 pypi", 62 | "transformers_version": "4.38.2", 63 | "upper_git_hash": null 64 | } -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/scores.txt: -------------------------------------------------------------------------------- 1 | ================================================================ 2 | /data/private/new_lib/results/SOLAR-10.7B-Instruct-v1.0/ifeval/eval_results_strict.jsonl Accuracy Scores: 3 | prompt-level: 0.5157116451016636 4 | instruction-level: 0.5796519410977242 5 | 6 | change_case 0.5595238095238095 7 | combination 0.16417910447761194 8 | detectable_content 0.8727272727272727 9 | detectable_format 0.6666666666666666 10 | keywords 0.744 11 | language 0.8 12 | length_constraints 0.5631067961165048 13 | punctuation 0.1791044776119403 14 | startend 0.6060606060606061 15 | 16 | change_case:capital_word_frequency 0.7 17 | change_case:english_capital 0.4230769230769231 18 | change_case:english_lowercase 0.5789473684210527 19 | combination:repeat_prompt 0.047619047619047616 20 | combination:two_responses 0.36 21 | detectable_content:number_placeholders 0.8076923076923077 22 | detectable_content:postscript 0.9310344827586207 23 | detectable_format:constrained_response 0.9 24 | detectable_format:json_format 0.7647058823529411 25 | detectable_format:multiple_sections 0.25 26 | detectable_format:number_bullet_lists 0.46153846153846156 27 | detectable_format:number_highlighted_sections 0.5714285714285714 28 | detectable_format:title 0.9722222222222222 29 | keywords:existence 0.9523809523809523 30 | keywords:forbidden_words 0.5833333333333334 31 | keywords:frequency 0.8461538461538461 32 | keywords:letter_frequency 0.6551724137931034 33 | language:response_language 0.8 34 | length_constraints:nth_paragraph_first_word 0.16666666666666666 35 | length_constraints:number_paragraphs 0.4 36 | length_constraints:number_sentences 0.6666666666666666 37 | length_constraints:number_words 0.7333333333333333 38 | punctuation:no_comma 0.1791044776119403 39 | startend:end_checker 0.72 40 | startend:quotation 0.5365853658536586 41 | ================================================================ 42 | /data/private/new_lib/results/SOLAR-10.7B-Instruct-v1.0/ifeval/eval_results_loose.jsonl Accuracy Scores: 43 | prompt-level: 0.5600739371534196 44 | instruction-level: 0.6291834002677377 45 | 46 | change_case 0.6071428571428571 47 | combination 0.16417910447761194 48 | detectable_content 0.8727272727272727 49 | detectable_format 0.68 50 | keywords 0.832 51 | language 0.8666666666666667 52 | length_constraints 0.6213592233009708 53 | punctuation 0.31343283582089554 54 | startend 0.6515151515151515 55 | 56 | change_case:capital_word_frequency 0.75 57 | change_case:english_capital 0.5 58 | change_case:english_lowercase 0.6052631578947368 59 | combination:repeat_prompt 0.047619047619047616 60 | combination:two_responses 0.36 61 | detectable_content:number_placeholders 0.8076923076923077 62 | detectable_content:postscript 0.9310344827586207 63 | detectable_format:constrained_response 0.9 64 | detectable_format:json_format 0.8823529411764706 65 | detectable_format:multiple_sections 0.25 66 | detectable_format:number_bullet_lists 0.46153846153846156 67 | detectable_format:number_highlighted_sections 0.5714285714285714 68 | detectable_format:title 0.9722222222222222 69 | keywords:existence 0.9523809523809523 70 | keywords:forbidden_words 0.8333333333333334 71 | keywords:frequency 0.8974358974358975 72 | keywords:letter_frequency 0.6551724137931034 73 | language:response_language 0.8666666666666667 74 | length_constraints:nth_paragraph_first_word 0.5 75 | length_constraints:number_paragraphs 0.44 76 | length_constraints:number_sentences 0.6666666666666666 77 | length_constraints:number_words 0.7666666666666667 78 | punctuation:no_comma 0.31343283582089554 79 | startend:end_checker 0.72 80 | startend:quotation 0.6097560975609756 81 | -------------------------------------------------------------------------------- /examples/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/scores.txt: -------------------------------------------------------------------------------- 1 | Mode: single 2 | Input file: /data/private/new_lib/evalverse/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/model_judgment/gpt-4_single.jsonl 3 | 4 | ########## First turn ########## 5 | score 6 | model turn 7 | SOLAR-10.7B-Instruct-v1.0 1 7.66875 8 | 9 | ########## Second turn ########## 10 | score 11 | model turn 12 | SOLAR-10.7B-Instruct-v1.0 2 7.21519 13 | 14 | ########## Average ########## 15 | score 16 | model 17 | SOLAR-10.7B-Instruct-v1.0 7.443396 18 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "evalverse" 3 | version = "0.0.1" 4 | description = "The Universe of Evaluation. All about the evaluation for LLMs." 5 | authors = ["Evalverse <evalverse@upstage.ai>"] 6 | license = "Apache License 2.0" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | ### Evalverse 11 | python = ">=3.9,<3.11" 12 | python-dotenv = "^1.0.1" 13 | pre-commit = "^3.7.0" 14 | pandas = "^2.2.1" 15 | 16 | ### Evalverse Reporter 17 | slack-sdk = "^3.27.1" 18 | slack-bolt = "^1.18.1" 19 | gitpython = "^3.1.42" 20 | plotly = "^5.20.0" 21 | kaleido = "0.2.1" 22 | nbformat = ">=4.2.0" 23 | 24 | ### lm-evaluation-harness & Common 25 | lm-eval = "0.4.2" 26 | transformers = "4.37.2" 27 | vllm = "0.3.1" 28 | ray = "^2.10.0" 29 | 30 | ### FastChat 31 | fschat = { path = "evalverse/submodules/FastChat" } 32 | openai = "<1" 33 | anthropic = ">=0.3" 34 | 35 | ### IFEval 36 | absl-py = "^2.1.0" 37 | langdetect = "^1.0.9" 38 | immutabledict = "^4.2.0" 39 | 40 | ### EQ-Bench 41 | gspread = "^6.1.0" 42 | oauth2client = "^4.1.3" 43 | firebase-admin = "^6.5.0" 44 | tensorboardx = "^2.6.2.2" 45 | hf-transfer = "^0.1.6" 46 | scipy = "^1.12.0" 47 | pexpect = "^4.9.0" 48 | 49 | 50 | [tool.poetry.group.dev.dependencies] 51 | ipykernel = "^6.29.3" 52 | 53 | [build-system] 54 | requires = ["poetry-core"] 55 | build-backend = "poetry.core.masonry.api" 56 | --------------------------------------------------------------------------------