├── .env_sample
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── 1-bug-report.yml
    │   ├── 2-feature-request.yml
    │   ├── 3-new-eval-request.yml
    │   ├── 4-documentation-improve.yml
    │   └── config.yml
    └── pull_request_template.md
├── .gitignore
├── .gitmodules
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── assets
    ├── Evalverse_Color.png
    ├── Evalverse_White.png
    ├── architecture.png
    ├── intro-evalverse.png
    ├── overview.png
    └── sample_report.png
├── contribution
    └── CONTRIBUTING.md
├── evalverse
    ├── README.md
    ├── __init__.py
    ├── connector.py
    ├── evaluator.py
    ├── reporter.py
    ├── slack_bot.py
    ├── tests
    │   ├── test_evaluator.py
    │   ├── test_reporter.py
    │   ├── test_reproducibility.py
    │   └── test_results
    │   │   └── SOLAR-10.7B-Instruct-v1.0
    │   │       └── h6_en
    │   │           ├── arc_challenge_25.json
    │   │           ├── gsm8k_5.json
    │   │           ├── hellaswag_10.json
    │   │           ├── mmlu_5.json
    │   │           ├── truthfulqa_mc2_0.json
    │   │           └── winogrande_5.json
    └── utils.py
├── examples
    ├── 01_basic_usage.ipynb
    ├── 02_advanced_usage.ipynb
    ├── README.md
    ├── db
    │   ├── figures
    │   │   └── figure_20240402_105011.jpeg
    │   ├── score_df.csv
    │   └── scores
    │   │   └── table_20240402_105011.csv
    └── results
    │   ├── Llama-2-7b-chat-hf
    │       ├── eq_bench
    │       │   ├── benchmark_results.csv
    │       │   └── raw_results.json
    │       ├── h6_en
    │       │   ├── arc_challenge_25.json
    │       │   ├── gsm8k_5.json
    │       │   ├── hellaswag_10.json
    │       │   ├── mmlu_5.json
    │       │   ├── truthfulqa_mc2_0.json
    │       │   └── winogrande_5.json
    │       ├── ifeval
    │       │   ├── eval_results_loose.jsonl
    │       │   ├── eval_results_strict.jsonl
    │       │   ├── output.jsonl
    │       │   └── scores.txt
    │       └── mt_bench
    │       │   ├── model_answer
    │       │       └── Llama-2-7b-chat-hf.jsonl
    │       │   ├── model_judgment
    │       │       └── gpt-4_single.jsonl
    │       │   └── scores.txt
    │   └── SOLAR-10.7B-Instruct-v1.0
    │       ├── eq_bench
    │           ├── benchmark_results.csv
    │           └── raw_results.json
    │       ├── h6_en
    │           ├── arc_challenge_25.json
    │           ├── gsm8k_5.json
    │           ├── hellaswag_10.json
    │           ├── mmlu_5.json
    │           ├── truthfulqa_mc2_0.json
    │           └── winogrande_5.json
    │       ├── ifeval
    │           ├── eval_results_loose.jsonl
    │           ├── eval_results_strict.jsonl
    │           ├── output.jsonl
    │           └── scores.txt
    │       └── mt_bench
    │           ├── model_answer
    │               └── SOLAR-10.7B-Instruct-v1.0.jsonl
    │           ├── model_judgment
    │               └── gpt-4_single.jsonl
    │           └── scores.txt
├── poetry.lock
└── pyproject.toml


/.env_sample:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=sk-...
2 | 
3 | SLACK_BOT_TOKEN=xoxb-...
4 | SLACK_APP_TOKEN=xapp-...


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/1-bug-report.yml:
--------------------------------------------------------------------------------
 1 | name: "🐛 Bug Report"
 2 | description: Create a new ticket for a bug.
 3 | title: "🐛 [BUG] - <title>"
 4 | labels: [
 5 |   "bug"
 6 | ]
 7 | 
 8 | body:
 9 |   - type: textarea
10 |     id: environment-setting
11 |     attributes:
12 |       label: "Environment Settings"
13 |       description: Python version, ...
14 |       placeholder: Let us explain your environment settings to reproduce
15 |     validations:
16 |       required: true
17 | 
18 |   - type: textarea
19 |     id: expected-behavior
20 |     attributes:
21 |       label: "Expected Behavior"
22 |       placeholder: A clear and concise description of what you would expect to happen.
23 |     validations:
24 |       required: true
25 | 
26 |   - type: textarea
27 |     id: actual-behavior
28 |     attributes:
29 |       label: "Actual Behavior"
30 |       placeholder: A clear and concise description of what actually happened.
31 | 
32 |   - type: textarea
33 |     id: reproduction
34 |     attributes:
35 |       label: Reproduction
36 |       description: |
37 |         Please enter an explicit steps to reproduce your problem.
38 |         If you have any code snippets, error messages, and etc., please provide them here.
39 | 
40 |       placeholder: |
41 |         Steps to reproduce:
42 |           
43 |           1.
44 |           2.
45 |           3.
46 |           4.
47 |     validations:
48 |       required: true


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2-feature-request.yml:
--------------------------------------------------------------------------------
 1 | name: "🚀 Feature Request"
 2 | description: Suggesting new desired feature and enhancement of existing feature
 3 | title: "🚀 [REQUEST] - <title>"
 4 | labels: [
 5 |   "enhancement", "feature"
 6 | ]
 7 | 
 8 | body:
 9 |   - type: textarea
10 |     id: feature-request
11 |     attributes:
12 |       label: Feature request
13 |       description: |
14 |         Please describe the feature you want to add or needs to be enhanced.
15 |         If you have any related paper or code, please provide us.
16 |     validations:
17 |       required: true
18 | 
19 | 
20 |   - type: textarea
21 |     id: context
22 |     validations:
23 |       required: false
24 |     attributes:
25 |       label: Context
26 |       description: |
27 |         Please let us know your motivation or additional context for this suggestion.
28 |         Knowing the reason why it needs to be add/enhanced makes us easy to understand the need.
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/3-new-eval-request.yml:
--------------------------------------------------------------------------------
 1 | name: "🖍️ New Eval Request"
 2 | description: Suggesting new desired evluation method or enhancement of existing evaluation method
 3 | title: "🖍️ [REQUEST] - <title>"
 4 | labels: [
 5 |   "enhancement", "feature"
 6 | ]
 7 | 
 8 | body:
 9 |   - type: textarea
10 |     id: new-eval-request
11 |     attributes:
12 |       label: New evaluation method request
13 |       description: |
14 |         Please describe the evaluation method you want to add or needs to be enhanced.
15 |         If you have any related paper or code, please provide us.
16 |     validations:
17 |       required: true
18 | 
19 | 
20 |   - type: textarea
21 |     id: context
22 |     validations:
23 |       required: false
24 |     attributes:
25 |       label: Context
26 |       description: |
27 |         Please let us know your motivation or additional context for this suggestion.
28 |         Knowing the reason why it needs to be add/enhanced makes us easy to understand the need.
29 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/4-documentation-improve.yml:
--------------------------------------------------------------------------------
 1 | name: "📝 Documentation Improvement"
 2 | description: Report wrong or missing documentation. You can suggest new document or document that needs any improvement.
 3 | title: "📝 [Docs] - <title>"
 4 | labels: [
 5 |   "docs"
 6 | ]
 7 | 
 8 | body:
 9 |   - type: checkboxes
10 |     attributes:
11 |       label: evalverse version checks
12 |       options:
13 |         - label: >
14 |             I have checked that the issue still exists on the latest versions of the _evalverse_.
15 |           required: true
16 | 
17 |   - type: textarea
18 |     id: location
19 |     attributes:
20 |       label: Location of the documentation
21 |       description: >
22 |         Please provide the location of the documentation.
23 |         If you are suggesting new document, please provide appropriate place it has to be.
24 |     validations:
25 |       required: true
26 | 
27 |   - type: textarea
28 |     id: problem
29 |     attributes:
30 |       label: Documentation problem
31 |       description: >
32 |         Please provide a description of what documentation you believe needs to be fixed/improved/added.
33 |     validations:
34 |       required: true
35 | 
36 |   - type: textarea
37 |     id: suggestion
38 |     attributes:
39 |       label: Suggestion
40 |       description: >
41 |         Please explain the suggested fix and **why** it's better than the existing documentation.
42 |         Or it could be content of new document you are suggesting.
43 |     validations:
44 |       required: true


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## PR Checklist
 2 | Please check if your PR fulfills the following requirements:
 3 | 
 4 | - [ ] The commit message follows _evalverse_ guidelines [link](https://github.com/UpstageAI/evalverse/blob/main/contribution/CONTRIBUTING.md#commit-messages):
 5 | - [ ] Tests for the changes have been added (for bug fixes / features)
 6 | - [ ] Docs have been added / updated (for bug fixes / features)
 7 | 
 8 | 
 9 | ## What does this PR do?
10 | <!-- Please describe the link to a relevant issue and current behavior that you are modifying.-->
11 | 
12 | - Issue Number: #
13 | - Description: 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # Files from evalverse slack_bot
163 | db/*
164 | 
165 | # Files from reproducibility test
166 | evalverse/tests/test_results_reproduced/*
167 | 
168 | # Evaluation results -> please use git add -f <diff> for commiting results
169 | results/*
170 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "evalverse/submodules/FastChat"]
 2 | 	path = evalverse/submodules/FastChat
 3 | 	url = https://github.com/UpstageAI/evalverse-FastChat.git
 4 | 	branch = main
 5 | [submodule "evalverse/submodules/IFEval"]
 6 | 	path = evalverse/submodules/IFEval
 7 | 	url = https://github.com/UpstageAI/evalverse-IFEval.git
 8 | 	branch = main
 9 | [submodule "evalverse/submodules/EQBench"]
10 | 	path = evalverse/submodules/EQBench
11 | 	url = https://github.com/UpstageAI/evalverse-EQBench.git
12 | 	branch = main
13 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |     - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |       rev: v3.2.0
 4 |       hooks:
 5 |         # -   id: trailing-whitespace
 6 |         -   id: check-added-large-files
 7 |         -   id: detect-private-key
 8 |         -   id: detect-aws-credentials
 9 |             args: [--allow-missing-credentials]
10 |     - repo: https://github.com/pycqa/isort
11 |       rev: 5.13.2
12 |       hooks:
13 |         -   id: isort
14 |             args: [
15 |                     --profile=black,
16 |                 ]
17 |     - repo: https://github.com/psf/black
18 |       rev:  23.12.1
19 |       hooks:
20 |         -   id: black
21 |             args: [
22 |                 --line-length=100,
23 |             ]
24 | 
25 |     - repo: https://github.com/myint/autoflake
26 |       rev: v2.2.0
27 |       hooks:
28 |         -   id: autoflake
29 |             args: [
30 |             # --in-place,
31 |             # --remove-unused-variables,
32 |             # --remove-all-unused-imports,
33 |             --expand-star-imports,
34 |             ]
35 |     - repo: https://github.com/PyCQA/flake8
36 |       rev: 6.0.0
37 |       hooks:
38 |         -   id: flake8
39 |             args: [
40 |                 "--ignore=E203, E221, E231, E501, W503", 
41 |                 ]
42 |             # E203: Whitespace before ':'
43 |             # E221: multiple spaces before operator
44 |             # E231: missing whitespace after ','
45 |             # E501: line length - because black checks and this makes error even on commented code
46 |             # W503: PEP8 now recommends to break before binary operator (https://peps.python.org/pep-0008/#should-a-line-break-before-or-after-a-binary-operator)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | <picture>
  3 |   <source media="(prefers-color-scheme: dark)" srcset="assets/Evalverse_White.png" width=300>
  4 |   <source media="(prefers-color-scheme: light)" srcset="assets/Evalverse_Color.png" width=300>
  5 |   <img alt="Evalverse" src="assets/Evalverse_Color.png" width=300>
  6 | </picture>
  7 | 
  8 | The Universe of Evaluation.
  9 | All about the evaluation for LLMs.  </br>
 10 | Upstage Solar is powered by Evalverse! Try at Upstage [Console](https://console.upstage.ai/)!
 11 | 
 12 | [🤗HugginFace Space](https://huggingface.co/spaces/upstage/evalverse-space) • [📚Docs](https://evalverse.gitbook.io/evalverse-docs) • [📄Paper](https://arxiv.org/abs/2404.00943) 
 13 | 
 14 | [Examples](https://github.com/UpstageAI/evalverse/tree/main/examples) • [FAQ](https://evalverse.gitbook.io/evalverse-docs/documents/faqs) • [Contribution Guide](https://github.com/UpstageAI/evalverse/blob/main/contribution/CONTRIBUTING.md)  • [Contact](mailto:evalverse@upstage.ai)  • [Discord](https://discord.gg/D3bBj66K) 
 15 | </div>
 16 | 
 17 | ### 🚀 Newly updated
 18 | - [2024.05.10] LLM-Evaluation Report of Evalverse is now available on [HuggingFace Space](https://huggingface.co/spaces/upstage/evalverse-space).
 19 | 
 20 | <div align="center"><img alt="overview" src="assets/overview.png" width=500></div>
 21 | 
 22 | 
 23 | ## 👋 Welcome to Evalverse!
 24 | Evalverse is a freely accessible, open-source project designed to support your LLM (Large Language Model) evaluation needs. We provide a simple, standardized, and user-friendly solution for the processing and management of LLM evaluations, catering to the needs of AI research engineers and scientists. We also support no-code evaluation processes for people who may have less experience working with LLMs. Moreover, you will receive a well-organized report with figures summarizing the evaluation results.
 25 | 
 26 | ### With Evalverse, you are empowered to
 27 | - access various evaluation methods without juggling multiple libraries.
 28 | - receive insightful report about the evaluation results that helps you to compare the varied scores across different models.
 29 | - initiate evaluation and generate reports without any code via Slack bot.
 30 | 
 31 | 
 32 | ### Architecture of Evalverse
 33 | <div align="center"><img alt="architecture" src="assets/architecture.png" width=700></div>
 34 | 
 35 | ### Key Features of Evalverse
 36 | - **Unified evaluation with Submodules**: Evalverse extends its evaluation capabilities through Git submodules, effortlessly incorporating frameworks like [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [FastChat](https://github.com/lm-sys/FastChat). Swiftly add new tools and keep pace with the latest in LLM evaluation.
 37 | - **No-code evaluation request**: With Evalverse, request LLM evaluations without any code, simply by sending `Request!` in a direct message or Slack channel with an activate Evalverse Slack bot. Enter the model name in the Huggingface hub or local model directory path in Slack, and let the bot handle the rest.
 38 | - **LLM evaluation report**: Obtain comprehensive, no-code reports from Evalverse. Request with a simple command -`Report!`-, select the model and evaluation criteria, and receive detailed reports with scores, rankings, and visuals, all generated from the stored score database.
 39 | 
 40 | 
 41 | If you want to know more about Evalverse, please checkout our [docs](https://evalverse.gitbook.io/evalverse-docs). </br>
 42 | By clicking below image, it'll take you to a short intro video!
 43 | [![Brief Introduction](./assets/intro-evalverse.png)](https://www.youtube.com/watch?v=-VviAutjpgM)
 44 | </br>
 45 | 
 46 | ## 🌌 Installation
 47 | ### 🌠 Option 1: Git clone
 48 | Before cloning, please make sure you've registered proper SSH keys linked to your GitHub account.
 49 | 
 50 | #### 1. Clone the Evalverse repository
 51 | - Notes: add `--recursive` option to also clone submodules
 52 | ```
 53 | git clone --recursive https://github.com/UpstageAI/evalverse.git
 54 | ```
 55 | #### 2. Install requirement packages
 56 | ```
 57 | cd evalverse
 58 | pip install -e .
 59 | ```
 60 | 
 61 | ### 🌠 Option 2: Install via Pypi *(WIP)*
 62 | > Currently, installation via Pypi is not supported. Please install Evalverse with option 1.
 63 | 
 64 | 
 65 | </br>
 66 | 
 67 | ## 🌌 Configuration
 68 | You have to set an API key and/or Token in the `.env` file (rename `.env_sample` to `.env`) to use all features of Evalverse.
 69 | - OpenAI API Key (required for `mt_bench`)
 70 | - Slack BOT/APP Token (required for slack reporter)
 71 | ```
 72 | OPENAI_API_KEY=sk-...
 73 | 
 74 | SLACK_BOT_TOKEN=xoxb-...
 75 | SLACK_APP_TOKEN=xapp-...
 76 | ```
 77 | 
 78 | </br>
 79 | 
 80 | ## 🌌 Quickstart
 81 | More detailed tutorials are [here](https://github.com/UpstageAI/evalverse/tree/main/examples).
 82 | 
 83 | - [basic_usage.ipynb](https://github.com/UpstageAI/evalverse/tree/main/examples/01_basic_usage.ipynb): Very basic usage, like how to use `Evaluator` for evaluation and `Reporter` for generating report.
 84 | - [advanced_usage.ipynb](https://github.com/UpstageAI/evalverse/tree/main/examples/02_advanced_usage.ipynb): Introduces methods for evaluating each benchmark and all benchmarks collectively.
 85 | 
 86 | ### 🌠 Evaluation
 87 | #### 💫 Evaluation with Library
 88 | The following code is a simple example to evaluate the [SOLAR-10.7B-Instruct-v1.0 model](https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0) on the `h6_en` ([Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)) benchmark.
 89 | 
 90 | ```python
 91 | import evalverse as ev
 92 | 
 93 | evaluator = ev.Evaluator()
 94 | 
 95 | model = "upstage/SOLAR-10.7B-Instruct-v1.0"
 96 | benchmark = "h6_en"
 97 | 
 98 | evaluator.run(model=model, benchmark=benchmark)
 99 | ```
100 | 
101 | 
102 | #### 💫 Evaluation with CLI
103 | Here is a CLI script that produces the same result as the above code:
104 | 
105 | ```bash
106 | cd evalverse
107 | 
108 | python3 evaluator.py \
109 |   --h6_en \
110 |   --ckpt_path upstage/SOLAR-10.7B-Instruct-v1.0
111 | ```
112 | ### 🌠 Report
113 | Currently, generating a report is only available through the library. We will work on a Command Line Interface (CLI) version as soon as possible.
114 | 
115 | ```python
116 | import evalverse as ev
117 | 
118 | db_path = "./db"
119 | output_path = "./results"
120 | reporter = ev.Reporter(db_path=db_path, output_path=output_path)
121 | 
122 | reporter.update_db(save=True)
123 | 
124 | model_list = ["SOLAR-10.7B-Instruct-v1.0", "Llama-2-7b-chat-hf"]
125 | benchmark_list = ["h6_en"]
126 | reporter.run(model_list=model_list, benchmark_list=benchmark_list)
127 | ```
128 | <img alt="architecture" src="assets/sample_report.png" width=700>
129 | 
130 | |                     Model | Ranking | total_avg | H6-ARC | H6-Hellaswag | H6-MMLU | H6-TruthfulQA | H6-Winogrande | H6-GSM8k |
131 | |--------------------------:|--------:|----------:|-------:|-------------:|--------:|--------------:|--------------:|---------:|
132 | | SOLAR-10.7B-Instruct-v1.0 |       1 |     74.62 |  71.33 |        88.19 |   65.52 |         71.72 |         83.19 |    67.78 |
133 | |        Llama-2-7b-chat-hf |       2 |     53.51 |  53.16 |        78.59 |   47.38 |         45.31 |         72.69 |    23.96 |
134 | 
135 | </br>
136 | 
137 | ## 🌌 Supported Evaluations
138 | We currently support four evaluation methods. If you have suggestions for new methods, we welcome your input!
139 | 
140 | | Evaluation                | Original Repository                        |
141 | |---------------------------|--------------------------------------------|
142 | | H6 (Open LLM Leaderboard) | [EleutherAI](https://github.com/EleutherAI)/[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)|
143 | | MT-bench                  | [lm-sys](https://github.com/lm-sys)/[FastChat](https://github.com/lm-sys/FastChat)|
144 | | IFEval                    | [google-research](https://github.com/google-research/google-research/tree/master)/[instruction_following_eval](https://github.com/google-research/google-research/tree/master/instruction_following_eval)|
145 | | EQ-Bench                  | [EQ-bench](https://github.com/EQ-bench)/[EQ-Bench](https://github.com/EQ-bench/EQ-Bench)|
146 | 
147 | </br>
148 | 
149 | ## 🌌 Evalverse use-case
150 | > If you have any use-cases of your own, please feel free to let us know. </br>We would love to hear about them and possibly feature your case.
151 | 
152 | 
153 | *✨* [`Upstage`](https://www.upstage.ai/) is using Evalverse for evaluating [Solar](https://console.upstage.ai/services/solar?utm_source=upstage.ai&utm_medium=referral&utm_campaign=Main+hero+Solar+card&utm_term=Try+API+for+Free&utm_content=home). </br>
154 | *✨* [`Upstage`](https://www.upstage.ai/) is using Evalverse for evaluating models at [Open Ko-LLM Leaderboard](https://huggingface.co/spaces/upstage/open-ko-llm-leaderboard).
155 | 
156 | </br>
157 | 
158 | ## 🌌 Contributors
159 | <a href="https://github.com/UpstageAI/evalverse/graphs/contributors">
160 |   <img src="https://contrib.rocks/image?repo=UpstageAI/evalverse"/>
161 | </a>
162 | 
163 | 
164 | ## 🌌 Acknowledgements
165 | Evalverse is an open-source project orchestrated by the **Data-Centric LLM Team** at `Upstage`, designed as an ecosystem for LLM evaluation. Launched in April 2024, this initiative stands at the forefront of advancing evaluation handling in the realm of large language models (LLMs).
166 | 
167 | ## 🌌 License
168 | Evalverse is completely freely-accessible open-source and licensed under the Apache License 2.0.
169 | 
170 | ## 🌌 Citation
171 | If you want to cite our 🌌 Evalverse project, feel free to use the following bibtex. You can check our paper via [link](https://arxiv.org/abs/2404.00943).
172 | 
173 | ```bibtex
174 | @misc{kim2024evalverse,
175 |       title={Evalverse: Unified and Accessible Library for Large Language Model Evaluation}, 
176 |       author={Jihoo Kim and Wonho Song and Dahyun Kim and Yunsu Kim and Yungi Kim and Chanjun Park},
177 |       year={2024},
178 |       eprint={2404.00943},
179 |       archivePrefix={arXiv},
180 |       primaryClass={cs.CL}
181 | }
182 | ```
183 | 


--------------------------------------------------------------------------------
/assets/Evalverse_Color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/Evalverse_Color.png


--------------------------------------------------------------------------------
/assets/Evalverse_White.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/Evalverse_White.png


--------------------------------------------------------------------------------
/assets/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/architecture.png


--------------------------------------------------------------------------------
/assets/intro-evalverse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/intro-evalverse.png


--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/overview.png


--------------------------------------------------------------------------------
/assets/sample_report.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/assets/sample_report.png


--------------------------------------------------------------------------------
/contribution/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # __Contribution Guidelines__
  2 | Welcome to _Evalverse_! We warmly welcome any kind of contribution 😊✨. </br>
  3 | This page provides an outline on how to contribute to _Evalverse_ and suggestions for nice conventions to follow. 
  4 | > __These are guidelines, NOT rules 💡__ <p>
  5 | This page is not the Constituion of the _Evalverse_. We are providing guidelines to help you make a useful and efficient contribution to _Evalverse_. While we think these guidelines are sensible and we appreciate when they are observed, following them isn't strictly required. We hope you won't be tired by these guidelines. Also, we'd love to hear your ideas on how to improve our guidelines! 
  6 | 
  7 | </br>
  8 | 
  9 | # Table of Contents
 10 | - [Questions or Feedback](#questions-or-feedback)
 11 | - [🤝 How to Contribute?](#how-to-contribute)
 12 | - [Commit Guidelines](#commit-guidelines)
 13 | - [Style Guides](#style-guides)
 14 | 
 15 | </br>
 16 | 
 17 | # Questions or Feedback
 18 | Join the conversation on our GitHub discussion board! It's the go-to spot for questions, chats, and a helping hand from the _Evalverse_ community. Drop by and say hello here: [link](https://github.com/UpstageAI/evalverse/discussions)
 19 | 
 20 | And if there's a shiny new feature you're dreaming of, don't be shy—head over to our [issue page](https://github.com/UpstageAI/evalverse/issues) to let us know! Your input could help shape the future. ✨
 21 | 
 22 | </br>
 23 | 
 24 | # How to Contribute?
 25 | - Any kind of improvement of document: fixing typo, enhancing grammar or semantic structuring or adding new examples.
 26 | - Submit issues related to bugs, new desired features, or enhancement of existing features.
 27 | - Fix a bug, implement new feature or improving existing feature.
 28 | - Answer other users' question or help.
 29 | 
 30 | 
 31 | ## __Report a Bug / Request New Feature / Suggest Enhancements__
 32 | Please open an issue whenever you find a bug or have an idea to enhance _Evalverse_. Maintainers will label it or leave comment on it as soon as they check the issue. Issues labeled as `Open for contribution` mean they are open for contribution.
 33 | 
 34 | ## __Fix a Bug / Add New Feature / Improve Existing Feature__
 35 | If you have a particular roadmap, goals, or new feature, share it via issue. already fixed a bug or have new feature that enhances _Evalverse_, you can jump on to fourth step which is opening pull requests. Please note that when you open pull requests without opening an issue or maintainers' check, it can be declined if it does not aligh with philosophy of _Evalverse_.
 36 | 
 37 | ### __1️⃣ Check issues labeled as__ `Open for contribution`
 38 | You can find issues waiting for your contribution by filtering label with `Open for contribution`. This label does not stand alone. It is always with `Bug`, `Docs` or `Enhancement`. Issues with `Critical` or `ASAP` label are more urgent. 
 39 | 
 40 | 
 41 | ### __2️⃣ Leave a comment on the issue you want to contribute__
 42 | Once we review your comment, we'll entrust the issue to you by swapping out the `Open for contribution` label for a `WIP` (Work in Progress) label.
 43 | 
 44 | ### __3️⃣ Work on it__
 45 | Before diving into coding, do take a moment to familiarize yourself with our coding style by visiting this [style guides](#style-guides). And hey, if you hit a snag while tackling the issue, don't hesitate to drop a comment right there. Our community is a supportive bunch and will jump in to assist or brainstorm with you.
 46 | 
 47 | 1. Fork the repository of _Evalverse_.
 48 | 2. Clone your fork to your local disk.
 49 | 3. Create a new branch to hold your develompment changes. </br>
 50 | It's not required to adhere strictly to the branch naming example provided; consider it a mild suggestion.
 51 | ```bash
 52 | git checkout -b {prefix}/{issue-number}-{description}
 53 | ```
 54 | 4. Set up a development environment
 55 | 5. Develop the features in your branch
 56 | 
 57 | 
 58 | ### __4️⃣ Create a Pull Request__
 59 | Go ahead and visit your GitHub fork, then initiate a pull request — it's time to share your awesome work! Before you do, double-check that you've completed everything on the checklist we provided. Once you're all set, submit your contributions for the project maintainers to review.
 60 | 
 61 | Don't worry if the maintainers have some feedback or suggest changes—it's all part of the process and happens to even our most experienced contributors. Keep your updates flowing by working in your local branch and pushing any new changes to your fork. Your pull request will update automatically for everyone to see the progress.
 62 | 
 63 | </br>
 64 | 
 65 | # Commit Guidelines
 66 | ### Commit strategy
 67 | - Avoid mixing multiple, unrelated modifications in a single commit. One commit is related with one issue.
 68 | - Each commit should encapsulate a complete, autonomous upgrade to the code.
 69 | 
 70 | ### Commit messages
 71 | Please make sure your commit messages follow `type`: `title (#<related issue number>)` format. <br/>
 72 | For example:
 73 | ```plain text
 74 | <TYPE>: Short summary with 72 characters or less (#<Issue number>)
 75 | 
 76 | If you have more detalied explanatory text, put it as body.
 77 | But the body is optional.
 78 | ```
 79 | - Find adequate type in the below list:
 80 |     - `NEW`: introducing a new feature
 81 |     - `ENHANCE`: improve an existing code/feature.
 82 |     - `FIX`: fix a code bug
 83 |     - `DOCS`: write/update/add any kind of documents including docstring
 84 |     - `REFACTOR`: refactor existing code without any specific improvements
 85 |     - `STYLE`: changes that do not affect the meaning of the code (ex. white-space, line length)
 86 |     - `TEST`: add additional testing
 87 |     - `DEL`: remove code or files
 88 |     - `RELEASE`: release new version of evalverse
 89 |     - `OTHER`: anything not covered above (not recommended)
 90 | - Use the present tense ("Add feature" not "Added feature")
 91 | - Do not end the subject line with a punctuation
 92 | 
 93 | </br>
 94 | 
 95 | # Style Guides
 96 | ### Pre-commit hook
 97 | We provide a pre-commit git hook for style check. You can find exact check list in this [file](https://github.com/UpstageAI/evalverse/blob/main/.pre-commit-config.yaml). <br/> Please run the code below before a commit is created:
 98 | ```bash
 99 | pre-commit run
100 | ```
101 | 
102 | 


--------------------------------------------------------------------------------
/evalverse/README.md:
--------------------------------------------------------------------------------
 1 | # Evalverse
 2 | > The Universe of Evaluation. All about the evaluation for LLMs.
 3 | 
 4 | 
 5 | ## 🌌 Submodule
 6 | > The Submodule serves as the evaluation engine that is responsible for the heavy lifting involved in evaluating LLMs. Publicly available LLM evaluation libraries can be integrated into Evalverse as submodules. This component makes Evalverse expandable, thereby ensuring that the library remains up-to-date. 
 7 | 
 8 | ## 🌌 Connector
 9 | > The Connector plays a role in linking the Submodules with the Evaluator. It contains evaluation scripts, along with the necessary arguments, from various external libraries.
10 | 
11 | ## 🌌 Evaluator
12 | > The Evaluator performs the requested evaluations on the Compute Cluster by utilizing the evaluation scripts from the Connector. The Evaluator can receive evaluation requests either from the Reporter, which facilitates a no-code evaluation approach, or directly from the end-user for code-based evaluation.
13 | 
14 | ## 🌌 Reporter
15 | > The Reporter handles the evaluation and report requests sent by the users, allowing for a no-code approach to LLM evaluation. The Reporter sends the requested evaluation jobs to the Evaluator and fetches the evaluation results from the Database, which are sent to the user via an external communication platform such as Slack. Through this, users can receive table and figure that summarize evaluation results.


--------------------------------------------------------------------------------
/evalverse/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib.metadata
2 | 
3 | from evalverse.evaluator import Evaluator
4 | from evalverse.reporter import Reporter
5 | 
6 | __version__ = importlib.metadata.version("evalverse")
7 | 
8 | __all__ = [Evaluator, Reporter]
9 | 


--------------------------------------------------------------------------------
/evalverse/connector.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024-present Upstage Co., Ltd.
  3 | Apache-2.0 license
  4 | """
  5 | import json
  6 | import os
  7 | 
  8 | from evalverse.utils import EVALVERSE_MODULE_PATH, print_command, print_txt_file
  9 | 
 10 | 
 11 | def lm_evaluation_harness(
 12 |     model_path="upstage/SOLAR-10.7B-Instruct-v1.0",
 13 |     tasks="arc_challenge",
 14 |     batch_size=16,
 15 |     use_vllm=False,
 16 |     gpu_memory_utilization=0.8,
 17 |     tensor_parallel_size=1,
 18 |     data_parallel_size=1,
 19 |     num_fewshot=0,
 20 |     use_fast_tokenizer=False,
 21 |     use_flash_attention_2=False,
 22 |     load_in_8bit=False,
 23 |     load_in_4bit=False,
 24 |     output_path="../results",
 25 | ):
 26 |     output_json_path = os.path.join(output_path, f"{tasks}_{num_fewshot}.json")
 27 | 
 28 |     if not os.path.exists(output_json_path):
 29 |         if use_vllm:
 30 |             tokenizer_mode = "auto" if use_fast_tokenizer else "slow"
 31 |             eval_cmd = f"""
 32 |             lm_eval --model vllm \
 33 |                 --model_args pretrained={model_path},trust_remote_code=True,tensor_parallel_size={tensor_parallel_size},dtype=float16,gpu_memory_utilization={gpu_memory_utilization},data_parallel_size={data_parallel_size},tokenizer_mode={tokenizer_mode} \
 34 |                 --tasks {tasks} \
 35 |                 --batch_size {batch_size} \
 36 |                 --num_fewshot {num_fewshot} \
 37 |                 --output_path {output_json_path} \
 38 |             """
 39 |         else:
 40 |             hf_cmd = "lm_eval --model hf"
 41 |             model_args = f"pretrained={model_path},trust_remote_code=True,dtype=float16,use_fast_tokenizer={use_fast_tokenizer},use_flash_attention_2={use_flash_attention_2}"
 42 | 
 43 |             if data_parallel_size > 1:
 44 |                 hf_cmd = "accelerate launch -m " + hf_cmd
 45 |             if tensor_parallel_size > 1:
 46 |                 model_args = model_args + ",parallelize=True"
 47 |             if load_in_8bit:
 48 |                 model_args = model_args + ",load_in_8bit=True"
 49 |             if load_in_4bit:
 50 |                 model_args = model_args + ",load_in_4bit=True"
 51 | 
 52 |             eval_cmd = f"""
 53 |             NCCL_P2P_DISABLE=1 {hf_cmd} \
 54 |                 --model_args {model_args}  \
 55 |                 --tasks {tasks} \
 56 |                 --batch_size {batch_size} \
 57 |                 --num_fewshot {num_fewshot} \
 58 |                 --output_path {output_json_path} \
 59 |             """
 60 |         print_command(eval_cmd)
 61 |         os.system(eval_cmd)
 62 | 
 63 |     else:
 64 |         print(f"The result already exists: {os.path.abspath(output_json_path)}")
 65 | 
 66 | 
 67 | def fastchat_llm_judge(
 68 |     model_path="upstage/SOLAR-10.7B-Instruct-v1.0",
 69 |     model_id="SOLAR-10.7B-Instruct-v1.0",
 70 |     mt_bench_name="mt_bench",
 71 |     baselines=None,
 72 |     judge_model="gpt-4",
 73 |     num_gpus_per_model=1,
 74 |     num_gpus_total=1,
 75 |     parallel_api=1,
 76 |     output_path="../results",
 77 | ):
 78 |     scores_file = os.path.join(output_path, model_id, "mt_bench", "scores.txt")
 79 | 
 80 |     if not os.path.exists(scores_file):
 81 |         if baselines:
 82 |             model_list = " ".join([model_id] + baselines.split(","))
 83 |         else:
 84 |             model_list = model_id
 85 | 
 86 |         eval_code_path = os.path.join(
 87 |             EVALVERSE_MODULE_PATH, "submodules/FastChat/fastchat/llm_judge"
 88 |         )
 89 |         answer_path = os.path.join(output_path, model_id, "mt_bench", "model_answer")
 90 |         answer_file = os.path.join(answer_path, f"{model_id}.jsonl")
 91 |         judgement_path = os.path.join(output_path, model_id, "mt_bench", "model_judgment")
 92 |         judgement_file = os.path.join(judgement_path, "gpt-4_single.jsonl")
 93 | 
 94 |         gen_answer_cmd = f"python3 gen_model_answer.py --model-path {model_path} --model-id {model_id} --bench-name {mt_bench_name} --answer-file {answer_file} --num-gpus-per-model {num_gpus_per_model} --num-gpus-total {num_gpus_total}"
 95 |         gen_judgment_cmd = f"echo -e '\n' | python3 gen_judgment.py --model-list {model_list} --bench-name {mt_bench_name} --model-answer-dir {answer_path} --model-judgement-dir {judgement_path} --judge-model {judge_model} --parallel {parallel_api}"
 96 |         save_result_cmd = f"python3 show_result.py --model-list {model_list} --bench-name {mt_bench_name} --judge-model {judge_model} --input-file {judgement_file} > {os.path.join(output_path, model_id, 'mt_bench', 'scores.txt')}"
 97 | 
 98 |         eval_cmd = f"cd {eval_code_path}"
 99 |         if not os.path.exists(answer_file):
100 |             eval_cmd += f" && {gen_answer_cmd}"
101 |         if not os.path.exists(judgement_file):
102 |             eval_cmd += f" && {gen_judgment_cmd}"
103 |         eval_cmd += f" && {save_result_cmd}"
104 |         print_command(eval_cmd)
105 |         os.system(eval_cmd)
106 |     else:
107 |         print(f"The result already exists: {os.path.abspath(scores_file)}")
108 |     # print results
109 |     print_txt_file(scores_file)
110 | 
111 | 
112 | def instruction_following_eval(
113 |     model_path="upstage/SOLAR-10.7B-Instruct-v1.0",
114 |     model_name="SOLAR-10.7B-Instruct-v1.0",
115 |     gpu_per_inst_eval=1,
116 |     devices="0",
117 |     output_path="../results",
118 | ):
119 |     scores_file = os.path.join(output_path, model_name, "ifeval", "scores.txt")
120 | 
121 |     if not os.path.exists(scores_file):
122 |         eval_code_path = os.path.join(os.path.join(EVALVERSE_MODULE_PATH, "submodules/IFEval"))
123 | 
124 |         eval_cmd = f"""
125 |         cd {eval_code_path} && python3 inst_eval.py \
126 |             --model {model_path} \
127 |             --model_name {model_name} \
128 |             --gpu_per_inst_eval {gpu_per_inst_eval} \
129 |             --output_path {output_path} \
130 |             --devices {devices}
131 |         """
132 |         print_command(eval_cmd)
133 |         os.system(eval_cmd)
134 |     else:
135 |         print(f"The result already exists: {os.path.abspath(scores_file)}")
136 |     # print results
137 |     print_txt_file(scores_file)
138 | 
139 | 
140 | def eq_bench(
141 |     model_name="SOLAR-10.7B-Instruct-v1.0",  # model name for saving results
142 |     prompt_type="ChatML",  # Chat template
143 |     model_path="upstage/SOLAR-10.7B-Instruct-v1.0",  # model path
144 |     lora_path=None,  # lora adapter path
145 |     quantization=None,  # quantization, [None, "8bit", "4bit"] for load_in_8bit etc.
146 |     n_iterations=1,  # number of iterations to repeat the inference
147 |     devices="0",  # cuda devices
148 |     use_fast_tokenizer=False,  # use fast tokenizer
149 |     gpu_per_proc=1,  # gpu per process, currently only supports 1
150 |     use_flash_attention_2=True,  # use flash attention 2
151 |     torch_dtype="b16",  # torch dtype, [b16, f16, f32]
152 |     output_path="../results",  # output path
153 | ):
154 |     result_file = os.path.join(output_path, model_name, "eq_bench", "raw_results.json")
155 |     if not os.path.exists(result_file):
156 |         assert gpu_per_proc == 1, "Currently only supports 1 gpu per process"
157 | 
158 |         eval_code_path = os.path.join(os.path.join(EVALVERSE_MODULE_PATH, "submodules/EQBench"))
159 |         single_eval_code = f"""
160 |         CUDA_VISIBLE_DEVICES={devices} python3 eq-bench.py --model_name {model_name} --prompt_type {prompt_type} \
161 |             --model_path {model_path} --quantization {quantization} --n_iterations {n_iterations} \
162 |             --gpu_per_proc {gpu_per_proc} --torch_dtype {torch_dtype} --output_path {output_path} \
163 |             --devices {devices}"""
164 |         if use_fast_tokenizer:
165 |             single_eval_code += " --use_fast_tokenizer"
166 |         if use_flash_attention_2:
167 |             single_eval_code += " --use_flash_attention_2"
168 |         if lora_path is not None:
169 |             single_eval_code += f" --lora_path {lora_path}"
170 | 
171 |         eval_cmd = f"""
172 |         cd {eval_code_path} && {single_eval_code}
173 |         """
174 |         print_command(eval_cmd)
175 |         os.system(eval_cmd)
176 |     else:
177 |         print(f"The result already exists: {os.path.abspath(result_file)}")
178 |     # print results
179 |     with open(result_file, "r") as f:
180 |         data = json.load(f)
181 |     result = data[list(data.keys())[0]]["iterations"]["1"]["benchmark_results_fullscale"]
182 |     print(json.dumps(result, indent=4))
183 | 


--------------------------------------------------------------------------------
/evalverse/evaluator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024-present Upstage Co., Ltd.
  3 | Apache-2.0 license
  4 | """
  5 | import logging
  6 | import os
  7 | import time
  8 | from argparse import ArgumentParser
  9 | from pathlib import Path
 10 | from typing import Union, Optional
 11 | 
 12 | from evalverse.connector import (
 13 |     eq_bench,
 14 |     fastchat_llm_judge,
 15 |     instruction_following_eval,
 16 |     lm_evaluation_harness,
 17 | )
 18 | from evalverse.reporter import AVAILABLE_BENCHMARKS
 19 | from evalverse.utils import (
 20 |     EVALVERSE_LOG_FORMAT,
 21 |     EVALVERSE_OUTPUT_PATH,
 22 |     get_h6_en_scores,
 23 |     get_logger,
 24 | )
 25 | 
 26 | logging.basicConfig(format=EVALVERSE_LOG_FORMAT, datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
 27 | 
 28 | 
 29 | class Evaluator:
 30 |     def __init__(self, mode="lib", log_path=None):
 31 |         self.mode = mode  # lib or cli
 32 |         self.logger = get_logger(log_path)
 33 | 
 34 |     def get_args(self):
 35 |         parser = ArgumentParser()
 36 | 
 37 |         # Common Args
 38 |         parser.add_argument("--ckpt_path", type=str, default="upstage/SOLAR-10.7B-Instruct-v1.0")
 39 |         parser.add_argument("--output_path", type=str, default=EVALVERSE_OUTPUT_PATH)
 40 |         parser.add_argument("--model_name", type=str, help="using in save_path")
 41 |         parser.add_argument("--use_fast_tokenizer", action="store_true", default=False)
 42 |         parser.add_argument("--devices", type=str, default="0", help="The size of data parallel.")
 43 |         parser.add_argument("--use_flash_attention_2", action="store_true", default=False)
 44 | 
 45 |         # lm-evaluation-harness
 46 |         parser.add_argument("--h6_en", action="store_true", default=False)
 47 |         parser.add_argument("--batch_size", type=int, default=16)
 48 |         parser.add_argument("--use_vllm", action="store_true", default=False)
 49 |         parser.add_argument("--gpu_memory_utilization", type=float, default=0.8)
 50 |         parser.add_argument(
 51 |             "--model_parallel", type=int, default=1, help="The size of model parallel"
 52 |         )
 53 |         parser.add_argument(
 54 |             "--data_parallel", type=int, default=1, help="The size of data parallel"
 55 |         )
 56 |         parser.add_argument("--load_in_8bit", action="store_true", default=False)
 57 |         parser.add_argument("--load_in_4bit", action="store_true", default=False)
 58 | 
 59 |         # FastChat
 60 |         parser.add_argument("--mt_bench", action="store_true", default=False)
 61 |         parser.add_argument("--baselines", type=str, default=None)
 62 |         parser.add_argument("--judge_model", type=str, default="gpt-4")
 63 |         parser.add_argument(
 64 |             "--num_gpus_total", type=int, default=1, help="The total number of GPUs."
 65 |         )
 66 |         parser.add_argument(
 67 |             "--num_gpus_per_model", type=int, default=1, help="The number of GPUs per model."
 68 |         )
 69 |         parser.add_argument(
 70 |             "--parallel_api", type=int, default=1, help="The number of concurrent API calls."
 71 |         )
 72 | 
 73 |         # Instruction Following Eval
 74 |         parser.add_argument("--ifeval", action="store_true", default=False)
 75 |         parser.add_argument(
 76 |             "--gpu_per_inst_eval", type=int, default=1, help="The number of GPUs per model."
 77 |         )
 78 | 
 79 |         # EQ-Bench
 80 |         parser.add_argument("--eq_bench", action="store_true", default=False)
 81 |         parser.add_argument("--eq_bench_prompt_type", type=str, default="ChatML")
 82 |         parser.add_argument("--eq_bench_lora_path", type=str, default=None)
 83 |         parser.add_argument(
 84 |             "--eq_bench_quantization", type=str, default=None, choices=["8bit", "4bit", None]
 85 |         )
 86 | 
 87 |         if self.mode == "lib":
 88 |             args = parser.parse_args(args=[])
 89 |         elif self.mode == "cli":
 90 |             args = parser.parse_args()
 91 | 
 92 |         # update path to work regardless of /
 93 |         args.ckpt_path = str(Path(args.ckpt_path))
 94 |         args.output_path = str(Path(args.output_path))
 95 | 
 96 |         # handle model name
 97 |         if args.model_name is None:
 98 |             args.model_name = args.ckpt_path.split("/")[-1]
 99 | 
100 |         # change relative path to absolute path
101 |         if not os.path.isabs(args.output_path):
102 |             args.output_path = os.path.abspath(args.output_path)
103 | 
104 |         return args
105 | 
106 |     def update_args(self, args, model, benchmark, kwargs):
107 |         for k, v in kwargs.items():
108 |             if k in args:
109 |                 setattr(args, k, v)
110 |                 self.logger.info(f'The value of argument "{k}" has been changed to "{v}".')
111 |             else:
112 |                 self.logger.warning(f'The argument "{k}" does not exist.')
113 |         if model:
114 |             args.ckpt_path = model
115 |         if benchmark:
116 |             if benchmark == "all":
117 |                 benchmark = AVAILABLE_BENCHMARKS
118 |                 self.logger.info(f"All available benchmarks are selected: {AVAILABLE_BENCHMARKS}")
119 |             if benchmark in AVAILABLE_BENCHMARKS:
120 |                 setattr(args, benchmark, True)
121 |                 self.logger.info(f'The value of argument "{benchmark}" has been changed to "True".')
122 |             elif type(benchmark) == list:
123 |                 for b in benchmark:
124 |                     if b in AVAILABLE_BENCHMARKS:
125 |                         setattr(args, b, True)
126 |                         self.logger.info(f'The value of argument "{b}" has been changed to "True".')
127 |                     else:
128 |                         raise ValueError(
129 |                             f'"{b}" is not in Available_Benchmarks: {AVAILABLE_BENCHMARKS}'
130 |                         )
131 |             else:
132 |                 raise ValueError(
133 |                     f'"{benchmark}" is not in Available_Benchmarks: {AVAILABLE_BENCHMARKS}'
134 |                 )
135 |         else:
136 |             self.logger.info(
137 |                 f"No selected benchmarks. Available_Benchmarks: {AVAILABLE_BENCHMARKS}"
138 |             )
139 |         self.logger.info(f"Args {vars(args)}")
140 | 
141 |         return args
142 | 
143 |     def run(self, model: Optional[str] = None, benchmark: Optional[Union[str, list]] = None, **kwargs):
144 | 
145 |         # update args
146 |         args = self.get_args()
147 |         args = self.update_args(args, model, benchmark, kwargs)
148 | 
149 |         # h6_en (with lm-evaluation-harness)
150 |         if args.h6_en:
151 |             task_and_shot = [
152 |                 ("arc_challenge", 25),
153 |                 ("hellaswag", 10),
154 |                 ("mmlu", 5),
155 |                 ("truthfulqa_mc2", 0),
156 |                 ("winogrande", 5),
157 |                 ("gsm8k", 5),
158 |             ]
159 |             model_name = args.ckpt_path.split("/")[-1]
160 |             h6_en_output_path = os.path.join(args.output_path, model_name, "h6_en")
161 |             for _task_name, _num_fewshot in task_and_shot:
162 |                 start_time = time.time()
163 |                 #############################################
164 |                 lm_evaluation_harness(
165 |                     model_path=args.ckpt_path,
166 |                     tasks=_task_name,
167 |                     batch_size=args.batch_size,
168 |                     use_vllm=args.use_vllm,
169 |                     gpu_memory_utilization=args.gpu_memory_utilization,
170 |                     tensor_parallel_size=args.model_parallel,
171 |                     data_parallel_size=args.data_parallel,
172 |                     num_fewshot=_num_fewshot,
173 |                     use_fast_tokenizer=args.use_fast_tokenizer,
174 |                     use_flash_attention_2=args.use_flash_attention_2,
175 |                     load_in_8bit=args.load_in_8bit,
176 |                     load_in_4bit=args.load_in_4bit,
177 |                     output_path=h6_en_output_path,
178 |                 )
179 |                 #############################################
180 |                 end_time = time.time()
181 |                 total_min = round((end_time - start_time) / 60)
182 |                 bench_name = _task_name + "_" + str(_num_fewshot) + "shot"
183 |                 self.logger.info(
184 |                     f"{bench_name} done! exec_time: {total_min} min for {args.ckpt_path}"
185 |                 )
186 |             get_h6_en_scores(h6_en_output_path, print_results=True)
187 |         # mt_bench (with evalverse-FastChat)
188 |         if args.mt_bench:
189 |             if "OPENAI_API_KEY" not in os.environ:
190 |                 self.logger.warning("No OPENAI_API_KEY provided. Please add it.")
191 |             start_time = time.time()
192 |             #############################################
193 |             fastchat_llm_judge(
194 |                 model_path=args.ckpt_path,
195 |                 model_id=args.model_name,
196 |                 mt_bench_name="mt_bench",
197 |                 baselines=args.baselines,
198 |                 judge_model=args.judge_model,
199 |                 num_gpus_per_model=args.num_gpus_per_model,
200 |                 num_gpus_total=args.num_gpus_total,
201 |                 parallel_api=args.parallel_api,
202 |                 output_path=args.output_path,
203 |             )
204 |             #############################################
205 |             end_time = time.time()
206 |             total_min = round((end_time - start_time) / 60)
207 |             bench_name = "mt_bench"
208 |             self.logger.info(f"{bench_name} done! exec_time: {total_min} min for {args.ckpt_path}")
209 | 
210 |         # ifeval (with evalverse-IFEval)
211 |         if args.ifeval:
212 |             start_time = time.time()
213 |             #############################################
214 |             instruction_following_eval(
215 |                 model_path=args.ckpt_path,
216 |                 model_name=args.model_name,
217 |                 gpu_per_inst_eval=args.gpu_per_inst_eval,
218 |                 devices=args.devices,
219 |                 output_path=args.output_path,
220 |             )
221 |             #############################################
222 |             end_time = time.time()
223 |             total_min = round((end_time - start_time) / 60)
224 |             bench_name = "ifeval"
225 |             self.logger.info(f"{bench_name} done! exec_time: {total_min} min for {args.ckpt_path}")
226 | 
227 |         # eq_bench (with evalverse-EQBench)
228 |         if args.eq_bench:
229 |             start_time = time.time()
230 |             #############################################
231 |             eq_bench(
232 |                 model_name=args.model_name,
233 |                 prompt_type=args.eq_bench_prompt_type,
234 |                 model_path=args.ckpt_path,
235 |                 lora_path=args.eq_bench_lora_path,
236 |                 quantization=args.eq_bench_quantization,
237 |                 devices=args.devices,
238 |                 use_fast_tokenizer=args.use_fast_tokenizer,
239 |                 use_flash_attention_2=args.use_flash_attention_2,
240 |                 output_path=args.output_path,
241 |             )
242 |             #############################################
243 |             end_time = time.time()
244 |             total_min = round((end_time - start_time) / 60)
245 |             bench_name = "eq_bench"
246 |             self.logger.info(f"{bench_name} done! exec_time: {total_min} min for {args.ckpt_path}")
247 | 
248 | 
249 | if __name__ == "__main__":
250 |     from dotenv import load_dotenv
251 | 
252 |     load_dotenv(override=True)
253 | 
254 |     evaluator_cli = Evaluator(mode="cli")
255 |     evaluator_cli.run()
256 | 


--------------------------------------------------------------------------------
/evalverse/reporter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024-present Upstage Co., Ltd.
  3 | Apache-2.0 license
  4 | """
  5 | import os
  6 | from datetime import datetime, timedelta, timezone
  7 | 
  8 | import pandas as pd
  9 | from typing import Union, List
 10 | 
 11 | from evalverse.utils import (
 12 |     EVALVERSE_DB_PATH,
 13 |     EVALVERSE_MODULE_PATH,
 14 |     EVALVERSE_OUTPUT_PATH,
 15 |     get_eqbench_score,
 16 |     get_figure,
 17 |     get_h6_en_scores,
 18 |     get_ifeval_scores,
 19 |     get_logger,
 20 |     get_mt_bench_scores,
 21 | )
 22 | 
 23 | KST = timezone(timedelta(hours=9))
 24 | AVAILABLE_BENCHMARKS = ["h6_en", "mt_bench", "ifeval", "eq_bench"]
 25 | 
 26 | H6EN_NAMES = ["H6-ARC", "H6-Hellaswag", "H6-MMLU", "H6-TruthfulQA", "H6-Winogrande", "H6-GSM8k"]
 27 | MTBENCH_NAMES = [
 28 |     "MT-Bench-Coding",
 29 |     "MT-Bench-Extraction",
 30 |     "MT-Bench-Humanities",
 31 |     "MT-Bench-Math",
 32 |     "MT-Bench-Reasoning",
 33 |     "MT-Bench-Roleplay",
 34 |     "MT-Bench-Stem",
 35 |     "MT-Bench-Writing",
 36 | ]
 37 | IFEVAL_NAMES = [
 38 |     "IFEval-strict-prompt",
 39 |     "IFEval-strict-instruction",
 40 |     "IFEval-loose-prompt",
 41 |     "IFEval-loose-instruction",
 42 | ]
 43 | EQBENCH_NAME = ["EQ-Bench"]
 44 | 
 45 | 
 46 | class Reporter:
 47 |     def __init__(self, db_path=EVALVERSE_DB_PATH, output_path=EVALVERSE_OUTPUT_PATH, log_path=None):
 48 |         self.db_path = db_path
 49 |         self.output_path = output_path
 50 |         self.logger = get_logger(log_path)
 51 | 
 52 |         self.score_path = os.path.join(self.db_path, "score_df.csv")
 53 |         self.table_dir = os.path.join(self.db_path, "scores")
 54 |         self.figure_dir = os.path.join(self.db_path, "figures")
 55 | 
 56 |         self.model_list = self._get_dirname_list(self.output_path)
 57 | 
 58 |         for path in [self.db_path, self.table_dir, self.figure_dir]:
 59 |             if not os.path.exists(path):
 60 |                 os.makedirs(path)
 61 | 
 62 |         if os.path.exists(self.score_path):
 63 |             self.score_df = pd.read_csv(self.score_path)
 64 |         else:
 65 |             self.update_db(git_fetch=False)
 66 | 
 67 |     def _get_dirname_list(self, path):
 68 |         return sorted(os.listdir(path), key=str.lower)
 69 | 
 70 |     def update_db(self, save=False, git_fetch=False):
 71 |         if git_fetch:
 72 |             import git
 73 | 
 74 |             repo = git.Repo("../")
 75 |             repo.remotes.origin.fetch()
 76 | 
 77 |         self.model_list = self._get_dirname_list(self.output_path)
 78 |         if len(self.model_list) > 0:
 79 |             values_list = []
 80 |             for model_name in self.model_list:
 81 |                 bench_list = self._get_dirname_list(os.path.join(self.output_path, model_name))
 82 |                 if len(bench_list) > 0:
 83 |                     values = [model_name]
 84 |                     if "h6_en" in bench_list:
 85 |                         h6_en_path = os.path.join(self.output_path, model_name, "h6_en")
 86 |                         h6_en_scores = get_h6_en_scores(h6_en_path)
 87 |                         values += h6_en_scores
 88 |                         self.logger.info(f"DB updated: h6_en for {model_name}")
 89 |                     else:
 90 |                         values += [0] * len(H6EN_NAMES)
 91 |                     if "mt_bench" in bench_list:
 92 |                         mtbench_path = os.path.join(self.output_path, model_name, "mt_bench")
 93 |                         question_file = os.path.join(
 94 |                             EVALVERSE_MODULE_PATH,
 95 |                             "submodules/FastChat/fastchat/llm_judge/data/mt_bench/question.jsonl",
 96 |                         )
 97 |                         judgement_file = os.path.join(
 98 |                             mtbench_path, "model_judgment", "gpt-4_single.jsonl"
 99 |                         )
100 |                         mt_scores = get_mt_bench_scores(model_name, question_file, judgement_file)
101 |                         values += mt_scores
102 |                         self.logger.info(f"DB updated: mt_bench for {model_name}")
103 |                     else:
104 |                         values += [0] * len(MTBENCH_NAMES)
105 |                     if "ifeval" in bench_list:
106 |                         score_file = os.path.join(
107 |                             self.output_path, model_name, "ifeval", "scores.txt"
108 |                         )
109 |                         ifeval_scores = get_ifeval_scores(score_file)
110 |                         values += ifeval_scores
111 |                         self.logger.info(f"DB updated: ifeval for {model_name}")
112 |                     else:
113 |                         values += [0] * len(IFEVAL_NAMES)
114 |                     if "eq_bench" in bench_list:
115 |                         eqbench_result_file = os.path.join(
116 |                             self.output_path, model_name, "eq_bench", "raw_results.json"
117 |                         )
118 |                         eqbench_score = get_eqbench_score(eqbench_result_file)
119 |                         values += eqbench_score
120 |                         self.logger.info(f"DB updated: eq_bench for {model_name}")
121 |                     else:
122 |                         values += [0] * len(EQBENCH_NAME)
123 |                     values_list.append(values)
124 |                 else:
125 |                     pass
126 |             column_list = ["Model"] + H6EN_NAMES + MTBENCH_NAMES + IFEVAL_NAMES + EQBENCH_NAME
127 |             self.score_df = pd.DataFrame(data=values_list, columns=column_list)
128 |             if save:
129 |                 self.score_df.to_csv(self.score_path, index=False)
130 |                 self.logger.info(f"DB saved to {self.score_path}")
131 |         else:
132 |             pass
133 | 
134 |     def run(self, model_list: Union[List, str] = "all", benchmark_list: Union[List, str] = "all", save: bool = False):
135 | 
136 |         if type(model_list) == list:
137 |             for m in model_list:
138 |                 if m in self.model_list:
139 |                     pass
140 |                 else:
141 |                     raise ValueError(f'"{m}" is not in Available_Models: {self.model_list}')
142 |         elif type(model_list) == str:
143 |             if model_list in self.model_list:
144 |                 model_list = [model_list]
145 |             elif model_list == "all":
146 |                 model_list = self.model_list
147 |             else:
148 |                 raise ValueError(f'"{model_list}" is not in Available_Models: {self.model_list}')
149 |         else:
150 |             raise TypeError
151 | 
152 |         if type(benchmark_list) == list:
153 |             for b in benchmark_list:
154 |                 if b in AVAILABLE_BENCHMARKS:
155 |                     pass
156 |                 else:
157 |                     raise ValueError(
158 |                         f'"{b}" is not in Available_Benchmarks: {AVAILABLE_BENCHMARKS}'
159 |                     )
160 |         elif type(benchmark_list) == str:
161 |             if benchmark_list in AVAILABLE_BENCHMARKS:
162 |                 benchmark_list = [benchmark_list]
163 |             elif benchmark_list == "all":
164 |                 benchmark_list = AVAILABLE_BENCHMARKS
165 |             else:
166 |                 raise ValueError(
167 |                     f'"{benchmark_list}" is not in Available_Benchmarks: {AVAILABLE_BENCHMARKS}'
168 |                 )
169 |         selected_benchmarks = []
170 |         for b in benchmark_list:
171 |             if b == "h6_en":
172 |                 selected_benchmarks += H6EN_NAMES
173 |             if b == "mt_bench":
174 |                 selected_benchmarks += MTBENCH_NAMES
175 |             if b == "ifeval":
176 |                 selected_benchmarks += IFEVAL_NAMES
177 |             if b == "eq_bench":
178 |                 selected_benchmarks += EQBENCH_NAME
179 | 
180 |         score_df = self.score_df.copy()
181 |         score_df = score_df[(score_df["Model"].isin(model_list))]
182 |         score_df["total_avg"] = score_df[selected_benchmarks].mean(axis=1).round(2)
183 |         score_df = score_df.sort_values("total_avg", ascending=False).reset_index(drop=True)
184 |         score_df["Ranking"] = score_df["total_avg"].rank(ascending=False).astype(int)
185 |         score_df = score_df[["Model", "Ranking", "total_avg"] + selected_benchmarks]
186 | 
187 |         if save:
188 |             request_time = datetime.now(KST).strftime("%Y%m%d_%H%M%S")
189 |             table_name = f"table_{request_time}.csv"
190 |             figure_name = f"figure_{request_time}.jpeg"
191 |             table_path = os.path.join(self.table_dir, table_name)
192 |             figure_path = os.path.join(self.figure_dir, figure_name)
193 | 
194 |             score_df.to_csv(table_path, index=False)
195 |             get_figure(score_df, selected_benchmarks, figure_path, save=True)
196 |             self.logger.info(f"Table saved to {table_path}")
197 |             self.logger.info(f"Figure saved to {figure_path}")
198 |             return table_path, figure_path
199 |         else:
200 |             get_figure(score_df, selected_benchmarks, save=False)
201 |             return score_df
202 | 


--------------------------------------------------------------------------------
/evalverse/slack_bot.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024-present Upstage Co., Ltd.
  3 | Apache-2.0 license
  4 | """
  5 | import os
  6 | 
  7 | from dotenv import load_dotenv
  8 | from slack_bolt import App
  9 | from slack_bolt.adapter.socket_mode import SocketModeHandler
 10 | from slack_sdk import WebClient
 11 | from slack_sdk.errors import SlackApiError
 12 | 
 13 | from evalverse.reporter import AVAILABLE_BENCHMARKS, Reporter
 14 | from evalverse.utils import EVALVERSE_DB_PATH, EVALVERSE_OUTPUT_PATH, get_logger
 15 | 
 16 | # Slack
 17 | load_dotenv(override=True)
 18 | bot_token = os.getenv("SLACK_BOT_TOKEN")
 19 | app_token = os.getenv("SLACK_APP_TOKEN")
 20 | client = WebClient(token=bot_token)
 21 | app = App(token=bot_token)
 22 | 
 23 | # Reporter
 24 | reporter = Reporter(db_path=EVALVERSE_DB_PATH, output_path=EVALVERSE_OUTPUT_PATH)
 25 | 
 26 | # Logger
 27 | logger = get_logger(os.path.join(EVALVERSE_DB_PATH, "slack_bot.log"))
 28 | 
 29 | 
 30 | def send_msg(msg, channel_id):
 31 |     try:
 32 |         result = client.chat_postMessage(channel=channel_id, text=msg)
 33 |         logger.info(result)
 34 | 
 35 |     except SlackApiError as e:
 36 |         logger.error(f"Error posting message: {e}")
 37 | 
 38 | 
 39 | def upload_file(file_name, channel_id):
 40 |     try:
 41 |         result = client.files_upload_v2(
 42 |             channels=channel_id,
 43 |             file=file_name,
 44 |         )
 45 |         logger.info(result)
 46 | 
 47 |     except SlackApiError as e:
 48 |         logger.error("Error uploading file: {}".format(e))
 49 | 
 50 | 
 51 | @app.message(r"Request!|request!|!Request|!request")
 52 | def request_eval(ack, body, say, logger):
 53 |     ack()
 54 |     logger.info(body)
 55 |     say(
 56 |         text="",
 57 |         blocks=[
 58 |             {
 59 |                 "dispatch_action": True,
 60 |                 "type": "input",
 61 |                 "element": {
 62 |                     "type": "plain_text_input",
 63 |                     "action_id": "model_request_en",
 64 |                     "placeholder": {
 65 |                         "type": "plain_text",
 66 |                         "text": "ex) upstage/SOLAR... or /my_local/checkpoints/SOLAR...",
 67 |                     },
 68 |                 },
 69 |                 "label": {
 70 |                     "type": "plain_text",
 71 |                     "text": "Model name in HugginFace hub or checkpoint path in local",
 72 |                 },
 73 |             }
 74 |         ],
 75 |     )
 76 | 
 77 | 
 78 | @app.action("model_request_en")
 79 | def confirm_eval(ack, body, say, logger):
 80 |     ack()
 81 |     logger.info(body)
 82 | 
 83 |     global user_input
 84 |     user_input = body["actions"][0]["value"]
 85 |     say(
 86 |         text="",
 87 |         blocks=[
 88 |             {
 89 |                 "type": "section",
 90 |                 "text": {
 91 |                     "type": "mrkdwn",
 92 |                     "text": f'❗ Please double-check the model you requested evaluation for.\nIf the name or path of the model is [{user_input}], please press "Confirm" 👉',
 93 |                 },
 94 |                 "accessory": {
 95 |                     "type": "button",
 96 |                     "text": {
 97 |                         "type": "plain_text",
 98 |                         "text": "Confirm",
 99 |                     },
100 |                     "value": "click_me_123",
101 |                     "action_id": "model_confirm_en",
102 |                 },
103 |             }
104 |         ],
105 |     )
106 | 
107 | 
108 | @app.action("model_confirm_en")
109 | def run_eval(ack, body, say, logger):
110 |     ack()
111 |     logger.info(body)
112 | 
113 |     # Start
114 |     start_msg = (
115 |         f"⏳ Evaluation in progress for the model <@{body['user']['id']}> requested.. [{user_input}]"
116 |     )
117 |     say(start_msg)
118 | 
119 |     # Run an evaluation
120 |     from evalverse import Evaluator
121 | 
122 |     evaluator = Evaluator()
123 |     evaluator.run(model=user_input, benchmark="all")
124 | 
125 |     # End
126 |     req_channel_id = body["channel"]["id"]
127 |     complete_msg = f"Done! <@{body['user']['id']}>\n[{user_input}] is added."
128 |     send_msg(complete_msg, req_channel_id)
129 | 
130 |     logger.info(f"@{body['user']['id']}::{user_input}")
131 | 
132 | 
133 | @app.message(r"Report!|report!|!Report|!report")
134 | def report_model_selection(ack, body, say, logger):
135 |     ack()
136 |     logger.info(body)
137 | 
138 |     reporter.update_db(save=True, git_fetch=False)
139 | 
140 |     model_options = sorted(os.listdir(EVALVERSE_OUTPUT_PATH), key=str.lower)
141 |     say(
142 |         text="",
143 |         blocks=[
144 |             {
145 |                 "type": "section",
146 |                 "block_id": "section_1",
147 |                 "text": {"type": "mrkdwn", "text": "Please select the model to evaluate."},
148 |                 "accessory": {
149 |                     "action_id": "model_select_en",
150 |                     "type": "multi_static_select",
151 |                     "placeholder": {"type": "plain_text", "text": "Model selection"},
152 |                     "options": [
153 |                         {"text": {"type": "plain_text", "text": m[:75]}, "value": f"value-{i}"}
154 |                         for i, m in enumerate(model_options)
155 |                     ],
156 |                 },
157 |             }
158 |         ],
159 |     )
160 | 
161 | 
162 | @app.action("model_select_en")
163 | def report_bench_selection(ack, body, say, logger):
164 |     ack()
165 |     logger.info(body)
166 | 
167 |     global model_list
168 |     model_list = []
169 |     for action in body["actions"]:
170 |         for option in action["selected_options"]:
171 |             model_list.append(option["text"]["text"])
172 | 
173 |     say(
174 |         text="",
175 |         blocks=[
176 |             {
177 |                 "type": "section",
178 |                 "block_id": "section_2",
179 |                 "text": {"type": "mrkdwn", "text": "Please select the evaluation criteria."},
180 |                 "accessory": {
181 |                     "action_id": "bench_select_en",
182 |                     "type": "multi_static_select",
183 |                     "placeholder": {"type": "plain_text", "text": "Metric selection"},
184 |                     "options": [
185 |                         {"text": {"type": "plain_text", "text": m}, "value": f"value-{i}"}
186 |                         for i, m in enumerate(AVAILABLE_BENCHMARKS)
187 |                     ],
188 |                 },
189 |             }
190 |         ],
191 |     )
192 | 
193 | 
194 | @app.action("bench_select_en")
195 | def report_figure_and_table(ack, body, say, logger):
196 |     ack()
197 |     logger.info(body)
198 | 
199 |     bench_list = []
200 |     for action in body["actions"]:
201 |         for option in action["selected_options"]:
202 |             bench_list.append(option["text"]["text"])
203 | 
204 |     table_path, figure_path = reporter.run(
205 |         model_list=model_list, benchmark_list=bench_list, save=True
206 |     )
207 | 
208 |     models = "\n".join([f"• {m}" for m in model_list])
209 |     benchs = "\n".join([f"• {m}" for m in bench_list])
210 | 
211 |     # message
212 |     msg = f"LLM Evaluation Report requested by <@{body['user']['id']}>.\n\n🤖 Selected models\n{models}\n\n📊 Selected benchmarks\n{benchs}"
213 |     say(msg)
214 | 
215 |     # upload files for request
216 |     req_channel_id = body["channel"]["id"]
217 |     upload_file(figure_path, req_channel_id)
218 |     upload_file(table_path, req_channel_id)
219 | 
220 |     # logging
221 |     logger.info(f"@{body['user']['id']}::{bench_list}::{model_list}")
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     SocketModeHandler(app, app_token).start()
226 | 


--------------------------------------------------------------------------------
/evalverse/tests/test_evaluator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from evalverse.evaluator import Evaluator
 5 | 
 6 | TEST_PATH = os.path.dirname(os.path.abspath(__file__))
 7 | 
 8 | 
 9 | class TestEvaluator(unittest.TestCase):
10 |     def setUp(self):
11 |         self.evaluator = Evaluator(mode="lib")
12 | 
13 |     def test_get_args_default(self):
14 |         args = self.evaluator.get_args()
15 |         self.assertEqual(args.ckpt_path, "upstage/SOLAR-10.7B-Instruct-v1.0")
16 | 
17 |     def test_run_args_overriding(self):
18 |         your_model = "your/Model"
19 |         your_output_path = "/your/output_path"
20 |         self.evaluator.run(model=your_model, output_path=your_output_path)
21 |         self.assertEqual(self.evaluator.args.ckpt_path, your_model)
22 |         self.assertEqual(self.evaluator.args.output_path, your_output_path)
23 | 
24 |     def test_run_h6_en_existing(self):
25 |         benchmark = "h6_en"
26 |         output_path = os.path.join(TEST_PATH, "test_results")
27 |         self.evaluator.run(benchmark=benchmark, output_path=output_path)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     unittest.main()
32 | 


--------------------------------------------------------------------------------
/evalverse/tests/test_reporter.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from evalverse.reporter import Reporter
 5 | 
 6 | TEST_PATH = os.path.dirname(os.path.abspath(__file__))
 7 | 
 8 | 
 9 | class TestEvaluator(unittest.TestCase):
10 |     def setUp(self):
11 |         output_path = os.path.join(TEST_PATH, "test_results")
12 |         self.reporter = Reporter(output_path=output_path)
13 | 
14 |     def test_update_db(self):
15 |         self.reporter.update_db()
16 | 
17 |     def test_run(self):
18 |         model_list = ["SOLAR-10.7B-Instruct-v1.0"]
19 |         benchmark_list = ["h6_en"]
20 |         self.reporter.run(model_list=model_list, benchmark_list=benchmark_list)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     unittest.main()
25 | 


--------------------------------------------------------------------------------
/evalverse/tests/test_reproducibility.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import unittest
 3 | 
 4 | from evalverse.evaluator import Evaluator
 5 | from evalverse.utils import get_h6_en_scores
 6 | 
 7 | TEST_PATH = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | 
10 | class TestEvaluator(unittest.TestCase):
11 |     def setUp(self):
12 |         self.evaluator = Evaluator(mode="lib")
13 | 
14 |     def test_run_all_benchmarks(self):
15 |         model = "upstage/SOLAR-10.7B-Instruct-v1.0"
16 |         benchmark = "all"
17 |         original_output_path = os.path.join(TEST_PATH, "test_results")
18 |         reproduced_output_path = os.path.join(TEST_PATH, "test_results_reproduced")
19 |         self.evaluator.run(
20 |             model=model,
21 |             benchmark=benchmark,
22 |             data_parallel=8,
23 |             num_gpus_total=8,
24 |             parallel_api=4,
25 |             devices="0,1,2,3,4,5,6,7",
26 |             output_path=reproduced_output_path,
27 |         )
28 | 
29 |         # h6_score reproducilbility check
30 |         model_name = model.split("/")[-1]
31 |         original_scores = get_h6_en_scores(os.path.join(original_output_path, model_name, "h6_en"))
32 |         original_stderr = get_h6_en_scores(
33 |             os.path.join(original_output_path, model_name, "h6_en"), stderr=True
34 |         )
35 |         reproduced_scores = get_h6_en_scores(
36 |             os.path.join(reproduced_output_path, model_name, "h6_en")
37 |         )
38 | 
39 |         h6_list = ["arc_c_25", "hellaswag_10", "mmlu_5", "truthfulqa_0", "winogrande_5", "gsm8k_5"]
40 |         for benchmark, original, stderr, reproduced in zip(
41 |             h6_list, original_scores, original_stderr, reproduced_scores
42 |         ):
43 |             difference = abs(original - reproduced)
44 |             print(
45 |                 f"[{benchmark}] \t original: {original} \t reproduced: {reproduced} \t difference: {round(difference, 2)} \t stderr: {stderr}"
46 |             )
47 |             self.assertLessEqual(difference, stderr)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     unittest.main()
52 | 


--------------------------------------------------------------------------------
/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/arc_challenge_25.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "arc_challenge": {
 4 |       "acc,none": 0.6885665529010239,
 5 |       "acc_stderr,none": 0.01353247209985083,
 6 |       "acc_norm,none": 0.7133105802047781,
 7 |       "acc_norm_stderr,none": 0.013214986329274855,
 8 |       "alias": "arc_challenge"
 9 |     }
10 |   },
11 |   "group_subtasks": {
12 |     "arc_challenge": []
13 |   },
14 |   "configs": {
15 |     "arc_challenge": {
16 |       "task": "arc_challenge",
17 |       "group": [
18 |         "ai2_arc"
19 |       ],
20 |       "dataset_path": "allenai/ai2_arc",
21 |       "dataset_name": "ARC-Challenge",
22 |       "training_split": "train",
23 |       "validation_split": "validation",
24 |       "test_split": "test",
25 |       "doc_to_text": "Question: {{question}}\nAnswer:",
26 |       "doc_to_target": "{{choices.label.index(answerKey)}}",
27 |       "doc_to_choice": "{{choices.text}}",
28 |       "description": "",
29 |       "target_delimiter": " ",
30 |       "fewshot_delimiter": "\n\n",
31 |       "num_fewshot": 25,
32 |       "metric_list": [
33 |         {
34 |           "metric": "acc",
35 |           "aggregation": "mean",
36 |           "higher_is_better": true
37 |         },
38 |         {
39 |           "metric": "acc_norm",
40 |           "aggregation": "mean",
41 |           "higher_is_better": true
42 |         }
43 |       ],
44 |       "output_type": "multiple_choice",
45 |       "repeats": 1,
46 |       "should_decontaminate": true,
47 |       "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
48 |       "metadata": {
49 |         "version": 1.0
50 |       }
51 |     }
52 |   },
53 |   "versions": {
54 |     "arc_challenge": 1.0
55 |   },
56 |   "n-shot": {
57 |     "arc_challenge": 25
58 |   },
59 |   "config": {
60 |     "model": "hf",
61 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
62 |     "batch_size": "16",
63 |     "batch_sizes": [],
64 |     "device": null,
65 |     "use_cache": null,
66 |     "limit": null,
67 |     "bootstrap_iters": 100000,
68 |     "gen_kwargs": null
69 |   },
70 |   "git_hash": "22f5854",
71 |   "date": 1711604407.8730423,
72 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2813.569\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
73 |   "transformers_version": "4.38.2",
74 |   "upper_git_hash": null
75 | }


--------------------------------------------------------------------------------
/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/gsm8k_5.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "results": {
  3 |     "gsm8k": {
  4 |       "exact_match,strict-match": 0.6777862016679302,
  5 |       "exact_match_stderr,strict-match": 0.012872435481188778,
  6 |       "exact_match,flexible-extract": 0.6853677028051555,
  7 |       "exact_match_stderr,flexible-extract": 0.012791037227336034,
  8 |       "alias": "gsm8k"
  9 |     }
 10 |   },
 11 |   "group_subtasks": {
 12 |     "gsm8k": []
 13 |   },
 14 |   "configs": {
 15 |     "gsm8k": {
 16 |       "task": "gsm8k",
 17 |       "group": [
 18 |         "math_word_problems"
 19 |       ],
 20 |       "dataset_path": "gsm8k",
 21 |       "dataset_name": "main",
 22 |       "training_split": "train",
 23 |       "test_split": "test",
 24 |       "fewshot_split": "train",
 25 |       "doc_to_text": "Question: {{question}}\nAnswer:",
 26 |       "doc_to_target": "{{answer}}",
 27 |       "description": "",
 28 |       "target_delimiter": " ",
 29 |       "fewshot_delimiter": "\n\n",
 30 |       "num_fewshot": 5,
 31 |       "metric_list": [
 32 |         {
 33 |           "metric": "exact_match",
 34 |           "aggregation": "mean",
 35 |           "higher_is_better": true,
 36 |           "ignore_case": true,
 37 |           "ignore_punctuation": false,
 38 |           "regexes_to_ignore": [
 39 |             ",",
 40 |             "\\$",
 41 |             "(?s).*#### ",
 42 |             "\\.$"
 43 |           ]
 44 |         }
 45 |       ],
 46 |       "output_type": "generate_until",
 47 |       "generation_kwargs": {
 48 |         "until": [
 49 |           "Question:",
 50 |           "</s>",
 51 |           "<|im_end|>"
 52 |         ],
 53 |         "do_sample": false,
 54 |         "temperature": 0.0
 55 |       },
 56 |       "repeats": 1,
 57 |       "filter_list": [
 58 |         {
 59 |           "name": "strict-match",
 60 |           "filter": [
 61 |             {
 62 |               "function": "regex",
 63 |               "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
 64 |             },
 65 |             {
 66 |               "function": "take_first"
 67 |             }
 68 |           ]
 69 |         },
 70 |         {
 71 |           "name": "flexible-extract",
 72 |           "filter": [
 73 |             {
 74 |               "function": "regex",
 75 |               "group_select": -1,
 76 |               "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
 77 |             },
 78 |             {
 79 |               "function": "take_first"
 80 |             }
 81 |           ]
 82 |         }
 83 |       ],
 84 |       "should_decontaminate": false,
 85 |       "metadata": {
 86 |         "version": 3.0
 87 |       }
 88 |     }
 89 |   },
 90 |   "versions": {
 91 |     "gsm8k": 3.0
 92 |   },
 93 |   "n-shot": {
 94 |     "gsm8k": 5
 95 |   },
 96 |   "config": {
 97 |     "model": "hf",
 98 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
 99 |     "batch_size": "16",
100 |     "batch_sizes": [],
101 |     "device": null,
102 |     "use_cache": null,
103 |     "limit": null,
104 |     "bootstrap_iters": 100000,
105 |     "gen_kwargs": null
106 |   },
107 |   "git_hash": "22f5854",
108 |   "date": 1711605933.4303067,
109 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2475.477\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
110 |   "transformers_version": "4.38.2",
111 |   "upper_git_hash": null
112 | }


--------------------------------------------------------------------------------
/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/hellaswag_10.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "hellaswag": {
 4 |       "acc,none": 0.7061342362079267,
 5 |       "acc_stderr,none": 0.004546002255457021,
 6 |       "acc_norm,none": 0.8818960366460864,
 7 |       "acc_norm_stderr,none": 0.0032207161266851005,
 8 |       "alias": "hellaswag"
 9 |     }
10 |   },
11 |   "group_subtasks": {
12 |     "hellaswag": []
13 |   },
14 |   "configs": {
15 |     "hellaswag": {
16 |       "task": "hellaswag",
17 |       "group": [
18 |         "multiple_choice"
19 |       ],
20 |       "dataset_path": "hellaswag",
21 |       "training_split": "train",
22 |       "validation_split": "validation",
23 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
24 |       "doc_to_text": "{{query}}",
25 |       "doc_to_target": "{{label}}",
26 |       "doc_to_choice": "choices",
27 |       "description": "",
28 |       "target_delimiter": " ",
29 |       "fewshot_delimiter": "\n\n",
30 |       "num_fewshot": 10,
31 |       "metric_list": [
32 |         {
33 |           "metric": "acc",
34 |           "aggregation": "mean",
35 |           "higher_is_better": true
36 |         },
37 |         {
38 |           "metric": "acc_norm",
39 |           "aggregation": "mean",
40 |           "higher_is_better": true
41 |         }
42 |       ],
43 |       "output_type": "multiple_choice",
44 |       "repeats": 1,
45 |       "should_decontaminate": false,
46 |       "metadata": {
47 |         "version": 1.0
48 |       }
49 |     }
50 |   },
51 |   "versions": {
52 |     "hellaswag": 1.0
53 |   },
54 |   "n-shot": {
55 |     "hellaswag": 10
56 |   },
57 |   "config": {
58 |     "model": "hf",
59 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
60 |     "batch_size": "16",
61 |     "batch_sizes": [],
62 |     "device": null,
63 |     "use_cache": null,
64 |     "limit": null,
65 |     "bootstrap_iters": 100000,
66 |     "gen_kwargs": null
67 |   },
68 |   "git_hash": "22f5854",
69 |   "date": 1711604551.2668173,
70 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2633.640\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
71 |   "transformers_version": "4.38.2",
72 |   "upper_git_hash": null
73 | }


--------------------------------------------------------------------------------
/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/truthfulqa_mc2_0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "truthfulqa_mc2": {
 4 |       "acc,none": 0.7171838111166857,
 5 |       "acc_stderr,none": 0.01498853297119472,
 6 |       "alias": "truthfulqa_mc2"
 7 |     }
 8 |   },
 9 |   "group_subtasks": {
10 |     "truthfulqa_mc2": []
11 |   },
12 |   "configs": {
13 |     "truthfulqa_mc2": {
14 |       "task": "truthfulqa_mc2",
15 |       "group": [
16 |         "truthfulqa"
17 |       ],
18 |       "dataset_path": "truthful_qa",
19 |       "dataset_name": "multiple_choice",
20 |       "validation_split": "validation",
21 |       "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
22 |       "doc_to_target": 0,
23 |       "doc_to_choice": "{{mc2_targets.choices}}",
24 |       "process_results": "def process_results_mc2(doc, results):\n    lls, is_greedy = zip(*results)\n\n    # Split on the first `0` as everything before it is true (`1`).\n    split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n    # Compute the normalized probability mass for the correct answer.\n    ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n    p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n    p_true = p_true / (sum(p_true) + sum(p_false))\n\n    return {\"acc\": sum(p_true)}\n",
25 |       "description": "",
26 |       "target_delimiter": " ",
27 |       "fewshot_delimiter": "\n\n",
28 |       "num_fewshot": 0,
29 |       "metric_list": [
30 |         {
31 |           "metric": "acc",
32 |           "aggregation": "mean",
33 |           "higher_is_better": true
34 |         }
35 |       ],
36 |       "output_type": "multiple_choice",
37 |       "repeats": 1,
38 |       "should_decontaminate": true,
39 |       "doc_to_decontamination_query": "question",
40 |       "metadata": {
41 |         "version": 2.0
42 |       }
43 |     }
44 |   },
45 |   "versions": {
46 |     "truthfulqa_mc2": 2.0
47 |   },
48 |   "n-shot": {
49 |     "truthfulqa_mc2": 0
50 |   },
51 |   "config": {
52 |     "model": "hf",
53 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
54 |     "batch_size": "16",
55 |     "batch_sizes": [],
56 |     "device": null,
57 |     "use_cache": null,
58 |     "limit": null,
59 |     "bootstrap_iters": 100000,
60 |     "gen_kwargs": null
61 |   },
62 |   "git_hash": "22f5854",
63 |   "date": 1711605810.1983285,
64 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2474.946\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
65 |   "transformers_version": "4.38.2",
66 |   "upper_git_hash": null
67 | }


--------------------------------------------------------------------------------
/evalverse/tests/test_results/SOLAR-10.7B-Instruct-v1.0/h6_en/winogrande_5.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "winogrande": {
 4 |       "acc,none": 0.8318863456985004,
 5 |       "acc_stderr,none": 0.010510336954166734,
 6 |       "alias": "winogrande"
 7 |     }
 8 |   },
 9 |   "group_subtasks": {
10 |     "winogrande": []
11 |   },
12 |   "configs": {
13 |     "winogrande": {
14 |       "task": "winogrande",
15 |       "dataset_path": "winogrande",
16 |       "dataset_name": "winogrande_xl",
17 |       "training_split": "train",
18 |       "validation_split": "validation",
19 |       "doc_to_text": "def doc_to_text(doc):\n    answer_to_num = {\"1\": 0, \"2\": 1}\n    return answer_to_num[doc[\"answer\"]]\n",
20 |       "doc_to_target": "def doc_to_target(doc):\n    idx = doc[\"sentence\"].index(\"_\") + 1\n    return doc[\"sentence\"][idx:].strip()\n",
21 |       "doc_to_choice": "def doc_to_choice(doc):\n    idx = doc[\"sentence\"].index(\"_\")\n    options = [doc[\"option1\"], doc[\"option2\"]]\n    return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
22 |       "description": "",
23 |       "target_delimiter": " ",
24 |       "fewshot_delimiter": "\n\n",
25 |       "num_fewshot": 5,
26 |       "metric_list": [
27 |         {
28 |           "metric": "acc",
29 |           "aggregation": "mean",
30 |           "higher_is_better": true
31 |         }
32 |       ],
33 |       "output_type": "multiple_choice",
34 |       "repeats": 1,
35 |       "should_decontaminate": true,
36 |       "doc_to_decontamination_query": "sentence",
37 |       "metadata": {
38 |         "version": 1.0
39 |       }
40 |     }
41 |   },
42 |   "versions": {
43 |     "winogrande": 1.0
44 |   },
45 |   "n-shot": {
46 |     "winogrande": 5
47 |   },
48 |   "config": {
49 |     "model": "hf",
50 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
51 |     "batch_size": "16",
52 |     "batch_sizes": [],
53 |     "device": null,
54 |     "use_cache": null,
55 |     "limit": null,
56 |     "bootstrap_iters": 100000,
57 |     "gen_kwargs": null
58 |   },
59 |   "git_hash": "22f5854",
60 |   "date": 1711605880.7907126,
61 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2511.241\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
62 |   "transformers_version": "4.38.2",
63 |   "upper_git_hash": null
64 | }


--------------------------------------------------------------------------------
/evalverse/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright (c) 2024-present Upstage Co., Ltd.
  3 | Apache-2.0 license
  4 | """
  5 | import json
  6 | import logging
  7 | import os
  8 | import re
  9 | 
 10 | import numpy as np
 11 | import pandas as pd
 12 | import plotly.express as px
 13 | 
 14 | EVALVERSE_MODULE_PATH = os.path.dirname(os.path.abspath(__file__))
 15 | EVALVERSE_DB_PATH = os.path.join(os.path.dirname(EVALVERSE_MODULE_PATH), "db")
 16 | EVALVERSE_OUTPUT_PATH = os.path.join(os.path.dirname(EVALVERSE_MODULE_PATH), "results")
 17 | EVALVERSE_LOG_FORMAT = (
 18 |     "[%(asctime)s][%(levelname)s][evalverse - %(filename)s:%(lineno)d] >> %(message)s"
 19 | )
 20 | 
 21 | 
 22 | def print_command(command, only_cmd=False):
 23 |     cmd = re.sub(r"\s+", " ", command).strip()
 24 |     if only_cmd:
 25 |         return cmd
 26 |     else:
 27 |         print(cmd)
 28 | 
 29 | 
 30 | def print_txt_file(path):
 31 |     with open(path, "r") as file:
 32 |         file_contents = file.read()
 33 |         print(file_contents)
 34 | 
 35 | 
 36 | def get_logger(log_path=None):
 37 |     logger = logging.getLogger(__name__)
 38 |     logger.setLevel(level=logging.INFO)
 39 |     formatter = logging.Formatter(
 40 |         fmt=EVALVERSE_LOG_FORMAT,
 41 |         datefmt="%Y-%m-%d %H:%M:%S",
 42 |     )
 43 |     if log_path:
 44 |         fileHandler = logging.FileHandler(filename=log_path)
 45 |         fileHandler.setFormatter(formatter)
 46 |         logger.addHandler(fileHandler)
 47 | 
 48 |     return logger
 49 | 
 50 | 
 51 | def get_figure(score_df, benchmarks_list, figure_path=None, save=False):
 52 |     scores = []
 53 |     for b in benchmarks_list:
 54 |         for m, n in score_df[["Model", b]].values:
 55 |             scores.append([m, b, n])
 56 |     figure_df = pd.DataFrame(scores, columns=["model", "benchmark", "score"])
 57 | 
 58 |     fig = px.line_polar(
 59 |         figure_df,
 60 |         r="score",
 61 |         theta="benchmark",
 62 |         line_close=True,
 63 |         category_orders={"benchmark": benchmarks_list},
 64 |         color="model",
 65 |         markers=True,
 66 |         color_discrete_sequence=px.colors.qualitative.Pastel,
 67 |         title="LLM Evaluation Report (by Evalverse)",
 68 |         width=800,
 69 |     )
 70 |     if save:
 71 |         fig.write_image(figure_path, scale=2)
 72 |     else:
 73 |         fig.show()
 74 | 
 75 | 
 76 | def get_h6_en_scores(exp_path, stderr=False, print_results=False):
 77 |     acc_metric = "acc,none"
 78 |     acc_norm_metric = "acc_norm,none"
 79 |     gsm8k_metrics = ["exact_match,get-answer", "exact_match,strict-match"]
 80 |     if stderr:
 81 |         acc_metric = "acc_stderr,none"
 82 |         acc_norm_metric = "acc_norm_stderr,none"
 83 |         gsm8k_metrics = ["exact_match_stderr,get-answer", "exact_match_stderr,strict-match"]
 84 | 
 85 |     with open(os.path.join(exp_path, "arc_challenge_25.json"), "r") as json_file:
 86 |         arc_challenge_25 = json.load(json_file)
 87 |         if print_results:
 88 |             print(
 89 |                 "ARC-Challenge (25-shot)",
 90 |                 json.dumps(arc_challenge_25["results"]["arc_challenge"], indent=4),
 91 |             )
 92 |         else:
 93 |             arc_score = arc_challenge_25["results"]["arc_challenge"][acc_norm_metric]
 94 | 
 95 |     with open(os.path.join(exp_path, "hellaswag_10.json"), "r") as json_file:
 96 |         hellaswag_10 = json.load(json_file)
 97 |         if print_results:
 98 |             print("Hellaswag (10-shot)", json.dumps(hellaswag_10["results"]["hellaswag"], indent=4))
 99 |         else:
100 |             hellaswag_score = hellaswag_10["results"]["hellaswag"][acc_norm_metric]
101 | 
102 |     with open(os.path.join(exp_path, "mmlu_5.json"), "r") as json_file:
103 |         mmlu_5 = json.load(json_file)
104 |         if print_results:
105 |             print("MMLU (5-shot)", json.dumps(mmlu_5["results"]["mmlu"], indent=4))
106 |         else:
107 |             mmlu_score = mmlu_5["results"]["mmlu"][acc_metric]
108 | 
109 |     with open(os.path.join(exp_path, "truthfulqa_mc2_0.json"), "r") as json_file:
110 |         truthfulqa_mc2_0 = json.load(json_file)
111 |         if print_results:
112 |             print(
113 |                 "TruthfulQA (0-shot)",
114 |                 json.dumps(truthfulqa_mc2_0["results"]["truthfulqa_mc2"], indent=4),
115 |             )
116 |         else:
117 |             truthfulqa_score = truthfulqa_mc2_0["results"]["truthfulqa_mc2"][acc_metric]
118 | 
119 |     with open(os.path.join(exp_path, "winogrande_5.json"), "r") as json_file:
120 |         winogrande_5 = json.load(json_file)
121 |         if print_results:
122 |             print(
123 |                 "Winogrande (5-shot)", json.dumps(winogrande_5["results"]["winogrande"], indent=4)
124 |             )
125 |         else:
126 |             winogrande_score = winogrande_5["results"]["winogrande"][acc_metric]
127 | 
128 |     with open(os.path.join(exp_path, "gsm8k_5.json"), "r") as json_file:
129 |         gsm8k_5 = json.load(json_file)
130 |         if print_results:
131 |             print("GSM8k (5-shot)", json.dumps(gsm8k_5["results"]["gsm8k"], indent=4))
132 |         else:
133 |             match_key = next(
134 |                 (key for key in gsm8k_metrics if key in gsm8k_5["results"]["gsm8k"]), None
135 |             )
136 |             gsm8k_score = gsm8k_5["results"]["gsm8k"][match_key]
137 | 
138 |     if print_results:
139 |         pass
140 |     else:
141 |         score_list = [
142 |             arc_score,
143 |             hellaswag_score,
144 |             mmlu_score,
145 |             truthfulqa_score,
146 |             winogrande_score,
147 |             gsm8k_score,
148 |         ]
149 |         score_list = list(np.round((np.array(score_list) * 100), 2))
150 | 
151 |         return score_list
152 | 
153 | 
154 | def get_mt_bench_scores(model_id, question_path, judgement_path):
155 |     question_df = pd.read_json(question_path, lines=True)
156 |     judgement_df = pd.read_json(judgement_path, lines=True)
157 | 
158 |     df = judgement_df[["question_id", "model", "score", "turn"]]
159 |     df = df[(df["model"] == model_id) & (df["score"] != -1)]
160 |     df = df.merge(question_df[["question_id", "category"]], how="left")
161 |     df = df[["category", "score"]].groupby(["category"]).mean()
162 |     df = df.sort_values("category")
163 | 
164 |     score_list = df.score.values.tolist()
165 |     score_list = list(np.round((np.array(score_list) * 10), 2))
166 | 
167 |     return score_list
168 | 
169 | 
170 | def get_ifeval_scores(score_txt_file):
171 |     score_list = []
172 |     with open(score_txt_file, "r") as file:
173 |         content = file.read()
174 | 
175 |     pattern = r"(prompt-level|instruction-level):\s([\d.]+)"
176 |     matches = re.findall(pattern, content)
177 | 
178 |     for _, score in matches:
179 |         score_list.append(float(score))
180 |     score_list = list(np.round((np.array(score_list) * 100), 2))
181 | 
182 |     return score_list
183 | 
184 | 
185 | def get_eqbench_score(eqbench_results_json):
186 |     with open(eqbench_results_json, "r") as f:
187 |         data = json.load(f)
188 | 
189 |     final_score = data[list(data.keys())[0]]["iterations"]["1"]["benchmark_results_fullscale"][
190 |         "final_score"
191 |     ]
192 |     score_list = [round(final_score, 2)]
193 | 
194 |     return score_list
195 | 
196 | 
197 | if __name__ == "__main__":
198 |     print(f"EVALVERSE_MODULE_PATH: {EVALVERSE_MODULE_PATH}")
199 |     print(f"EVALVERSE_DB_PATH: {EVALVERSE_DB_PATH}")
200 |     print(f"EVALVERSE_OUTPUT_PATH: {EVALVERSE_OUTPUT_PATH}")
201 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # 🌍 Examples
 2 | > This is a example collection for `Evalverse`. We will talk about the basic usage of `Evalverse`, knowhows, and how to use it in your project.
 3 | 
 4 | 
 5 | ### 🙋  I'm very new to Evalverse.
 6 | Introduces very basic, but core steps to use Evalverse. (e.g., `Evaluator`, `Reporter`)
 7 | 
 8 | - [01_basic_usage.ipynb](https://github.com/UpstageAI/evalverse/blob/main/examples/01_basic_usage.ipynb)
 9 | 
10 | ### 🙋 I want to know how to run evaluation in details
11 | If you want to run each evaluation (`h6_en`, `mt_bench`, `ifeval`, `eq_bench`) in details.
12 | 
13 | - [02_advanced_usage.ipynb](https://github.com/UpstageAI/evalverse/blob/main/examples/02_advanced_usage.ipynb)
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/examples/db/figures/figure_20240402_105011.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UpstageAI/evalverse/06f85eefd4b82385c549ea3bc29ff3e177f3a84b/examples/db/figures/figure_20240402_105011.jpeg


--------------------------------------------------------------------------------
/examples/db/score_df.csv:
--------------------------------------------------------------------------------
1 | Model,H6-ARC,H6-Hellaswag,H6-MMLU,H6-TruthfulQA,H6-Winogrande,H6-GSM8k,MT-Bench-Coding,MT-Bench-Extraction,MT-Bench-Humanities,MT-Bench-Math,MT-Bench-Reasoning,MT-Bench-Roleplay,MT-Bench-Stem,MT-Bench-Writing,IFEval-strict-prompt,IFEval-strict-instruction,IFEval-loose-prompt,IFEval-loose-instruction,EQ-Bench
2 | Llama-2-7b-chat-hf,53.16,78.59,47.38,45.31,72.69,23.96,28.95,66.25,96.5,23.5,52.5,77.5,89.0,86.75,39.19,47.93,48.43,56.09,36.46
3 | SOLAR-10.7B-Instruct-v1.0,71.33,88.19,65.52,71.72,83.19,67.78,44.5,77.89,98.5,43.5,66.0,82.5,88.75,94.0,51.57,57.97,56.01,62.92,72.17
4 | 


--------------------------------------------------------------------------------
/examples/db/scores/table_20240402_105011.csv:
--------------------------------------------------------------------------------
1 | Model,Ranking,total_avg,H6-ARC,H6-Hellaswag,H6-MMLU,H6-TruthfulQA,H6-Winogrande,H6-GSM8k
2 | SOLAR-10.7B-Instruct-v1.0,1,74.62,71.33,88.19,65.52,71.72,83.19,67.78
3 | 


--------------------------------------------------------------------------------
/examples/results/Llama-2-7b-chat-hf/eq_bench/benchmark_results.csv:
--------------------------------------------------------------------------------
1 | Run ID, Benchmark Completed, Prompt Format, Model Path, Lora Path, Quantization, Benchmark Score, EQ-Bench Version, Num Questions Parseable, Num Iterations, Inference Engine, Ooba Params, Download Filters, Error
2 | Llama-2-7b-chat-hf,2024-04-02 13:34:43,ChatML,meta-llama/Llama-2-7b-chat-hf,None,None,36.46,v2,155.0,1,transformers,none,--include ["n", "o", "n", "e"] --exclude ["n", "o", "n", "e"],
3 | 


--------------------------------------------------------------------------------
/examples/results/Llama-2-7b-chat-hf/h6_en/arc_challenge_25.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "arc_challenge": {
 4 |       "acc,none": 0.5093856655290102,
 5 |       "acc_stderr,none": 0.014608816322065,
 6 |       "acc_norm,none": 0.5315699658703071,
 7 |       "acc_norm_stderr,none": 0.014582236460866977,
 8 |       "alias": "arc_challenge"
 9 |     }
10 |   },
11 |   "group_subtasks": {
12 |     "arc_challenge": []
13 |   },
14 |   "configs": {
15 |     "arc_challenge": {
16 |       "task": "arc_challenge",
17 |       "group": [
18 |         "ai2_arc"
19 |       ],
20 |       "dataset_path": "allenai/ai2_arc",
21 |       "dataset_name": "ARC-Challenge",
22 |       "training_split": "train",
23 |       "validation_split": "validation",
24 |       "test_split": "test",
25 |       "doc_to_text": "Question: {{question}}\nAnswer:",
26 |       "doc_to_target": "{{choices.label.index(answerKey)}}",
27 |       "doc_to_choice": "{{choices.text}}",
28 |       "description": "",
29 |       "target_delimiter": " ",
30 |       "fewshot_delimiter": "\n\n",
31 |       "num_fewshot": 25,
32 |       "metric_list": [
33 |         {
34 |           "metric": "acc",
35 |           "aggregation": "mean",
36 |           "higher_is_better": true
37 |         },
38 |         {
39 |           "metric": "acc_norm",
40 |           "aggregation": "mean",
41 |           "higher_is_better": true
42 |         }
43 |       ],
44 |       "output_type": "multiple_choice",
45 |       "repeats": 1,
46 |       "should_decontaminate": true,
47 |       "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
48 |       "metadata": {
49 |         "version": 1.0
50 |       }
51 |     }
52 |   },
53 |   "versions": {
54 |     "arc_challenge": 1.0
55 |   },
56 |   "n-shot": {
57 |     "arc_challenge": 25
58 |   },
59 |   "config": {
60 |     "model": "hf",
61 |     "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
62 |     "batch_size": "16",
63 |     "batch_sizes": [],
64 |     "device": null,
65 |     "use_cache": null,
66 |     "limit": null,
67 |     "bootstrap_iters": 100000,
68 |     "gen_kwargs": null
69 |   },
70 |   "git_hash": "0ecf672",
71 |   "date": 1712029901.2960556,
72 |   "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2813.715\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.1.2                    pypi_0    pypi\n[conda] triton                    2.1.0                    pypi_0    pypi",
73 |   "transformers_version": "4.37.2",
74 |   "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6"
75 | }


--------------------------------------------------------------------------------
/examples/results/Llama-2-7b-chat-hf/h6_en/gsm8k_5.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "results": {
  3 |     "gsm8k": {
  4 |       "exact_match,strict-match": 0.2395754359363154,
  5 |       "exact_match_stderr,strict-match": 0.01175686434407741,
  6 |       "exact_match,flexible-extract": 0.24184988627748294,
  7 |       "exact_match_stderr,flexible-extract": 0.011794861371318698,
  8 |       "alias": "gsm8k"
  9 |     }
 10 |   },
 11 |   "group_subtasks": {
 12 |     "gsm8k": []
 13 |   },
 14 |   "configs": {
 15 |     "gsm8k": {
 16 |       "task": "gsm8k",
 17 |       "group": [
 18 |         "math_word_problems"
 19 |       ],
 20 |       "dataset_path": "gsm8k",
 21 |       "dataset_name": "main",
 22 |       "training_split": "train",
 23 |       "test_split": "test",
 24 |       "fewshot_split": "train",
 25 |       "doc_to_text": "Question: {{question}}\nAnswer:",
 26 |       "doc_to_target": "{{answer}}",
 27 |       "description": "",
 28 |       "target_delimiter": " ",
 29 |       "fewshot_delimiter": "\n\n",
 30 |       "num_fewshot": 5,
 31 |       "metric_list": [
 32 |         {
 33 |           "metric": "exact_match",
 34 |           "aggregation": "mean",
 35 |           "higher_is_better": true,
 36 |           "ignore_case": true,
 37 |           "ignore_punctuation": false,
 38 |           "regexes_to_ignore": [
 39 |             ",",
 40 |             "\\$",
 41 |             "(?s).*#### ",
 42 |             "\\.$"
 43 |           ]
 44 |         }
 45 |       ],
 46 |       "output_type": "generate_until",
 47 |       "generation_kwargs": {
 48 |         "until": [
 49 |           "Question:",
 50 |           "</s>",
 51 |           "<|im_end|>"
 52 |         ],
 53 |         "do_sample": false,
 54 |         "temperature": 0.0
 55 |       },
 56 |       "repeats": 1,
 57 |       "filter_list": [
 58 |         {
 59 |           "name": "strict-match",
 60 |           "filter": [
 61 |             {
 62 |               "function": "regex",
 63 |               "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
 64 |             },
 65 |             {
 66 |               "function": "take_first"
 67 |             }
 68 |           ]
 69 |         },
 70 |         {
 71 |           "name": "flexible-extract",
 72 |           "filter": [
 73 |             {
 74 |               "function": "regex",
 75 |               "group_select": -1,
 76 |               "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
 77 |             },
 78 |             {
 79 |               "function": "take_first"
 80 |             }
 81 |           ]
 82 |         }
 83 |       ],
 84 |       "should_decontaminate": false,
 85 |       "metadata": {
 86 |         "version": 3.0
 87 |       }
 88 |     }
 89 |   },
 90 |   "versions": {
 91 |     "gsm8k": 3.0
 92 |   },
 93 |   "n-shot": {
 94 |     "gsm8k": 5
 95 |   },
 96 |   "config": {
 97 |     "model": "hf",
 98 |     "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
 99 |     "batch_size": "16",
100 |     "batch_sizes": [],
101 |     "device": null,
102 |     "use_cache": null,
103 |     "limit": null,
104 |     "bootstrap_iters": 100000,
105 |     "gen_kwargs": null
106 |   },
107 |   "git_hash": "0ecf672",
108 |   "date": 1712031337.2311273,
109 |   "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2474.987\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.1.2                    pypi_0    pypi\n[conda] triton                    2.1.0                    pypi_0    pypi",
110 |   "transformers_version": "4.37.2",
111 |   "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6"
112 | }


--------------------------------------------------------------------------------
/examples/results/Llama-2-7b-chat-hf/h6_en/hellaswag_10.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "hellaswag": {
 4 |       "acc,none": 0.5944035052778331,
 5 |       "acc_stderr,none": 0.004900036261309049,
 6 |       "acc_norm,none": 0.7858992232622983,
 7 |       "acc_norm_stderr,none": 0.004093587404303701,
 8 |       "alias": "hellaswag"
 9 |     }
10 |   },
11 |   "group_subtasks": {
12 |     "hellaswag": []
13 |   },
14 |   "configs": {
15 |     "hellaswag": {
16 |       "task": "hellaswag",
17 |       "group": [
18 |         "multiple_choice"
19 |       ],
20 |       "dataset_path": "hellaswag",
21 |       "training_split": "train",
22 |       "validation_split": "validation",
23 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
24 |       "doc_to_text": "{{query}}",
25 |       "doc_to_target": "{{label}}",
26 |       "doc_to_choice": "choices",
27 |       "description": "",
28 |       "target_delimiter": " ",
29 |       "fewshot_delimiter": "\n\n",
30 |       "num_fewshot": 10,
31 |       "metric_list": [
32 |         {
33 |           "metric": "acc",
34 |           "aggregation": "mean",
35 |           "higher_is_better": true
36 |         },
37 |         {
38 |           "metric": "acc_norm",
39 |           "aggregation": "mean",
40 |           "higher_is_better": true
41 |         }
42 |       ],
43 |       "output_type": "multiple_choice",
44 |       "repeats": 1,
45 |       "should_decontaminate": false,
46 |       "metadata": {
47 |         "version": 1.0
48 |       }
49 |     }
50 |   },
51 |   "versions": {
52 |     "hellaswag": 1.0
53 |   },
54 |   "n-shot": {
55 |     "hellaswag": 10
56 |   },
57 |   "config": {
58 |     "model": "hf",
59 |     "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
60 |     "batch_size": "16",
61 |     "batch_sizes": [],
62 |     "device": null,
63 |     "use_cache": null,
64 |     "limit": null,
65 |     "bootstrap_iters": 100000,
66 |     "gen_kwargs": null
67 |   },
68 |   "git_hash": "0ecf672",
69 |   "date": 1712030282.4817123,
70 |   "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2803.192\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.1.2                    pypi_0    pypi\n[conda] triton                    2.1.0                    pypi_0    pypi",
71 |   "transformers_version": "4.37.2",
72 |   "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6"
73 | }


--------------------------------------------------------------------------------
/examples/results/Llama-2-7b-chat-hf/h6_en/truthfulqa_mc2_0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "truthfulqa_mc2": {
 4 |       "acc,none": 0.4531373800075501,
 5 |       "acc_stderr,none": 0.015639311798545347,
 6 |       "alias": "truthfulqa_mc2"
 7 |     }
 8 |   },
 9 |   "group_subtasks": {
10 |     "truthfulqa_mc2": []
11 |   },
12 |   "configs": {
13 |     "truthfulqa_mc2": {
14 |       "task": "truthfulqa_mc2",
15 |       "group": [
16 |         "truthfulqa"
17 |       ],
18 |       "dataset_path": "truthful_qa",
19 |       "dataset_name": "multiple_choice",
20 |       "validation_split": "validation",
21 |       "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
22 |       "doc_to_target": 0,
23 |       "doc_to_choice": "{{mc2_targets.choices}}",
24 |       "process_results": "def process_results_mc2(doc, results):\n    lls, is_greedy = zip(*results)\n\n    # Split on the first `0` as everything before it is true (`1`).\n    split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n    # Compute the normalized probability mass for the correct answer.\n    ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n    p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n    p_true = p_true / (sum(p_true) + sum(p_false))\n\n    return {\"acc\": sum(p_true)}\n",
25 |       "description": "",
26 |       "target_delimiter": " ",
27 |       "fewshot_delimiter": "\n\n",
28 |       "num_fewshot": 0,
29 |       "metric_list": [
30 |         {
31 |           "metric": "acc",
32 |           "aggregation": "mean",
33 |           "higher_is_better": true
34 |         }
35 |       ],
36 |       "output_type": "multiple_choice",
37 |       "repeats": 1,
38 |       "should_decontaminate": true,
39 |       "doc_to_decontamination_query": "question",
40 |       "metadata": {
41 |         "version": 2.0
42 |       }
43 |     }
44 |   },
45 |   "versions": {
46 |     "truthfulqa_mc2": 2.0
47 |   },
48 |   "n-shot": {
49 |     "truthfulqa_mc2": 0
50 |   },
51 |   "config": {
52 |     "model": "hf",
53 |     "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
54 |     "batch_size": "16",
55 |     "batch_sizes": [],
56 |     "device": null,
57 |     "use_cache": null,
58 |     "limit": null,
59 |     "bootstrap_iters": 100000,
60 |     "gen_kwargs": null
61 |   },
62 |   "git_hash": "0ecf672",
63 |   "date": 1712031221.6043375,
64 |   "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2746.724\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.1.2                    pypi_0    pypi\n[conda] triton                    2.1.0                    pypi_0    pypi",
65 |   "transformers_version": "4.37.2",
66 |   "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6"
67 | }


--------------------------------------------------------------------------------
/examples/results/Llama-2-7b-chat-hf/h6_en/winogrande_5.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "winogrande": {
 4 |       "acc,none": 0.7269139700078927,
 5 |       "acc_stderr,none": 0.012522020105869457,
 6 |       "alias": "winogrande"
 7 |     }
 8 |   },
 9 |   "group_subtasks": {
10 |     "winogrande": []
11 |   },
12 |   "configs": {
13 |     "winogrande": {
14 |       "task": "winogrande",
15 |       "dataset_path": "winogrande",
16 |       "dataset_name": "winogrande_xl",
17 |       "training_split": "train",
18 |       "validation_split": "validation",
19 |       "doc_to_text": "def doc_to_text(doc):\n    answer_to_num = {\"1\": 0, \"2\": 1}\n    return answer_to_num[doc[\"answer\"]]\n",
20 |       "doc_to_target": "def doc_to_target(doc):\n    idx = doc[\"sentence\"].index(\"_\") + 1\n    return doc[\"sentence\"][idx:].strip()\n",
21 |       "doc_to_choice": "def doc_to_choice(doc):\n    idx = doc[\"sentence\"].index(\"_\")\n    options = [doc[\"option1\"], doc[\"option2\"]]\n    return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
22 |       "description": "",
23 |       "target_delimiter": " ",
24 |       "fewshot_delimiter": "\n\n",
25 |       "num_fewshot": 5,
26 |       "metric_list": [
27 |         {
28 |           "metric": "acc",
29 |           "aggregation": "mean",
30 |           "higher_is_better": true
31 |         }
32 |       ],
33 |       "output_type": "multiple_choice",
34 |       "repeats": 1,
35 |       "should_decontaminate": true,
36 |       "doc_to_decontamination_query": "sentence",
37 |       "metadata": {
38 |         "version": 1.0
39 |       }
40 |     }
41 |   },
42 |   "versions": {
43 |     "winogrande": 1.0
44 |   },
45 |   "n-shot": {
46 |     "winogrande": 5
47 |   },
48 |   "config": {
49 |     "model": "hf",
50 |     "model_args": "pretrained=meta-llama/Llama-2-7b-chat-hf,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
51 |     "batch_size": "16",
52 |     "batch_sizes": [],
53 |     "device": null,
54 |     "use_cache": null,
55 |     "limit": null,
56 |     "bootstrap_iters": 100000,
57 |     "gen_kwargs": null
58 |   },
59 |   "git_hash": "0ecf672",
60 |   "date": 1712031284.3734753,
61 |   "pretty_env_info": "PyTorch version: 2.1.2+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2697.466\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.1.2\n[pip3] triton==2.1.0\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.1.2                    pypi_0    pypi\n[conda] triton                    2.1.0                    pypi_0    pypi",
62 |   "transformers_version": "4.37.2",
63 |   "upper_git_hash": "0ecf67227578e1079c4d8d00a4db878a6f6523d6"
64 | }


--------------------------------------------------------------------------------
/examples/results/Llama-2-7b-chat-hf/ifeval/scores.txt:
--------------------------------------------------------------------------------
 1 | ================================================================
 2 | /data/private/new_lib/evalverse/results/Llama-2-7b-chat-hf/ifeval/eval_results_strict.jsonl Accuracy Scores:
 3 | prompt-level: 0.39186691312384475
 4 | instruction-level: 0.47925033467202144
 5 | 
 6 | change_case 0.2857142857142857
 7 | combination 0.0
 8 | detectable_content 0.8181818181818182
 9 | detectable_format 0.5733333333333334
10 | keywords 0.744
11 | language 0.4666666666666667
12 | length_constraints 0.5436893203883495
13 | punctuation 0.05970149253731343
14 | startend 0.5454545454545454
15 | 
16 | change_case:capital_word_frequency 0.6
17 | change_case:english_capital 0.23076923076923078
18 | change_case:english_lowercase 0.15789473684210525
19 | combination:repeat_prompt 0.0
20 | combination:two_responses 0.0
21 | detectable_content:number_placeholders 0.7692307692307693
22 | detectable_content:postscript 0.8620689655172413
23 | detectable_format:constrained_response 1.0
24 | detectable_format:json_format 0.11764705882352941
25 | detectable_format:multiple_sections 0.25
26 | detectable_format:number_bullet_lists 0.4230769230769231
27 | detectable_format:number_highlighted_sections 0.5918367346938775
28 | detectable_format:title 0.8611111111111112
29 | keywords:existence 0.6666666666666666
30 | keywords:forbidden_words 0.8333333333333334
31 | keywords:frequency 0.7692307692307693
32 | keywords:letter_frequency 0.6551724137931034
33 | language:response_language 0.4666666666666667
34 | length_constraints:nth_paragraph_first_word 0.25
35 | length_constraints:number_paragraphs 0.12
36 | length_constraints:number_sentences 0.75
37 | length_constraints:number_words 0.7666666666666667
38 | punctuation:no_comma 0.05970149253731343
39 | startend:end_checker 0.8
40 | startend:quotation 0.3902439024390244
41 | ================================================================
42 | /data/private/new_lib/evalverse/results/Llama-2-7b-chat-hf/ifeval/eval_results_loose.jsonl Accuracy Scores:
43 | prompt-level: 0.48428835489833644
44 | instruction-level: 0.5609103078982597
45 | 
46 | change_case 0.4642857142857143
47 | combination 0.1044776119402985
48 | detectable_content 0.8181818181818182
49 | detectable_format 0.6266666666666667
50 | keywords 0.768
51 | language 0.5666666666666667
52 | length_constraints 0.5728155339805825
53 | punctuation 0.22388059701492538
54 | startend 0.7121212121212122
55 | 
56 | change_case:capital_word_frequency 0.65
57 | change_case:english_capital 0.3076923076923077
58 | change_case:english_lowercase 0.47368421052631576
59 | combination:repeat_prompt 0.047619047619047616
60 | combination:two_responses 0.2
61 | detectable_content:number_placeholders 0.7692307692307693
62 | detectable_content:postscript 0.8620689655172413
63 | detectable_format:constrained_response 1.0
64 | detectable_format:json_format 0.5882352941176471
65 | detectable_format:multiple_sections 0.25
66 | detectable_format:number_bullet_lists 0.4230769230769231
67 | detectable_format:number_highlighted_sections 0.5918367346938775
68 | detectable_format:title 0.8611111111111112
69 | keywords:existence 0.6666666666666666
70 | keywords:forbidden_words 0.9166666666666666
71 | keywords:frequency 0.7692307692307693
72 | keywords:letter_frequency 0.6551724137931034
73 | language:response_language 0.5666666666666667
74 | length_constraints:nth_paragraph_first_word 0.5
75 | length_constraints:number_paragraphs 0.12
76 | length_constraints:number_sentences 0.75
77 | length_constraints:number_words 0.7666666666666667
78 | punctuation:no_comma 0.22388059701492538
79 | startend:end_checker 0.8
80 | startend:quotation 0.6585365853658537
81 | 


--------------------------------------------------------------------------------
/examples/results/Llama-2-7b-chat-hf/mt_bench/scores.txt:
--------------------------------------------------------------------------------
 1 | Mode: single
 2 | Input file: /data/private/new_lib/evalverse/results/Llama-2-7b-chat-hf/mt_bench/model_judgment/gpt-4_single.jsonl
 3 | 
 4 | ########## First turn ##########
 5 |                          score
 6 | model              turn       
 7 | Llama-2-7b-chat-hf 1     7.025
 8 | 
 9 | ########## Second turn ##########
10 |                             score
11 | model              turn          
12 | Llama-2-7b-chat-hf 2     6.037975
13 | 
14 | ########## Average ##########
15 |                        score
16 | model                       
17 | Llama-2-7b-chat-hf  6.534591
18 | 


--------------------------------------------------------------------------------
/examples/results/SOLAR-10.7B-Instruct-v1.0/eq_bench/benchmark_results.csv:
--------------------------------------------------------------------------------
1 | Run ID, Benchmark Completed, Prompt Format, Model Path, Lora Path, Quantization, Benchmark Score, EQ-Bench Version, Num Questions Parseable, Num Iterations, Inference Engine, Ooba Params, Download Filters, Error
2 | SOLAR-10.7B-Instruct-v1.0,2024-05-09 18:17:58,Solar-v1,upstage/SOLAR-10.7B-Instruct-v1.0,None,None,72.17,v2,165.0,1,transformers,none,--include ["n", "o", "n", "e"] --exclude ["n", "o", "n", "e"],
3 | 


--------------------------------------------------------------------------------
/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/arc_challenge_25.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "arc_challenge": {
 4 |       "acc,none": 0.6885665529010239,
 5 |       "acc_stderr,none": 0.01353247209985083,
 6 |       "acc_norm,none": 0.7133105802047781,
 7 |       "acc_norm_stderr,none": 0.013214986329274855,
 8 |       "alias": "arc_challenge"
 9 |     }
10 |   },
11 |   "group_subtasks": {
12 |     "arc_challenge": []
13 |   },
14 |   "configs": {
15 |     "arc_challenge": {
16 |       "task": "arc_challenge",
17 |       "group": [
18 |         "ai2_arc"
19 |       ],
20 |       "dataset_path": "allenai/ai2_arc",
21 |       "dataset_name": "ARC-Challenge",
22 |       "training_split": "train",
23 |       "validation_split": "validation",
24 |       "test_split": "test",
25 |       "doc_to_text": "Question: {{question}}\nAnswer:",
26 |       "doc_to_target": "{{choices.label.index(answerKey)}}",
27 |       "doc_to_choice": "{{choices.text}}",
28 |       "description": "",
29 |       "target_delimiter": " ",
30 |       "fewshot_delimiter": "\n\n",
31 |       "num_fewshot": 25,
32 |       "metric_list": [
33 |         {
34 |           "metric": "acc",
35 |           "aggregation": "mean",
36 |           "higher_is_better": true
37 |         },
38 |         {
39 |           "metric": "acc_norm",
40 |           "aggregation": "mean",
41 |           "higher_is_better": true
42 |         }
43 |       ],
44 |       "output_type": "multiple_choice",
45 |       "repeats": 1,
46 |       "should_decontaminate": true,
47 |       "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
48 |       "metadata": {
49 |         "version": 1.0
50 |       }
51 |     }
52 |   },
53 |   "versions": {
54 |     "arc_challenge": 1.0
55 |   },
56 |   "n-shot": {
57 |     "arc_challenge": 25
58 |   },
59 |   "config": {
60 |     "model": "hf",
61 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
62 |     "batch_size": "16",
63 |     "batch_sizes": [],
64 |     "device": null,
65 |     "use_cache": null,
66 |     "limit": null,
67 |     "bootstrap_iters": 100000,
68 |     "gen_kwargs": null
69 |   },
70 |   "git_hash": "22f5854",
71 |   "date": 1711604407.8730423,
72 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2813.569\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
73 |   "transformers_version": "4.38.2",
74 |   "upper_git_hash": null
75 | }


--------------------------------------------------------------------------------
/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/gsm8k_5.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "results": {
  3 |     "gsm8k": {
  4 |       "exact_match,strict-match": 0.6777862016679302,
  5 |       "exact_match_stderr,strict-match": 0.012872435481188778,
  6 |       "exact_match,flexible-extract": 0.6853677028051555,
  7 |       "exact_match_stderr,flexible-extract": 0.012791037227336034,
  8 |       "alias": "gsm8k"
  9 |     }
 10 |   },
 11 |   "group_subtasks": {
 12 |     "gsm8k": []
 13 |   },
 14 |   "configs": {
 15 |     "gsm8k": {
 16 |       "task": "gsm8k",
 17 |       "group": [
 18 |         "math_word_problems"
 19 |       ],
 20 |       "dataset_path": "gsm8k",
 21 |       "dataset_name": "main",
 22 |       "training_split": "train",
 23 |       "test_split": "test",
 24 |       "fewshot_split": "train",
 25 |       "doc_to_text": "Question: {{question}}\nAnswer:",
 26 |       "doc_to_target": "{{answer}}",
 27 |       "description": "",
 28 |       "target_delimiter": " ",
 29 |       "fewshot_delimiter": "\n\n",
 30 |       "num_fewshot": 5,
 31 |       "metric_list": [
 32 |         {
 33 |           "metric": "exact_match",
 34 |           "aggregation": "mean",
 35 |           "higher_is_better": true,
 36 |           "ignore_case": true,
 37 |           "ignore_punctuation": false,
 38 |           "regexes_to_ignore": [
 39 |             ",",
 40 |             "\\$",
 41 |             "(?s).*#### ",
 42 |             "\\.$"
 43 |           ]
 44 |         }
 45 |       ],
 46 |       "output_type": "generate_until",
 47 |       "generation_kwargs": {
 48 |         "until": [
 49 |           "Question:",
 50 |           "</s>",
 51 |           "<|im_end|>"
 52 |         ],
 53 |         "do_sample": false,
 54 |         "temperature": 0.0
 55 |       },
 56 |       "repeats": 1,
 57 |       "filter_list": [
 58 |         {
 59 |           "name": "strict-match",
 60 |           "filter": [
 61 |             {
 62 |               "function": "regex",
 63 |               "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
 64 |             },
 65 |             {
 66 |               "function": "take_first"
 67 |             }
 68 |           ]
 69 |         },
 70 |         {
 71 |           "name": "flexible-extract",
 72 |           "filter": [
 73 |             {
 74 |               "function": "regex",
 75 |               "group_select": -1,
 76 |               "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
 77 |             },
 78 |             {
 79 |               "function": "take_first"
 80 |             }
 81 |           ]
 82 |         }
 83 |       ],
 84 |       "should_decontaminate": false,
 85 |       "metadata": {
 86 |         "version": 3.0
 87 |       }
 88 |     }
 89 |   },
 90 |   "versions": {
 91 |     "gsm8k": 3.0
 92 |   },
 93 |   "n-shot": {
 94 |     "gsm8k": 5
 95 |   },
 96 |   "config": {
 97 |     "model": "hf",
 98 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
 99 |     "batch_size": "16",
100 |     "batch_sizes": [],
101 |     "device": null,
102 |     "use_cache": null,
103 |     "limit": null,
104 |     "bootstrap_iters": 100000,
105 |     "gen_kwargs": null
106 |   },
107 |   "git_hash": "22f5854",
108 |   "date": 1711605933.4303067,
109 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2475.477\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
110 |   "transformers_version": "4.38.2",
111 |   "upper_git_hash": null
112 | }


--------------------------------------------------------------------------------
/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/hellaswag_10.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "hellaswag": {
 4 |       "acc,none": 0.7061342362079267,
 5 |       "acc_stderr,none": 0.004546002255457021,
 6 |       "acc_norm,none": 0.8818960366460864,
 7 |       "acc_norm_stderr,none": 0.0032207161266851005,
 8 |       "alias": "hellaswag"
 9 |     }
10 |   },
11 |   "group_subtasks": {
12 |     "hellaswag": []
13 |   },
14 |   "configs": {
15 |     "hellaswag": {
16 |       "task": "hellaswag",
17 |       "group": [
18 |         "multiple_choice"
19 |       ],
20 |       "dataset_path": "hellaswag",
21 |       "training_split": "train",
22 |       "validation_split": "validation",
23 |       "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n    def _process_doc(doc):\n        ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n        out_doc = {\n            \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n            \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n            \"gold\": int(doc[\"label\"]),\n        }\n        return out_doc\n\n    return dataset.map(_process_doc)\n",
24 |       "doc_to_text": "{{query}}",
25 |       "doc_to_target": "{{label}}",
26 |       "doc_to_choice": "choices",
27 |       "description": "",
28 |       "target_delimiter": " ",
29 |       "fewshot_delimiter": "\n\n",
30 |       "num_fewshot": 10,
31 |       "metric_list": [
32 |         {
33 |           "metric": "acc",
34 |           "aggregation": "mean",
35 |           "higher_is_better": true
36 |         },
37 |         {
38 |           "metric": "acc_norm",
39 |           "aggregation": "mean",
40 |           "higher_is_better": true
41 |         }
42 |       ],
43 |       "output_type": "multiple_choice",
44 |       "repeats": 1,
45 |       "should_decontaminate": false,
46 |       "metadata": {
47 |         "version": 1.0
48 |       }
49 |     }
50 |   },
51 |   "versions": {
52 |     "hellaswag": 1.0
53 |   },
54 |   "n-shot": {
55 |     "hellaswag": 10
56 |   },
57 |   "config": {
58 |     "model": "hf",
59 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
60 |     "batch_size": "16",
61 |     "batch_sizes": [],
62 |     "device": null,
63 |     "use_cache": null,
64 |     "limit": null,
65 |     "bootstrap_iters": 100000,
66 |     "gen_kwargs": null
67 |   },
68 |   "git_hash": "22f5854",
69 |   "date": 1711604551.2668173,
70 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2633.640\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
71 |   "transformers_version": "4.38.2",
72 |   "upper_git_hash": null
73 | }


--------------------------------------------------------------------------------
/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/truthfulqa_mc2_0.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "truthfulqa_mc2": {
 4 |       "acc,none": 0.7171838111166857,
 5 |       "acc_stderr,none": 0.01498853297119472,
 6 |       "alias": "truthfulqa_mc2"
 7 |     }
 8 |   },
 9 |   "group_subtasks": {
10 |     "truthfulqa_mc2": []
11 |   },
12 |   "configs": {
13 |     "truthfulqa_mc2": {
14 |       "task": "truthfulqa_mc2",
15 |       "group": [
16 |         "truthfulqa"
17 |       ],
18 |       "dataset_path": "truthful_qa",
19 |       "dataset_name": "multiple_choice",
20 |       "validation_split": "validation",
21 |       "doc_to_text": "{% set prompt_qa = 'Q: What is human life expectancy in the United States?\nA: Human life expectancy in the United States is 78 years.\n\nQ: Who was president of the United States in 1955?\nA: Dwight D. Eisenhower was president of the United States in 1955.\n\nQ: Which party did he belong to?\nA: He belonged to the Republican Party.\n\nQ: What is the square root of banana?\nA: I have no comment.\n\nQ: How does a telescope work?\nA: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\nQ: Where were the 1992 Olympics held?\nA: The 1992 Olympics were held in Barcelona, Spain.'%}{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}",
22 |       "doc_to_target": 0,
23 |       "doc_to_choice": "{{mc2_targets.choices}}",
24 |       "process_results": "def process_results_mc2(doc, results):\n    lls, is_greedy = zip(*results)\n\n    # Split on the first `0` as everything before it is true (`1`).\n    split_idx = list(doc[\"mc2_targets\"][\"labels\"]).index(0)\n    # Compute the normalized probability mass for the correct answer.\n    ll_true, ll_false = lls[:split_idx], lls[split_idx:]\n    p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))\n    p_true = p_true / (sum(p_true) + sum(p_false))\n\n    return {\"acc\": sum(p_true)}\n",
25 |       "description": "",
26 |       "target_delimiter": " ",
27 |       "fewshot_delimiter": "\n\n",
28 |       "num_fewshot": 0,
29 |       "metric_list": [
30 |         {
31 |           "metric": "acc",
32 |           "aggregation": "mean",
33 |           "higher_is_better": true
34 |         }
35 |       ],
36 |       "output_type": "multiple_choice",
37 |       "repeats": 1,
38 |       "should_decontaminate": true,
39 |       "doc_to_decontamination_query": "question",
40 |       "metadata": {
41 |         "version": 2.0
42 |       }
43 |     }
44 |   },
45 |   "versions": {
46 |     "truthfulqa_mc2": 2.0
47 |   },
48 |   "n-shot": {
49 |     "truthfulqa_mc2": 0
50 |   },
51 |   "config": {
52 |     "model": "hf",
53 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
54 |     "batch_size": "16",
55 |     "batch_sizes": [],
56 |     "device": null,
57 |     "use_cache": null,
58 |     "limit": null,
59 |     "bootstrap_iters": 100000,
60 |     "gen_kwargs": null
61 |   },
62 |   "git_hash": "22f5854",
63 |   "date": 1711605810.1983285,
64 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2474.946\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
65 |   "transformers_version": "4.38.2",
66 |   "upper_git_hash": null
67 | }


--------------------------------------------------------------------------------
/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/winogrande_5.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": {
 3 |     "winogrande": {
 4 |       "acc,none": 0.8318863456985004,
 5 |       "acc_stderr,none": 0.010510336954166734,
 6 |       "alias": "winogrande"
 7 |     }
 8 |   },
 9 |   "group_subtasks": {
10 |     "winogrande": []
11 |   },
12 |   "configs": {
13 |     "winogrande": {
14 |       "task": "winogrande",
15 |       "dataset_path": "winogrande",
16 |       "dataset_name": "winogrande_xl",
17 |       "training_split": "train",
18 |       "validation_split": "validation",
19 |       "doc_to_text": "def doc_to_text(doc):\n    answer_to_num = {\"1\": 0, \"2\": 1}\n    return answer_to_num[doc[\"answer\"]]\n",
20 |       "doc_to_target": "def doc_to_target(doc):\n    idx = doc[\"sentence\"].index(\"_\") + 1\n    return doc[\"sentence\"][idx:].strip()\n",
21 |       "doc_to_choice": "def doc_to_choice(doc):\n    idx = doc[\"sentence\"].index(\"_\")\n    options = [doc[\"option1\"], doc[\"option2\"]]\n    return [doc[\"sentence\"][:idx] + opt for opt in options]\n",
22 |       "description": "",
23 |       "target_delimiter": " ",
24 |       "fewshot_delimiter": "\n\n",
25 |       "num_fewshot": 5,
26 |       "metric_list": [
27 |         {
28 |           "metric": "acc",
29 |           "aggregation": "mean",
30 |           "higher_is_better": true
31 |         }
32 |       ],
33 |       "output_type": "multiple_choice",
34 |       "repeats": 1,
35 |       "should_decontaminate": true,
36 |       "doc_to_decontamination_query": "sentence",
37 |       "metadata": {
38 |         "version": 1.0
39 |       }
40 |     }
41 |   },
42 |   "versions": {
43 |     "winogrande": 1.0
44 |   },
45 |   "n-shot": {
46 |     "winogrande": 5
47 |   },
48 |   "config": {
49 |     "model": "hf",
50 |     "model_args": "pretrained=upstage/SOLAR-10.7B-Instruct-v1.0,trust_remote_code=True,dtype=float16,use_fast_tokenizer=False,use_flash_attention_2=False",
51 |     "batch_size": "16",
52 |     "batch_sizes": [],
53 |     "device": null,
54 |     "use_cache": null,
55 |     "limit": null,
56 |     "bootstrap_iters": 100000,
57 |     "gen_kwargs": null
58 |   },
59 |   "git_hash": "22f5854",
60 |   "date": 1711605880.7907126,
61 |   "pretty_env_info": "PyTorch version: 2.2.1+cu121\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 20.04.6 LTS (x86_64)\nGCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.31\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.4.0-164-generic-x86_64-with-glibc2.31\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 525.125.06\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                       x86_64\nCPU op-mode(s):                     32-bit, 64-bit\nByte Order:                         Little Endian\nAddress sizes:                      43 bits physical, 48 bits virtual\nCPU(s):                             128\nOn-line CPU(s) list:                0-127\nThread(s) per core:                 1\nCore(s) per socket:                 64\nSocket(s):                          2\nNUMA node(s):                       2\nVendor ID:                          AuthenticAMD\nCPU family:                         25\nModel:                              1\nModel name:                         AMD EPYC 7763 64-Core Processor\nStepping:                           1\nFrequency boost:                    enabled\nCPU MHz:                            2511.241\nCPU max MHz:                        2450.0000\nCPU min MHz:                        1500.0000\nBogoMIPS:                           4890.43\nVirtualization:                     AMD-V\nL1d cache:                          4 MiB\nL1i cache:                          4 MiB\nL2 cache:                           64 MiB\nL3 cache:                           512 MiB\nNUMA node0 CPU(s):                  0-63\nNUMA node1 CPU(s):                  64-127\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit:        Not affected\nVulnerability L1tf:                 Not affected\nVulnerability Mds:                  Not affected\nVulnerability Meltdown:             Not affected\nVulnerability Mmio stale data:      Not affected\nVulnerability Retbleed:             Not affected\nVulnerability Spec store bypass:    Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1:           Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:           Mitigation; Retpolines, IBPB conditional, IBRS_FW, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected\nVulnerability Srbds:                Not affected\nVulnerability Tsx async abort:      Not affected\nFlags:                              fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate sme ssbd mba sev ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr wbnoinvd arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload vgif umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca\n\nVersions of relevant libraries:\n[pip3] numpy==1.26.4\n[pip3] torch==2.2.1\n[conda] numpy                     1.26.4                   pypi_0    pypi\n[conda] torch                     2.2.1                    pypi_0    pypi",
62 |   "transformers_version": "4.38.2",
63 |   "upper_git_hash": null
64 | }


--------------------------------------------------------------------------------
/examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/scores.txt:
--------------------------------------------------------------------------------
 1 | ================================================================
 2 | /data/private/new_lib/results/SOLAR-10.7B-Instruct-v1.0/ifeval/eval_results_strict.jsonl Accuracy Scores:
 3 | prompt-level: 0.5157116451016636
 4 | instruction-level: 0.5796519410977242
 5 | 
 6 | change_case 0.5595238095238095
 7 | combination 0.16417910447761194
 8 | detectable_content 0.8727272727272727
 9 | detectable_format 0.6666666666666666
10 | keywords 0.744
11 | language 0.8
12 | length_constraints 0.5631067961165048
13 | punctuation 0.1791044776119403
14 | startend 0.6060606060606061
15 | 
16 | change_case:capital_word_frequency 0.7
17 | change_case:english_capital 0.4230769230769231
18 | change_case:english_lowercase 0.5789473684210527
19 | combination:repeat_prompt 0.047619047619047616
20 | combination:two_responses 0.36
21 | detectable_content:number_placeholders 0.8076923076923077
22 | detectable_content:postscript 0.9310344827586207
23 | detectable_format:constrained_response 0.9
24 | detectable_format:json_format 0.7647058823529411
25 | detectable_format:multiple_sections 0.25
26 | detectable_format:number_bullet_lists 0.46153846153846156
27 | detectable_format:number_highlighted_sections 0.5714285714285714
28 | detectable_format:title 0.9722222222222222
29 | keywords:existence 0.9523809523809523
30 | keywords:forbidden_words 0.5833333333333334
31 | keywords:frequency 0.8461538461538461
32 | keywords:letter_frequency 0.6551724137931034
33 | language:response_language 0.8
34 | length_constraints:nth_paragraph_first_word 0.16666666666666666
35 | length_constraints:number_paragraphs 0.4
36 | length_constraints:number_sentences 0.6666666666666666
37 | length_constraints:number_words 0.7333333333333333
38 | punctuation:no_comma 0.1791044776119403
39 | startend:end_checker 0.72
40 | startend:quotation 0.5365853658536586
41 | ================================================================
42 | /data/private/new_lib/results/SOLAR-10.7B-Instruct-v1.0/ifeval/eval_results_loose.jsonl Accuracy Scores:
43 | prompt-level: 0.5600739371534196
44 | instruction-level: 0.6291834002677377
45 | 
46 | change_case 0.6071428571428571
47 | combination 0.16417910447761194
48 | detectable_content 0.8727272727272727
49 | detectable_format 0.68
50 | keywords 0.832
51 | language 0.8666666666666667
52 | length_constraints 0.6213592233009708
53 | punctuation 0.31343283582089554
54 | startend 0.6515151515151515
55 | 
56 | change_case:capital_word_frequency 0.75
57 | change_case:english_capital 0.5
58 | change_case:english_lowercase 0.6052631578947368
59 | combination:repeat_prompt 0.047619047619047616
60 | combination:two_responses 0.36
61 | detectable_content:number_placeholders 0.8076923076923077
62 | detectable_content:postscript 0.9310344827586207
63 | detectable_format:constrained_response 0.9
64 | detectable_format:json_format 0.8823529411764706
65 | detectable_format:multiple_sections 0.25
66 | detectable_format:number_bullet_lists 0.46153846153846156
67 | detectable_format:number_highlighted_sections 0.5714285714285714
68 | detectable_format:title 0.9722222222222222
69 | keywords:existence 0.9523809523809523
70 | keywords:forbidden_words 0.8333333333333334
71 | keywords:frequency 0.8974358974358975
72 | keywords:letter_frequency 0.6551724137931034
73 | language:response_language 0.8666666666666667
74 | length_constraints:nth_paragraph_first_word 0.5
75 | length_constraints:number_paragraphs 0.44
76 | length_constraints:number_sentences 0.6666666666666666
77 | length_constraints:number_words 0.7666666666666667
78 | punctuation:no_comma 0.31343283582089554
79 | startend:end_checker 0.72
80 | startend:quotation 0.6097560975609756
81 | 


--------------------------------------------------------------------------------
/examples/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/scores.txt:
--------------------------------------------------------------------------------
 1 | Mode: single
 2 | Input file: /data/private/new_lib/evalverse/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/model_judgment/gpt-4_single.jsonl
 3 | 
 4 | ########## First turn ##########
 5 |                                   score
 6 | model                     turn         
 7 | SOLAR-10.7B-Instruct-v1.0 1     7.66875
 8 | 
 9 | ########## Second turn ##########
10 |                                   score
11 | model                     turn         
12 | SOLAR-10.7B-Instruct-v1.0 2     7.21519
13 | 
14 | ########## Average ##########
15 |                               score
16 | model                              
17 | SOLAR-10.7B-Instruct-v1.0  7.443396
18 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "evalverse"
 3 | version = "0.0.1"
 4 | description = "The Universe of Evaluation. All about the evaluation for LLMs."
 5 | authors = ["Evalverse <evalverse@upstage.ai>"]
 6 | license = "Apache License 2.0"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | ### Evalverse
11 | python = ">=3.9,<3.11"
12 | python-dotenv = "^1.0.1"
13 | pre-commit = "^3.7.0"
14 | pandas = "^2.2.1"
15 | 
16 | ### Evalverse Reporter
17 | slack-sdk = "^3.27.1"
18 | slack-bolt = "^1.18.1"
19 | gitpython = "^3.1.42"
20 | plotly = "^5.20.0"
21 | kaleido = "0.2.1"
22 | nbformat = ">=4.2.0"
23 | 
24 | ### lm-evaluation-harness & Common
25 | lm-eval = "0.4.2"
26 | transformers = "4.37.2"
27 | vllm = "0.3.1"
28 | ray = "^2.10.0"
29 | 
30 | ### FastChat
31 | fschat = { path = "evalverse/submodules/FastChat" }
32 | openai = "<1"
33 | anthropic = ">=0.3"
34 | 
35 | ### IFEval
36 | absl-py = "^2.1.0"
37 | langdetect = "^1.0.9"
38 | immutabledict = "^4.2.0"
39 | 
40 | ### EQ-Bench
41 | gspread = "^6.1.0"
42 | oauth2client = "^4.1.3"
43 | firebase-admin = "^6.5.0"
44 | tensorboardx = "^2.6.2.2"
45 | hf-transfer = "^0.1.6"
46 | scipy = "^1.12.0"
47 | pexpect = "^4.9.0"
48 | 
49 | 
50 | [tool.poetry.group.dev.dependencies]
51 | ipykernel = "^6.29.3"
52 | 
53 | [build-system]
54 | requires = ["poetry-core"]
55 | build-backend = "poetry.core.masonry.api"
56 | 


--------------------------------------------------------------------------------