├── .dockerignore
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── README.md
├── codegen
    ├── generate.py
    └── model.py
├── evoeval
    ├── __init__.py
    ├── data.py
    ├── eval_test
    │   ├── __init__.py
    │   ├── _creative_special_oracle.py
    │   ├── _difficult_special_oracle.py
    │   ├── _he_special_oracle.py
    │   └── _subtle_special_oracle.py
    ├── evaluate.py
    └── util
    │   └── api_request.py
├── pyproject.toml
├── requirements.txt
├── resources
    ├── butterfly_dark.png
    └── example.gif
├── setup.cfg
└── tool
    └── sanitize.py


/.dockerignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #  nuclear option because steven uses PyCharm.
161 | .idea/
162 | 
163 | # VSCode
164 | .vscode/
165 | *.jsonl
166 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #  nuclear option because steven uses PyCharm.
161 | .idea/
162 | 
163 | 
164 | evoeval/_version.py
165 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pycqa/isort
 3 |     rev: 5.12.0
 4 |     hooks:
 5 |     -   id: isort
 6 |         name: isort (python)
 7 |         args: ["--profile", "black"]
 8 | -   repo: https://github.com/psf/black
 9 |     rev: 22.6.0
10 |     hooks:
11 |     -   id: black
12 | -   repo: https://github.com/pre-commit/pre-commit-hooks
13 |     rev: v4.3.0
14 |     hooks:
15 |     -   id: check-yaml
16 |     -   id: end-of-file-fixer
17 |     -   id: trailing-whitespace
18 | exclude: (?x)^(
19 |         resources/.*|
20 |         README.*
21 |     )$
22 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # base env: py39 ubuntu20.04
 2 | # 3.9 is needed for typing related stuff
 3 | FROM python:3.9-slim-buster
 4 | 
 5 | # install git
 6 | RUN apt-get update && apt-get install -y git
 7 | 
 8 | # upgrade to latest pip
 9 | RUN pip install --upgrade pip
10 | 
11 | COPY . /evoeval
12 | 
13 | RUN cd /evoeval && ls -l && pip install .
14 | 
15 | WORKDIR /app
16 | 
17 | ENTRYPOINT ["python3", "-m", "evoeval.evaluate"]
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # <img src="resources/butterfly_dark.png" width="32px" height="auto"> EvoEval: Evolving Coding Benchmarks via LLM
  2 | 
  3 | <p align="center">
  4 |     <a href="https://evo-eval.github.io/leaderboard.html"><img src="https://img.shields.io/badge/🏆-LeaderBoard-8e7cc3?style=for-the-badge"></a>
  5 |     <a href="https://evo-eval.github.io/visualization.html"><img src="https://img.shields.io/badge/🔮-Visualization-3d85c6?style=for-the-badge"></a>
  6 |     <a href="https://arxiv.org/abs/2403.19114"><img src="https://img.shields.io/badge/📃-Arxiv-b31b1b?style=for-the-badge"></a>
  7 |     <a href="https://huggingface.co/evoeval/"><img src="https://img.shields.io/badge/🤗-Huggingface-f59e0b?style=for-the-badge"></a>
  8 |     <a href="https://pypi.org/project/evoeval/"><img src="https://img.shields.io/badge/0.1.0-Pypi-3b719f?style=for-the-badge&logo=pypi"></a>
  9 | </p>
 10 | 
 11 | <p align="center">
 12 |     <big><a href="#-quick-start">⚡Quick Start</a></big> |
 13 |     <big><a href="#-benchmarks">🔠Benchmarks</a></big> |
 14 |     <big><a href="#-llm-generated-code">🤖LLM Generated Code</a></big> |
 15 |     <big><a href="#-citation">📝Citation</a></big> |
 16 |     <big><a href="#-acknowledgement">🙏Acknowledgement</a></big>
 17 | </p>
 18 | 
 19 | ## <img src="resources/butterfly_dark.png" width="23px" height="auto"> About 
 20 | 
 21 | **EvoEval**<sup>1</sup> is a holistic benchmark suite created by _evolving_ **HumanEval** problems:
 22 | - 🔥 Contains **828** new problems across **5** 🌠 semantic-altering and **2** ⭐ semantic-preserving benchmarks
 23 | - 🔮 Allows evaluation/comparison across different **dimensions** and problem **types** (i.e., _Difficult_, _Creative_ or _Tool Use_ problems). See our [**visualization tool**](https://evo-eval.github.io/visualization.html) for ready-to-use comparison
 24 | - 🏆 Complete with [**leaderboard**](https://evo-eval.github.io/leaderboard.html), **groundtruth solutions**, **robust testcases** and **evaluation scripts** to easily fit into your evaluation pipeline
 25 | - 🤖 Generated LLM code samples from **>50** different models to save you time in running experiments
 26 | 
 27 | <sup>1</sup> coincidentally similar pronunciation with 😈 EvilEval
 28 | 
 29 | <p align="center">
 30 | <img src="./resources/example.gif" style="width:75%; margin-left: auto; margin-right: auto;">
 31 | </p>
 32 | 
 33 | Checkout our 📃 [paper](https://arxiv.org/abs/2403.19114) and [webpage](https://evo-eval.github.io) for more detail! 
 34 | 
 35 | 
 36 | 
 37 | ## ⚡ Quick Start
 38 | 
 39 | Directly install the package:
 40 | 
 41 | ```bash
 42 | pip install evoeval --upgrade
 43 | ```
 44 | 
 45 | <details><summary>⏬ Nightly Version </summary>
 46 | <div>
 47 | 
 48 | ```bash
 49 | pip install "git+https://github.com/evo-eval/evoeval.git" --upgrade
 50 | ```
 51 | 
 52 | </div>
 53 | </details>
 54 | 
 55 | <details><summary>⏬ Local Repository</summary>
 56 | <div>
 57 | 
 58 | ```bash
 59 | git clone https://github.com/evo-eval/evoeval.git
 60 | cd evoeval
 61 | export PYTHONPATH=$PYTHONPATH:$(pwd)
 62 | pip install -r requirements.txt
 63 | ```
 64 | 
 65 | </div>
 66 | </details>
 67 | 
 68 | Now you are ready to download EvoEval benchmarks and perform evaluation!
 69 | 
 70 | ### 🧑‍💻 Code generation
 71 | 
 72 | To download our benchmarks, simply use the following code snippet:
 73 | 
 74 | ```python
 75 | from evoeval.data import get_evo_eval
 76 | 
 77 | evoeval_benchmark = "EvoEval_difficult" # you can pick from 7 different benchmarks!
 78 | 
 79 | problems = get_evo_eval(evoeval_benchmark)
 80 | ```
 81 | 
 82 | For code generation and evaluation, we adopt the same style as [HumanEval+](https://github.com/evalplus/evalplus) and [HumanEval](https://github.com/openai/human-eval).
 83 | 
 84 | Implement the `GEN_SOLUTION` function by calling the LLM to produce the complete solution (include the function header + code) and save the samples to `{benchmark}_samples.jsonl`:
 85 | 
 86 | ```python
 87 | from evoeval.data import get_evo_eval, write_jsonl
 88 | 
 89 | evoeval_benchmark = "EvoEval_difficult"
 90 | 
 91 | samples = [
 92 |     dict(task_id=task_id, solution=GEN_SOLUTION(problem["prompt"]))
 93 |     for task_id, problem in get_evo_eval(evoeval_benchmark).items()
 94 | ]
 95 | write_jsonl(f"{evoeval_benchmark}_samples.jsonl", samples)
 96 | ```
 97 | 
 98 | > [!TIP]
 99 | > 
100 | > EvoEval `samples.jsonl` expects the solution field to contain the **complete** code implementation, this is 
101 | > slightly different from the original HumanEval where the solution field only contains the function body.
102 | > 
103 | > If you want to follow exactly like HumanEval setup, checkout our 🤗 Huggingface [datasets](https://huggingface.co/evoeval), which can be directly ran with
104 | > HumanEval evaluation [script](https://huggingface.co/evoeval)
105 | 
106 | ### 🕵️ Evaluation
107 | 
108 | You can use our provided [docker](https://docs.docker.com/get-docker/) image:
109 | 
110 | ```bash
111 | docker run --rm -v $(pwd):/app evoeval/evoeval:latest --dataset EvoEval_difficult --samples EvoEval_difficult_samples.jsonl
112 | ```
113 | 
114 | Or run it locally:
115 | 
116 | ```bash
117 | evoeval.evaluate --dataset EvoEval_difficult --samples EvoEval_difficult_samples.jsonl
118 | ```
119 | 
120 | Or if you are using it as a local repository:
121 | 
122 | ```bash
123 | export PYTHONPATH=$PYTHONPATH:$(pwd)
124 | python evoeval/evaluate.py --dataset EvoEval_difficult --samples EvoEval_difficult_samples.jsonl
125 | ```
126 | 
127 | You should expect to see the following output (when evaluated on GPT-4):
128 | ```
129 | Computing expected output...
130 | Expected outputs computed in 11.24s
131 | Reading samples...
132 | 100it [00:00, 164.16it/s]
133 | 100%|████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 12.77it/s]
134 | EvoEval_difficult
135 | pass@1: 0.520 # for reference GPT-4 solves more than 80% of problems in HumanEval
136 | ```
137 | This shows the pass@1 score for the EvoEval_difficult benchmark. You can use `--i-just-wanna-run` to recompute the evaluation result
138 | 
139 | > [!Note]
140 | > 
141 | > You can also evaluate the LLM solutions in a folder format with each subfolder contains
142 | > the LLM solution for each problem in the benchmark
143 | >
144 | > For example, you can grab the [GPT-4 solutions](https://github.com/evo-eval/evoeval/releases/download/v0.1.0/gpt-4_temp_0.0.zip) in our [v0.1.0 release](https://github.com/evo-eval/evoeval/releases/tag/v0.1.0).
145 | > After unzipping, you can run the following command:
146 | > 
147 | > ```bash
148 | > evoeval.evaluate --dataset EvoEval_difficult --samples gpt-4_temp_0.0/EvoEval_difficult 
149 | > ```
150 | >
151 | > to obtain the same result as above using `.jsonl`
152 | 
153 | 
154 | ## 🔠 Benchmarks
155 | 
156 | **EvoEval** contains **7** different benchmarks, each with a unique set of problems 
157 | evolved from the original **HumanEval** problems. 🌠 denotes semantic-altering benchmarks, 
158 | while ⭐ denotes semantic-preserving benchmarks.:
159 | 
160 | <details><summary><b>🌠EvoEval_difficult:</b></summary>
161 | <div>
162 | 
163 | > Introduce complexity by adding additional constraints and requirements,
164 | > replace commonly used requirements to less common ones, or add additional reasoning
165 | > steps to the original problem.
166 | </div>
167 | </details>
168 | 
169 | <details><summary><b>🌠EvoEval_creative:</b></summary>
170 | <div>
171 | 
172 | > Generate a more creative problem compared to the original through the use
173 | > of stories or uncommon narratives.
174 | </div>
175 | </details>
176 | 
177 | 
178 | <details><summary><b>🌠EvoEval_subtle:</b></summary>
179 | <div>
180 | 
181 | > Make a subtle and minor change to the original problem such as inverting or
182 | > replacing a requirement.
183 | </div>
184 | </details>
185 | 
186 | 
187 | <details><summary><b>🌠EvoEval_combine:</b></summary>
188 | <div>
189 | 
190 | > Combine two different problems by integrating the concepts from both problems. In order to select problems that make sense to combine, we apply a simple heuristic
191 | > to combine only problems of the same type together categorized based on the type of
192 | > input arguments in the original problem.
193 | </div>
194 | </details>
195 | 
196 | <details><summary><b>🌠EvoEval_tool_use:</b></summary>
197 | <div>
198 | 
199 | > Produce a new problem containing a main problem and one or more helpers
200 | > functions which can be used to solve it. Each helper function is fully implemented and
201 | > provides hints or useful functionality for solving the main problem. The main problem
202 | > does not explicitly reference individual helper functions, and we do not require the model
203 | > to use the provided helpers.
204 | </div>
205 | </details>
206 | 
207 | 
208 | <details><summary><b>⭐EvoEval_verbose:</b></summary>
209 | <div>
210 | 
211 | > Reword the original docstring to be more verbose. These verbose docstrings
212 | > can use more descriptive language to illustrate the problem, include detailed explanation
213 | > of the example output, and provide additional hints.
214 | </div>
215 | </details>
216 | 
217 | <details><summary><b>⭐EvoEval_concise:</b></summary>
218 | <div>
219 | 
220 | > Reword the original docstring to be more concise by removing unnecessary
221 | > details and using concise language. Furthermore, simple examples that are not required
222 | > to demonstrate edge cases may be removed.
223 | 
224 | </div>
225 | </details>
226 | 
227 | For each problem in each **EvoEval** benchmark, we include the complete groundtruth as well as test cases for functional evaluation.
228 | 
229 | > [!Note]
230 | > 
231 | > **Problem Structure**
232 | > 
233 | > ```json
234 | > {
235 | > "task_id": "identifier string for the task",
236 | > "entry_point": "name of the function",
237 | > "prompt": "function signature with docstring",
238 | > "canonical_solution": "groundtruth implementation",
239 | > "inputs": "test inputs for each problem",
240 | > "parent": "original HumanEval problem it evolved from",
241 | > "main": "special field of EvoEval_tool_use to show just the main problem description",
242 | > "helpers": "special field of EvoEval_tool_use to show the helper functions"
243 | > }
244 | > ```
245 | 
246 | ## 🤖 LLM Generated Code
247 | 
248 | To view the performance of **>50** LLMs on the EvoEval benchmarks,
249 | we provide a complete [leaderboard](https://evo-eval.github.io/leaderboard.html) as well as a 
250 | [visualization tool](https://evo-eval.github.io/visualization.html) to compare the performance of different models.
251 | 
252 | Further, we also provide all code samples from LLMs on the **EvoEval** benchmarks:
253 | 
254 | * See the attachment of our [v0.1.0 release](https://github.com/evo-eval/evoeval/releases/tag/v0.1.0).
255 | 
256 | Each LLM generation is packaged in a zip file named like `{model_name}_temp_0.0.zip`. You can unzip the folder and obtain the
257 | LLM generation for each of our 7 benchmarks + the original HumanEval problems. Note that we only evaluate the greedy output for each LLM.
258 | 
259 | ## 📝 Citation
260 | 
261 | ```bibtex
262 | @article{evoeval,
263 |   author    = {Xia, Chunqiu Steven and Deng, Yinlin and Zhang, Lingming},
264 |   title     = {Top Leaderboard Ranking = Top Coding Proficiency, Always? EvoEval: Evolving Coding Benchmarks via LLM},
265 |   year      = {2024},
266 |   journal   = {arXiv preprint},
267 | }
268 | ```
269 | 
270 | > [!Note]
271 | > 
272 | > The first two authors contributed equally to this work, with author order determined via [_Nigiri_](https://senseis.xmp.net/?Nigiri)
273 | 
274 | ## 🙏 Acknowledgement
275 | 
276 | * [HumanEval](https://github.com/openai/human-eval)
277 | * We especially thank [EvalPlus](https://github.com/evalplus/evalplus)
278 | 
279 | 
280 | 
281 | 


--------------------------------------------------------------------------------
/codegen/generate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | from os import PathLike
  4 | 
  5 | from evalplus.data import get_human_eval_plus
  6 | from model import DecoderBase, make_model
  7 | from rich.progress import (
  8 |     BarColumn,
  9 |     MofNCompleteColumn,
 10 |     Progress,
 11 |     TextColumn,
 12 |     TimeElapsedColumn,
 13 | )
 14 | 
 15 | from evoeval.data import get_evo_eval
 16 | 
 17 | 
 18 | def construct_contract_prompt(prompt: str, contract_type: str, contract: str) -> str:
 19 |     if contract_type == "no":
 20 |         return prompt
 21 |     elif contract_type == "docstring":
 22 |         # embed within the docstring
 23 |         sep = ""
 24 |         if '"""' in prompt:
 25 |             sep = '"""'
 26 |         elif "'''" in prompt:
 27 |             sep = "'''"
 28 |         assert sep != ""
 29 |         l = prompt.split(sep)
 30 |         contract = "\n".join([x.split("#")[0] for x in contract.splitlines()])
 31 |         l[1] = (
 32 |             l[1] + contract + "\n" + " " * (len(contract) - len(contract.lstrip()) - 1)
 33 |         )
 34 |         return sep.join(l)
 35 |     elif contract_type == "code":
 36 |         # at the beginning of the function
 37 |         contract = "\n".join([x.split("#")[0] for x in contract.splitlines()])
 38 |         return prompt + contract
 39 | 
 40 | 
 41 | def code_generate(args, workdir: PathLike, model: DecoderBase, id_range=None):
 42 |     with Progress(
 43 |         TextColumn(
 44 |             f"{args.dataset} •" + "[progress.percentage]{task.percentage:>3.0f}%"
 45 |         ),
 46 |         BarColumn(),
 47 |         MofNCompleteColumn(),
 48 |         TextColumn("•"),
 49 |         TimeElapsedColumn(),
 50 |     ) as p:
 51 |         if args.dataset == "humaneval":
 52 |             dataset = get_human_eval_plus()
 53 |         else:
 54 |             dataset = get_evo_eval(args.dataset)
 55 | 
 56 |         for task_id, task in p.track(dataset.items()):
 57 |             if id_range is not None:
 58 |                 id_num = int(task_id.split("/")[1])
 59 |                 low, high = id_range
 60 |                 if id_num < low or id_num >= high:
 61 |                     p.console.print(f"Skipping {task_id} as it is not in {id_range}")
 62 |                     continue
 63 | 
 64 |             p_name = task_id.replace("/", "_")
 65 |             if args.use_contracts != "no" and task["contract"] == "":
 66 |                 continue
 67 |             os.makedirs(os.path.join(workdir, p_name), exist_ok=True)
 68 |             log = f"Codegen: {p_name} @ {model}"
 69 |             n_existing = 0
 70 |             if args.resume:
 71 |                 # count existing .py files
 72 |                 n_existing = len(
 73 |                     [
 74 |                         f
 75 |                         for f in os.listdir(os.path.join(workdir, p_name))
 76 |                         if f.endswith(".py")
 77 |                     ]
 78 |                 )
 79 |                 if n_existing > 0:
 80 |                     log += f" (resuming from {n_existing})"
 81 | 
 82 |             nsamples = args.n_samples - n_existing
 83 |             p.console.print(log)
 84 | 
 85 |             sidx = args.n_samples - nsamples
 86 |             while sidx < args.n_samples:
 87 |                 outputs = model.codegen(
 88 |                     construct_contract_prompt(
 89 |                         task["prompt"], args.use_contracts, task["contract"]
 90 |                     ),
 91 |                     do_sample=not args.greedy,
 92 |                     num_samples=args.n_samples - sidx,
 93 |                 )
 94 |                 assert outputs, "No outputs from model!"
 95 |                 for impl in outputs:
 96 |                     try:
 97 |                         with open(
 98 |                             os.path.join(workdir, p_name, f"{sidx}.py"),
 99 |                             "w",
100 |                             encoding="utf-8",
101 |                         ) as f:
102 |                             if model.conversational:
103 |                                 f.write(impl)
104 |                             else:
105 |                                 f.write(task["prompt"] + impl)
106 |                     except UnicodeEncodeError:
107 |                         continue
108 |                     sidx += 1
109 | 
110 | 
111 | def main():
112 |     parser = argparse.ArgumentParser()
113 |     parser.add_argument("--model", required=True, type=str)
114 |     parser.add_argument("--bs", required=True, type=int)
115 |     parser.add_argument("--temperature", required=True, type=float)
116 |     parser.add_argument("--dataset", default="evileval", type=str)
117 |     parser.add_argument("--root", type=str, required=True)
118 |     parser.add_argument("--n_samples", default=200, type=int)
119 |     parser.add_argument("--resume", action="store_true")
120 |     parser.add_argument("--use_contracts", default="no", type=str)
121 |     parser.add_argument("--greedy", action="store_true")
122 |     # id_range is list
123 |     parser.add_argument("--id-range", default=None, nargs="+", type=int)
124 |     args = parser.parse_args()
125 | 
126 |     # if args.dataset not in ["evileval", "humaneval"]:
127 |     #     raise NotImplementedError("Unsupported dataset: {}".format(args.dataset))
128 | 
129 |     if args.use_contracts not in ["no", "code", "docstring"]:
130 |         raise NotImplementedError(
131 |             "Unsupported contract usage: {}".format(args.use_contracts)
132 |         )
133 |     if args.greedy and (args.temperature != 0 or args.bs != 1 or args.n_samples != 1):
134 |         raise ValueError(
135 |             f"Greedy decoding is only supported with temperature({args.temperature}) = 0, batch_size({args.bs}) = 1"
136 |             f" and n_samples({args.n_samples}) = 1"
137 |         )
138 | 
139 |     if args.id_range is not None:
140 |         assert len(args.id_range) == 2, "id_range must be a list of length 2"
141 |         assert args.id_range[0] < args.id_range[1], "id_range must be increasing"
142 |         args.id_range = tuple(args.id_range)
143 | 
144 |     # Make project dir
145 |     os.makedirs(args.root, exist_ok=True)
146 |     # Make dataset dir
147 |     os.makedirs(os.path.join(args.root, args.dataset), exist_ok=True)
148 |     # Make dir for codes generated by each model
149 |     args.model = args.model.lower()
150 |     model = make_model(
151 |         name=args.model, batch_size=args.bs, temperature=args.temperature
152 |     )
153 |     workdir = os.path.join(
154 |         args.root,
155 |         args.dataset,
156 |         args.model
157 |         + f"_temp_{args.temperature}"
158 |         + ("" if args.use_contracts == "no" else f"-contract-{args.use_contracts}"),
159 |     )
160 |     os.makedirs(workdir, exist_ok=True)
161 | 
162 |     with open(os.path.join(workdir, "args.txt"), "w") as f:
163 |         f.write(str(args))
164 | 
165 |     code_generate(args, workdir=workdir, model=model, id_range=args.id_range)
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     main()
170 | 


--------------------------------------------------------------------------------
/codegen/model.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | from abc import ABC, abstractmethod
   3 | from typing import List
   4 | from warnings import warn
   5 | 
   6 | # Communism
   7 | os.environ["HF_HOME"] = os.environ.get("HF_HOME", "/ColossalTitan/huggingface/")
   8 | 
   9 | import anthropic
  10 | import google.generativeai as genai
  11 | import openai
  12 | import torch
  13 | from transformers import (
  14 |     AutoModelForCausalLM,
  15 |     AutoModelForSeq2SeqLM,
  16 |     AutoTokenizer,
  17 |     StoppingCriteria,
  18 |     StoppingCriteriaList,
  19 | )
  20 | from vllm import LLM, SamplingParams
  21 | 
  22 | from evoeval.util.api_request import (
  23 |     create_anthropic_config,
  24 |     create_chatgpt_config,
  25 |     create_gemini_config,
  26 |     create_palm_config,
  27 |     num_tokens_from_messages,
  28 |     request_anthropic_engine,
  29 |     request_chatgpt_engine,
  30 |     request_gemini_engine,
  31 |     request_palm_engine,
  32 | )
  33 | 
  34 | HUMANEVAL_EOS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"]
  35 | NON_CODE_EOS = ["<|endoftext|>", "\n```", "\n</s>", "<|endofmask|>"]
  36 | EOS = HUMANEVAL_EOS + NON_CODE_EOS
  37 | 
  38 | 
  39 | # Adopted from https://github.com/huggingface/transformers/pull/14897
  40 | class EndOfFunctionCriteria(StoppingCriteria):
  41 |     def __init__(self, start_length, eos, tokenizer, *args, **kwargs):
  42 |         super().__init__(*args, **kwargs)
  43 |         self.start_length = start_length
  44 |         self.eos = eos
  45 |         self.tokenizer = tokenizer
  46 |         self.end_length = {}
  47 | 
  48 |     def __call__(self, input_ids, scores, **kwargs):
  49 |         """Returns true if all generated sequences contain any of the end-of-function strings."""
  50 |         decoded_generations = self.tokenizer.batch_decode(
  51 |             input_ids[:, self.start_length :]
  52 |         )
  53 |         done = []
  54 |         for index, decoded_generation in enumerate(decoded_generations):
  55 |             finished = any(
  56 |                 [stop_string in decoded_generation for stop_string in self.eos]
  57 |             )
  58 |             if (
  59 |                 finished and index not in self.end_length
  60 |             ):  # ensures first time we see it
  61 |                 for stop_string in self.eos:
  62 |                     if stop_string in decoded_generation:
  63 |                         self.end_length[index] = len(
  64 |                             input_ids[
  65 |                                 index,  # get length of actual generation
  66 |                                 self.start_length : -len(
  67 |                                     self.tokenizer.encode(
  68 |                                         stop_string,
  69 |                                         add_special_tokens=False,
  70 |                                         return_tensors="pt",
  71 |                                     )[0]
  72 |                                 ),
  73 |                             ]
  74 |                         )
  75 |             done.append(finished)
  76 |         return all(done)
  77 | 
  78 | 
  79 | class DecoderBase(ABC):
  80 |     def __init__(
  81 |         self,
  82 |         name: str,
  83 |         batch_size: int = 1,
  84 |         temperature: float = 0.8,
  85 |         max_new_tokens: int = 512,
  86 |         conversational: bool = False,
  87 |         body: bool = False,
  88 |     ) -> None:
  89 |         print("Initializing a decoder model: {} ...".format(name))
  90 |         self.name = name
  91 |         self.batch_size = batch_size
  92 |         self.temperature = temperature
  93 |         self.eos = EOS
  94 |         self.skip_special_tokens = False
  95 |         self.max_new_tokens = max_new_tokens
  96 |         self.conversational = conversational
  97 |         self.body = body
  98 | 
  99 |     @abstractmethod
 100 |     def codegen(
 101 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 102 |     ) -> List[str]:
 103 |         pass
 104 | 
 105 |     def __repr__(self) -> str:
 106 |         return self.name
 107 | 
 108 |     def __str__(self) -> str:
 109 |         return self.name
 110 | 
 111 | 
 112 | class VLlmDecoder(DecoderBase):
 113 |     def __init__(
 114 |         self,
 115 |         name: str,
 116 |         batch_size: int = 1,
 117 |         temperature: float = 0.8,
 118 |         max_new_tokens: int = 512,
 119 |         conversational: bool = False,
 120 |     ) -> None:
 121 |         super().__init__(name, batch_size, temperature, max_new_tokens, conversational)
 122 |         kwargs = {"tensor_parallel_size": int(os.getenv("VLLM_N_GPUS", "1"))}
 123 | 
 124 |         if "CodeLlama" in name:
 125 |             kwargs["dtype"] = "bfloat16"
 126 |         elif "code-millenials" in name:
 127 |             kwargs["dtype"] = "float16"
 128 |         elif "uukuguy/speechless-code-mistral-7b-v1.0" == name:
 129 |             kwargs["dtype"] = "float16"
 130 |         elif "uukuguy/speechless-codellama-34b-v2.0" == name:
 131 |             kwargs["dtype"] = "float16"
 132 |         elif "CodeBooga" in name:
 133 |             kwargs["dtype"] = "float16"
 134 |         elif "WizardCoder" in name and "V1.1" in name:
 135 |             kwargs["dtype"] = "bfloat16"
 136 |         elif "WizardCoder" in name:
 137 |             kwargs["dtype"] = "float16"
 138 |         elif "deepseek" in name:
 139 |             kwargs["dtype"] = "bfloat16"
 140 |         elif "mixtral" in name.lower():
 141 |             kwargs["dtype"] = "bfloat16"
 142 |         elif "solar" in name:
 143 |             kwargs["dtype"] = "float16"
 144 |         elif "mistral" in name.lower():
 145 |             kwargs["dtype"] = "bfloat16"
 146 |         elif "phi" in name.lower():
 147 |             kwargs["dtype"] = "float16"
 148 |             kwargs["trust_remote_code"] = True
 149 |         elif "openchat" in name.lower():
 150 |             kwargs["dtype"] = "bfloat16"
 151 | 
 152 |         # reset the eos
 153 |         self.eos = []
 154 |         self.llm = LLM(model=name, max_model_len=2048, **kwargs)
 155 | 
 156 |     def codegen(
 157 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 158 |     ) -> List[str]:
 159 |         if do_sample:
 160 |             assert self.temperature > 0, "Temperature must be greater than 0!"
 161 |         batch_size = min(self.batch_size, num_samples)
 162 | 
 163 |         vllm_outputs = self.llm.generate(
 164 |             [prompt] * batch_size,
 165 |             SamplingParams(
 166 |                 temperature=self.temperature,
 167 |                 max_tokens=self.max_new_tokens
 168 |                 + len(self.llm.get_tokenizer().encode(prompt, return_tensors="pt")[0]),
 169 |                 top_p=0.95 if do_sample else 1.0,
 170 |                 stop=self.eos,
 171 |             ),
 172 |             use_tqdm=False,
 173 |         )
 174 | 
 175 |         gen_strs = [x.outputs[0].text.replace("\t", "    ") for x in vllm_outputs]
 176 | 
 177 |         return gen_strs
 178 | 
 179 | 
 180 | class CodeLlamaInstructSmall(VLlmDecoder):
 181 |     def __init__(self, name: str, **kwargs) -> None:
 182 |         kwargs["conversational"] = True
 183 |         super().__init__(name, **kwargs)
 184 |         self.eos += ["\n```"]
 185 | 
 186 |     def codegen(
 187 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 188 |     ) -> List[str]:
 189 |         if do_sample:
 190 |             assert self.temperature > 0, "Temperature must be greater than 0!"
 191 | 
 192 |         input = f"""[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:
 193 | ```python
 194 | {prompt}
 195 | ```
 196 | [/INST]
 197 | ```python
 198 | """
 199 | 
 200 |         return VLlmDecoder.codegen(self, input, do_sample, num_samples)
 201 | 
 202 | 
 203 | class Alpaca(VLlmDecoder):
 204 |     def __init__(self, name: str, **kwargs) -> None:
 205 |         kwargs["conversational"] = True
 206 |         super().__init__(name, **kwargs)
 207 |         self.eos += ["\n```"]
 208 | 
 209 |     def codegen(
 210 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 211 |     ) -> List[str]:
 212 |         prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes request.
 213 | 
 214 | ### Instruction:
 215 | Create a Python script for this problem:
 216 | {prompt}
 217 | 
 218 | ### Response:
 219 | ```python
 220 | """
 221 |         return VLlmDecoder.codegen(self, prompt, do_sample, num_samples)
 222 | 
 223 | 
 224 | class OpenChat(VLlmDecoder):
 225 |     def __init__(self, name: str, **kwargs) -> None:
 226 |         kwargs["conversational"] = True
 227 |         super().__init__(name, **kwargs)
 228 |         self.eos += ["\n```"]
 229 | 
 230 |     def codegen(
 231 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 232 |     ) -> List[str]:
 233 |         if do_sample:
 234 |             assert self.temperature > 0, "Temperature must be greater than 0!"
 235 | 
 236 |         input = f"""GPT4 Correct User: Can you complete the following Python function?
 237 | ```python
 238 | {prompt}
 239 | ```
 240 | <|end_of_turn|>GPT4 Correct Assistant:
 241 | ```python
 242 | """
 243 |         return VLlmDecoder.codegen(self, input, do_sample, num_samples)
 244 | 
 245 | 
 246 | class WizardCoderDecoder(VLlmDecoder):
 247 |     def codegen(
 248 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 249 |     ) -> List[str]:
 250 |         if do_sample:
 251 |             assert self.temperature > 0, "Temperature must be greater than 0!"
 252 | 
 253 |         prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
 254 | 
 255 | 
 256 | ### Instruction:
 257 | Create a Python script for this problem:
 258 | {prompt}
 259 | 
 260 | ### Response:"""
 261 | 
 262 |         batch_size = min(self.batch_size, num_samples)
 263 | 
 264 |         num_of_tokens = len(
 265 |             self.llm.get_tokenizer().encode(prompt, return_tensors="pt")[0]
 266 |         )
 267 | 
 268 |         vllm_outputs = self.llm.generate(
 269 |             [prompt] * batch_size,
 270 |             SamplingParams(
 271 |                 temperature=self.temperature,
 272 |                 max_tokens=num_of_tokens + self.max_new_tokens,
 273 |                 top_p=0.95 if do_sample else 1.0,
 274 |             ),
 275 |             use_tqdm=False,
 276 |         )
 277 | 
 278 |         return [x.outputs[0].text.replace("\t", "    ") for x in vllm_outputs]
 279 | 
 280 | 
 281 | class XwinCoder(VLlmDecoder):
 282 |     def __init__(self, name: str, **kwargs) -> None:
 283 |         kwargs["conversational"] = True
 284 |         super().__init__(name, **kwargs)
 285 |         self.eos += ["\n```"]
 286 | 
 287 |     def codegen(
 288 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 289 |     ) -> List[str]:
 290 | 
 291 |         prompt = f"""<system>: You are an AI coding assistant that helps people with programming. Write a response that appropriately completes the user's request.
 292 | <user>: Complete the following code for me and return a fully runable code.
 293 | ```python
 294 | {prompt}
 295 | ```
 296 | <AI>:
 297 | ```python
 298 | """
 299 |         return VLlmDecoder.codegen(self, prompt, do_sample, num_samples)
 300 | 
 301 | 
 302 | class HFTorchDecoder(DecoderBase):
 303 |     def __init__(self, name: str, **kwargs):
 304 |         super().__init__(name=name, **kwargs)
 305 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 306 |         kwargs = {
 307 |             "trust_remote_code": name
 308 |             in {
 309 |                 "bigcode/santacoder",
 310 |                 "Salesforce/codegen2-1B",
 311 |                 "Salesforce/codegen2-3_7B",
 312 |                 "Salesforce/codegen2-7B",
 313 |                 "Salesforce/codegen2-16B",
 314 |                 "deepseek-ai/deepseek-coder-6.7b-base",
 315 |                 "deepseek-ai/deepseek-coder-33b-base",
 316 |                 "stabilityai/stable-code-3b",
 317 |                 "Qwen/Qwen-14B-Chat",
 318 |                 "Qwen/Qwen-7B-Chat",
 319 |             }
 320 |         }
 321 | 
 322 |         if "codegen-" in name:  # use fp16 for codegen models
 323 |             kwargs["torch_dtype"] = torch.float16
 324 |         if "codegen2-" in name:  # avoid warning of trust remote code
 325 |             kwargs["revision"] = "main"
 326 |             if "16b" in name.lower():
 327 |                 kwargs["device_map"] = "auto"
 328 |         if "starcoder2" in name:
 329 |             kwargs["device_map"] = "auto"
 330 |         if "starcoder" in name:
 331 |             kwargs["torch_dtype"] = torch.bfloat16
 332 |         if "CodeLlama" in name:
 333 |             if "34b" in name.lower() or "70b" in name.lower():
 334 |                 kwargs["device_map"] = "auto"
 335 |             kwargs["torch_dtype"] = torch.bfloat16
 336 |             self.skip_special_tokens = True
 337 |         if "CodeBooga" in name:
 338 |             kwargs["torch_dtype"] = torch.float16
 339 |             kwargs["device_map"] = "auto"
 340 |             self.skip_special_tokens = True
 341 |         if "Mistral-7B-codealpaca-lora" == name:
 342 |             kwargs["torch_dtype"] = torch.float16
 343 |             self.skip_special_tokens = True
 344 |         elif "Mistral" in name or "zephyr-7b-beta" in name:
 345 |             kwargs["torch_dtype"] = torch.bfloat16
 346 |         if "Mixtral" in name:
 347 |             kwargs["torch_dtype"] = torch.bfloat16
 348 |             kwargs["device_map"] = "auto"
 349 |         if "deepseek" in name:
 350 |             kwargs["torch_dtype"] = torch.bfloat16
 351 |             if "33b" in name.lower():
 352 |                 kwargs["device_map"] = "auto"
 353 |             self.skip_special_tokens = True
 354 |         if "/phi" in name:
 355 |             kwargs["torch_dtype"] = torch.float16
 356 |             kwargs["trust_remote_code"] = True
 357 |             self.skip_special_tokens = True
 358 |         if "Qwen" in name:
 359 |             kwargs["torch_dtype"] = torch.bfloat16
 360 |             self.skip_special_tokens = True
 361 |             if "72B" in name:
 362 |                 kwargs["device_map"] = "auto"
 363 |         if "Phind" in name:
 364 |             kwargs["torch_dtype"] = torch.bfloat16
 365 |             kwargs["device_map"] = "auto"
 366 |         if "gemma" in name:
 367 |             kwargs["torch_dtype"] = torch.bfloat16
 368 |         if "Magicoder" in name:
 369 |             kwargs["torch_dtype"] = torch.bfloat16
 370 |             kwargs["device_map"] = "auto"
 371 | 
 372 |         print(f"{kwargs = }")
 373 | 
 374 |         self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
 375 |         self.model = AutoModelForCausalLM.from_pretrained(name, **kwargs)
 376 |         if name in {"StabilityAI/stablelm-base-alpha-7b"}:
 377 |             print("Switching to float16 ...")
 378 |             self.model = self.model.half()
 379 |             self.skip_special_tokens = True
 380 | 
 381 |         if "device_map" not in kwargs:
 382 |             self.model = self.model.to(self.device)
 383 | 
 384 |     @torch.inference_mode()
 385 |     def codegen(
 386 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 387 |     ) -> List[str]:
 388 |         if self.temperature == 0:
 389 |             assert not do_sample
 390 |             assert num_samples == 1
 391 | 
 392 |         input_tokens = self.tokenizer.encode(prompt, return_tensors="pt").to(
 393 |             self.device
 394 |         )
 395 |         scores = StoppingCriteriaList(
 396 |             [
 397 |                 EndOfFunctionCriteria(
 398 |                     start_length=len(input_tokens[0]),
 399 |                     eos=self.eos,
 400 |                     tokenizer=self.tokenizer,
 401 |                 )
 402 |             ]
 403 |         )
 404 |         kwargs = {}
 405 |         if do_sample:
 406 |             kwargs["top_p"] = 0.95
 407 |             kwargs["temperature"] = self.temperature
 408 | 
 409 |         raw_outputs = self.model.generate(
 410 |             input_tokens,
 411 |             max_new_tokens=self.max_new_tokens,
 412 |             stopping_criteria=scores,
 413 |             do_sample=do_sample,
 414 |             output_scores=True,
 415 |             return_dict_in_generate=True,
 416 |             num_return_sequences=min(self.batch_size, num_samples),
 417 |             pad_token_id=self.tokenizer.eos_token_id,
 418 |             **kwargs,
 419 |         )  # remove warning
 420 |         gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
 421 |         gen_strs = self.tokenizer.batch_decode(
 422 |             gen_seqs, skip_special_tokens=self.skip_special_tokens
 423 |         )
 424 |         outputs = []
 425 |         # removes eos tokens.
 426 |         for output in gen_strs:
 427 |             min_index = 10000
 428 |             for eos in self.eos:
 429 |                 if eos in output:
 430 |                     # could be multiple eos in outputs, better pick minimum one
 431 |                     min_index = min(min_index, output.index(eos))
 432 |             outputs.append(output[:min_index])
 433 |         return outputs
 434 | 
 435 | 
 436 | class CodeLlamaInstructLarge(HFTorchDecoder):
 437 |     def __init__(self, name: str, **kwargs) -> None:
 438 |         kwargs["conversational"] = True
 439 |         super().__init__(name, **kwargs)
 440 |         self.eos = ["\n```"]
 441 | 
 442 |     def codegen(
 443 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 444 |     ) -> List[str]:
 445 |         if do_sample:
 446 |             assert self.temperature > 0, "Temperature must be greater than 0!"
 447 | 
 448 |         input = f"""'<s>Source: system
 449 | 
 450 |  You are a helpful and honest code assistant expert in Python. Please, provide all answers to programming questions in Python.
 451 |  <step> Source: user
 452 | 
 453 |  Provide a self-contained Python script that solves the following problem:
 454 | ```python
 455 | {prompt}
 456 | ```
 457 |  <step> Source: assistant
 458 | 
 459 |  Here is a Python script that solves the problem:
 460 | ```python
 461 | """
 462 | 
 463 |         input_tokens = self.tokenizer.encode(input, return_tensors="pt").to(self.device)
 464 |         scores = StoppingCriteriaList(
 465 |             [
 466 |                 EndOfFunctionCriteria(
 467 |                     start_length=len(input_tokens[0]),
 468 |                     eos=self.eos,
 469 |                     tokenizer=self.tokenizer,
 470 |                 )
 471 |             ]
 472 |         )
 473 |         kwargs = {}
 474 |         if do_sample:
 475 |             kwargs["top_p"] = 0.95
 476 |             kwargs["temperature"] = self.temperature
 477 | 
 478 |         max_new_tokens = self.max_new_tokens + len(
 479 |             self.tokenizer.encode(prompt, return_tensors="pt")[0]
 480 |         )
 481 | 
 482 |         raw_outputs = self.model.generate(
 483 |             input_tokens,
 484 |             max_new_tokens=max_new_tokens,
 485 |             stopping_criteria=scores,
 486 |             do_sample=do_sample,
 487 |             output_scores=True,
 488 |             return_dict_in_generate=True,
 489 |             num_return_sequences=min(self.batch_size, num_samples),
 490 |             pad_token_id=self.tokenizer.eos_token_id,
 491 |             **kwargs,
 492 |         )  # remove warning
 493 |         gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
 494 |         gen_strs = self.tokenizer.batch_decode(
 495 |             gen_seqs, skip_special_tokens=self.skip_special_tokens
 496 |         )
 497 |         outputs = []
 498 |         # removes eos tokens.
 499 |         for output in gen_strs:
 500 |             min_index = 10000
 501 |             for eos in self.eos:
 502 |                 if eos in output:
 503 |                     # could be multiple eos in outputs, better pick minimum one
 504 |                     min_index = min(min_index, output.index(eos))
 505 |             outputs.append(output[:min_index])
 506 |         return outputs
 507 | 
 508 | 
 509 | class QwenInstruct(HFTorchDecoder):
 510 | 
 511 |     generation_template = "Please implement the following Python function in a markdown style code block:\n\n```python\n{prompt}\n```\n"
 512 |     incorrect_code_template = "```python\n{incorrect_solution}\n```\n"
 513 |     feedback_template = "{feedback}"
 514 | 
 515 |     @torch.inference_mode()
 516 |     def codegen(
 517 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 518 |     ) -> List[str]:
 519 |         if self.temperature == 0:
 520 |             assert not do_sample
 521 |             assert num_samples == 1
 522 |         content = self.generation_template.format(prompt=prompt)
 523 | 
 524 |         input_tokens = self.tokenizer.apply_chat_template(
 525 |             [
 526 |                 {
 527 |                     "role": "user",
 528 |                     "content": content,
 529 |                 }
 530 |             ],
 531 |             add_generation_prompt=True,
 532 |             return_tensors="pt",
 533 |         ).to(self.device)
 534 | 
 535 |         max_token = len(input_tokens[0]) + self.max_new_tokens
 536 | 
 537 |         kwargs = {}
 538 |         if do_sample:
 539 |             kwargs["top_p"] = 0.95
 540 |             kwargs["temperature"] = self.temperature
 541 | 
 542 |         raw_outputs = self.model.generate(
 543 |             input_tokens,
 544 |             max_new_tokens=max_token,
 545 |             do_sample=do_sample,
 546 |             output_scores=True,
 547 |             return_dict_in_generate=True,
 548 |             top_k=50,
 549 |             num_return_sequences=min(self.batch_size, num_samples),
 550 |             pad_token_id=self.tokenizer.eos_token_id,
 551 |             **kwargs,
 552 |         )  # remove warning
 553 |         gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
 554 |         gen_strs = self.tokenizer.batch_decode(
 555 |             gen_seqs, skip_special_tokens=self.skip_special_tokens
 556 |         )
 557 |         return gen_strs
 558 | 
 559 | 
 560 | class DeepSeekInstruct(HFTorchDecoder):
 561 | 
 562 |     generation_template = "Please implement the following Python function in a markdown style code block:\n\n```python\n{prompt}\n```\n"
 563 |     incorrect_code_template = "```python\n{incorrect_solution}\n```\n"
 564 |     feedback_template = "{feedback}"
 565 | 
 566 |     @torch.inference_mode()
 567 |     def codegen(
 568 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 569 |     ) -> List[str]:
 570 |         if self.temperature == 0:
 571 |             assert not do_sample
 572 |             assert num_samples == 1
 573 |         content = self.generation_template.format(prompt=prompt)
 574 | 
 575 |         input_tokens = self.tokenizer.apply_chat_template(
 576 |             [
 577 |                 {
 578 |                     "role": "user",
 579 |                     "content": content,
 580 |                 }
 581 |             ],
 582 |             add_generation_prompt=True,
 583 |             return_tensors="pt",
 584 |         ).to(self.device)
 585 | 
 586 |         # set instruction model to have more max_tokens TODO: for all models
 587 |         max_token = len(input_tokens[0]) + self.max_new_tokens
 588 | 
 589 |         kwargs = {}
 590 |         if do_sample:
 591 |             kwargs["top_p"] = 0.95
 592 |             kwargs["temperature"] = self.temperature
 593 | 
 594 |         raw_outputs = self.model.generate(
 595 |             input_tokens,
 596 |             max_new_tokens=max_token,
 597 |             do_sample=do_sample,
 598 |             output_scores=True,
 599 |             return_dict_in_generate=True,
 600 |             top_k=50,
 601 |             num_return_sequences=min(self.batch_size, num_samples),
 602 |             pad_token_id=self.tokenizer.eos_token_id,
 603 |             eos_token_id=32021,
 604 |             **kwargs,
 605 |         )  # remove warning
 606 |         gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
 607 |         gen_strs = self.tokenizer.batch_decode(
 608 |             gen_seqs, skip_special_tokens=self.skip_special_tokens
 609 |         )
 610 |         return gen_strs
 611 |         # return [x.split("```python")[-1].split("```")[0] for x in gen_strs]
 612 | 
 613 | 
 614 | class MistralInstruct(DeepSeekInstruct):
 615 |     pass  # just use the same as DeepSeekInstruct
 616 | 
 617 | 
 618 | class MixtralSPMXInstruct(DeepSeekInstruct):
 619 |     pass  # just use the same as DeepSeekInstruct
 620 | 
 621 | 
 622 | class GemmaInstruct(QwenInstruct):
 623 |     pass  # just use the same as QwenInstruct
 624 | 
 625 | 
 626 | class MagicCoderInstruct(DeepSeekInstruct):
 627 | 
 628 |     generation_template = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n@@ Instruction\nWrite a solution to the following problem:\n```python\n{prompt}\n```\n\n@@ Response\n"""
 629 | 
 630 |     @torch.inference_mode()
 631 |     def codegen(
 632 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 633 |     ) -> List[str]:
 634 |         if self.temperature == 0:
 635 |             assert not do_sample
 636 |             assert num_samples == 1
 637 |         content = self.generation_template.format(prompt=prompt)
 638 | 
 639 |         input_tokens = self.tokenizer.encode(content, return_tensors="pt").to(
 640 |             self.device
 641 |         )
 642 | 
 643 |         max_token = len(input_tokens[0]) + self.max_new_tokens
 644 | 
 645 |         kwargs = {}
 646 |         if do_sample:
 647 |             kwargs["top_p"] = 0.95
 648 |             kwargs["temperature"] = self.temperature
 649 | 
 650 |         raw_outputs = self.model.generate(
 651 |             input_tokens,
 652 |             max_new_tokens=max_token,
 653 |             do_sample=do_sample,
 654 |             output_scores=True,
 655 |             return_dict_in_generate=True,
 656 |             top_k=50,
 657 |             num_return_sequences=min(self.batch_size, num_samples),
 658 |             pad_token_id=self.tokenizer.eos_token_id,
 659 |             eos_token_id=self.tokenizer.eos_token_id,
 660 |             **kwargs,
 661 |         )  # remove warning
 662 |         gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
 663 |         gen_strs = self.tokenizer.batch_decode(
 664 |             gen_seqs, skip_special_tokens=self.skip_special_tokens
 665 |         )
 666 |         return gen_strs
 667 | 
 668 | 
 669 | class AnthropicDecoder(DecoderBase):
 670 |     generation_template = (
 671 |         "Please complete the following code snippet.\n```\n{prompt}\n```"
 672 |     )
 673 | 
 674 |     def __init__(self, name: str, model_name: str = "gpt-3.5-turbo", **kwargs) -> None:
 675 |         super().__init__(name, **kwargs)
 676 |         self.model_name = model_name
 677 |         self.client = anthropic.Anthropic(
 678 |             api_key=os.getenv("ANTHROPIC_API_KEY", "dummy")
 679 |         )
 680 | 
 681 |     def _anthrophic_parse(self, ret, prompt, body=False):
 682 |         outputs = []
 683 |         for returns in ret.content:
 684 |             raw_o = returns.text
 685 |             outputs.append(raw_o)
 686 |         return outputs
 687 | 
 688 |     def codegen(
 689 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 690 |     ) -> List[str]:
 691 |         if do_sample:
 692 |             assert self.temperature > 0, "Temperature must be positive for sampling"
 693 | 
 694 |         batch_size = min(self.batch_size, num_samples)
 695 |         assert batch_size <= 20, "Use larger batch size could blow up the memory!"
 696 | 
 697 |         message = self.generation_template.format(prompt=prompt.strip())
 698 | 
 699 |         # estimation
 700 |         num_tokens = num_tokens_from_messages(message, self.model_name)
 701 | 
 702 |         config = create_anthropic_config(
 703 |             message=message,
 704 |             max_tokens=num_tokens + self.max_new_tokens,
 705 |             temperature=self.temperature,
 706 |             batch_size=batch_size,
 707 |             model=self.model_name,
 708 |         )
 709 |         ret = request_anthropic_engine(self.client, config)
 710 |         return self._anthrophic_parse(ret, prompt.strip(), body=self.body)
 711 | 
 712 | 
 713 | class PalmDecoder(DecoderBase):
 714 |     generation_template = (
 715 |         "Please complete the following code snippet.\n```\n{prompt}\n```"
 716 |     )
 717 | 
 718 |     def __init__(self, name: str, model_name: str = "palm", **kwargs) -> None:
 719 |         super().__init__(name, **kwargs)
 720 |         genai.configure(api_key=os.getenv("GOOGLE_API_KEY", "dummy"))
 721 |         self.model_name = model_name
 722 | 
 723 |     def _palm_parse(self, ret, prompt):
 724 |         outputs = []
 725 |         raw_o = ret.result
 726 |         outputs.append(raw_o)
 727 |         return outputs
 728 | 
 729 |     def codegen(
 730 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 731 |     ) -> List[str]:
 732 |         if do_sample:
 733 |             assert self.temperature > 0, "Temperature must be positive for sampling"
 734 | 
 735 |         batch_size = min(self.batch_size, num_samples)
 736 |         assert batch_size <= 20, "Use larger batch size could blow up the memory!"
 737 | 
 738 |         message = self.generation_template.format(prompt=prompt.strip())
 739 | 
 740 |         # approximate ge
 741 |         num_tokens = num_tokens_from_messages(message, self.model_name)
 742 | 
 743 |         config = create_palm_config(
 744 |             message=message,
 745 |             max_tokens=num_tokens + self.max_new_tokens,
 746 |             temperature=self.temperature,
 747 |             batch_size=batch_size,
 748 |             model=self.model_name,
 749 |         )
 750 |         ret = request_palm_engine(genai, config)
 751 |         # if "gpt-3.5" in self.model_name:
 752 |         return self._palm_parse(ret, prompt.strip())
 753 | 
 754 | 
 755 | class GeminiChatDecoder(DecoderBase):
 756 |     generation_template = (
 757 |         "Please complete the following code snippet.\n```\n{prompt}\n```"
 758 |     )
 759 | 
 760 |     def __init__(
 761 |         self, name: str, model_name: str = "models/gemini-pro", **kwargs
 762 |     ) -> None:
 763 |         super().__init__(name, **kwargs)
 764 |         self.model_name = model_name
 765 |         genai.configure(api_key=os.getenv("GOOGLE_API_KEY", "dummy"))
 766 |         self.model = genai.GenerativeModel(self.model_name)
 767 | 
 768 |     @staticmethod
 769 |     def _find_gen_func_sig(prompt):
 770 |         func_sig = ""
 771 |         for x in prompt.splitlines():
 772 |             if x.startswith("def ") and x.endswith(":"):
 773 |                 # always pick the last one, since there could pre-defined functions.
 774 |                 func_sig = x
 775 |         return func_sig
 776 | 
 777 |     @staticmethod
 778 |     def _remove_eos(gen):
 779 |         min_index = 100000000
 780 |         for eos in EOS:
 781 |             if eos in gen:
 782 |                 min_index = min(min_index, gen.index(eos))
 783 |         return gen[:min_index]
 784 | 
 785 |     def _gemini_parse(self, ret, prompt):
 786 |         outputs = []
 787 |         raw_o = ret.text
 788 |         outputs.append(raw_o)
 789 |         return outputs
 790 | 
 791 |     def codegen(
 792 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 793 |     ) -> List[str]:
 794 |         if do_sample:
 795 |             assert self.temperature > 0, "Temperature must be positive for sampling"
 796 | 
 797 |         batch_size = min(self.batch_size, num_samples)
 798 |         assert batch_size <= 20, "Use larger batch size could blow up the memory!"
 799 | 
 800 |         message = self.generation_template.format(prompt=prompt.strip())
 801 | 
 802 |         # approximate ge
 803 |         num_tokens = num_tokens_from_messages(message, self.model_name)
 804 | 
 805 |         config = create_gemini_config(
 806 |             max_tokens=num_tokens + self.max_new_tokens,
 807 |             temperature=self.temperature,
 808 |             batch_size=batch_size,
 809 |         )
 810 |         ret = request_gemini_engine(self.model, message, config)
 811 |         # if "gpt-3.5" in self.model_name:
 812 |         return self._gemini_parse(ret, prompt.strip())
 813 | 
 814 | 
 815 | class OpenAIChatDecoder(DecoderBase):
 816 |     generation_template = (
 817 |         "Please complete the following code snippet.\n```\n{prompt}\n```"
 818 |     )
 819 | 
 820 |     def __init__(self, name: str, model_name: str = "gpt-3.5-turbo", **kwargs) -> None:
 821 |         super().__init__(name, **kwargs)
 822 |         self.model_name = model_name
 823 |         openai.api_key = os.environ.get("OPENAI_API_KEY", "dummy")
 824 | 
 825 |     def codegen(
 826 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 827 |     ) -> List[str]:
 828 |         if do_sample:
 829 |             assert self.temperature > 0, "Temperature must be positive for sampling"
 830 | 
 831 |         batch_size = min(self.batch_size, num_samples)
 832 |         assert batch_size <= 20, "Use larger batch size could blow up the memory!"
 833 | 
 834 |         # construct prompt
 835 |         # if "gpt-3.5" in self.model_name: # chatgpt
 836 |         message = self.generation_template.format(prompt=prompt.strip())
 837 | 
 838 |         num_tokens = num_tokens_from_messages(message, self.model_name)
 839 | 
 840 |         config = create_chatgpt_config(
 841 |             message=message,
 842 |             max_tokens=num_tokens + self.max_new_tokens,
 843 |             temperature=self.temperature,
 844 |             batch_size=batch_size,
 845 |             model=self.model_name,
 846 |         )
 847 |         ret = request_chatgpt_engine(config)
 848 |         outputs = []
 849 |         for returns in ret.choices:
 850 |             outputs.append(returns.message.content)
 851 |         return outputs
 852 | 
 853 | 
 854 | class StarCoder2(HFTorchDecoder):
 855 |     def codegen(
 856 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 857 |     ) -> List[str]:
 858 |         prompt = prompt.strip()  # starcoder2 needs this, bad
 859 |         return HFTorchDecoder.codegen(self, prompt, do_sample, num_samples)
 860 | 
 861 | 
 862 | class StarCoderInfill(HFTorchDecoder):
 863 |     def __init__(self, name: str, **kwargs) -> None:
 864 |         super().__init__(name, **kwargs)
 865 |         self.prefix_token = "<fim_prefix>"
 866 |         self.suffix_token = "<fim_suffix><fim_middle>"
 867 | 
 868 |     def codegen(
 869 |         self, prompt: str, do_sample: bool = True, num_samples: int = 200
 870 |     ) -> List[str]:
 871 |         if self.temperature == 0:
 872 |             assert not do_sample
 873 |             assert num_samples == 1
 874 | 
 875 |         input = self.prefix_token + prompt + self.suffix_token
 876 |         input_tokens = self.tokenizer.encode(input, return_tensors="pt").to(self.device)
 877 |         scores = StoppingCriteriaList(
 878 |             [
 879 |                 EndOfFunctionCriteria(
 880 |                     start_length=len(input_tokens[0]),
 881 |                     eos=self.eos,
 882 |                     tokenizer=self.tokenizer,
 883 |                 )
 884 |             ]
 885 |         )
 886 |         temperature = max(self.temperature, 1e-2)
 887 |         raw_outputs = self.model.generate(
 888 |             input_tokens,
 889 |             max_new_tokens=self.max_new_tokens,
 890 |             stopping_criteria=scores,
 891 |             do_sample=do_sample,
 892 |             top_p=0.95,
 893 |             top_k=None,
 894 |             temperature=temperature,
 895 |             num_return_sequences=min(self.batch_size, num_samples),
 896 |             output_scores=True,
 897 |             return_dict_in_generate=True,
 898 |             repetition_penalty=1.0,
 899 |             pad_token_id=self.tokenizer.eos_token_id,
 900 |         )
 901 |         gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
 902 |         gen_strs = self.tokenizer.batch_decode(
 903 |             gen_seqs, skip_special_tokens=self.skip_special_tokens
 904 |         )
 905 |         outputs = []
 906 |         # removes eos tokens.
 907 |         for output in gen_strs:
 908 |             min_index = 10000
 909 |             for eos in self.eos:
 910 |                 if eos in output:
 911 |                     min_index = min(min_index, output.index(eos))
 912 |             outputs.append(output[:min_index])
 913 |         return outputs
 914 | 
 915 | 
 916 | def make_model(name: str, batch_size: int = 1, temperature: float = 0.8):
 917 |     if name == "claude-3":
 918 |         return AnthropicDecoder(
 919 |             batch_size=batch_size,
 920 |             name="claude",
 921 |             temperature=temperature,
 922 |             model_name="claude-3-opus-20240229",
 923 |             conversational=True,
 924 |         )
 925 |     elif name == "claude-3-haiku":  # cheaper model
 926 |         return AnthropicDecoder(
 927 |             batch_size=batch_size,
 928 |             name="claude",
 929 |             temperature=temperature,
 930 |             model_name="claude-3-haiku-20240307",
 931 |             conversational=True,
 932 |         )
 933 |     elif name == "claude-2":
 934 |         return AnthropicDecoder(
 935 |             batch_size=batch_size,
 936 |             name="claude",
 937 |             temperature=temperature,
 938 |             model_name="claude-2.1",
 939 |             conversational=True,
 940 |         )
 941 |     elif name == "gemini-pro":
 942 |         return GeminiChatDecoder(
 943 |             batch_size=batch_size,
 944 |             name="gemini-pro",
 945 |             temperature=temperature,
 946 |             model_name="models/gemini-pro",
 947 |             conversational=True,
 948 |         )
 949 |     elif name == "palm":
 950 |         return PalmDecoder(
 951 |             batch_size=batch_size,
 952 |             name="palm",
 953 |             temperature=temperature,
 954 |             model_name="models/text-bison-001",
 955 |             conversational=True,
 956 |         )
 957 |     elif name == "chatgpt":
 958 |         return OpenAIChatDecoder(
 959 |             batch_size=batch_size,
 960 |             name="ChatGPT",
 961 |             temperature=temperature,
 962 |             model_name="gpt-3.5-turbo",
 963 |             conversational=True,
 964 |         )
 965 |     elif name == "gpt-4-turbo":
 966 |         return OpenAIChatDecoder(
 967 |             batch_size=batch_size,
 968 |             name="GPT4",
 969 |             temperature=temperature,
 970 |             model_name="gpt-4-turbo-preview",
 971 |             conversational=True,
 972 |         )
 973 |     elif name in ["gpt-4", "gpt-4-1106-preview"]:
 974 |         return OpenAIChatDecoder(
 975 |             batch_size=batch_size,
 976 |             name="GPT4",
 977 |             temperature=temperature,
 978 |             model_name=name,
 979 |             conversational=True,
 980 |         )
 981 |     elif name.startswith("starcoder2"):
 982 |         import re
 983 | 
 984 |         pattern = re.compile(r"starcoder2-(\d+)b")
 985 |         matches = pattern.findall(name)
 986 |         nb = int(matches[0])
 987 |         assert float(nb) > 0
 988 |         return StarCoder2(
 989 |             batch_size=batch_size,
 990 |             name=f"bigcode/{name}",
 991 |             temperature=temperature,
 992 |         )
 993 |     elif name.startswith("starcoder"):
 994 |         return StarCoderInfill(
 995 |             batch_size=batch_size, name=f"bigcode/{name}", temperature=temperature
 996 |         )
 997 |     elif name.startswith("code-llama-"):
 998 |         import re
 999 | 
1000 |         pattern = re.compile(r"code-llama-(\d+\.?\d*)b(.*)")
1001 |         matches = pattern.findall(name)[0]
1002 |         nb = matches[0]
1003 |         assert float(nb) > 0
1004 | 
1005 |         if "instruct" in name:
1006 |             if float(nb) < 69:  # nice
1007 |                 return CodeLlamaInstructSmall(
1008 |                     batch_size=batch_size,
1009 |                     name=f"codellama/CodeLlama-{nb}b-Instruct-hf",
1010 |                     temperature=temperature,
1011 |                 )
1012 |             else:
1013 |                 return CodeLlamaInstructLarge(
1014 |                     batch_size=batch_size,
1015 |                     name=f"codellama/CodeLlama-{nb}b-Instruct-hf",
1016 |                     temperature=temperature,
1017 |                 )
1018 |         elif "python" in name:
1019 |             return HFTorchDecoder(
1020 |                 batch_size=batch_size,
1021 |                 name=f"codellama/CodeLlama-{nb}b-Python-hf",
1022 |                 temperature=temperature,
1023 |             )
1024 |         else:
1025 |             return VLlmDecoder(
1026 |                 batch_size=batch_size,
1027 |                 name=f"codellama/CodeLlama-{nb}b-hf",
1028 |                 temperature=temperature,
1029 |             )
1030 |     elif name.startswith("deepseek-coder"):
1031 |         import re
1032 | 
1033 |         # format deepseek-coder-{nb}b*
1034 |         pattern = re.compile(r"deepseek-coder-(\d+\.?\d*)b(.*)")
1035 |         matches = pattern.findall(name)[0]
1036 |         nb = matches[0]
1037 |         assert float(nb) > 0
1038 | 
1039 |         if "instruct" in name:
1040 |             return DeepSeekInstruct(
1041 |                 batch_size=batch_size,
1042 |                 name=f"deepseek-ai/{name}",
1043 |                 temperature=temperature,
1044 |                 conversational=True,
1045 |             )
1046 |         else:
1047 |             return HFTorchDecoder(
1048 |                 batch_size=batch_size,
1049 |                 name=f"deepseek-ai/deepseek-coder-{nb}b-base",
1050 |                 temperature=temperature,
1051 |             )
1052 |     elif name == "magicoder-s-ds-6.7b":
1053 |         return MagicCoderInstruct(
1054 |             batch_size=batch_size,
1055 |             name=f"ise-uiuc/Magicoder-S-DS-6.7B",
1056 |             temperature=temperature,
1057 |             conversational=True,
1058 |         )
1059 |     elif name == "magicoder-s-cl-7b":
1060 |         return MagicCoderInstruct(
1061 |             batch_size=batch_size,
1062 |             name=f"ise-uiuc/Magicoder-S-CL-7B",
1063 |             temperature=temperature,
1064 |             conversational=True,
1065 |         )
1066 |     elif name.startswith("wizardcoder-34b"):
1067 |         return WizardCoderDecoder(
1068 |             batch_size=batch_size,
1069 |             name=f"WizardLM/WizardCoder-Python-34B-V1.0",
1070 |             temperature=temperature,
1071 |             conversational=True,
1072 |         )
1073 |     elif name.startswith("wizardcoder-33b-1.1"):
1074 |         return WizardCoderDecoder(
1075 |             batch_size=batch_size,
1076 |             name=f"WizardLM/WizardCoder-33B-V1.1",
1077 |             temperature=temperature,
1078 |             conversational=True,
1079 |         )
1080 |     elif name == "phind-code-llama-34b-v2":
1081 |         return HFTorchDecoder(
1082 |             batch_size=batch_size,
1083 |             name="Phind/Phind-CodeLlama-34B-v2",
1084 |             temperature=temperature,
1085 |         )
1086 |     elif name.startswith("mistral-7b"):
1087 |         if "instruct" in name:
1088 |             if name.endswith("-v02"):
1089 |                 return MistralInstruct(
1090 |                     batch_size=batch_size,
1091 |                     name="mistralai/Mistral-7B-Instruct-v0.2",
1092 |                     temperature=temperature,
1093 |                     conversational=True,
1094 |                 )
1095 |             else:
1096 |                 return MistralInstruct(
1097 |                     batch_size=batch_size,
1098 |                     name="mistralai/Mistral-7B-Instruct-v0.1",
1099 |                     temperature=temperature,
1100 |                     conversational=True,
1101 |                 )
1102 |         else:
1103 |             return HFTorchDecoder(
1104 |                 batch_size=batch_size,
1105 |                 name="mistralai/Mistral-7B-v0.1",
1106 |                 temperature=temperature,
1107 |             )
1108 |     elif name.startswith("mixtral-8x7b"):
1109 |         if "instruct" in name:
1110 |             return MixtralSPMXInstruct(
1111 |                 batch_size=batch_size,
1112 |                 name="mistralai/Mixtral-8x7B-Instruct-v0.1",
1113 |                 temperature=temperature,
1114 |                 conversational=True,
1115 |             )
1116 |         else:
1117 |             return HFTorchDecoder(
1118 |                 batch_size=batch_size,
1119 |                 name="mistralai/Mixtral-8x7B-v0.1",
1120 |                 temperature=temperature,
1121 |             )
1122 |     elif name == "stable-code-3b":
1123 |         return HFTorchDecoder(
1124 |             batch_size=batch_size,
1125 |             name="stabilityai/stable-code-3b",
1126 |             temperature=temperature,
1127 |         )
1128 |     elif name == "speechless-codellama-34b":
1129 |         return Alpaca(
1130 |             batch_size=batch_size,
1131 |             name="uukuguy/speechless-codellama-34b-v2.0",
1132 |             temperature=temperature,
1133 |         )
1134 |     elif name == "openchat":
1135 |         return OpenChat(
1136 |             batch_size=batch_size,
1137 |             name="openchat/openchat-3.5-0106",
1138 |             temperature=temperature,
1139 |         )
1140 |     elif name.startswith("code-millenials-34b"):
1141 |         return Alpaca(
1142 |             batch_size=batch_size,
1143 |             name="budecosystem/code-millenials-34b",
1144 |             temperature=temperature,
1145 |             conversational=True,
1146 |         )
1147 |     elif name == "phi-2":
1148 |         return VLlmDecoder(
1149 |             batch_size=batch_size,
1150 |             name="microsoft/phi-2",
1151 |             temperature=temperature,
1152 |         )
1153 |     elif name.startswith("qwen"):
1154 |         # format deepseek-coder-{nb}b*
1155 |         import re
1156 | 
1157 |         pattern = re.compile(r"qwen-(\d+\.?\d*)b(.*)")
1158 |         matches = pattern.findall(name)[0]
1159 |         nb = matches[0]
1160 |         assert float(nb) > 0
1161 | 
1162 |         if "1.5" in name:
1163 |             return QwenInstruct(
1164 |                 batch_size=batch_size,
1165 |                 name=f"Qwen/Qwen1.5-{nb}B-Chat",
1166 |                 temperature=temperature,
1167 |                 conversational=True,
1168 |             )
1169 |         else:
1170 |             return QwenInstruct(
1171 |                 batch_size=batch_size,
1172 |                 name=f"Qwen/Qwen-{nb}B-Chat",
1173 |                 temperature=temperature,
1174 |                 conversational=True,
1175 |             )
1176 |     elif name.startswith("xwincoder-34b"):
1177 |         return XwinCoder(
1178 |             batch_size=batch_size, name="Xwin-LM/XwinCoder-34B", temperature=temperature
1179 |         )
1180 |     elif name.startswith("gemma"):
1181 |         import re
1182 | 
1183 |         pattern = re.compile(r"gemma-(\d+\.?\d*)b(.*)")
1184 |         matches = pattern.findall(name)[0]
1185 |         nb = matches[0]
1186 |         assert float(nb) > 0
1187 |         if "instruct" in name:
1188 |             return GemmaInstruct(
1189 |                 batch_size=batch_size,
1190 |                 name=f"google/gemma-{nb}b-it",
1191 |                 temperature=temperature,
1192 |                 conversational=True,
1193 |             )
1194 |         else:
1195 |             return HFTorchDecoder(
1196 |                 batch_size=batch_size,
1197 |                 name=f"google/gemma-{nb}b",
1198 |                 temperature=temperature,
1199 |             )
1200 | 
1201 |     raise ValueError(f"Invalid model name: {name}")
1202 | 


--------------------------------------------------------------------------------
/evoeval/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from evoeval._version import __version__, __version_tuple__
3 | except ImportError:
4 |     __version__ = "local-dev"
5 | 


--------------------------------------------------------------------------------
/evoeval/data.py:
--------------------------------------------------------------------------------
  1 | # largely adopted from EvalPlus
  2 | import gzip
  3 | import hashlib
  4 | import json
  5 | import os
  6 | from typing import Dict, Iterable
  7 | 
  8 | import tempdir
  9 | import wget
 10 | from appdirs import user_cache_dir
 11 | 
 12 | CACHE_DIR = user_cache_dir("evoeval")
 13 | 
 14 | 
 15 | EVOEVAL_VERSION = "v0.1.0"
 16 | EVOEVAL_OVERRIDE_PATH = os.environ.get("EVOEVAL_OVERRIDE_PATH", None)
 17 | 
 18 | 
 19 | def write_jsonl(
 20 |     filename: str, data: Iterable[Dict], append: bool = False, drop_builtin: bool = True
 21 | ):
 22 |     """
 23 |     Writes an iterable of dictionaries to jsonl
 24 |     """
 25 |     if append:
 26 |         mode = "ab"
 27 |     else:
 28 |         mode = "wb"
 29 |     filename = os.path.expanduser(filename)
 30 |     if filename.endswith(".gz"):
 31 |         with open(filename, mode) as fp:
 32 |             with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
 33 |                 for x in data:
 34 |                     if drop_builtin:
 35 |                         x = {k: v for k, v in x.items() if not k.startswith("_")}
 36 |                     gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
 37 |     else:
 38 |         with open(filename, mode) as fp:
 39 |             for x in data:
 40 |                 if drop_builtin:
 41 |                     x = {k: v for k, v in x.items() if not k.startswith("_")}
 42 |                 fp.write((json.dumps(x) + "\n").encode("utf-8"))
 43 | 
 44 | 
 45 | def make_cache(gzip_url, cache_path, dataset_name):
 46 |     # Check if human eval file exists in CACHE_DIR
 47 |     if not os.path.exists(cache_path):
 48 |         # Install HumanEval dataset and parse as jsonl
 49 |         print(f"Downloading dataset from {gzip_url}")
 50 |         with tempdir.TempDir() as tmpdir:
 51 |             # TODO need to test this.
 52 |             evoeval_gz_path = os.path.join(tmpdir, f"{dataset_name}-data.jsonl.gz")
 53 |             wget.download(gzip_url, evoeval_gz_path)
 54 | 
 55 |             with gzip.open(evoeval_gz_path, "rb") as f:
 56 |                 evoeval = f.read().decode("utf-8")
 57 | 
 58 |         # create CACHE_DIR if not exists
 59 |         if not os.path.exists(CACHE_DIR):
 60 |             os.makedirs(CACHE_DIR)
 61 | 
 62 |         # Write the original human eval file to CACHE_DIR
 63 |         with open(cache_path, "w") as f:
 64 |             f.write(evoeval)
 65 | 
 66 | 
 67 | def get_dataset_metadata(name: str, version: str):
 68 |     assert name in [
 69 |         "EvoEval_difficult",
 70 |         "EvoEval_creative",
 71 |         "EvoEval_subtle",
 72 |         "EvoEval_combine",
 73 |         "EvoEval_tool_use",
 74 |         "EvoEval_verbose",
 75 |         "EvoEval_concise",
 76 |     ], f"Unknown/unsupported dataset: {name}"
 77 |     url = f"https://github.com/evo-eval/evoeval_release/releases/download/{version}/{name}.jsonl.gz"
 78 |     cache_path = os.path.join(CACHE_DIR, f"{name}-{version}.jsonl")
 79 |     return url, cache_path
 80 | 
 81 | 
 82 | def _ready_evo_eval_path(dataset_name: str) -> str:
 83 |     if EVOEVAL_OVERRIDE_PATH is not None:
 84 |         # create CACHE_DIR if not exists
 85 |         if not os.path.exists(CACHE_DIR):
 86 |             os.makedirs(CACHE_DIR)
 87 |         return f"{EVOEVAL_OVERRIDE_PATH}/{dataset_name}.jsonl"
 88 | 
 89 |     url, cache_path = get_dataset_metadata(dataset_name, EVOEVAL_VERSION)
 90 |     make_cache(url, cache_path, dataset_name)
 91 | 
 92 |     return cache_path
 93 | 
 94 | 
 95 | def get_evo_eval_plus_hash(dataset_name: str) -> str:
 96 |     evoeval_path = _ready_evo_eval_path(dataset_name)
 97 |     with open(evoeval_path, "rb") as f:
 98 |         evoeval = f.read()
 99 |     return hashlib.md5(evoeval).hexdigest()
100 | 
101 | 
102 | def get_evo_eval(dataset_name: str):
103 |     evoeval_path = _ready_evo_eval_path(dataset_name)
104 |     with open(evoeval_path, "r") as f:
105 |         data = {json.loads(task)["task_id"]: json.loads(task) for task in f.readlines()}
106 | 
107 |     return data
108 | 


--------------------------------------------------------------------------------
/evoeval/eval_test/__init__.py:
--------------------------------------------------------------------------------
  1 | # largely adopted from https://github.com/evalplus/evalplus
  2 | 
  3 | import itertools
  4 | import json
  5 | import multiprocessing
  6 | import os
  7 | import time
  8 | from enum import IntEnum, auto
  9 | from multiprocessing import Array, Value
 10 | from typing import Any, Dict, List, Tuple, Union
 11 | 
 12 | import numpy as np
 13 | from evalplus.eval.utils import (
 14 |     create_tempdir,
 15 |     reliability_guard,
 16 |     swallow_io,
 17 |     time_limit,
 18 | )
 19 | 
 20 | from evoeval.eval_test._creative_special_oracle import (
 21 |     _check_maze,
 22 |     _check_path,
 23 |     _check_product,
 24 | )
 25 | from evoeval.eval_test._difficult_special_oracle import (
 26 |     _check_difficult_poly,
 27 |     _check_insensitive_palindrome,
 28 | )
 29 | from evoeval.eval_test._he_special_oracle import _poly
 30 | from evoeval.eval_test._subtle_special_oracle import _check_poly
 31 | 
 32 | 
 33 | class CustomEncoder(json.JSONEncoder):
 34 |     def default(self, obj):
 35 |         if isinstance(obj, set):
 36 |             return list(obj)
 37 |         if isinstance(obj, object):
 38 |             return str(obj)
 39 |         return json.JSONEncoder.default(self, obj)
 40 | 
 41 | 
 42 | def compatible_eval_result(results: Dict) -> Dict:
 43 |     # compatibility
 44 |     for task_results in results["eval"].values():
 45 |         # update the "files" field to "nfiles"
 46 |         if "files" in task_results and "nfiles" not in task_results:
 47 |             task_results["nfiles"] = len(task_results.pop("files"))
 48 |     return results
 49 | 
 50 | 
 51 | # unbiased estimator from https://github.com/openai/human-eval
 52 | def estimate_pass_at_k(
 53 |     num_samples: Union[int, List[int], np.ndarray],
 54 |     num_correct: Union[List[int], np.ndarray],
 55 |     k: int,
 56 | ) -> np.ndarray:
 57 |     """
 58 |     Estimates pass@k of each problem and returns them in an array.
 59 |     """
 60 | 
 61 |     def estimator(n: int, c: int, k: int) -> float:
 62 |         """
 63 |         Calculates 1 - comb(n - c, k) / comb(n, k).
 64 |         """
 65 |         if n - c < k:
 66 |             return 1.0
 67 |         return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
 68 | 
 69 |     if isinstance(num_samples, int):
 70 |         num_samples_it = itertools.repeat(num_samples, len(num_correct))
 71 |     else:
 72 |         assert len(num_samples) == len(num_correct)
 73 |         num_samples_it = iter(num_samples)
 74 | 
 75 |     return np.array(
 76 |         [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
 77 |     )
 78 | 
 79 | 
 80 | PASS = "pass"
 81 | FAIL = "fail"
 82 | TIMEOUT = "timeout"
 83 | 
 84 | _SUCCESS = 0
 85 | _FAILED = 1
 86 | _TIMEOUT = 2
 87 | _UNKNOWN = 3
 88 | 
 89 | _mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None}
 90 | 
 91 | 
 92 | def is_floats(x) -> bool:
 93 |     # check if it is float; List[float]; Tuple[float]
 94 |     # TODO: search for any close floats? (other data structures)
 95 |     if isinstance(x, float):
 96 |         return True
 97 |     if isinstance(x, (list, tuple)):
 98 |         return all(isinstance(i, float) for i in x)
 99 |     if isinstance(x, np.ndarray):
100 |         return x.dtype == np.float64 or x.dtype == np.float32
101 |     return False
102 | 
103 | 
104 | class DataType(IntEnum):
105 |     Float = auto()
106 |     Bool = auto()
107 |     Int = auto()
108 |     Str = auto()
109 |     Null = auto()
110 |     Tuple = auto()
111 |     List = auto()
112 |     Dict = auto()
113 |     Set = auto()
114 |     Type = auto()
115 |     Unknown = auto()
116 | 
117 | 
118 | def get_type(x):
119 |     if x is None:
120 |         return DataType.Null
121 |     elif isinstance(x, bool):
122 |         return DataType.Bool
123 |     elif isinstance(x, int):
124 |         return DataType.Int
125 |     elif isinstance(x, str):
126 |         return DataType.Str
127 |     elif is_floats(x):
128 |         return DataType.Float
129 |     elif isinstance(x, tuple):
130 |         return DataType.Tuple
131 |     elif isinstance(x, list):
132 |         return DataType.List
133 |     elif isinstance(x, dict):
134 |         return DataType.Dict
135 |     elif isinstance(x, set):
136 |         return DataType.Set
137 |     elif isinstance(x, type):
138 |         return DataType.Type
139 |     else:
140 |         return DataType.Unknown
141 | 
142 | 
143 | def is_equal(x, y) -> tuple[bool, str]:
144 |     x_type, y_type = get_type(x), get_type(y)
145 |     if x_type != y_type:
146 |         return False, "Type mismatch: {} vs {}".format(str(x_type), str(y_type))
147 | 
148 |     if x_type in [
149 |         DataType.Int,
150 |         DataType.Bool,
151 |         DataType.Null,
152 |         DataType.Str,
153 |         DataType.Set,
154 |         DataType.Type,
155 |     ]:
156 |         if x == y:
157 |             return True, None
158 |         try:
159 |             error_msg = "INT/BOOL/NULL/ Value mismatch: {} vs {}".format(
160 |                 repr(x)[:300], repr(y)[:300]
161 |             )
162 |         except:
163 |             error_msg = "Value mismatch: too large for display"
164 |         return False, error_msg
165 |     elif x_type == DataType.Float:
166 |         if np.allclose(x, y, equal_nan=True, atol=1e-6):  # guard against nan
167 |             return True, None
168 |         else:
169 |             return False, "FLOAT Value mismatch: {} vs {}".format(x, y)
170 |     elif x_type in [DataType.List, DataType.Tuple]:
171 |         if len(x) != len(y):
172 |             return False, "Length mismatch: {} vs {}".format(len(x), len(y))
173 |         for i in range(len(x)):
174 |             equal, msg = is_equal(x[i], y[i])
175 |             if not equal:
176 |                 return False, msg
177 |         return True, None
178 |     elif x_type == DataType.Dict:
179 |         if len(x) != len(y):
180 |             return False, "Length mismatch: {} vs {}".format(len(x), len(y))
181 |         for k, v in x.items():
182 |             if k not in y:
183 |                 return False, "DICT Value mismatch: key {} in {} but not in {}".format(
184 |                     k, x, y
185 |                 )
186 |             equal, msg = is_equal(v, y[k])
187 |             if not equal:
188 |                 return False, msg
189 |         return True, None
190 |     else:
191 |         # from IPython import embed
192 |         # embed()
193 |         try:
194 |             if x == y:  # e.g., object comparison
195 |                 return True, None
196 |             else:
197 |                 return False, "ELSE Value mismatch: {} vs {}".format(x, y)
198 |         except:
199 |             return False, "Unsupported type: {} <-- {}".format(x_type, type(x))
200 | 
201 | 
202 | def unsafe_execute(
203 |     dataset: str,
204 |     entry_point: str,
205 |     task_id: str,
206 |     code: str,
207 |     inputs,
208 |     expected: List,
209 |     time_limits,
210 |     atol,
211 |     fast_check,
212 |     stat: Value,
213 |     details: Array,
214 |     progress: Value,
215 | ):
216 |     with create_tempdir():
217 |         # These system calls are needed when cleaning up tempdir.
218 |         import os
219 |         import shutil
220 | 
221 |         rmtree = shutil.rmtree
222 |         rmdir = os.rmdir
223 |         chdir = os.chdir
224 |         # Disable functionalities that can make destructive changes to the test.
225 |         # allow only 4GB memory usage
226 |         maximum_memory_bytes = 4 * 1024 * 1024 * 1024
227 |         reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
228 |         exec_globals = {}
229 |         try:
230 |             with swallow_io():
231 |                 exec(code, exec_globals)
232 |                 fn = exec_globals[entry_point]
233 |                 for i, inp in enumerate(inputs):
234 |                     try:
235 |                         with time_limit(time_limits[i]):
236 |                             out = fn(*inp)
237 |                         exp = expected[i]
238 |                         # TODO, for special oracles, think about how to deal with case where
239 |                         # the function has side affect and changes the input ...
240 |                         # this is true especially for some grid checking stuff
241 |                         # ================================================ #
242 |                         # ============== special oracles ================= #
243 |                         # use task_id and dataset to determine the oracle
244 |                         if (
245 |                             dataset == "humaneval"
246 |                             or "verbose" in dataset
247 |                             or "concise" in dataset
248 |                         ) and task_id == "HumanEval/32":
249 |                             assert abs(_poly(*out, inp)) <= 1e-6
250 | 
251 |                         # =================== Difficult ================== #
252 |                         elif "difficult" in dataset and task_id == "EvoEval/10":
253 |                             _check_insensitive_palindrome(out, *inp, exp)
254 |                         elif "difficult" in dataset and task_id == "EvoEval/32":
255 |                             _check_difficult_poly(*inp, out, exp)
256 | 
257 |                         # =================== Creative =================== #
258 |                         elif "creative" in dataset and task_id == "EvoEval/26":
259 |                             _check_maze(*inp, out, exp)
260 |                         elif "creative" in dataset and task_id == "EvoEval/30":
261 |                             _check_path(*inp, out, exp)
262 |                         elif "creative" in dataset and task_id == "EvoEval/69":
263 |                             _check_product(*inp, out, exp)
264 | 
265 |                         # =================== Subtle ===================== #
266 |                         elif "subtle" in dataset and task_id == "EvoEval/32":
267 |                             _check_poly(*inp, out)
268 | 
269 |                         # =================== Combine ==================== #
270 | 
271 |                         # =================== Tool Using ================= #
272 | 
273 |                         # ============== special oracles ================= #
274 |                         # ================================================ #
275 |                         else:
276 |                             exact_match, _ = is_equal(exp, out)
277 |                             assert exact_match
278 |                     except BaseException:
279 |                         details[i] = False
280 |                         progress.value += 1
281 |                         if fast_check:
282 |                             raise
283 |                         continue
284 | 
285 |                     details[i] = True
286 |                     progress.value += 1
287 |                 stat.value = _SUCCESS
288 |         except BaseException:
289 |             stat.value = _FAILED
290 |         # Needed for cleaning up.
291 |         shutil.rmtree = rmtree
292 |         os.rmdir = rmdir
293 |         os.chdir = chdir
294 | 
295 | 
296 | def untrusted_check(
297 |     dataset: str,
298 |     code: str,
299 |     inputs: List[Any],
300 |     entry_point: str,
301 |     task_id: str,
302 |     expected,
303 |     atol,
304 |     ref_time: List[float],
305 |     fast_check: bool = False,
306 |     min_time_limit: float = 0.1,
307 |     gt_time_limit_factor: float = 2.0,
308 | ) -> Tuple[str, np.ndarray]:
309 |     time_limits = [max(min_time_limit, gt_time_limit_factor * t) for t in ref_time]
310 |     timeout = min(os.getenv("EVOEVAL_TIMEOUT_PER_TASK", 60), sum(time_limits)) + 1
311 |     if not fast_check:
312 |         timeout += 1  # extra time for data collection
313 | 
314 |     # shared memory objects
315 |     progress = Value("i", 0)
316 |     stat = Value("i", _UNKNOWN)
317 |     details = Array("b", [False for _ in range(len(inputs))])
318 |     p = multiprocessing.Process(
319 |         target=unsafe_execute,
320 |         args=(
321 |             dataset,
322 |             entry_point,
323 |             task_id,
324 |             code,
325 |             inputs,
326 |             expected,
327 |             time_limits,
328 |             atol,
329 |             fast_check,
330 |             # return values
331 |             stat,
332 |             details,
333 |             progress,
334 |         ),
335 |     )
336 |     p.start()
337 |     p.join(timeout=timeout + 1)
338 |     if p.is_alive():
339 |         p.terminate()
340 |         time.sleep(0.1)
341 |     if p.is_alive():
342 |         p.kill()
343 |         time.sleep(0.1)
344 | 
345 |     stat = _mapping[stat.value]
346 |     details = details[: progress.value]
347 | 
348 |     if not stat:
349 |         stat = TIMEOUT
350 | 
351 |     if stat == PASS:
352 |         if len(details) != len(inputs) or not all(details):
353 |             stat = FAIL
354 | 
355 |     return stat, details
356 | 
357 | 
358 | def evaluateb_files(
359 |     dataset: str,
360 |     files: List[str],
361 |     inputs: List,
362 |     expected: List,
363 |     entry_point: str,
364 |     atol: float,
365 |     ref_time: List[float],
366 |     fast_check: bool = False,
367 |     min_time_limit: float = 0.1,
368 |     gt_time_limit_factor: float = 2.0,
369 | ) -> List[Tuple[str, List[bool]]]:
370 |     ret = []
371 |     # sort files by the id in name (i.e., "../n.py")
372 |     files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0]))
373 |     for file in files:
374 |         code = open(file, "r").read()
375 |         stat, det = untrusted_check(
376 |             dataset,
377 |             code,
378 |             inputs,
379 |             entry_point,
380 |             expected=expected,
381 |             atol=atol,
382 |             ref_time=ref_time,
383 |             fast_check=fast_check,
384 |             min_time_limit=min_time_limit,
385 |             gt_time_limit_factor=gt_time_limit_factor,
386 |         )
387 |         ret.append((stat, det.tolist()))
388 |     return ret
389 | 


--------------------------------------------------------------------------------
/evoeval/eval_test/_creative_special_oracle.py:
--------------------------------------------------------------------------------
 1 | # oracle for EvoEval/51 in creative
 2 | def _check_maze(maze, start, end, solution_path, gt_path):
 3 |     if not gt_path:
 4 |         assert solution_path == []
 5 |     else:
 6 |         # check the path according to solution reaches from start to end
 7 |         move_to_direction = {
 8 |             "right": (0, 1),
 9 |             "left": (0, -1),
10 |             "up": (-1, 0),
11 |             "down": (1, 0),
12 |         }
13 |         current_position = start
14 |         for move in solution_path:
15 |             current_position = (
16 |                 current_position[0] + move_to_direction[move][0],
17 |                 current_position[1] + move_to_direction[move][1],
18 |             )
19 |             assert maze[current_position[0]][current_position[1]] != 1
20 | 
21 |         assert current_position == end
22 | 
23 | 
24 | # oracle for EvoEval/55 in creative
25 | def _check_path(maze, start, end, solution_path, gt_path):
26 |     if not gt_path:
27 |         assert solution_path == []
28 |     else:
29 |         # check the path according to solution reaches from start to end
30 |         assert solution_path[0] == start
31 |         assert solution_path[-1] == end
32 |         assert maze[start[0]][start[0]] != 0
33 |         for i in range(1, len(solution_path)):
34 |             prev_x, prev_y = solution_path[i - 1]
35 |             curr_x, curr_y = solution_path[i]
36 |             assert maze[curr_x][curr_y] != 0  # not a wall
37 |             assert abs(curr_x - prev_x) + abs(curr_y - prev_y) == 1  # adjacent
38 | 
39 | 
40 | # oracle for EvoEval/110 in creative
41 | def _check_product(arr, target, solution, gt):
42 |     if gt == "No magic today":
43 |         assert gt == solution
44 |     else:
45 |         assert isinstance(solution, tuple)
46 |         i, j = solution
47 |         assert 0 <= i < j < len(arr)  # don't allow negative indexing
48 |         assert arr[i] * arr[j] == target
49 | 


--------------------------------------------------------------------------------
/evoeval/eval_test/_difficult_special_oracle.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | # oracle for EvoEval/10 in difficult
 5 | def _check_insensitive_palindrome(check_palindrome, string, gt_palindrome):
 6 |     assert len(check_palindrome) == len(gt_palindrome)
 7 |     assert check_palindrome.startswith(string)
 8 |     assert check_palindrome.lower() == check_palindrome[::-1].lower()
 9 | 
10 | 
11 | def _poly(xs: list, x: float):
12 |     """
13 |     Evaluates polynomial with coefficients xs at point x.
14 |     return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
15 |     """
16 |     return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
17 | 
18 | 
19 | # oracle for EvoEval/32 in difficult
20 | def _check_difficult_poly(xs, interval, solution, gt_solution):
21 |     if gt_solution is None:
22 |         assert solution is None
23 |         return
24 | 
25 |     start, end = interval
26 |     assert start <= solution <= end
27 |     assert abs(_poly(xs, solution)) <= 2e-2
28 | 


--------------------------------------------------------------------------------
/evoeval/eval_test/_he_special_oracle.py:
--------------------------------------------------------------------------------
 1 | # Adopted from EvalPlus
 2 | import math
 3 | 
 4 | 
 5 | # oracle for HumanEval/032
 6 | def _poly(xs: list, x: float):
 7 |     """
 8 |     Evaluates polynomial with coefficients xs at point x.
 9 |     return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
10 |     """
11 |     return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
12 | 


--------------------------------------------------------------------------------
/evoeval/eval_test/_subtle_special_oracle.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | 
 4 | # oracle for EvoEval/32 in subtle
 5 | def _poly(xs: list, x: float):
 6 |     """
 7 |     Evaluates polynomial with coefficients xs at point x.
 8 |     return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
 9 |     """
10 |     return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
11 | 
12 | 
13 | def _check_poly(xs, solution):
14 |     full_xs = [xs[0], xs[1]]
15 |     for i in range(2, len(xs)):
16 |         full_xs.extend([0, xs[i]])
17 |     assert abs(_poly(full_xs, solution)) <= 1e-6
18 | 


--------------------------------------------------------------------------------
/evoeval/evaluate.py:
--------------------------------------------------------------------------------
  1 | # Adopted from https://github.com/evalplus/evalplus
  2 | import argparse
  3 | import contextlib
  4 | import json
  5 | import multiprocessing
  6 | import os
  7 | import pickle
  8 | import threading
  9 | import time
 10 | from collections import Counter, defaultdict
 11 | from concurrent.futures import ProcessPoolExecutor, as_completed
 12 | from typing import Any, Dict, List, Tuple
 13 | from warnings import warn
 14 | 
 15 | import numpy as np
 16 | from evalplus.data import get_human_eval_plus
 17 | from evalplus.data.utils import load_solutions
 18 | from evalplus.gen.util import trusted_exec
 19 | from termcolor import cprint
 20 | from tqdm import tqdm
 21 | 
 22 | from evoeval.data import CACHE_DIR, get_evo_eval, get_evo_eval_plus_hash
 23 | from evoeval.eval_test import (
 24 |     FAIL,
 25 |     PASS,
 26 |     CustomEncoder,
 27 |     compatible_eval_result,
 28 |     estimate_pass_at_k,
 29 |     untrusted_check,
 30 | )
 31 | 
 32 | # 1st item: the status
 33 | # 2nd item (optional): the detailed pass/fail boolean for each input
 34 | Result = Tuple[str, List[bool]]
 35 | 
 36 | 
 37 | def get_groundtruth(
 38 |     problems, hashcode, use_raw_inputs=False, compute_plus_inputs=False
 39 | ) -> Dict[str, Any]:
 40 |     if hashcode is not None:
 41 |         cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
 42 |         if os.path.exists(cache_file):
 43 |             print(f"Load from ground-truth from {cache_file}")
 44 |             with open(cache_file, "rb") as f:
 45 |                 return pickle.load(f)
 46 | 
 47 |     print("Computing expected output...")
 48 |     tbegin = time.time()
 49 |     expected_output = {}
 50 |     for task_id, problem in problems.items():
 51 |         oracle = {}
 52 |         with contextlib.redirect_stdout(None):
 53 |             oracle["base"], oracle["base_time"] = trusted_exec(
 54 |                 problem["prompt"] + "\n" + problem["canonical_solution"],
 55 |                 problem["base_input"]
 56 |                 if use_raw_inputs
 57 |                 else [
 58 |                     eval(f"[{i}]") for i in problem["inputs"]
 59 |                 ],  # why do we do this? we have more complex input types.
 60 |                 problem["entry_point"],
 61 |                 record_time=True,
 62 |                 output_not_none=False,
 63 |             )
 64 |         expected_output[task_id] = oracle
 65 | 
 66 |         if compute_plus_inputs:
 67 |             oracle["plus"], oracle["plus_time"] = trusted_exec(
 68 |                 problem["prompt"] + "\n" + problem["canonical_solution"],
 69 |                 problem["plus_input"],  # assumption: we have plus_input
 70 |                 problem["entry_point"],
 71 |                 record_time=True,
 72 |                 output_not_none=False,
 73 |             )
 74 |             expected_output[task_id] = oracle
 75 | 
 76 |     # print(expected_output)
 77 |     print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")
 78 | 
 79 |     if hashcode is not None:
 80 |         with open(cache_file, "wb") as f:
 81 |             pickle.dump(expected_output, f)
 82 | 
 83 |     return expected_output
 84 | 
 85 | 
 86 | def check_correctness(
 87 |     dataset: str,
 88 |     completion_id: int,
 89 |     problem: Dict[str, Any],
 90 |     solution: str,
 91 |     expected_output: Dict[str, List],
 92 |     fast_check=False,
 93 |     identifier=None,
 94 |     min_time_limit: float = 0.1,
 95 |     gt_time_limit_factor: float = 2.0,
 96 |     use_raw_inputs=False,
 97 |     compute_plus_inputs=False,
 98 | ) -> Dict[str, Result]:  # {...}, "base" | "plus" -> (status, details)
 99 | 
100 |     ret = {
101 |         "completion_id": completion_id,
102 |         "task_id": problem["task_id"],
103 |         "_identifier": identifier,
104 |         "solution": solution,
105 |     }
106 | 
107 |     ret["result"] = untrusted_check(
108 |         dataset,
109 |         solution,
110 |         problem["base_input"]
111 |         if use_raw_inputs
112 |         else [eval(f"[{i}]") for i in problem["inputs"]],
113 |         problem["entry_point"],
114 |         task_id=problem["task_id"],
115 |         expected=expected_output["base"],
116 |         atol=0,  # TODO check
117 |         ref_time=expected_output["base_time"],
118 |         fast_check=fast_check,
119 |         min_time_limit=min_time_limit,
120 |         gt_time_limit_factor=gt_time_limit_factor,
121 |     )
122 | 
123 |     if compute_plus_inputs:
124 |         ret["plus"] = untrusted_check(
125 |             dataset,
126 |             solution,
127 |             problem["plus_input"],
128 |             problem["entry_point"],
129 |             task_id=problem["task_id"],
130 |             expected=expected_output["plus"],
131 |             atol=0,  # TODO check
132 |             ref_time=expected_output["plus_time"],
133 |             fast_check=fast_check,
134 |             min_time_limit=min_time_limit,
135 |             gt_time_limit_factor=gt_time_limit_factor,
136 |         )
137 | 
138 |     return ret
139 | 
140 | 
141 | def evaluate(flags):
142 |     if flags.parallel is None:
143 |         n_workers = max(1, multiprocessing.cpu_count() // 2)
144 |     else:
145 |         n_workers = flags.parallel
146 | 
147 |     if os.path.isdir(flags.samples):
148 |         result_path = os.path.join(flags.samples, "eval_results.json")
149 |     else:
150 |         assert flags.samples.endswith(".jsonl")
151 |         result_path = flags.samples.replace(".jsonl", "_eval_results.json")
152 | 
153 |     compute_plus_inputs = False
154 | 
155 |     if os.path.isfile(result_path) and not flags.i_just_wanna_run:
156 |         print(f"Load from previous results from {result_path}")
157 |         with open(result_path, "r") as f:
158 |             results = json.load(f)
159 | 
160 |         results = compatible_eval_result(results)
161 |     else:
162 |         use_raw_inputs = False
163 |         if flags.dataset == "humaneval":
164 |             use_raw_inputs = True
165 |             compute_plus_inputs = True
166 |             problems = get_human_eval_plus()
167 |             expected_output = get_groundtruth(
168 |                 problems,
169 |                 None,
170 |                 use_raw_inputs=use_raw_inputs,
171 |                 compute_plus_inputs=compute_plus_inputs,
172 |             )
173 |         elif "verbose" in flags.dataset or "concise" in flags.dataset:
174 |             use_raw_inputs = True
175 |             compute_plus_inputs = True
176 |             problems = get_evo_eval(flags.dataset)
177 |             expected_output = get_groundtruth(
178 |                 problems,
179 |                 None,
180 |                 use_raw_inputs=use_raw_inputs,
181 |                 compute_plus_inputs=compute_plus_inputs,
182 |             )
183 |         else:
184 |             problems = get_evo_eval(flags.dataset)
185 |             dataset_hash = get_evo_eval_plus_hash(flags.dataset)
186 |             expected_output = get_groundtruth(
187 |                 problems,
188 |                 dataset_hash,
189 |                 use_raw_inputs=use_raw_inputs,
190 |                 compute_plus_inputs=compute_plus_inputs,
191 |             )
192 | 
193 |         results = {
194 |             "eval": {},
195 |         }
196 | 
197 |         with ProcessPoolExecutor(max_workers=n_workers) as executor:
198 |             futures = []
199 |             completion_id = Counter()
200 |             n_samples = 0
201 |             eval_results = defaultdict(list)  # task_id ->
202 |             remainings = set()
203 | 
204 |             print("Reading samples...")
205 |             for sample in tqdm(load_solutions(flags.samples)):
206 |                 task_id = sample["task_id"]
207 |                 solution = (
208 |                     sample["solution"]
209 |                     if "solution" in sample
210 |                     else problems[task_id]["prompt"] + sample["completion"]
211 |                 )
212 |                 remainings.add(sample["_identifier"])
213 |                 args = (
214 |                     flags.dataset,
215 |                     completion_id[task_id],
216 |                     problems[task_id],
217 |                     solution,
218 |                     expected_output[task_id],
219 |                     not flags.test_details,  # fast_check
220 |                     sample["_identifier"],
221 |                     flags.min_time_limit,
222 |                     flags.gt_time_limit_factor,
223 |                     use_raw_inputs,
224 |                     compute_plus_inputs,
225 |                 )
226 |                 futures.append(executor.submit(check_correctness, *args))
227 |                 completion_id[task_id] += 1
228 |                 n_samples += 1
229 | 
230 |             assert n_samples == len(remainings), "Missing problems in unfinished"
231 |             assert len(completion_id) == len(problems), "Missing problems in samples"
232 | 
233 |             def stucking_checker():
234 |                 while remainings:
235 |                     last_size = len(remainings)
236 |                     time.sleep(20)
237 |                     if last_size != len(remainings) or len(remainings) == 0:
238 |                         continue
239 |                     # Potentially stuck
240 |                     warn("No samples had finished testing in the last 20s")
241 |                     warn(f"{len(remainings)} samples to be tested: {remainings}")
242 | 
243 |             threading.Thread(target=stucking_checker).start()
244 | 
245 |             for future in tqdm(as_completed(futures), total=n_samples):
246 |                 result = future.result()
247 |                 remainings.remove(result["_identifier"])
248 |                 eval_results[result["task_id"]].append(result)
249 | 
250 |         # sort the results for each problem by completion_id
251 |         for task_id, task_results in eval_results.items():
252 |             task_results.sort(key=lambda x: x["completion_id"])
253 |             results["eval"][task_id] = []
254 |             for res in task_results:
255 | 
256 |                 def get_failed_tests(stat, details, inputs) -> List[Any]:
257 |                     if stat == PASS or not details:
258 |                         return []
259 | 
260 |                     # if flags.test_details:
261 |                     return [inputs[i] for i in range(len(details)) if not details[i]]
262 | 
263 |                 base_stat, base_details = res["result"]
264 |                 base_fail_tests = get_failed_tests(
265 |                     base_stat,
266 |                     base_details,
267 |                     problems[task_id]["base_input"]
268 |                     if use_raw_inputs
269 |                     else [eval(f"[{i}]") for i in problems[task_id]["inputs"]],
270 |                 )
271 | 
272 |                 # initialize plus tests
273 |                 plus_stat = None
274 |                 plus_fail_tests = []
275 | 
276 |                 # with plus tests
277 |                 if not flags.base_only and compute_plus_inputs:
278 |                     plus_stat, plus_details = res["plus"]
279 |                     plus_fail_tests = get_failed_tests(
280 |                         plus_stat, plus_details, problems[task_id]["plus_input"]
281 |                     )
282 | 
283 |                 results["eval"][task_id].append(
284 |                     {
285 |                         "task_id": task_id,
286 |                         "solution": res["solution"],
287 |                         "base_status": base_stat,
288 |                         "plus_status": plus_stat,
289 |                         "base_fail_tests": base_fail_tests,
290 |                         "plus_fail_tests": plus_fail_tests,
291 |                     }
292 |                 )
293 | 
294 |     if os.path.isfile(result_path) and flags.i_just_wanna_run:
295 |         decision = ""
296 |         while decision.lower() not in ["y", "n"]:
297 |             print(f"{result_path} already exists. Press [Y/N] to overwrite or exit...")
298 |             decision = input()
299 | 
300 |         if decision.lower() == "y":
301 |             # mv the file to a backup
302 |             new_path = result_path + ".bak"
303 |             while os.path.isfile(new_path):
304 |                 new_path += ".bak"
305 |             os.rename(result_path, new_path)
306 |             print(f"Backup {result_path} to {new_path}")
307 | 
308 |     if not os.path.isfile(result_path):
309 |         with open(result_path, "w") as f:
310 |             json.dump(
311 |                 results, f, cls=CustomEncoder
312 |             )  # handle some unique cases where failure inputs are sets
313 | 
314 |         # Calculate pass@k.
315 |     total = np.array([len(r) for r in results["eval"].values()])
316 |     correct = []
317 |     plus_correct = []
318 | 
319 |     for res in results["eval"].values():
320 |         bc = sum([r["base_status"] == PASS for r in res])
321 |         correct.append(bc)
322 |         if not flags.base_only and compute_plus_inputs:
323 |             plus_correct.append(
324 |                 sum(
325 |                     [
326 |                         res[i]["base_status"] == res[i]["plus_status"] == PASS
327 |                         for i in range(len(res))
328 |                     ]
329 |                 )
330 |             )
331 | 
332 |     correct = np.array(correct)
333 |     pass_at_k = {
334 |         f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
335 |         for k in [1, 10, 100]
336 |         if total.min() >= k
337 |     }
338 |     cprint(f"{flags.dataset}", "red")
339 |     for k, v in pass_at_k.items():
340 |         cprint(f"{k}:\t{v:.3f}", "red")
341 | 
342 |     if plus_correct:
343 |         cprint(f"{flags.dataset}+ (base + extra tests)", "green")
344 |         pass_at_k = {
345 |             f"pass@{k}": estimate_pass_at_k(total, np.array(plus_correct), k).mean()
346 |             for k in [1, 10, 100]
347 |             if (total >= k).all()
348 |         }
349 |         for k, v in pass_at_k.items():
350 |             cprint(f"{k}:\t{v:.3f}", "green")
351 | 
352 | 
353 | def main():
354 |     parser = argparse.ArgumentParser(description="Evaluator")
355 |     parser.add_argument("--dataset", required=True, type=str)
356 |     parser.add_argument("--samples", required=True, type=str)
357 |     parser.add_argument("--base-only", action="store_true")
358 |     parser.add_argument("--parallel", default=None, type=int)
359 |     parser.add_argument("--i-just-wanna-run", action="store_true")
360 |     parser.add_argument("--test-details", action="store_true")
361 |     parser.add_argument("--min-time-limit", default=1, type=float)
362 |     parser.add_argument("--gt-time-limit-factor", default=4.0, type=float)
363 |     parser.add_argument("--mini", action="store_true")
364 |     parser.add_argument(
365 |         "--noextreme", action="store_true", help="Omit extreme test inputs"
366 |     )
367 |     args = parser.parse_args()
368 | 
369 |     evaluate(args)
370 | 
371 | 
372 | if __name__ == "__main__":
373 |     main()
374 | 


--------------------------------------------------------------------------------
/evoeval/util/api_request.py:
--------------------------------------------------------------------------------
  1 | import signal
  2 | import time
  3 | from typing import Dict, Union
  4 | 
  5 | import openai
  6 | import tiktoken
  7 | from google.generativeai import GenerationConfig
  8 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
  9 | 
 10 | client = openai.OpenAI()
 11 | 
 12 | 
 13 | def num_tokens_from_messages(message, model="gpt-3.5-turbo-0301"):
 14 |     """Returns the number of tokens used by a list of messages."""
 15 |     try:
 16 |         encoding = tiktoken.encoding_for_model(model)
 17 |     except KeyError:
 18 |         encoding = tiktoken.get_encoding("cl100k_base")
 19 |     if isinstance(message, list):
 20 |         # use last message.
 21 |         num_tokens = len(encoding.encode(message[0]["content"]))
 22 |     else:
 23 |         num_tokens = len(encoding.encode(message))
 24 |     return num_tokens
 25 | 
 26 | 
 27 | def create_chatgpt_config(
 28 |     message: Union[str, list],
 29 |     max_tokens: int,
 30 |     temperature: float = 1,
 31 |     batch_size: int = 1,
 32 |     system_message: str = "You are a helpful assistant.",
 33 |     model: str = "gpt-3.5-turbo",
 34 | ) -> Dict:
 35 |     if isinstance(message, list):
 36 |         config = {
 37 |             "model": model,
 38 |             "max_tokens": max_tokens,
 39 |             "temperature": temperature,
 40 |             "n": batch_size,
 41 |             "messages": [{"role": "system", "content": system_message}] + message,
 42 |         }
 43 |     else:
 44 |         config = {
 45 |             "model": model,
 46 |             "max_tokens": max_tokens,
 47 |             "temperature": temperature,
 48 |             "n": batch_size,
 49 |             "messages": [
 50 |                 {"role": "system", "content": system_message},
 51 |                 {"role": "user", "content": message},
 52 |             ],
 53 |         }
 54 |     return config
 55 | 
 56 | 
 57 | def handler(signum, frame):
 58 |     # swallow signum and frame
 59 |     raise Exception("end of time")
 60 | 
 61 | 
 62 | def request_chatgpt_engine(config):
 63 |     ret = None
 64 |     while ret is None:
 65 |         try:
 66 |             signal.signal(signal.SIGALRM, handler)
 67 |             signal.alarm(100)
 68 |             ret = client.chat.completions.create(**config)
 69 |             signal.alarm(0)
 70 |         except openai._exceptions.BadRequestError as e:
 71 |             print(e)
 72 |             signal.alarm(0)
 73 |         except openai._exceptions.RateLimitError as e:
 74 |             print("Rate limit exceeded. Waiting...")
 75 |             print(e)
 76 |             signal.alarm(0)
 77 |             time.sleep(5)
 78 |         except openai._exceptions.APIConnectionError as e:
 79 |             print("API connection error. Waiting...")
 80 |             signal.alarm(0)
 81 |             time.sleep(5)
 82 |         except Exception as e:
 83 |             print("Unknown error. Waiting...")
 84 |             print(e)
 85 |             signal.alarm(0)
 86 |             time.sleep(1)
 87 |     return ret
 88 | 
 89 | 
 90 | def create_gemini_config(
 91 |     max_tokens: int,
 92 |     temperature: float = 1,
 93 |     batch_size: int = 1,
 94 | ) -> Dict:
 95 |     config = GenerationConfig(
 96 |         candidate_count=batch_size,
 97 |         max_output_tokens=max_tokens,
 98 |         temperature=temperature,
 99 |     )
100 |     return config
101 | 
102 | 
103 | safety_settings = [
104 |     {
105 |         "category": "HARM_CATEGORY_DANGEROUS",
106 |         "threshold": "BLOCK_NONE",
107 |     },
108 |     {
109 |         "category": "HARM_CATEGORY_HARASSMENT",
110 |         "threshold": "BLOCK_NONE",
111 |     },
112 |     {
113 |         "category": "HARM_CATEGORY_HATE_SPEECH",
114 |         "threshold": "BLOCK_NONE",
115 |     },
116 |     {
117 |         "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
118 |         "threshold": "BLOCK_NONE",
119 |     },
120 |     {
121 |         "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
122 |         "threshold": "BLOCK_NONE",
123 |     },
124 | ]
125 | 
126 | 
127 | def request_gemini_engine(model, message, config):
128 |     ret = None
129 |     count = 0
130 |     while ret is None:
131 |         try:
132 |             signal.signal(signal.SIGALRM, handler)
133 |             signal.alarm(100)
134 |             ret = model.generate_content(
135 |                 message, generation_config=config, safety_settings=safety_settings
136 |             )
137 |             s = ret.text  # check if response can be accessed.
138 |             signal.alarm(0)
139 |         except Exception as e:
140 |             ret = None  # reset
141 |             print("Unknown error. Waiting...")
142 |             count += 1
143 |             print(e)
144 |             # here we need to slightly increase temperature to combat weird gemini output of
145 |             # The token generation was stopped as the response was flagged for unauthorized citations.
146 |             if count > 10:
147 |                 config.temperature = min(config.temperature + 0.1, 1)
148 |             signal.alarm(0)
149 |             time.sleep(20)
150 |     return ret
151 | 
152 | 
153 | def create_palm_config(
154 |     message: str,
155 |     max_tokens: int,
156 |     temperature: float = 1,
157 |     batch_size: int = 1,
158 |     model: str = "models/text-bison-001",
159 | ) -> Dict:
160 |     config = {
161 |         "model": model,
162 |         "prompt": message,
163 |         "temperature": temperature,
164 |         "max_output_tokens": max_tokens,
165 |         "safety_settings": [
166 |             {
167 |                 "category": HarmCategory.HARM_CATEGORY_DEROGATORY,
168 |                 "threshold": HarmBlockThreshold.BLOCK_NONE,
169 |             },
170 |             {
171 |                 "category": HarmCategory.HARM_CATEGORY_TOXICITY,
172 |                 "threshold": HarmBlockThreshold.BLOCK_NONE,
173 |             },
174 |             {
175 |                 "category": HarmCategory.HARM_CATEGORY_SEXUAL,
176 |                 "threshold": HarmBlockThreshold.BLOCK_NONE,
177 |             },
178 |             {
179 |                 "category": HarmCategory.HARM_CATEGORY_VIOLENCE,
180 |                 "threshold": HarmBlockThreshold.BLOCK_NONE,
181 |             },
182 |             {
183 |                 "category": HarmCategory.HARM_CATEGORY_DANGEROUS,
184 |                 "threshold": HarmBlockThreshold.BLOCK_NONE,
185 |             },
186 |             {
187 |                 "category": HarmCategory.HARM_CATEGORY_MEDICAL,
188 |                 "threshold": HarmBlockThreshold.BLOCK_NONE,
189 |             },
190 |         ],
191 |     }
192 |     return config
193 | 
194 | 
195 | def request_palm_engine(model, config):
196 |     ret = None
197 |     count = 0
198 |     while ret is None:
199 |         try:
200 |             signal.signal(signal.SIGALRM, handler)
201 |             signal.alarm(100)
202 |             ret = model.generate_text(**config)
203 |             s = ret.result  # check if response can be accessed.
204 |             if s is None:
205 |                 config["temperature"] = min(config["temperature"] + 0.1, 1)
206 |                 count += 1
207 |                 if count > 100:
208 |                     ret.result = ""  # just return empty string
209 |                 else:
210 |                     ret = None  # reset
211 |             signal.alarm(0)
212 |         except Exception as e:
213 |             ret = None  # reset
214 |             print("Unknown error. Waiting...")
215 |             print(e)
216 |             signal.alarm(0)
217 |             time.sleep(20)
218 |     return ret
219 | 
220 | 
221 | def create_anthropic_config(
222 |     message: str,
223 |     max_tokens: int,
224 |     temperature: float = 1,
225 |     batch_size: int = 1,
226 |     model: str = "claude-2.1",
227 | ) -> Dict:
228 |     if isinstance(message, list):
229 |         config = {
230 |             "model": model,
231 |             "temperature": temperature,
232 |             "max_tokens": max_tokens,
233 |             "messages": message,
234 |         }
235 |     else:
236 |         config = {
237 |             "model": model,
238 |             "temperature": temperature,
239 |             "max_tokens": max_tokens,
240 |             "messages": [{"role": "user", "content": message}],
241 |         }
242 |     return config
243 | 
244 | 
245 | def request_anthropic_engine(client, config):
246 |     ret = None
247 |     while ret is None:
248 |         try:
249 |             signal.signal(signal.SIGALRM, handler)
250 |             signal.alarm(100)
251 |             ret = client.messages.create(**config)
252 |             signal.alarm(0)
253 |         except Exception as e:
254 |             print("Unknown error. Waiting...")
255 |             print(e)
256 |             signal.alarm(0)
257 |             time.sleep(10)
258 |     return ret
259 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
3 | build-backend = "setuptools.build_meta"
4 | 
5 | [tool.setuptools_scm]
6 | write_to = "evoeval/_version.py"
7 | version_scheme = "release-branch-semver"
8 | local_scheme = "no-local-version"
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wget
2 | appdirs
3 | tempdir
4 | multipledispatch
5 | numpy
6 | tqdm
7 | termcolor
8 | evalplus @ git+https://github.com/evalplus/evalplus
9 | 


--------------------------------------------------------------------------------
/resources/butterfly_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/evo-eval/evoeval/d5ca3104ec30b99f1076f51d4476eb4c3f29effa/resources/butterfly_dark.png


--------------------------------------------------------------------------------
/resources/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/evo-eval/evoeval/d5ca3104ec30b99f1076f51d4476eb4c3f29effa/resources/example.gif


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = evoeval
 3 | description = "EvoEval: Evolving Coding Benchmarks via LLM"
 4 | long_description = file: README.md
 5 | long_description_content_type = text/markdown
 6 | url = https://github.com/evo-eval/evoeval
 7 | license = Apache-2.0
 8 | license_file = LICENSE
 9 | platform = any
10 | classifiers =
11 |     Operating System :: OS Independent
12 |     Programming Language :: Python :: 3
13 |     License :: OSI Approved :: Apache Software License
14 | 
15 | [options]
16 | packages = find:
17 | python_requires = >=3.9
18 | dependency_links =
19 | install_requires =
20 |     wget>=3.2
21 |     tempdir>=0.7.1
22 |     multipledispatch>=0.6.0
23 |     appdirs>=1.4.4
24 |     numpy>=1.19.5
25 |     tqdm>=4.56.0
26 |     termcolor>=2.0.0
27 |     evalplus>=0.2.0
28 | 
29 | [options.entry_points]
30 | console_scripts =
31 |     evoeval.evaluate = evoeval.evaluate:main
32 | 


--------------------------------------------------------------------------------
/tool/sanitize.py:
--------------------------------------------------------------------------------
  1 | # largely adopted from EvalPlus
  2 | 
  3 | import ast
  4 | import os
  5 | import pathlib
  6 | 
  7 | from evalplus.data import get_human_eval_plus
  8 | from tqdm import tqdm
  9 | 
 10 | from evoeval.data import get_evo_eval
 11 | 
 12 | INCODER_EXTRA = ["</code>", "<|", "</CODE>"]
 13 | POLYCODER_EXTRA = ["\n//", "\n/*"]
 14 | NON_CODE_EOFS = ["<|endoftext|>", "\n```", "\n</s>", "\n#"]
 15 | 
 16 | 
 17 | def get_all_python_files(folder):
 18 |     # return a list of full-path python files
 19 |     py_files = []
 20 |     for root, _, files in os.walk(folder):
 21 |         for file in files:
 22 |             if file.endswith(".py"):
 23 |                 py_files.append(os.path.join(root, file))
 24 |     return py_files
 25 | 
 26 | 
 27 | def remove_unindented_lines(code, ok_starts):
 28 |     new_code = ""
 29 |     for line in code.splitlines():
 30 |         if any([line.startswith(t) for t in ok_starts]) or line.strip() == "":
 31 |             new_code += line + "\n"
 32 |             continue
 33 | 
 34 |         lspace = len(line) - len(line.lstrip())
 35 |         if lspace == 0:
 36 |             continue
 37 | 
 38 |         new_code += line + "\n"
 39 | 
 40 |     return new_code
 41 | 
 42 | 
 43 | def extract_function(code, target_func):
 44 |     def remove_last_line_until_parse(code):
 45 |         try:
 46 |             tree = ast.parse(code)
 47 |         except:
 48 |             if "\n" in code:
 49 |                 code = code.rsplit("\n", 1)[0]
 50 |                 return remove_last_line_until_parse(code)
 51 |             else:
 52 |                 return None
 53 |         return tree
 54 | 
 55 |     tree = remove_last_line_until_parse(code)
 56 |     if tree is None:  # fail to parse
 57 |         return ""
 58 | 
 59 |     # return the target function only
 60 |     for node in tree.body:
 61 |         if isinstance(node, ast.FunctionDef):
 62 |             if node.name == target_func:
 63 |                 return ast.unparse(node)
 64 |     return ""
 65 | 
 66 | 
 67 | def to_four_space_indents(old_code):
 68 |     new_code = ""
 69 |     for line in old_code.splitlines():
 70 |         lspace = len(line) - len(line.lstrip())
 71 |         if lspace == 3:
 72 |             new_code += " "
 73 |         new_code += line + "\n"
 74 |     return new_code
 75 | 
 76 | 
 77 | def sanitize_folder(args, folder):
 78 |     # task_id -> entry_point
 79 |     entry_point = {}
 80 |     prompts = {}
 81 | 
 82 |     if args.dataset == "humaneval":
 83 |         problems = get_human_eval_plus()
 84 |     else:
 85 |         problems = get_evo_eval(args.dataset)
 86 | 
 87 |     for task_id, problem in problems.items():
 88 |         entry_point[task_id] = problem["entry_point"]
 89 |         prompts[task_id] = problem["prompt"]
 90 | 
 91 |     # make a new folder with "-sanitized" suffix
 92 |     old_folder = pathlib.Path(folder)
 93 |     if args.inplace:
 94 |         new_folder = old_folder
 95 |     else:
 96 |         new_folder = old_folder.parent / (old_folder.name + "-sanitized")
 97 | 
 98 |     nsan = 0
 99 |     ntotal = 0
100 |     for pyf in tqdm(get_all_python_files(folder)):
101 |         # Get [?] from "[prefix]/HumanEval_[?]/[number].py":
102 |         task_id = pyf.split("/")[-2].replace("_", "/")
103 |         ntotal += 1
104 |         old_code = open(pyf).read()
105 | 
106 |         def_left = "def " + entry_point[task_id] + "("
107 | 
108 |         imports = prompts[task_id].split(def_left)[0]
109 |         def_right = def_left.join(prompts[task_id].split(def_left)[1:])
110 | 
111 |         new_code = imports + def_left + old_code.split(def_left)[-1]
112 |         chunks = new_code.split(def_left)  # imports + def_left + {def_right + impl}
113 | 
114 |         if len(chunks) == 2:
115 |             new_code = def_left + chunks[-1]  # fn + impl
116 | 
117 |         if "chatgpt" in folder:
118 |             tmp = ""
119 |             for line in new_code.splitlines():
120 |                 if line.strip() == "python":
121 |                     continue
122 |                 tmp += line + "\n"
123 |             new_code = tmp
124 | 
125 |         new_code = to_four_space_indents(new_code)
126 | 
127 |         if args.eof:
128 |             eof_strs = NON_CODE_EOFS
129 |             if "incoder" in folder:
130 |                 eof_strs = eof_strs + INCODER_EXTRA
131 |             if "polycoder" in folder:
132 |                 eof_strs = eof_strs + POLYCODER_EXTRA
133 |             if "mistral" in folder:
134 |                 eof_strs = eof_strs + [r"</s>"]
135 |             for eof in eof_strs:
136 |                 new_code = new_code.split(eof)[0]
137 | 
138 |         # extract the target function and remove lines that are not indented
139 |         new_code = extract_function(new_code, entry_point[task_id])
140 | 
141 |         if len(chunks) == 2:
142 |             new_code = chunks[0] + new_code
143 | 
144 |         # write to new folder
145 |         new_pyf = pyf.replace(str(old_folder), str(new_folder))
146 | 
147 |         if new_code.strip() != old_code.strip():
148 |             print("Sanitized: ", pyf, "->", new_pyf)
149 |             nsan += 1
150 | 
151 |         pathlib.Path(new_pyf).parent.mkdir(parents=True, exist_ok=True)
152 |         with open(new_pyf, "w") as f:
153 |             f.write(new_code)
154 | 
155 |     print(f"Sanitized {nsan} out of {ntotal} files.")
156 | 
157 | 
158 | def main():
159 |     import argparse
160 | 
161 |     parser = argparse.ArgumentParser()
162 |     parser.add_argument("--folder", type=str, required=True)
163 |     parser.add_argument("--dataset", type=str, required=True)
164 |     parser.add_argument("--eof", action="store_true")
165 |     parser.add_argument("--inplace", action="store_true")
166 |     parser.add_argument(
167 |         "--root_folder",
168 |         action="store_true",
169 |         help="Use if we want to sanitize all folders in the root folder.",
170 |     )
171 | 
172 |     args = parser.parse_args()
173 | 
174 |     assert not args.folder.endswith("/")
175 | 
176 |     if not args.root_folder:
177 |         sanitize_folder(args, args.folder)
178 |     else:
179 |         for folder in os.listdir(args.folder):
180 |             if os.path.isdir(f"{args.folder}/{folder}") and "sanitized" not in folder:
181 |                 sanitize_folder(args, f"{args.folder}/{folder}")
182 | 
183 | 
184 | if __name__ == "__main__":
185 |     main()
186 | 


--------------------------------------------------------------------------------