├── .dockerignore ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── README.md ├── codegen ├── generate.py └── model.py ├── evoeval ├── __init__.py ├── data.py ├── eval_test │ ├── __init__.py │ ├── _creative_special_oracle.py │ ├── _difficult_special_oracle.py │ ├── _he_special_oracle.py │ └── _subtle_special_oracle.py ├── evaluate.py └── util │ └── api_request.py ├── pyproject.toml ├── requirements.txt ├── resources ├── butterfly_dark.png └── example.gif ├── setup.cfg └── tool └── sanitize.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | # nuclear option because steven uses PyCharm. 161 | .idea/ 162 | 163 | # VSCode 164 | .vscode/ 165 | *.jsonl 166 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | # nuclear option because steven uses PyCharm. 161 | .idea/ 162 | 163 | 164 | evoeval/_version.py 165 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pycqa/isort 3 | rev: 5.12.0 4 | hooks: 5 | - id: isort 6 | name: isort (python) 7 | args: ["--profile", "black"] 8 | - repo: https://github.com/psf/black 9 | rev: 22.6.0 10 | hooks: 11 | - id: black 12 | - repo: https://github.com/pre-commit/pre-commit-hooks 13 | rev: v4.3.0 14 | hooks: 15 | - id: check-yaml 16 | - id: end-of-file-fixer 17 | - id: trailing-whitespace 18 | exclude: (?x)^( 19 | resources/.*| 20 | README.* 21 | )$ 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # base env: py39 ubuntu20.04 2 | # 3.9 is needed for typing related stuff 3 | FROM python:3.9-slim-buster 4 | 5 | # install git 6 | RUN apt-get update && apt-get install -y git 7 | 8 | # upgrade to latest pip 9 | RUN pip install --upgrade pip 10 | 11 | COPY . /evoeval 12 | 13 | RUN cd /evoeval && ls -l && pip install . 14 | 15 | WORKDIR /app 16 | 17 | ENTRYPOINT ["python3", "-m", "evoeval.evaluate"] 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EvoEval: Evolving Coding Benchmarks via LLM 2 | 3 |

4 | 5 | 6 | 7 | 8 | 9 |

10 | 11 |

12 | ⚡Quick Start | 13 | 🔠Benchmarks | 14 | 🤖LLM Generated Code | 15 | 📝Citation | 16 | 🙏Acknowledgement 17 |

18 | 19 | ## About 20 | 21 | **EvoEval**1 is a holistic benchmark suite created by _evolving_ **HumanEval** problems: 22 | - 🔥 Contains **828** new problems across **5** 🌠 semantic-altering and **2** ⭐ semantic-preserving benchmarks 23 | - 🔮 Allows evaluation/comparison across different **dimensions** and problem **types** (i.e., _Difficult_, _Creative_ or _Tool Use_ problems). See our [**visualization tool**](https://evo-eval.github.io/visualization.html) for ready-to-use comparison 24 | - 🏆 Complete with [**leaderboard**](https://evo-eval.github.io/leaderboard.html), **groundtruth solutions**, **robust testcases** and **evaluation scripts** to easily fit into your evaluation pipeline 25 | - 🤖 Generated LLM code samples from **>50** different models to save you time in running experiments 26 | 27 | 1 coincidentally similar pronunciation with 😈 EvilEval 28 | 29 |

30 | 31 |

32 | 33 | Checkout our 📃 [paper](https://arxiv.org/abs/2403.19114) and [webpage](https://evo-eval.github.io) for more detail! 34 | 35 | 36 | 37 | ## ⚡ Quick Start 38 | 39 | Directly install the package: 40 | 41 | ```bash 42 | pip install evoeval --upgrade 43 | ``` 44 | 45 |
⏬ Nightly Version 46 |
47 | 48 | ```bash 49 | pip install "git+https://github.com/evo-eval/evoeval.git" --upgrade 50 | ``` 51 | 52 |
53 |
54 | 55 |
⏬ Local Repository 56 |
57 | 58 | ```bash 59 | git clone https://github.com/evo-eval/evoeval.git 60 | cd evoeval 61 | export PYTHONPATH=$PYTHONPATH:$(pwd) 62 | pip install -r requirements.txt 63 | ``` 64 | 65 |
66 |
67 | 68 | Now you are ready to download EvoEval benchmarks and perform evaluation! 69 | 70 | ### 🧑‍💻 Code generation 71 | 72 | To download our benchmarks, simply use the following code snippet: 73 | 74 | ```python 75 | from evoeval.data import get_evo_eval 76 | 77 | evoeval_benchmark = "EvoEval_difficult" # you can pick from 7 different benchmarks! 78 | 79 | problems = get_evo_eval(evoeval_benchmark) 80 | ``` 81 | 82 | For code generation and evaluation, we adopt the same style as [HumanEval+](https://github.com/evalplus/evalplus) and [HumanEval](https://github.com/openai/human-eval). 83 | 84 | Implement the `GEN_SOLUTION` function by calling the LLM to produce the complete solution (include the function header + code) and save the samples to `{benchmark}_samples.jsonl`: 85 | 86 | ```python 87 | from evoeval.data import get_evo_eval, write_jsonl 88 | 89 | evoeval_benchmark = "EvoEval_difficult" 90 | 91 | samples = [ 92 | dict(task_id=task_id, solution=GEN_SOLUTION(problem["prompt"])) 93 | for task_id, problem in get_evo_eval(evoeval_benchmark).items() 94 | ] 95 | write_jsonl(f"{evoeval_benchmark}_samples.jsonl", samples) 96 | ``` 97 | 98 | > [!TIP] 99 | > 100 | > EvoEval `samples.jsonl` expects the solution field to contain the **complete** code implementation, this is 101 | > slightly different from the original HumanEval where the solution field only contains the function body. 102 | > 103 | > If you want to follow exactly like HumanEval setup, checkout our 🤗 Huggingface [datasets](https://huggingface.co/evoeval), which can be directly ran with 104 | > HumanEval evaluation [script](https://huggingface.co/evoeval) 105 | 106 | ### 🕵️ Evaluation 107 | 108 | You can use our provided [docker](https://docs.docker.com/get-docker/) image: 109 | 110 | ```bash 111 | docker run --rm -v $(pwd):/app evoeval/evoeval:latest --dataset EvoEval_difficult --samples EvoEval_difficult_samples.jsonl 112 | ``` 113 | 114 | Or run it locally: 115 | 116 | ```bash 117 | evoeval.evaluate --dataset EvoEval_difficult --samples EvoEval_difficult_samples.jsonl 118 | ``` 119 | 120 | Or if you are using it as a local repository: 121 | 122 | ```bash 123 | export PYTHONPATH=$PYTHONPATH:$(pwd) 124 | python evoeval/evaluate.py --dataset EvoEval_difficult --samples EvoEval_difficult_samples.jsonl 125 | ``` 126 | 127 | You should expect to see the following output (when evaluated on GPT-4): 128 | ``` 129 | Computing expected output... 130 | Expected outputs computed in 11.24s 131 | Reading samples... 132 | 100it [00:00, 164.16it/s] 133 | 100%|████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 12.77it/s] 134 | EvoEval_difficult 135 | pass@1: 0.520 # for reference GPT-4 solves more than 80% of problems in HumanEval 136 | ``` 137 | This shows the pass@1 score for the EvoEval_difficult benchmark. You can use `--i-just-wanna-run` to recompute the evaluation result 138 | 139 | > [!Note] 140 | > 141 | > You can also evaluate the LLM solutions in a folder format with each subfolder contains 142 | > the LLM solution for each problem in the benchmark 143 | > 144 | > For example, you can grab the [GPT-4 solutions](https://github.com/evo-eval/evoeval/releases/download/v0.1.0/gpt-4_temp_0.0.zip) in our [v0.1.0 release](https://github.com/evo-eval/evoeval/releases/tag/v0.1.0). 145 | > After unzipping, you can run the following command: 146 | > 147 | > ```bash 148 | > evoeval.evaluate --dataset EvoEval_difficult --samples gpt-4_temp_0.0/EvoEval_difficult 149 | > ``` 150 | > 151 | > to obtain the same result as above using `.jsonl` 152 | 153 | 154 | ## 🔠 Benchmarks 155 | 156 | **EvoEval** contains **7** different benchmarks, each with a unique set of problems 157 | evolved from the original **HumanEval** problems. 🌠 denotes semantic-altering benchmarks, 158 | while ⭐ denotes semantic-preserving benchmarks.: 159 | 160 |
🌠EvoEval_difficult: 161 |
162 | 163 | > Introduce complexity by adding additional constraints and requirements, 164 | > replace commonly used requirements to less common ones, or add additional reasoning 165 | > steps to the original problem. 166 |
167 |
168 | 169 |
🌠EvoEval_creative: 170 |
171 | 172 | > Generate a more creative problem compared to the original through the use 173 | > of stories or uncommon narratives. 174 |
175 |
176 | 177 | 178 |
🌠EvoEval_subtle: 179 |
180 | 181 | > Make a subtle and minor change to the original problem such as inverting or 182 | > replacing a requirement. 183 |
184 |
185 | 186 | 187 |
🌠EvoEval_combine: 188 |
189 | 190 | > Combine two different problems by integrating the concepts from both problems. In order to select problems that make sense to combine, we apply a simple heuristic 191 | > to combine only problems of the same type together categorized based on the type of 192 | > input arguments in the original problem. 193 |
194 |
195 | 196 |
🌠EvoEval_tool_use: 197 |
198 | 199 | > Produce a new problem containing a main problem and one or more helpers 200 | > functions which can be used to solve it. Each helper function is fully implemented and 201 | > provides hints or useful functionality for solving the main problem. The main problem 202 | > does not explicitly reference individual helper functions, and we do not require the model 203 | > to use the provided helpers. 204 |
205 |
206 | 207 | 208 |
⭐EvoEval_verbose: 209 |
210 | 211 | > Reword the original docstring to be more verbose. These verbose docstrings 212 | > can use more descriptive language to illustrate the problem, include detailed explanation 213 | > of the example output, and provide additional hints. 214 |
215 |
216 | 217 |
⭐EvoEval_concise: 218 |
219 | 220 | > Reword the original docstring to be more concise by removing unnecessary 221 | > details and using concise language. Furthermore, simple examples that are not required 222 | > to demonstrate edge cases may be removed. 223 | 224 |
225 |
226 | 227 | For each problem in each **EvoEval** benchmark, we include the complete groundtruth as well as test cases for functional evaluation. 228 | 229 | > [!Note] 230 | > 231 | > **Problem Structure** 232 | > 233 | > ```json 234 | > { 235 | > "task_id": "identifier string for the task", 236 | > "entry_point": "name of the function", 237 | > "prompt": "function signature with docstring", 238 | > "canonical_solution": "groundtruth implementation", 239 | > "inputs": "test inputs for each problem", 240 | > "parent": "original HumanEval problem it evolved from", 241 | > "main": "special field of EvoEval_tool_use to show just the main problem description", 242 | > "helpers": "special field of EvoEval_tool_use to show the helper functions" 243 | > } 244 | > ``` 245 | 246 | ## 🤖 LLM Generated Code 247 | 248 | To view the performance of **>50** LLMs on the EvoEval benchmarks, 249 | we provide a complete [leaderboard](https://evo-eval.github.io/leaderboard.html) as well as a 250 | [visualization tool](https://evo-eval.github.io/visualization.html) to compare the performance of different models. 251 | 252 | Further, we also provide all code samples from LLMs on the **EvoEval** benchmarks: 253 | 254 | * See the attachment of our [v0.1.0 release](https://github.com/evo-eval/evoeval/releases/tag/v0.1.0). 255 | 256 | Each LLM generation is packaged in a zip file named like `{model_name}_temp_0.0.zip`. You can unzip the folder and obtain the 257 | LLM generation for each of our 7 benchmarks + the original HumanEval problems. Note that we only evaluate the greedy output for each LLM. 258 | 259 | ## 📝 Citation 260 | 261 | ```bibtex 262 | @article{evoeval, 263 | author = {Xia, Chunqiu Steven and Deng, Yinlin and Zhang, Lingming}, 264 | title = {Top Leaderboard Ranking = Top Coding Proficiency, Always? EvoEval: Evolving Coding Benchmarks via LLM}, 265 | year = {2024}, 266 | journal = {arXiv preprint}, 267 | } 268 | ``` 269 | 270 | > [!Note] 271 | > 272 | > The first two authors contributed equally to this work, with author order determined via [_Nigiri_](https://senseis.xmp.net/?Nigiri) 273 | 274 | ## 🙏 Acknowledgement 275 | 276 | * [HumanEval](https://github.com/openai/human-eval) 277 | * We especially thank [EvalPlus](https://github.com/evalplus/evalplus) 278 | 279 | 280 | 281 | -------------------------------------------------------------------------------- /codegen/generate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from os import PathLike 4 | 5 | from evalplus.data import get_human_eval_plus 6 | from model import DecoderBase, make_model 7 | from rich.progress import ( 8 | BarColumn, 9 | MofNCompleteColumn, 10 | Progress, 11 | TextColumn, 12 | TimeElapsedColumn, 13 | ) 14 | 15 | from evoeval.data import get_evo_eval 16 | 17 | 18 | def construct_contract_prompt(prompt: str, contract_type: str, contract: str) -> str: 19 | if contract_type == "no": 20 | return prompt 21 | elif contract_type == "docstring": 22 | # embed within the docstring 23 | sep = "" 24 | if '"""' in prompt: 25 | sep = '"""' 26 | elif "'''" in prompt: 27 | sep = "'''" 28 | assert sep != "" 29 | l = prompt.split(sep) 30 | contract = "\n".join([x.split("#")[0] for x in contract.splitlines()]) 31 | l[1] = ( 32 | l[1] + contract + "\n" + " " * (len(contract) - len(contract.lstrip()) - 1) 33 | ) 34 | return sep.join(l) 35 | elif contract_type == "code": 36 | # at the beginning of the function 37 | contract = "\n".join([x.split("#")[0] for x in contract.splitlines()]) 38 | return prompt + contract 39 | 40 | 41 | def code_generate(args, workdir: PathLike, model: DecoderBase, id_range=None): 42 | with Progress( 43 | TextColumn( 44 | f"{args.dataset} •" + "[progress.percentage]{task.percentage:>3.0f}%" 45 | ), 46 | BarColumn(), 47 | MofNCompleteColumn(), 48 | TextColumn("•"), 49 | TimeElapsedColumn(), 50 | ) as p: 51 | if args.dataset == "humaneval": 52 | dataset = get_human_eval_plus() 53 | else: 54 | dataset = get_evo_eval(args.dataset) 55 | 56 | for task_id, task in p.track(dataset.items()): 57 | if id_range is not None: 58 | id_num = int(task_id.split("/")[1]) 59 | low, high = id_range 60 | if id_num < low or id_num >= high: 61 | p.console.print(f"Skipping {task_id} as it is not in {id_range}") 62 | continue 63 | 64 | p_name = task_id.replace("/", "_") 65 | if args.use_contracts != "no" and task["contract"] == "": 66 | continue 67 | os.makedirs(os.path.join(workdir, p_name), exist_ok=True) 68 | log = f"Codegen: {p_name} @ {model}" 69 | n_existing = 0 70 | if args.resume: 71 | # count existing .py files 72 | n_existing = len( 73 | [ 74 | f 75 | for f in os.listdir(os.path.join(workdir, p_name)) 76 | if f.endswith(".py") 77 | ] 78 | ) 79 | if n_existing > 0: 80 | log += f" (resuming from {n_existing})" 81 | 82 | nsamples = args.n_samples - n_existing 83 | p.console.print(log) 84 | 85 | sidx = args.n_samples - nsamples 86 | while sidx < args.n_samples: 87 | outputs = model.codegen( 88 | construct_contract_prompt( 89 | task["prompt"], args.use_contracts, task["contract"] 90 | ), 91 | do_sample=not args.greedy, 92 | num_samples=args.n_samples - sidx, 93 | ) 94 | assert outputs, "No outputs from model!" 95 | for impl in outputs: 96 | try: 97 | with open( 98 | os.path.join(workdir, p_name, f"{sidx}.py"), 99 | "w", 100 | encoding="utf-8", 101 | ) as f: 102 | if model.conversational: 103 | f.write(impl) 104 | else: 105 | f.write(task["prompt"] + impl) 106 | except UnicodeEncodeError: 107 | continue 108 | sidx += 1 109 | 110 | 111 | def main(): 112 | parser = argparse.ArgumentParser() 113 | parser.add_argument("--model", required=True, type=str) 114 | parser.add_argument("--bs", required=True, type=int) 115 | parser.add_argument("--temperature", required=True, type=float) 116 | parser.add_argument("--dataset", default="evileval", type=str) 117 | parser.add_argument("--root", type=str, required=True) 118 | parser.add_argument("--n_samples", default=200, type=int) 119 | parser.add_argument("--resume", action="store_true") 120 | parser.add_argument("--use_contracts", default="no", type=str) 121 | parser.add_argument("--greedy", action="store_true") 122 | # id_range is list 123 | parser.add_argument("--id-range", default=None, nargs="+", type=int) 124 | args = parser.parse_args() 125 | 126 | # if args.dataset not in ["evileval", "humaneval"]: 127 | # raise NotImplementedError("Unsupported dataset: {}".format(args.dataset)) 128 | 129 | if args.use_contracts not in ["no", "code", "docstring"]: 130 | raise NotImplementedError( 131 | "Unsupported contract usage: {}".format(args.use_contracts) 132 | ) 133 | if args.greedy and (args.temperature != 0 or args.bs != 1 or args.n_samples != 1): 134 | raise ValueError( 135 | f"Greedy decoding is only supported with temperature({args.temperature}) = 0, batch_size({args.bs}) = 1" 136 | f" and n_samples({args.n_samples}) = 1" 137 | ) 138 | 139 | if args.id_range is not None: 140 | assert len(args.id_range) == 2, "id_range must be a list of length 2" 141 | assert args.id_range[0] < args.id_range[1], "id_range must be increasing" 142 | args.id_range = tuple(args.id_range) 143 | 144 | # Make project dir 145 | os.makedirs(args.root, exist_ok=True) 146 | # Make dataset dir 147 | os.makedirs(os.path.join(args.root, args.dataset), exist_ok=True) 148 | # Make dir for codes generated by each model 149 | args.model = args.model.lower() 150 | model = make_model( 151 | name=args.model, batch_size=args.bs, temperature=args.temperature 152 | ) 153 | workdir = os.path.join( 154 | args.root, 155 | args.dataset, 156 | args.model 157 | + f"_temp_{args.temperature}" 158 | + ("" if args.use_contracts == "no" else f"-contract-{args.use_contracts}"), 159 | ) 160 | os.makedirs(workdir, exist_ok=True) 161 | 162 | with open(os.path.join(workdir, "args.txt"), "w") as f: 163 | f.write(str(args)) 164 | 165 | code_generate(args, workdir=workdir, model=model, id_range=args.id_range) 166 | 167 | 168 | if __name__ == "__main__": 169 | main() 170 | -------------------------------------------------------------------------------- /codegen/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC, abstractmethod 3 | from typing import List 4 | from warnings import warn 5 | 6 | # Communism 7 | os.environ["HF_HOME"] = os.environ.get("HF_HOME", "/ColossalTitan/huggingface/") 8 | 9 | import anthropic 10 | import google.generativeai as genai 11 | import openai 12 | import torch 13 | from transformers import ( 14 | AutoModelForCausalLM, 15 | AutoModelForSeq2SeqLM, 16 | AutoTokenizer, 17 | StoppingCriteria, 18 | StoppingCriteriaList, 19 | ) 20 | from vllm import LLM, SamplingParams 21 | 22 | from evoeval.util.api_request import ( 23 | create_anthropic_config, 24 | create_chatgpt_config, 25 | create_gemini_config, 26 | create_palm_config, 27 | num_tokens_from_messages, 28 | request_anthropic_engine, 29 | request_chatgpt_engine, 30 | request_gemini_engine, 31 | request_palm_engine, 32 | ) 33 | 34 | HUMANEVAL_EOS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"] 35 | NON_CODE_EOS = ["<|endoftext|>", "\n```", "\n", "<|endofmask|>"] 36 | EOS = HUMANEVAL_EOS + NON_CODE_EOS 37 | 38 | 39 | # Adopted from https://github.com/huggingface/transformers/pull/14897 40 | class EndOfFunctionCriteria(StoppingCriteria): 41 | def __init__(self, start_length, eos, tokenizer, *args, **kwargs): 42 | super().__init__(*args, **kwargs) 43 | self.start_length = start_length 44 | self.eos = eos 45 | self.tokenizer = tokenizer 46 | self.end_length = {} 47 | 48 | def __call__(self, input_ids, scores, **kwargs): 49 | """Returns true if all generated sequences contain any of the end-of-function strings.""" 50 | decoded_generations = self.tokenizer.batch_decode( 51 | input_ids[:, self.start_length :] 52 | ) 53 | done = [] 54 | for index, decoded_generation in enumerate(decoded_generations): 55 | finished = any( 56 | [stop_string in decoded_generation for stop_string in self.eos] 57 | ) 58 | if ( 59 | finished and index not in self.end_length 60 | ): # ensures first time we see it 61 | for stop_string in self.eos: 62 | if stop_string in decoded_generation: 63 | self.end_length[index] = len( 64 | input_ids[ 65 | index, # get length of actual generation 66 | self.start_length : -len( 67 | self.tokenizer.encode( 68 | stop_string, 69 | add_special_tokens=False, 70 | return_tensors="pt", 71 | )[0] 72 | ), 73 | ] 74 | ) 75 | done.append(finished) 76 | return all(done) 77 | 78 | 79 | class DecoderBase(ABC): 80 | def __init__( 81 | self, 82 | name: str, 83 | batch_size: int = 1, 84 | temperature: float = 0.8, 85 | max_new_tokens: int = 512, 86 | conversational: bool = False, 87 | body: bool = False, 88 | ) -> None: 89 | print("Initializing a decoder model: {} ...".format(name)) 90 | self.name = name 91 | self.batch_size = batch_size 92 | self.temperature = temperature 93 | self.eos = EOS 94 | self.skip_special_tokens = False 95 | self.max_new_tokens = max_new_tokens 96 | self.conversational = conversational 97 | self.body = body 98 | 99 | @abstractmethod 100 | def codegen( 101 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 102 | ) -> List[str]: 103 | pass 104 | 105 | def __repr__(self) -> str: 106 | return self.name 107 | 108 | def __str__(self) -> str: 109 | return self.name 110 | 111 | 112 | class VLlmDecoder(DecoderBase): 113 | def __init__( 114 | self, 115 | name: str, 116 | batch_size: int = 1, 117 | temperature: float = 0.8, 118 | max_new_tokens: int = 512, 119 | conversational: bool = False, 120 | ) -> None: 121 | super().__init__(name, batch_size, temperature, max_new_tokens, conversational) 122 | kwargs = {"tensor_parallel_size": int(os.getenv("VLLM_N_GPUS", "1"))} 123 | 124 | if "CodeLlama" in name: 125 | kwargs["dtype"] = "bfloat16" 126 | elif "code-millenials" in name: 127 | kwargs["dtype"] = "float16" 128 | elif "uukuguy/speechless-code-mistral-7b-v1.0" == name: 129 | kwargs["dtype"] = "float16" 130 | elif "uukuguy/speechless-codellama-34b-v2.0" == name: 131 | kwargs["dtype"] = "float16" 132 | elif "CodeBooga" in name: 133 | kwargs["dtype"] = "float16" 134 | elif "WizardCoder" in name and "V1.1" in name: 135 | kwargs["dtype"] = "bfloat16" 136 | elif "WizardCoder" in name: 137 | kwargs["dtype"] = "float16" 138 | elif "deepseek" in name: 139 | kwargs["dtype"] = "bfloat16" 140 | elif "mixtral" in name.lower(): 141 | kwargs["dtype"] = "bfloat16" 142 | elif "solar" in name: 143 | kwargs["dtype"] = "float16" 144 | elif "mistral" in name.lower(): 145 | kwargs["dtype"] = "bfloat16" 146 | elif "phi" in name.lower(): 147 | kwargs["dtype"] = "float16" 148 | kwargs["trust_remote_code"] = True 149 | elif "openchat" in name.lower(): 150 | kwargs["dtype"] = "bfloat16" 151 | 152 | # reset the eos 153 | self.eos = [] 154 | self.llm = LLM(model=name, max_model_len=2048, **kwargs) 155 | 156 | def codegen( 157 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 158 | ) -> List[str]: 159 | if do_sample: 160 | assert self.temperature > 0, "Temperature must be greater than 0!" 161 | batch_size = min(self.batch_size, num_samples) 162 | 163 | vllm_outputs = self.llm.generate( 164 | [prompt] * batch_size, 165 | SamplingParams( 166 | temperature=self.temperature, 167 | max_tokens=self.max_new_tokens 168 | + len(self.llm.get_tokenizer().encode(prompt, return_tensors="pt")[0]), 169 | top_p=0.95 if do_sample else 1.0, 170 | stop=self.eos, 171 | ), 172 | use_tqdm=False, 173 | ) 174 | 175 | gen_strs = [x.outputs[0].text.replace("\t", " ") for x in vllm_outputs] 176 | 177 | return gen_strs 178 | 179 | 180 | class CodeLlamaInstructSmall(VLlmDecoder): 181 | def __init__(self, name: str, **kwargs) -> None: 182 | kwargs["conversational"] = True 183 | super().__init__(name, **kwargs) 184 | self.eos += ["\n```"] 185 | 186 | def codegen( 187 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 188 | ) -> List[str]: 189 | if do_sample: 190 | assert self.temperature > 0, "Temperature must be greater than 0!" 191 | 192 | input = f"""[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```: 193 | ```python 194 | {prompt} 195 | ``` 196 | [/INST] 197 | ```python 198 | """ 199 | 200 | return VLlmDecoder.codegen(self, input, do_sample, num_samples) 201 | 202 | 203 | class Alpaca(VLlmDecoder): 204 | def __init__(self, name: str, **kwargs) -> None: 205 | kwargs["conversational"] = True 206 | super().__init__(name, **kwargs) 207 | self.eos += ["\n```"] 208 | 209 | def codegen( 210 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 211 | ) -> List[str]: 212 | prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes request. 213 | 214 | ### Instruction: 215 | Create a Python script for this problem: 216 | {prompt} 217 | 218 | ### Response: 219 | ```python 220 | """ 221 | return VLlmDecoder.codegen(self, prompt, do_sample, num_samples) 222 | 223 | 224 | class OpenChat(VLlmDecoder): 225 | def __init__(self, name: str, **kwargs) -> None: 226 | kwargs["conversational"] = True 227 | super().__init__(name, **kwargs) 228 | self.eos += ["\n```"] 229 | 230 | def codegen( 231 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 232 | ) -> List[str]: 233 | if do_sample: 234 | assert self.temperature > 0, "Temperature must be greater than 0!" 235 | 236 | input = f"""GPT4 Correct User: Can you complete the following Python function? 237 | ```python 238 | {prompt} 239 | ``` 240 | <|end_of_turn|>GPT4 Correct Assistant: 241 | ```python 242 | """ 243 | return VLlmDecoder.codegen(self, input, do_sample, num_samples) 244 | 245 | 246 | class WizardCoderDecoder(VLlmDecoder): 247 | def codegen( 248 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 249 | ) -> List[str]: 250 | if do_sample: 251 | assert self.temperature > 0, "Temperature must be greater than 0!" 252 | 253 | prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. 254 | 255 | 256 | ### Instruction: 257 | Create a Python script for this problem: 258 | {prompt} 259 | 260 | ### Response:""" 261 | 262 | batch_size = min(self.batch_size, num_samples) 263 | 264 | num_of_tokens = len( 265 | self.llm.get_tokenizer().encode(prompt, return_tensors="pt")[0] 266 | ) 267 | 268 | vllm_outputs = self.llm.generate( 269 | [prompt] * batch_size, 270 | SamplingParams( 271 | temperature=self.temperature, 272 | max_tokens=num_of_tokens + self.max_new_tokens, 273 | top_p=0.95 if do_sample else 1.0, 274 | ), 275 | use_tqdm=False, 276 | ) 277 | 278 | return [x.outputs[0].text.replace("\t", " ") for x in vllm_outputs] 279 | 280 | 281 | class XwinCoder(VLlmDecoder): 282 | def __init__(self, name: str, **kwargs) -> None: 283 | kwargs["conversational"] = True 284 | super().__init__(name, **kwargs) 285 | self.eos += ["\n```"] 286 | 287 | def codegen( 288 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 289 | ) -> List[str]: 290 | 291 | prompt = f""": You are an AI coding assistant that helps people with programming. Write a response that appropriately completes the user's request. 292 | : Complete the following code for me and return a fully runable code. 293 | ```python 294 | {prompt} 295 | ``` 296 | : 297 | ```python 298 | """ 299 | return VLlmDecoder.codegen(self, prompt, do_sample, num_samples) 300 | 301 | 302 | class HFTorchDecoder(DecoderBase): 303 | def __init__(self, name: str, **kwargs): 304 | super().__init__(name=name, **kwargs) 305 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 306 | kwargs = { 307 | "trust_remote_code": name 308 | in { 309 | "bigcode/santacoder", 310 | "Salesforce/codegen2-1B", 311 | "Salesforce/codegen2-3_7B", 312 | "Salesforce/codegen2-7B", 313 | "Salesforce/codegen2-16B", 314 | "deepseek-ai/deepseek-coder-6.7b-base", 315 | "deepseek-ai/deepseek-coder-33b-base", 316 | "stabilityai/stable-code-3b", 317 | "Qwen/Qwen-14B-Chat", 318 | "Qwen/Qwen-7B-Chat", 319 | } 320 | } 321 | 322 | if "codegen-" in name: # use fp16 for codegen models 323 | kwargs["torch_dtype"] = torch.float16 324 | if "codegen2-" in name: # avoid warning of trust remote code 325 | kwargs["revision"] = "main" 326 | if "16b" in name.lower(): 327 | kwargs["device_map"] = "auto" 328 | if "starcoder2" in name: 329 | kwargs["device_map"] = "auto" 330 | if "starcoder" in name: 331 | kwargs["torch_dtype"] = torch.bfloat16 332 | if "CodeLlama" in name: 333 | if "34b" in name.lower() or "70b" in name.lower(): 334 | kwargs["device_map"] = "auto" 335 | kwargs["torch_dtype"] = torch.bfloat16 336 | self.skip_special_tokens = True 337 | if "CodeBooga" in name: 338 | kwargs["torch_dtype"] = torch.float16 339 | kwargs["device_map"] = "auto" 340 | self.skip_special_tokens = True 341 | if "Mistral-7B-codealpaca-lora" == name: 342 | kwargs["torch_dtype"] = torch.float16 343 | self.skip_special_tokens = True 344 | elif "Mistral" in name or "zephyr-7b-beta" in name: 345 | kwargs["torch_dtype"] = torch.bfloat16 346 | if "Mixtral" in name: 347 | kwargs["torch_dtype"] = torch.bfloat16 348 | kwargs["device_map"] = "auto" 349 | if "deepseek" in name: 350 | kwargs["torch_dtype"] = torch.bfloat16 351 | if "33b" in name.lower(): 352 | kwargs["device_map"] = "auto" 353 | self.skip_special_tokens = True 354 | if "/phi" in name: 355 | kwargs["torch_dtype"] = torch.float16 356 | kwargs["trust_remote_code"] = True 357 | self.skip_special_tokens = True 358 | if "Qwen" in name: 359 | kwargs["torch_dtype"] = torch.bfloat16 360 | self.skip_special_tokens = True 361 | if "72B" in name: 362 | kwargs["device_map"] = "auto" 363 | if "Phind" in name: 364 | kwargs["torch_dtype"] = torch.bfloat16 365 | kwargs["device_map"] = "auto" 366 | if "gemma" in name: 367 | kwargs["torch_dtype"] = torch.bfloat16 368 | if "Magicoder" in name: 369 | kwargs["torch_dtype"] = torch.bfloat16 370 | kwargs["device_map"] = "auto" 371 | 372 | print(f"{kwargs = }") 373 | 374 | self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs) 375 | self.model = AutoModelForCausalLM.from_pretrained(name, **kwargs) 376 | if name in {"StabilityAI/stablelm-base-alpha-7b"}: 377 | print("Switching to float16 ...") 378 | self.model = self.model.half() 379 | self.skip_special_tokens = True 380 | 381 | if "device_map" not in kwargs: 382 | self.model = self.model.to(self.device) 383 | 384 | @torch.inference_mode() 385 | def codegen( 386 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 387 | ) -> List[str]: 388 | if self.temperature == 0: 389 | assert not do_sample 390 | assert num_samples == 1 391 | 392 | input_tokens = self.tokenizer.encode(prompt, return_tensors="pt").to( 393 | self.device 394 | ) 395 | scores = StoppingCriteriaList( 396 | [ 397 | EndOfFunctionCriteria( 398 | start_length=len(input_tokens[0]), 399 | eos=self.eos, 400 | tokenizer=self.tokenizer, 401 | ) 402 | ] 403 | ) 404 | kwargs = {} 405 | if do_sample: 406 | kwargs["top_p"] = 0.95 407 | kwargs["temperature"] = self.temperature 408 | 409 | raw_outputs = self.model.generate( 410 | input_tokens, 411 | max_new_tokens=self.max_new_tokens, 412 | stopping_criteria=scores, 413 | do_sample=do_sample, 414 | output_scores=True, 415 | return_dict_in_generate=True, 416 | num_return_sequences=min(self.batch_size, num_samples), 417 | pad_token_id=self.tokenizer.eos_token_id, 418 | **kwargs, 419 | ) # remove warning 420 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :] 421 | gen_strs = self.tokenizer.batch_decode( 422 | gen_seqs, skip_special_tokens=self.skip_special_tokens 423 | ) 424 | outputs = [] 425 | # removes eos tokens. 426 | for output in gen_strs: 427 | min_index = 10000 428 | for eos in self.eos: 429 | if eos in output: 430 | # could be multiple eos in outputs, better pick minimum one 431 | min_index = min(min_index, output.index(eos)) 432 | outputs.append(output[:min_index]) 433 | return outputs 434 | 435 | 436 | class CodeLlamaInstructLarge(HFTorchDecoder): 437 | def __init__(self, name: str, **kwargs) -> None: 438 | kwargs["conversational"] = True 439 | super().__init__(name, **kwargs) 440 | self.eos = ["\n```"] 441 | 442 | def codegen( 443 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 444 | ) -> List[str]: 445 | if do_sample: 446 | assert self.temperature > 0, "Temperature must be greater than 0!" 447 | 448 | input = f"""'Source: system 449 | 450 | You are a helpful and honest code assistant expert in Python. Please, provide all answers to programming questions in Python. 451 | Source: user 452 | 453 | Provide a self-contained Python script that solves the following problem: 454 | ```python 455 | {prompt} 456 | ``` 457 | Source: assistant 458 | 459 | Here is a Python script that solves the problem: 460 | ```python 461 | """ 462 | 463 | input_tokens = self.tokenizer.encode(input, return_tensors="pt").to(self.device) 464 | scores = StoppingCriteriaList( 465 | [ 466 | EndOfFunctionCriteria( 467 | start_length=len(input_tokens[0]), 468 | eos=self.eos, 469 | tokenizer=self.tokenizer, 470 | ) 471 | ] 472 | ) 473 | kwargs = {} 474 | if do_sample: 475 | kwargs["top_p"] = 0.95 476 | kwargs["temperature"] = self.temperature 477 | 478 | max_new_tokens = self.max_new_tokens + len( 479 | self.tokenizer.encode(prompt, return_tensors="pt")[0] 480 | ) 481 | 482 | raw_outputs = self.model.generate( 483 | input_tokens, 484 | max_new_tokens=max_new_tokens, 485 | stopping_criteria=scores, 486 | do_sample=do_sample, 487 | output_scores=True, 488 | return_dict_in_generate=True, 489 | num_return_sequences=min(self.batch_size, num_samples), 490 | pad_token_id=self.tokenizer.eos_token_id, 491 | **kwargs, 492 | ) # remove warning 493 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :] 494 | gen_strs = self.tokenizer.batch_decode( 495 | gen_seqs, skip_special_tokens=self.skip_special_tokens 496 | ) 497 | outputs = [] 498 | # removes eos tokens. 499 | for output in gen_strs: 500 | min_index = 10000 501 | for eos in self.eos: 502 | if eos in output: 503 | # could be multiple eos in outputs, better pick minimum one 504 | min_index = min(min_index, output.index(eos)) 505 | outputs.append(output[:min_index]) 506 | return outputs 507 | 508 | 509 | class QwenInstruct(HFTorchDecoder): 510 | 511 | generation_template = "Please implement the following Python function in a markdown style code block:\n\n```python\n{prompt}\n```\n" 512 | incorrect_code_template = "```python\n{incorrect_solution}\n```\n" 513 | feedback_template = "{feedback}" 514 | 515 | @torch.inference_mode() 516 | def codegen( 517 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 518 | ) -> List[str]: 519 | if self.temperature == 0: 520 | assert not do_sample 521 | assert num_samples == 1 522 | content = self.generation_template.format(prompt=prompt) 523 | 524 | input_tokens = self.tokenizer.apply_chat_template( 525 | [ 526 | { 527 | "role": "user", 528 | "content": content, 529 | } 530 | ], 531 | add_generation_prompt=True, 532 | return_tensors="pt", 533 | ).to(self.device) 534 | 535 | max_token = len(input_tokens[0]) + self.max_new_tokens 536 | 537 | kwargs = {} 538 | if do_sample: 539 | kwargs["top_p"] = 0.95 540 | kwargs["temperature"] = self.temperature 541 | 542 | raw_outputs = self.model.generate( 543 | input_tokens, 544 | max_new_tokens=max_token, 545 | do_sample=do_sample, 546 | output_scores=True, 547 | return_dict_in_generate=True, 548 | top_k=50, 549 | num_return_sequences=min(self.batch_size, num_samples), 550 | pad_token_id=self.tokenizer.eos_token_id, 551 | **kwargs, 552 | ) # remove warning 553 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :] 554 | gen_strs = self.tokenizer.batch_decode( 555 | gen_seqs, skip_special_tokens=self.skip_special_tokens 556 | ) 557 | return gen_strs 558 | 559 | 560 | class DeepSeekInstruct(HFTorchDecoder): 561 | 562 | generation_template = "Please implement the following Python function in a markdown style code block:\n\n```python\n{prompt}\n```\n" 563 | incorrect_code_template = "```python\n{incorrect_solution}\n```\n" 564 | feedback_template = "{feedback}" 565 | 566 | @torch.inference_mode() 567 | def codegen( 568 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 569 | ) -> List[str]: 570 | if self.temperature == 0: 571 | assert not do_sample 572 | assert num_samples == 1 573 | content = self.generation_template.format(prompt=prompt) 574 | 575 | input_tokens = self.tokenizer.apply_chat_template( 576 | [ 577 | { 578 | "role": "user", 579 | "content": content, 580 | } 581 | ], 582 | add_generation_prompt=True, 583 | return_tensors="pt", 584 | ).to(self.device) 585 | 586 | # set instruction model to have more max_tokens TODO: for all models 587 | max_token = len(input_tokens[0]) + self.max_new_tokens 588 | 589 | kwargs = {} 590 | if do_sample: 591 | kwargs["top_p"] = 0.95 592 | kwargs["temperature"] = self.temperature 593 | 594 | raw_outputs = self.model.generate( 595 | input_tokens, 596 | max_new_tokens=max_token, 597 | do_sample=do_sample, 598 | output_scores=True, 599 | return_dict_in_generate=True, 600 | top_k=50, 601 | num_return_sequences=min(self.batch_size, num_samples), 602 | pad_token_id=self.tokenizer.eos_token_id, 603 | eos_token_id=32021, 604 | **kwargs, 605 | ) # remove warning 606 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :] 607 | gen_strs = self.tokenizer.batch_decode( 608 | gen_seqs, skip_special_tokens=self.skip_special_tokens 609 | ) 610 | return gen_strs 611 | # return [x.split("```python")[-1].split("```")[0] for x in gen_strs] 612 | 613 | 614 | class MistralInstruct(DeepSeekInstruct): 615 | pass # just use the same as DeepSeekInstruct 616 | 617 | 618 | class MixtralSPMXInstruct(DeepSeekInstruct): 619 | pass # just use the same as DeepSeekInstruct 620 | 621 | 622 | class GemmaInstruct(QwenInstruct): 623 | pass # just use the same as QwenInstruct 624 | 625 | 626 | class MagicCoderInstruct(DeepSeekInstruct): 627 | 628 | generation_template = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n@@ Instruction\nWrite a solution to the following problem:\n```python\n{prompt}\n```\n\n@@ Response\n""" 629 | 630 | @torch.inference_mode() 631 | def codegen( 632 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 633 | ) -> List[str]: 634 | if self.temperature == 0: 635 | assert not do_sample 636 | assert num_samples == 1 637 | content = self.generation_template.format(prompt=prompt) 638 | 639 | input_tokens = self.tokenizer.encode(content, return_tensors="pt").to( 640 | self.device 641 | ) 642 | 643 | max_token = len(input_tokens[0]) + self.max_new_tokens 644 | 645 | kwargs = {} 646 | if do_sample: 647 | kwargs["top_p"] = 0.95 648 | kwargs["temperature"] = self.temperature 649 | 650 | raw_outputs = self.model.generate( 651 | input_tokens, 652 | max_new_tokens=max_token, 653 | do_sample=do_sample, 654 | output_scores=True, 655 | return_dict_in_generate=True, 656 | top_k=50, 657 | num_return_sequences=min(self.batch_size, num_samples), 658 | pad_token_id=self.tokenizer.eos_token_id, 659 | eos_token_id=self.tokenizer.eos_token_id, 660 | **kwargs, 661 | ) # remove warning 662 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :] 663 | gen_strs = self.tokenizer.batch_decode( 664 | gen_seqs, skip_special_tokens=self.skip_special_tokens 665 | ) 666 | return gen_strs 667 | 668 | 669 | class AnthropicDecoder(DecoderBase): 670 | generation_template = ( 671 | "Please complete the following code snippet.\n```\n{prompt}\n```" 672 | ) 673 | 674 | def __init__(self, name: str, model_name: str = "gpt-3.5-turbo", **kwargs) -> None: 675 | super().__init__(name, **kwargs) 676 | self.model_name = model_name 677 | self.client = anthropic.Anthropic( 678 | api_key=os.getenv("ANTHROPIC_API_KEY", "dummy") 679 | ) 680 | 681 | def _anthrophic_parse(self, ret, prompt, body=False): 682 | outputs = [] 683 | for returns in ret.content: 684 | raw_o = returns.text 685 | outputs.append(raw_o) 686 | return outputs 687 | 688 | def codegen( 689 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 690 | ) -> List[str]: 691 | if do_sample: 692 | assert self.temperature > 0, "Temperature must be positive for sampling" 693 | 694 | batch_size = min(self.batch_size, num_samples) 695 | assert batch_size <= 20, "Use larger batch size could blow up the memory!" 696 | 697 | message = self.generation_template.format(prompt=prompt.strip()) 698 | 699 | # estimation 700 | num_tokens = num_tokens_from_messages(message, self.model_name) 701 | 702 | config = create_anthropic_config( 703 | message=message, 704 | max_tokens=num_tokens + self.max_new_tokens, 705 | temperature=self.temperature, 706 | batch_size=batch_size, 707 | model=self.model_name, 708 | ) 709 | ret = request_anthropic_engine(self.client, config) 710 | return self._anthrophic_parse(ret, prompt.strip(), body=self.body) 711 | 712 | 713 | class PalmDecoder(DecoderBase): 714 | generation_template = ( 715 | "Please complete the following code snippet.\n```\n{prompt}\n```" 716 | ) 717 | 718 | def __init__(self, name: str, model_name: str = "palm", **kwargs) -> None: 719 | super().__init__(name, **kwargs) 720 | genai.configure(api_key=os.getenv("GOOGLE_API_KEY", "dummy")) 721 | self.model_name = model_name 722 | 723 | def _palm_parse(self, ret, prompt): 724 | outputs = [] 725 | raw_o = ret.result 726 | outputs.append(raw_o) 727 | return outputs 728 | 729 | def codegen( 730 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 731 | ) -> List[str]: 732 | if do_sample: 733 | assert self.temperature > 0, "Temperature must be positive for sampling" 734 | 735 | batch_size = min(self.batch_size, num_samples) 736 | assert batch_size <= 20, "Use larger batch size could blow up the memory!" 737 | 738 | message = self.generation_template.format(prompt=prompt.strip()) 739 | 740 | # approximate ge 741 | num_tokens = num_tokens_from_messages(message, self.model_name) 742 | 743 | config = create_palm_config( 744 | message=message, 745 | max_tokens=num_tokens + self.max_new_tokens, 746 | temperature=self.temperature, 747 | batch_size=batch_size, 748 | model=self.model_name, 749 | ) 750 | ret = request_palm_engine(genai, config) 751 | # if "gpt-3.5" in self.model_name: 752 | return self._palm_parse(ret, prompt.strip()) 753 | 754 | 755 | class GeminiChatDecoder(DecoderBase): 756 | generation_template = ( 757 | "Please complete the following code snippet.\n```\n{prompt}\n```" 758 | ) 759 | 760 | def __init__( 761 | self, name: str, model_name: str = "models/gemini-pro", **kwargs 762 | ) -> None: 763 | super().__init__(name, **kwargs) 764 | self.model_name = model_name 765 | genai.configure(api_key=os.getenv("GOOGLE_API_KEY", "dummy")) 766 | self.model = genai.GenerativeModel(self.model_name) 767 | 768 | @staticmethod 769 | def _find_gen_func_sig(prompt): 770 | func_sig = "" 771 | for x in prompt.splitlines(): 772 | if x.startswith("def ") and x.endswith(":"): 773 | # always pick the last one, since there could pre-defined functions. 774 | func_sig = x 775 | return func_sig 776 | 777 | @staticmethod 778 | def _remove_eos(gen): 779 | min_index = 100000000 780 | for eos in EOS: 781 | if eos in gen: 782 | min_index = min(min_index, gen.index(eos)) 783 | return gen[:min_index] 784 | 785 | def _gemini_parse(self, ret, prompt): 786 | outputs = [] 787 | raw_o = ret.text 788 | outputs.append(raw_o) 789 | return outputs 790 | 791 | def codegen( 792 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 793 | ) -> List[str]: 794 | if do_sample: 795 | assert self.temperature > 0, "Temperature must be positive for sampling" 796 | 797 | batch_size = min(self.batch_size, num_samples) 798 | assert batch_size <= 20, "Use larger batch size could blow up the memory!" 799 | 800 | message = self.generation_template.format(prompt=prompt.strip()) 801 | 802 | # approximate ge 803 | num_tokens = num_tokens_from_messages(message, self.model_name) 804 | 805 | config = create_gemini_config( 806 | max_tokens=num_tokens + self.max_new_tokens, 807 | temperature=self.temperature, 808 | batch_size=batch_size, 809 | ) 810 | ret = request_gemini_engine(self.model, message, config) 811 | # if "gpt-3.5" in self.model_name: 812 | return self._gemini_parse(ret, prompt.strip()) 813 | 814 | 815 | class OpenAIChatDecoder(DecoderBase): 816 | generation_template = ( 817 | "Please complete the following code snippet.\n```\n{prompt}\n```" 818 | ) 819 | 820 | def __init__(self, name: str, model_name: str = "gpt-3.5-turbo", **kwargs) -> None: 821 | super().__init__(name, **kwargs) 822 | self.model_name = model_name 823 | openai.api_key = os.environ.get("OPENAI_API_KEY", "dummy") 824 | 825 | def codegen( 826 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 827 | ) -> List[str]: 828 | if do_sample: 829 | assert self.temperature > 0, "Temperature must be positive for sampling" 830 | 831 | batch_size = min(self.batch_size, num_samples) 832 | assert batch_size <= 20, "Use larger batch size could blow up the memory!" 833 | 834 | # construct prompt 835 | # if "gpt-3.5" in self.model_name: # chatgpt 836 | message = self.generation_template.format(prompt=prompt.strip()) 837 | 838 | num_tokens = num_tokens_from_messages(message, self.model_name) 839 | 840 | config = create_chatgpt_config( 841 | message=message, 842 | max_tokens=num_tokens + self.max_new_tokens, 843 | temperature=self.temperature, 844 | batch_size=batch_size, 845 | model=self.model_name, 846 | ) 847 | ret = request_chatgpt_engine(config) 848 | outputs = [] 849 | for returns in ret.choices: 850 | outputs.append(returns.message.content) 851 | return outputs 852 | 853 | 854 | class StarCoder2(HFTorchDecoder): 855 | def codegen( 856 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 857 | ) -> List[str]: 858 | prompt = prompt.strip() # starcoder2 needs this, bad 859 | return HFTorchDecoder.codegen(self, prompt, do_sample, num_samples) 860 | 861 | 862 | class StarCoderInfill(HFTorchDecoder): 863 | def __init__(self, name: str, **kwargs) -> None: 864 | super().__init__(name, **kwargs) 865 | self.prefix_token = "" 866 | self.suffix_token = "" 867 | 868 | def codegen( 869 | self, prompt: str, do_sample: bool = True, num_samples: int = 200 870 | ) -> List[str]: 871 | if self.temperature == 0: 872 | assert not do_sample 873 | assert num_samples == 1 874 | 875 | input = self.prefix_token + prompt + self.suffix_token 876 | input_tokens = self.tokenizer.encode(input, return_tensors="pt").to(self.device) 877 | scores = StoppingCriteriaList( 878 | [ 879 | EndOfFunctionCriteria( 880 | start_length=len(input_tokens[0]), 881 | eos=self.eos, 882 | tokenizer=self.tokenizer, 883 | ) 884 | ] 885 | ) 886 | temperature = max(self.temperature, 1e-2) 887 | raw_outputs = self.model.generate( 888 | input_tokens, 889 | max_new_tokens=self.max_new_tokens, 890 | stopping_criteria=scores, 891 | do_sample=do_sample, 892 | top_p=0.95, 893 | top_k=None, 894 | temperature=temperature, 895 | num_return_sequences=min(self.batch_size, num_samples), 896 | output_scores=True, 897 | return_dict_in_generate=True, 898 | repetition_penalty=1.0, 899 | pad_token_id=self.tokenizer.eos_token_id, 900 | ) 901 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :] 902 | gen_strs = self.tokenizer.batch_decode( 903 | gen_seqs, skip_special_tokens=self.skip_special_tokens 904 | ) 905 | outputs = [] 906 | # removes eos tokens. 907 | for output in gen_strs: 908 | min_index = 10000 909 | for eos in self.eos: 910 | if eos in output: 911 | min_index = min(min_index, output.index(eos)) 912 | outputs.append(output[:min_index]) 913 | return outputs 914 | 915 | 916 | def make_model(name: str, batch_size: int = 1, temperature: float = 0.8): 917 | if name == "claude-3": 918 | return AnthropicDecoder( 919 | batch_size=batch_size, 920 | name="claude", 921 | temperature=temperature, 922 | model_name="claude-3-opus-20240229", 923 | conversational=True, 924 | ) 925 | elif name == "claude-3-haiku": # cheaper model 926 | return AnthropicDecoder( 927 | batch_size=batch_size, 928 | name="claude", 929 | temperature=temperature, 930 | model_name="claude-3-haiku-20240307", 931 | conversational=True, 932 | ) 933 | elif name == "claude-2": 934 | return AnthropicDecoder( 935 | batch_size=batch_size, 936 | name="claude", 937 | temperature=temperature, 938 | model_name="claude-2.1", 939 | conversational=True, 940 | ) 941 | elif name == "gemini-pro": 942 | return GeminiChatDecoder( 943 | batch_size=batch_size, 944 | name="gemini-pro", 945 | temperature=temperature, 946 | model_name="models/gemini-pro", 947 | conversational=True, 948 | ) 949 | elif name == "palm": 950 | return PalmDecoder( 951 | batch_size=batch_size, 952 | name="palm", 953 | temperature=temperature, 954 | model_name="models/text-bison-001", 955 | conversational=True, 956 | ) 957 | elif name == "chatgpt": 958 | return OpenAIChatDecoder( 959 | batch_size=batch_size, 960 | name="ChatGPT", 961 | temperature=temperature, 962 | model_name="gpt-3.5-turbo", 963 | conversational=True, 964 | ) 965 | elif name == "gpt-4-turbo": 966 | return OpenAIChatDecoder( 967 | batch_size=batch_size, 968 | name="GPT4", 969 | temperature=temperature, 970 | model_name="gpt-4-turbo-preview", 971 | conversational=True, 972 | ) 973 | elif name in ["gpt-4", "gpt-4-1106-preview"]: 974 | return OpenAIChatDecoder( 975 | batch_size=batch_size, 976 | name="GPT4", 977 | temperature=temperature, 978 | model_name=name, 979 | conversational=True, 980 | ) 981 | elif name.startswith("starcoder2"): 982 | import re 983 | 984 | pattern = re.compile(r"starcoder2-(\d+)b") 985 | matches = pattern.findall(name) 986 | nb = int(matches[0]) 987 | assert float(nb) > 0 988 | return StarCoder2( 989 | batch_size=batch_size, 990 | name=f"bigcode/{name}", 991 | temperature=temperature, 992 | ) 993 | elif name.startswith("starcoder"): 994 | return StarCoderInfill( 995 | batch_size=batch_size, name=f"bigcode/{name}", temperature=temperature 996 | ) 997 | elif name.startswith("code-llama-"): 998 | import re 999 | 1000 | pattern = re.compile(r"code-llama-(\d+\.?\d*)b(.*)") 1001 | matches = pattern.findall(name)[0] 1002 | nb = matches[0] 1003 | assert float(nb) > 0 1004 | 1005 | if "instruct" in name: 1006 | if float(nb) < 69: # nice 1007 | return CodeLlamaInstructSmall( 1008 | batch_size=batch_size, 1009 | name=f"codellama/CodeLlama-{nb}b-Instruct-hf", 1010 | temperature=temperature, 1011 | ) 1012 | else: 1013 | return CodeLlamaInstructLarge( 1014 | batch_size=batch_size, 1015 | name=f"codellama/CodeLlama-{nb}b-Instruct-hf", 1016 | temperature=temperature, 1017 | ) 1018 | elif "python" in name: 1019 | return HFTorchDecoder( 1020 | batch_size=batch_size, 1021 | name=f"codellama/CodeLlama-{nb}b-Python-hf", 1022 | temperature=temperature, 1023 | ) 1024 | else: 1025 | return VLlmDecoder( 1026 | batch_size=batch_size, 1027 | name=f"codellama/CodeLlama-{nb}b-hf", 1028 | temperature=temperature, 1029 | ) 1030 | elif name.startswith("deepseek-coder"): 1031 | import re 1032 | 1033 | # format deepseek-coder-{nb}b* 1034 | pattern = re.compile(r"deepseek-coder-(\d+\.?\d*)b(.*)") 1035 | matches = pattern.findall(name)[0] 1036 | nb = matches[0] 1037 | assert float(nb) > 0 1038 | 1039 | if "instruct" in name: 1040 | return DeepSeekInstruct( 1041 | batch_size=batch_size, 1042 | name=f"deepseek-ai/{name}", 1043 | temperature=temperature, 1044 | conversational=True, 1045 | ) 1046 | else: 1047 | return HFTorchDecoder( 1048 | batch_size=batch_size, 1049 | name=f"deepseek-ai/deepseek-coder-{nb}b-base", 1050 | temperature=temperature, 1051 | ) 1052 | elif name == "magicoder-s-ds-6.7b": 1053 | return MagicCoderInstruct( 1054 | batch_size=batch_size, 1055 | name=f"ise-uiuc/Magicoder-S-DS-6.7B", 1056 | temperature=temperature, 1057 | conversational=True, 1058 | ) 1059 | elif name == "magicoder-s-cl-7b": 1060 | return MagicCoderInstruct( 1061 | batch_size=batch_size, 1062 | name=f"ise-uiuc/Magicoder-S-CL-7B", 1063 | temperature=temperature, 1064 | conversational=True, 1065 | ) 1066 | elif name.startswith("wizardcoder-34b"): 1067 | return WizardCoderDecoder( 1068 | batch_size=batch_size, 1069 | name=f"WizardLM/WizardCoder-Python-34B-V1.0", 1070 | temperature=temperature, 1071 | conversational=True, 1072 | ) 1073 | elif name.startswith("wizardcoder-33b-1.1"): 1074 | return WizardCoderDecoder( 1075 | batch_size=batch_size, 1076 | name=f"WizardLM/WizardCoder-33B-V1.1", 1077 | temperature=temperature, 1078 | conversational=True, 1079 | ) 1080 | elif name == "phind-code-llama-34b-v2": 1081 | return HFTorchDecoder( 1082 | batch_size=batch_size, 1083 | name="Phind/Phind-CodeLlama-34B-v2", 1084 | temperature=temperature, 1085 | ) 1086 | elif name.startswith("mistral-7b"): 1087 | if "instruct" in name: 1088 | if name.endswith("-v02"): 1089 | return MistralInstruct( 1090 | batch_size=batch_size, 1091 | name="mistralai/Mistral-7B-Instruct-v0.2", 1092 | temperature=temperature, 1093 | conversational=True, 1094 | ) 1095 | else: 1096 | return MistralInstruct( 1097 | batch_size=batch_size, 1098 | name="mistralai/Mistral-7B-Instruct-v0.1", 1099 | temperature=temperature, 1100 | conversational=True, 1101 | ) 1102 | else: 1103 | return HFTorchDecoder( 1104 | batch_size=batch_size, 1105 | name="mistralai/Mistral-7B-v0.1", 1106 | temperature=temperature, 1107 | ) 1108 | elif name.startswith("mixtral-8x7b"): 1109 | if "instruct" in name: 1110 | return MixtralSPMXInstruct( 1111 | batch_size=batch_size, 1112 | name="mistralai/Mixtral-8x7B-Instruct-v0.1", 1113 | temperature=temperature, 1114 | conversational=True, 1115 | ) 1116 | else: 1117 | return HFTorchDecoder( 1118 | batch_size=batch_size, 1119 | name="mistralai/Mixtral-8x7B-v0.1", 1120 | temperature=temperature, 1121 | ) 1122 | elif name == "stable-code-3b": 1123 | return HFTorchDecoder( 1124 | batch_size=batch_size, 1125 | name="stabilityai/stable-code-3b", 1126 | temperature=temperature, 1127 | ) 1128 | elif name == "speechless-codellama-34b": 1129 | return Alpaca( 1130 | batch_size=batch_size, 1131 | name="uukuguy/speechless-codellama-34b-v2.0", 1132 | temperature=temperature, 1133 | ) 1134 | elif name == "openchat": 1135 | return OpenChat( 1136 | batch_size=batch_size, 1137 | name="openchat/openchat-3.5-0106", 1138 | temperature=temperature, 1139 | ) 1140 | elif name.startswith("code-millenials-34b"): 1141 | return Alpaca( 1142 | batch_size=batch_size, 1143 | name="budecosystem/code-millenials-34b", 1144 | temperature=temperature, 1145 | conversational=True, 1146 | ) 1147 | elif name == "phi-2": 1148 | return VLlmDecoder( 1149 | batch_size=batch_size, 1150 | name="microsoft/phi-2", 1151 | temperature=temperature, 1152 | ) 1153 | elif name.startswith("qwen"): 1154 | # format deepseek-coder-{nb}b* 1155 | import re 1156 | 1157 | pattern = re.compile(r"qwen-(\d+\.?\d*)b(.*)") 1158 | matches = pattern.findall(name)[0] 1159 | nb = matches[0] 1160 | assert float(nb) > 0 1161 | 1162 | if "1.5" in name: 1163 | return QwenInstruct( 1164 | batch_size=batch_size, 1165 | name=f"Qwen/Qwen1.5-{nb}B-Chat", 1166 | temperature=temperature, 1167 | conversational=True, 1168 | ) 1169 | else: 1170 | return QwenInstruct( 1171 | batch_size=batch_size, 1172 | name=f"Qwen/Qwen-{nb}B-Chat", 1173 | temperature=temperature, 1174 | conversational=True, 1175 | ) 1176 | elif name.startswith("xwincoder-34b"): 1177 | return XwinCoder( 1178 | batch_size=batch_size, name="Xwin-LM/XwinCoder-34B", temperature=temperature 1179 | ) 1180 | elif name.startswith("gemma"): 1181 | import re 1182 | 1183 | pattern = re.compile(r"gemma-(\d+\.?\d*)b(.*)") 1184 | matches = pattern.findall(name)[0] 1185 | nb = matches[0] 1186 | assert float(nb) > 0 1187 | if "instruct" in name: 1188 | return GemmaInstruct( 1189 | batch_size=batch_size, 1190 | name=f"google/gemma-{nb}b-it", 1191 | temperature=temperature, 1192 | conversational=True, 1193 | ) 1194 | else: 1195 | return HFTorchDecoder( 1196 | batch_size=batch_size, 1197 | name=f"google/gemma-{nb}b", 1198 | temperature=temperature, 1199 | ) 1200 | 1201 | raise ValueError(f"Invalid model name: {name}") 1202 | -------------------------------------------------------------------------------- /evoeval/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from evoeval._version import __version__, __version_tuple__ 3 | except ImportError: 4 | __version__ = "local-dev" 5 | -------------------------------------------------------------------------------- /evoeval/data.py: -------------------------------------------------------------------------------- 1 | # largely adopted from EvalPlus 2 | import gzip 3 | import hashlib 4 | import json 5 | import os 6 | from typing import Dict, Iterable 7 | 8 | import tempdir 9 | import wget 10 | from appdirs import user_cache_dir 11 | 12 | CACHE_DIR = user_cache_dir("evoeval") 13 | 14 | 15 | EVOEVAL_VERSION = "v0.1.0" 16 | EVOEVAL_OVERRIDE_PATH = os.environ.get("EVOEVAL_OVERRIDE_PATH", None) 17 | 18 | 19 | def write_jsonl( 20 | filename: str, data: Iterable[Dict], append: bool = False, drop_builtin: bool = True 21 | ): 22 | """ 23 | Writes an iterable of dictionaries to jsonl 24 | """ 25 | if append: 26 | mode = "ab" 27 | else: 28 | mode = "wb" 29 | filename = os.path.expanduser(filename) 30 | if filename.endswith(".gz"): 31 | with open(filename, mode) as fp: 32 | with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp: 33 | for x in data: 34 | if drop_builtin: 35 | x = {k: v for k, v in x.items() if not k.startswith("_")} 36 | gzfp.write((json.dumps(x) + "\n").encode("utf-8")) 37 | else: 38 | with open(filename, mode) as fp: 39 | for x in data: 40 | if drop_builtin: 41 | x = {k: v for k, v in x.items() if not k.startswith("_")} 42 | fp.write((json.dumps(x) + "\n").encode("utf-8")) 43 | 44 | 45 | def make_cache(gzip_url, cache_path, dataset_name): 46 | # Check if human eval file exists in CACHE_DIR 47 | if not os.path.exists(cache_path): 48 | # Install HumanEval dataset and parse as jsonl 49 | print(f"Downloading dataset from {gzip_url}") 50 | with tempdir.TempDir() as tmpdir: 51 | # TODO need to test this. 52 | evoeval_gz_path = os.path.join(tmpdir, f"{dataset_name}-data.jsonl.gz") 53 | wget.download(gzip_url, evoeval_gz_path) 54 | 55 | with gzip.open(evoeval_gz_path, "rb") as f: 56 | evoeval = f.read().decode("utf-8") 57 | 58 | # create CACHE_DIR if not exists 59 | if not os.path.exists(CACHE_DIR): 60 | os.makedirs(CACHE_DIR) 61 | 62 | # Write the original human eval file to CACHE_DIR 63 | with open(cache_path, "w") as f: 64 | f.write(evoeval) 65 | 66 | 67 | def get_dataset_metadata(name: str, version: str): 68 | assert name in [ 69 | "EvoEval_difficult", 70 | "EvoEval_creative", 71 | "EvoEval_subtle", 72 | "EvoEval_combine", 73 | "EvoEval_tool_use", 74 | "EvoEval_verbose", 75 | "EvoEval_concise", 76 | ], f"Unknown/unsupported dataset: {name}" 77 | url = f"https://github.com/evo-eval/evoeval_release/releases/download/{version}/{name}.jsonl.gz" 78 | cache_path = os.path.join(CACHE_DIR, f"{name}-{version}.jsonl") 79 | return url, cache_path 80 | 81 | 82 | def _ready_evo_eval_path(dataset_name: str) -> str: 83 | if EVOEVAL_OVERRIDE_PATH is not None: 84 | # create CACHE_DIR if not exists 85 | if not os.path.exists(CACHE_DIR): 86 | os.makedirs(CACHE_DIR) 87 | return f"{EVOEVAL_OVERRIDE_PATH}/{dataset_name}.jsonl" 88 | 89 | url, cache_path = get_dataset_metadata(dataset_name, EVOEVAL_VERSION) 90 | make_cache(url, cache_path, dataset_name) 91 | 92 | return cache_path 93 | 94 | 95 | def get_evo_eval_plus_hash(dataset_name: str) -> str: 96 | evoeval_path = _ready_evo_eval_path(dataset_name) 97 | with open(evoeval_path, "rb") as f: 98 | evoeval = f.read() 99 | return hashlib.md5(evoeval).hexdigest() 100 | 101 | 102 | def get_evo_eval(dataset_name: str): 103 | evoeval_path = _ready_evo_eval_path(dataset_name) 104 | with open(evoeval_path, "r") as f: 105 | data = {json.loads(task)["task_id"]: json.loads(task) for task in f.readlines()} 106 | 107 | return data 108 | -------------------------------------------------------------------------------- /evoeval/eval_test/__init__.py: -------------------------------------------------------------------------------- 1 | # largely adopted from https://github.com/evalplus/evalplus 2 | 3 | import itertools 4 | import json 5 | import multiprocessing 6 | import os 7 | import time 8 | from enum import IntEnum, auto 9 | from multiprocessing import Array, Value 10 | from typing import Any, Dict, List, Tuple, Union 11 | 12 | import numpy as np 13 | from evalplus.eval.utils import ( 14 | create_tempdir, 15 | reliability_guard, 16 | swallow_io, 17 | time_limit, 18 | ) 19 | 20 | from evoeval.eval_test._creative_special_oracle import ( 21 | _check_maze, 22 | _check_path, 23 | _check_product, 24 | ) 25 | from evoeval.eval_test._difficult_special_oracle import ( 26 | _check_difficult_poly, 27 | _check_insensitive_palindrome, 28 | ) 29 | from evoeval.eval_test._he_special_oracle import _poly 30 | from evoeval.eval_test._subtle_special_oracle import _check_poly 31 | 32 | 33 | class CustomEncoder(json.JSONEncoder): 34 | def default(self, obj): 35 | if isinstance(obj, set): 36 | return list(obj) 37 | if isinstance(obj, object): 38 | return str(obj) 39 | return json.JSONEncoder.default(self, obj) 40 | 41 | 42 | def compatible_eval_result(results: Dict) -> Dict: 43 | # compatibility 44 | for task_results in results["eval"].values(): 45 | # update the "files" field to "nfiles" 46 | if "files" in task_results and "nfiles" not in task_results: 47 | task_results["nfiles"] = len(task_results.pop("files")) 48 | return results 49 | 50 | 51 | # unbiased estimator from https://github.com/openai/human-eval 52 | def estimate_pass_at_k( 53 | num_samples: Union[int, List[int], np.ndarray], 54 | num_correct: Union[List[int], np.ndarray], 55 | k: int, 56 | ) -> np.ndarray: 57 | """ 58 | Estimates pass@k of each problem and returns them in an array. 59 | """ 60 | 61 | def estimator(n: int, c: int, k: int) -> float: 62 | """ 63 | Calculates 1 - comb(n - c, k) / comb(n, k). 64 | """ 65 | if n - c < k: 66 | return 1.0 67 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 68 | 69 | if isinstance(num_samples, int): 70 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 71 | else: 72 | assert len(num_samples) == len(num_correct) 73 | num_samples_it = iter(num_samples) 74 | 75 | return np.array( 76 | [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)] 77 | ) 78 | 79 | 80 | PASS = "pass" 81 | FAIL = "fail" 82 | TIMEOUT = "timeout" 83 | 84 | _SUCCESS = 0 85 | _FAILED = 1 86 | _TIMEOUT = 2 87 | _UNKNOWN = 3 88 | 89 | _mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None} 90 | 91 | 92 | def is_floats(x) -> bool: 93 | # check if it is float; List[float]; Tuple[float] 94 | # TODO: search for any close floats? (other data structures) 95 | if isinstance(x, float): 96 | return True 97 | if isinstance(x, (list, tuple)): 98 | return all(isinstance(i, float) for i in x) 99 | if isinstance(x, np.ndarray): 100 | return x.dtype == np.float64 or x.dtype == np.float32 101 | return False 102 | 103 | 104 | class DataType(IntEnum): 105 | Float = auto() 106 | Bool = auto() 107 | Int = auto() 108 | Str = auto() 109 | Null = auto() 110 | Tuple = auto() 111 | List = auto() 112 | Dict = auto() 113 | Set = auto() 114 | Type = auto() 115 | Unknown = auto() 116 | 117 | 118 | def get_type(x): 119 | if x is None: 120 | return DataType.Null 121 | elif isinstance(x, bool): 122 | return DataType.Bool 123 | elif isinstance(x, int): 124 | return DataType.Int 125 | elif isinstance(x, str): 126 | return DataType.Str 127 | elif is_floats(x): 128 | return DataType.Float 129 | elif isinstance(x, tuple): 130 | return DataType.Tuple 131 | elif isinstance(x, list): 132 | return DataType.List 133 | elif isinstance(x, dict): 134 | return DataType.Dict 135 | elif isinstance(x, set): 136 | return DataType.Set 137 | elif isinstance(x, type): 138 | return DataType.Type 139 | else: 140 | return DataType.Unknown 141 | 142 | 143 | def is_equal(x, y) -> tuple[bool, str]: 144 | x_type, y_type = get_type(x), get_type(y) 145 | if x_type != y_type: 146 | return False, "Type mismatch: {} vs {}".format(str(x_type), str(y_type)) 147 | 148 | if x_type in [ 149 | DataType.Int, 150 | DataType.Bool, 151 | DataType.Null, 152 | DataType.Str, 153 | DataType.Set, 154 | DataType.Type, 155 | ]: 156 | if x == y: 157 | return True, None 158 | try: 159 | error_msg = "INT/BOOL/NULL/ Value mismatch: {} vs {}".format( 160 | repr(x)[:300], repr(y)[:300] 161 | ) 162 | except: 163 | error_msg = "Value mismatch: too large for display" 164 | return False, error_msg 165 | elif x_type == DataType.Float: 166 | if np.allclose(x, y, equal_nan=True, atol=1e-6): # guard against nan 167 | return True, None 168 | else: 169 | return False, "FLOAT Value mismatch: {} vs {}".format(x, y) 170 | elif x_type in [DataType.List, DataType.Tuple]: 171 | if len(x) != len(y): 172 | return False, "Length mismatch: {} vs {}".format(len(x), len(y)) 173 | for i in range(len(x)): 174 | equal, msg = is_equal(x[i], y[i]) 175 | if not equal: 176 | return False, msg 177 | return True, None 178 | elif x_type == DataType.Dict: 179 | if len(x) != len(y): 180 | return False, "Length mismatch: {} vs {}".format(len(x), len(y)) 181 | for k, v in x.items(): 182 | if k not in y: 183 | return False, "DICT Value mismatch: key {} in {} but not in {}".format( 184 | k, x, y 185 | ) 186 | equal, msg = is_equal(v, y[k]) 187 | if not equal: 188 | return False, msg 189 | return True, None 190 | else: 191 | # from IPython import embed 192 | # embed() 193 | try: 194 | if x == y: # e.g., object comparison 195 | return True, None 196 | else: 197 | return False, "ELSE Value mismatch: {} vs {}".format(x, y) 198 | except: 199 | return False, "Unsupported type: {} <-- {}".format(x_type, type(x)) 200 | 201 | 202 | def unsafe_execute( 203 | dataset: str, 204 | entry_point: str, 205 | task_id: str, 206 | code: str, 207 | inputs, 208 | expected: List, 209 | time_limits, 210 | atol, 211 | fast_check, 212 | stat: Value, 213 | details: Array, 214 | progress: Value, 215 | ): 216 | with create_tempdir(): 217 | # These system calls are needed when cleaning up tempdir. 218 | import os 219 | import shutil 220 | 221 | rmtree = shutil.rmtree 222 | rmdir = os.rmdir 223 | chdir = os.chdir 224 | # Disable functionalities that can make destructive changes to the test. 225 | # allow only 4GB memory usage 226 | maximum_memory_bytes = 4 * 1024 * 1024 * 1024 227 | reliability_guard(maximum_memory_bytes=maximum_memory_bytes) 228 | exec_globals = {} 229 | try: 230 | with swallow_io(): 231 | exec(code, exec_globals) 232 | fn = exec_globals[entry_point] 233 | for i, inp in enumerate(inputs): 234 | try: 235 | with time_limit(time_limits[i]): 236 | out = fn(*inp) 237 | exp = expected[i] 238 | # TODO, for special oracles, think about how to deal with case where 239 | # the function has side affect and changes the input ... 240 | # this is true especially for some grid checking stuff 241 | # ================================================ # 242 | # ============== special oracles ================= # 243 | # use task_id and dataset to determine the oracle 244 | if ( 245 | dataset == "humaneval" 246 | or "verbose" in dataset 247 | or "concise" in dataset 248 | ) and task_id == "HumanEval/32": 249 | assert abs(_poly(*out, inp)) <= 1e-6 250 | 251 | # =================== Difficult ================== # 252 | elif "difficult" in dataset and task_id == "EvoEval/10": 253 | _check_insensitive_palindrome(out, *inp, exp) 254 | elif "difficult" in dataset and task_id == "EvoEval/32": 255 | _check_difficult_poly(*inp, out, exp) 256 | 257 | # =================== Creative =================== # 258 | elif "creative" in dataset and task_id == "EvoEval/26": 259 | _check_maze(*inp, out, exp) 260 | elif "creative" in dataset and task_id == "EvoEval/30": 261 | _check_path(*inp, out, exp) 262 | elif "creative" in dataset and task_id == "EvoEval/69": 263 | _check_product(*inp, out, exp) 264 | 265 | # =================== Subtle ===================== # 266 | elif "subtle" in dataset and task_id == "EvoEval/32": 267 | _check_poly(*inp, out) 268 | 269 | # =================== Combine ==================== # 270 | 271 | # =================== Tool Using ================= # 272 | 273 | # ============== special oracles ================= # 274 | # ================================================ # 275 | else: 276 | exact_match, _ = is_equal(exp, out) 277 | assert exact_match 278 | except BaseException: 279 | details[i] = False 280 | progress.value += 1 281 | if fast_check: 282 | raise 283 | continue 284 | 285 | details[i] = True 286 | progress.value += 1 287 | stat.value = _SUCCESS 288 | except BaseException: 289 | stat.value = _FAILED 290 | # Needed for cleaning up. 291 | shutil.rmtree = rmtree 292 | os.rmdir = rmdir 293 | os.chdir = chdir 294 | 295 | 296 | def untrusted_check( 297 | dataset: str, 298 | code: str, 299 | inputs: List[Any], 300 | entry_point: str, 301 | task_id: str, 302 | expected, 303 | atol, 304 | ref_time: List[float], 305 | fast_check: bool = False, 306 | min_time_limit: float = 0.1, 307 | gt_time_limit_factor: float = 2.0, 308 | ) -> Tuple[str, np.ndarray]: 309 | time_limits = [max(min_time_limit, gt_time_limit_factor * t) for t in ref_time] 310 | timeout = min(os.getenv("EVOEVAL_TIMEOUT_PER_TASK", 60), sum(time_limits)) + 1 311 | if not fast_check: 312 | timeout += 1 # extra time for data collection 313 | 314 | # shared memory objects 315 | progress = Value("i", 0) 316 | stat = Value("i", _UNKNOWN) 317 | details = Array("b", [False for _ in range(len(inputs))]) 318 | p = multiprocessing.Process( 319 | target=unsafe_execute, 320 | args=( 321 | dataset, 322 | entry_point, 323 | task_id, 324 | code, 325 | inputs, 326 | expected, 327 | time_limits, 328 | atol, 329 | fast_check, 330 | # return values 331 | stat, 332 | details, 333 | progress, 334 | ), 335 | ) 336 | p.start() 337 | p.join(timeout=timeout + 1) 338 | if p.is_alive(): 339 | p.terminate() 340 | time.sleep(0.1) 341 | if p.is_alive(): 342 | p.kill() 343 | time.sleep(0.1) 344 | 345 | stat = _mapping[stat.value] 346 | details = details[: progress.value] 347 | 348 | if not stat: 349 | stat = TIMEOUT 350 | 351 | if stat == PASS: 352 | if len(details) != len(inputs) or not all(details): 353 | stat = FAIL 354 | 355 | return stat, details 356 | 357 | 358 | def evaluateb_files( 359 | dataset: str, 360 | files: List[str], 361 | inputs: List, 362 | expected: List, 363 | entry_point: str, 364 | atol: float, 365 | ref_time: List[float], 366 | fast_check: bool = False, 367 | min_time_limit: float = 0.1, 368 | gt_time_limit_factor: float = 2.0, 369 | ) -> List[Tuple[str, List[bool]]]: 370 | ret = [] 371 | # sort files by the id in name (i.e., "../n.py") 372 | files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0])) 373 | for file in files: 374 | code = open(file, "r").read() 375 | stat, det = untrusted_check( 376 | dataset, 377 | code, 378 | inputs, 379 | entry_point, 380 | expected=expected, 381 | atol=atol, 382 | ref_time=ref_time, 383 | fast_check=fast_check, 384 | min_time_limit=min_time_limit, 385 | gt_time_limit_factor=gt_time_limit_factor, 386 | ) 387 | ret.append((stat, det.tolist())) 388 | return ret 389 | -------------------------------------------------------------------------------- /evoeval/eval_test/_creative_special_oracle.py: -------------------------------------------------------------------------------- 1 | # oracle for EvoEval/51 in creative 2 | def _check_maze(maze, start, end, solution_path, gt_path): 3 | if not gt_path: 4 | assert solution_path == [] 5 | else: 6 | # check the path according to solution reaches from start to end 7 | move_to_direction = { 8 | "right": (0, 1), 9 | "left": (0, -1), 10 | "up": (-1, 0), 11 | "down": (1, 0), 12 | } 13 | current_position = start 14 | for move in solution_path: 15 | current_position = ( 16 | current_position[0] + move_to_direction[move][0], 17 | current_position[1] + move_to_direction[move][1], 18 | ) 19 | assert maze[current_position[0]][current_position[1]] != 1 20 | 21 | assert current_position == end 22 | 23 | 24 | # oracle for EvoEval/55 in creative 25 | def _check_path(maze, start, end, solution_path, gt_path): 26 | if not gt_path: 27 | assert solution_path == [] 28 | else: 29 | # check the path according to solution reaches from start to end 30 | assert solution_path[0] == start 31 | assert solution_path[-1] == end 32 | assert maze[start[0]][start[0]] != 0 33 | for i in range(1, len(solution_path)): 34 | prev_x, prev_y = solution_path[i - 1] 35 | curr_x, curr_y = solution_path[i] 36 | assert maze[curr_x][curr_y] != 0 # not a wall 37 | assert abs(curr_x - prev_x) + abs(curr_y - prev_y) == 1 # adjacent 38 | 39 | 40 | # oracle for EvoEval/110 in creative 41 | def _check_product(arr, target, solution, gt): 42 | if gt == "No magic today": 43 | assert gt == solution 44 | else: 45 | assert isinstance(solution, tuple) 46 | i, j = solution 47 | assert 0 <= i < j < len(arr) # don't allow negative indexing 48 | assert arr[i] * arr[j] == target 49 | -------------------------------------------------------------------------------- /evoeval/eval_test/_difficult_special_oracle.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | # oracle for EvoEval/10 in difficult 5 | def _check_insensitive_palindrome(check_palindrome, string, gt_palindrome): 6 | assert len(check_palindrome) == len(gt_palindrome) 7 | assert check_palindrome.startswith(string) 8 | assert check_palindrome.lower() == check_palindrome[::-1].lower() 9 | 10 | 11 | def _poly(xs: list, x: float): 12 | """ 13 | Evaluates polynomial with coefficients xs at point x. 14 | return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n 15 | """ 16 | return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)]) 17 | 18 | 19 | # oracle for EvoEval/32 in difficult 20 | def _check_difficult_poly(xs, interval, solution, gt_solution): 21 | if gt_solution is None: 22 | assert solution is None 23 | return 24 | 25 | start, end = interval 26 | assert start <= solution <= end 27 | assert abs(_poly(xs, solution)) <= 2e-2 28 | -------------------------------------------------------------------------------- /evoeval/eval_test/_he_special_oracle.py: -------------------------------------------------------------------------------- 1 | # Adopted from EvalPlus 2 | import math 3 | 4 | 5 | # oracle for HumanEval/032 6 | def _poly(xs: list, x: float): 7 | """ 8 | Evaluates polynomial with coefficients xs at point x. 9 | return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n 10 | """ 11 | return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)]) 12 | -------------------------------------------------------------------------------- /evoeval/eval_test/_subtle_special_oracle.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | # oracle for EvoEval/32 in subtle 5 | def _poly(xs: list, x: float): 6 | """ 7 | Evaluates polynomial with coefficients xs at point x. 8 | return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n 9 | """ 10 | return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)]) 11 | 12 | 13 | def _check_poly(xs, solution): 14 | full_xs = [xs[0], xs[1]] 15 | for i in range(2, len(xs)): 16 | full_xs.extend([0, xs[i]]) 17 | assert abs(_poly(full_xs, solution)) <= 1e-6 18 | -------------------------------------------------------------------------------- /evoeval/evaluate.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/evalplus/evalplus 2 | import argparse 3 | import contextlib 4 | import json 5 | import multiprocessing 6 | import os 7 | import pickle 8 | import threading 9 | import time 10 | from collections import Counter, defaultdict 11 | from concurrent.futures import ProcessPoolExecutor, as_completed 12 | from typing import Any, Dict, List, Tuple 13 | from warnings import warn 14 | 15 | import numpy as np 16 | from evalplus.data import get_human_eval_plus 17 | from evalplus.data.utils import load_solutions 18 | from evalplus.gen.util import trusted_exec 19 | from termcolor import cprint 20 | from tqdm import tqdm 21 | 22 | from evoeval.data import CACHE_DIR, get_evo_eval, get_evo_eval_plus_hash 23 | from evoeval.eval_test import ( 24 | FAIL, 25 | PASS, 26 | CustomEncoder, 27 | compatible_eval_result, 28 | estimate_pass_at_k, 29 | untrusted_check, 30 | ) 31 | 32 | # 1st item: the status 33 | # 2nd item (optional): the detailed pass/fail boolean for each input 34 | Result = Tuple[str, List[bool]] 35 | 36 | 37 | def get_groundtruth( 38 | problems, hashcode, use_raw_inputs=False, compute_plus_inputs=False 39 | ) -> Dict[str, Any]: 40 | if hashcode is not None: 41 | cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl") 42 | if os.path.exists(cache_file): 43 | print(f"Load from ground-truth from {cache_file}") 44 | with open(cache_file, "rb") as f: 45 | return pickle.load(f) 46 | 47 | print("Computing expected output...") 48 | tbegin = time.time() 49 | expected_output = {} 50 | for task_id, problem in problems.items(): 51 | oracle = {} 52 | with contextlib.redirect_stdout(None): 53 | oracle["base"], oracle["base_time"] = trusted_exec( 54 | problem["prompt"] + "\n" + problem["canonical_solution"], 55 | problem["base_input"] 56 | if use_raw_inputs 57 | else [ 58 | eval(f"[{i}]") for i in problem["inputs"] 59 | ], # why do we do this? we have more complex input types. 60 | problem["entry_point"], 61 | record_time=True, 62 | output_not_none=False, 63 | ) 64 | expected_output[task_id] = oracle 65 | 66 | if compute_plus_inputs: 67 | oracle["plus"], oracle["plus_time"] = trusted_exec( 68 | problem["prompt"] + "\n" + problem["canonical_solution"], 69 | problem["plus_input"], # assumption: we have plus_input 70 | problem["entry_point"], 71 | record_time=True, 72 | output_not_none=False, 73 | ) 74 | expected_output[task_id] = oracle 75 | 76 | # print(expected_output) 77 | print(f"Expected outputs computed in {time.time() - tbegin:.2f}s") 78 | 79 | if hashcode is not None: 80 | with open(cache_file, "wb") as f: 81 | pickle.dump(expected_output, f) 82 | 83 | return expected_output 84 | 85 | 86 | def check_correctness( 87 | dataset: str, 88 | completion_id: int, 89 | problem: Dict[str, Any], 90 | solution: str, 91 | expected_output: Dict[str, List], 92 | fast_check=False, 93 | identifier=None, 94 | min_time_limit: float = 0.1, 95 | gt_time_limit_factor: float = 2.0, 96 | use_raw_inputs=False, 97 | compute_plus_inputs=False, 98 | ) -> Dict[str, Result]: # {...}, "base" | "plus" -> (status, details) 99 | 100 | ret = { 101 | "completion_id": completion_id, 102 | "task_id": problem["task_id"], 103 | "_identifier": identifier, 104 | "solution": solution, 105 | } 106 | 107 | ret["result"] = untrusted_check( 108 | dataset, 109 | solution, 110 | problem["base_input"] 111 | if use_raw_inputs 112 | else [eval(f"[{i}]") for i in problem["inputs"]], 113 | problem["entry_point"], 114 | task_id=problem["task_id"], 115 | expected=expected_output["base"], 116 | atol=0, # TODO check 117 | ref_time=expected_output["base_time"], 118 | fast_check=fast_check, 119 | min_time_limit=min_time_limit, 120 | gt_time_limit_factor=gt_time_limit_factor, 121 | ) 122 | 123 | if compute_plus_inputs: 124 | ret["plus"] = untrusted_check( 125 | dataset, 126 | solution, 127 | problem["plus_input"], 128 | problem["entry_point"], 129 | task_id=problem["task_id"], 130 | expected=expected_output["plus"], 131 | atol=0, # TODO check 132 | ref_time=expected_output["plus_time"], 133 | fast_check=fast_check, 134 | min_time_limit=min_time_limit, 135 | gt_time_limit_factor=gt_time_limit_factor, 136 | ) 137 | 138 | return ret 139 | 140 | 141 | def evaluate(flags): 142 | if flags.parallel is None: 143 | n_workers = max(1, multiprocessing.cpu_count() // 2) 144 | else: 145 | n_workers = flags.parallel 146 | 147 | if os.path.isdir(flags.samples): 148 | result_path = os.path.join(flags.samples, "eval_results.json") 149 | else: 150 | assert flags.samples.endswith(".jsonl") 151 | result_path = flags.samples.replace(".jsonl", "_eval_results.json") 152 | 153 | compute_plus_inputs = False 154 | 155 | if os.path.isfile(result_path) and not flags.i_just_wanna_run: 156 | print(f"Load from previous results from {result_path}") 157 | with open(result_path, "r") as f: 158 | results = json.load(f) 159 | 160 | results = compatible_eval_result(results) 161 | else: 162 | use_raw_inputs = False 163 | if flags.dataset == "humaneval": 164 | use_raw_inputs = True 165 | compute_plus_inputs = True 166 | problems = get_human_eval_plus() 167 | expected_output = get_groundtruth( 168 | problems, 169 | None, 170 | use_raw_inputs=use_raw_inputs, 171 | compute_plus_inputs=compute_plus_inputs, 172 | ) 173 | elif "verbose" in flags.dataset or "concise" in flags.dataset: 174 | use_raw_inputs = True 175 | compute_plus_inputs = True 176 | problems = get_evo_eval(flags.dataset) 177 | expected_output = get_groundtruth( 178 | problems, 179 | None, 180 | use_raw_inputs=use_raw_inputs, 181 | compute_plus_inputs=compute_plus_inputs, 182 | ) 183 | else: 184 | problems = get_evo_eval(flags.dataset) 185 | dataset_hash = get_evo_eval_plus_hash(flags.dataset) 186 | expected_output = get_groundtruth( 187 | problems, 188 | dataset_hash, 189 | use_raw_inputs=use_raw_inputs, 190 | compute_plus_inputs=compute_plus_inputs, 191 | ) 192 | 193 | results = { 194 | "eval": {}, 195 | } 196 | 197 | with ProcessPoolExecutor(max_workers=n_workers) as executor: 198 | futures = [] 199 | completion_id = Counter() 200 | n_samples = 0 201 | eval_results = defaultdict(list) # task_id -> 202 | remainings = set() 203 | 204 | print("Reading samples...") 205 | for sample in tqdm(load_solutions(flags.samples)): 206 | task_id = sample["task_id"] 207 | solution = ( 208 | sample["solution"] 209 | if "solution" in sample 210 | else problems[task_id]["prompt"] + sample["completion"] 211 | ) 212 | remainings.add(sample["_identifier"]) 213 | args = ( 214 | flags.dataset, 215 | completion_id[task_id], 216 | problems[task_id], 217 | solution, 218 | expected_output[task_id], 219 | not flags.test_details, # fast_check 220 | sample["_identifier"], 221 | flags.min_time_limit, 222 | flags.gt_time_limit_factor, 223 | use_raw_inputs, 224 | compute_plus_inputs, 225 | ) 226 | futures.append(executor.submit(check_correctness, *args)) 227 | completion_id[task_id] += 1 228 | n_samples += 1 229 | 230 | assert n_samples == len(remainings), "Missing problems in unfinished" 231 | assert len(completion_id) == len(problems), "Missing problems in samples" 232 | 233 | def stucking_checker(): 234 | while remainings: 235 | last_size = len(remainings) 236 | time.sleep(20) 237 | if last_size != len(remainings) or len(remainings) == 0: 238 | continue 239 | # Potentially stuck 240 | warn("No samples had finished testing in the last 20s") 241 | warn(f"{len(remainings)} samples to be tested: {remainings}") 242 | 243 | threading.Thread(target=stucking_checker).start() 244 | 245 | for future in tqdm(as_completed(futures), total=n_samples): 246 | result = future.result() 247 | remainings.remove(result["_identifier"]) 248 | eval_results[result["task_id"]].append(result) 249 | 250 | # sort the results for each problem by completion_id 251 | for task_id, task_results in eval_results.items(): 252 | task_results.sort(key=lambda x: x["completion_id"]) 253 | results["eval"][task_id] = [] 254 | for res in task_results: 255 | 256 | def get_failed_tests(stat, details, inputs) -> List[Any]: 257 | if stat == PASS or not details: 258 | return [] 259 | 260 | # if flags.test_details: 261 | return [inputs[i] for i in range(len(details)) if not details[i]] 262 | 263 | base_stat, base_details = res["result"] 264 | base_fail_tests = get_failed_tests( 265 | base_stat, 266 | base_details, 267 | problems[task_id]["base_input"] 268 | if use_raw_inputs 269 | else [eval(f"[{i}]") for i in problems[task_id]["inputs"]], 270 | ) 271 | 272 | # initialize plus tests 273 | plus_stat = None 274 | plus_fail_tests = [] 275 | 276 | # with plus tests 277 | if not flags.base_only and compute_plus_inputs: 278 | plus_stat, plus_details = res["plus"] 279 | plus_fail_tests = get_failed_tests( 280 | plus_stat, plus_details, problems[task_id]["plus_input"] 281 | ) 282 | 283 | results["eval"][task_id].append( 284 | { 285 | "task_id": task_id, 286 | "solution": res["solution"], 287 | "base_status": base_stat, 288 | "plus_status": plus_stat, 289 | "base_fail_tests": base_fail_tests, 290 | "plus_fail_tests": plus_fail_tests, 291 | } 292 | ) 293 | 294 | if os.path.isfile(result_path) and flags.i_just_wanna_run: 295 | decision = "" 296 | while decision.lower() not in ["y", "n"]: 297 | print(f"{result_path} already exists. Press [Y/N] to overwrite or exit...") 298 | decision = input() 299 | 300 | if decision.lower() == "y": 301 | # mv the file to a backup 302 | new_path = result_path + ".bak" 303 | while os.path.isfile(new_path): 304 | new_path += ".bak" 305 | os.rename(result_path, new_path) 306 | print(f"Backup {result_path} to {new_path}") 307 | 308 | if not os.path.isfile(result_path): 309 | with open(result_path, "w") as f: 310 | json.dump( 311 | results, f, cls=CustomEncoder 312 | ) # handle some unique cases where failure inputs are sets 313 | 314 | # Calculate pass@k. 315 | total = np.array([len(r) for r in results["eval"].values()]) 316 | correct = [] 317 | plus_correct = [] 318 | 319 | for res in results["eval"].values(): 320 | bc = sum([r["base_status"] == PASS for r in res]) 321 | correct.append(bc) 322 | if not flags.base_only and compute_plus_inputs: 323 | plus_correct.append( 324 | sum( 325 | [ 326 | res[i]["base_status"] == res[i]["plus_status"] == PASS 327 | for i in range(len(res)) 328 | ] 329 | ) 330 | ) 331 | 332 | correct = np.array(correct) 333 | pass_at_k = { 334 | f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() 335 | for k in [1, 10, 100] 336 | if total.min() >= k 337 | } 338 | cprint(f"{flags.dataset}", "red") 339 | for k, v in pass_at_k.items(): 340 | cprint(f"{k}:\t{v:.3f}", "red") 341 | 342 | if plus_correct: 343 | cprint(f"{flags.dataset}+ (base + extra tests)", "green") 344 | pass_at_k = { 345 | f"pass@{k}": estimate_pass_at_k(total, np.array(plus_correct), k).mean() 346 | for k in [1, 10, 100] 347 | if (total >= k).all() 348 | } 349 | for k, v in pass_at_k.items(): 350 | cprint(f"{k}:\t{v:.3f}", "green") 351 | 352 | 353 | def main(): 354 | parser = argparse.ArgumentParser(description="Evaluator") 355 | parser.add_argument("--dataset", required=True, type=str) 356 | parser.add_argument("--samples", required=True, type=str) 357 | parser.add_argument("--base-only", action="store_true") 358 | parser.add_argument("--parallel", default=None, type=int) 359 | parser.add_argument("--i-just-wanna-run", action="store_true") 360 | parser.add_argument("--test-details", action="store_true") 361 | parser.add_argument("--min-time-limit", default=1, type=float) 362 | parser.add_argument("--gt-time-limit-factor", default=4.0, type=float) 363 | parser.add_argument("--mini", action="store_true") 364 | parser.add_argument( 365 | "--noextreme", action="store_true", help="Omit extreme test inputs" 366 | ) 367 | args = parser.parse_args() 368 | 369 | evaluate(args) 370 | 371 | 372 | if __name__ == "__main__": 373 | main() 374 | -------------------------------------------------------------------------------- /evoeval/util/api_request.py: -------------------------------------------------------------------------------- 1 | import signal 2 | import time 3 | from typing import Dict, Union 4 | 5 | import openai 6 | import tiktoken 7 | from google.generativeai import GenerationConfig 8 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory 9 | 10 | client = openai.OpenAI() 11 | 12 | 13 | def num_tokens_from_messages(message, model="gpt-3.5-turbo-0301"): 14 | """Returns the number of tokens used by a list of messages.""" 15 | try: 16 | encoding = tiktoken.encoding_for_model(model) 17 | except KeyError: 18 | encoding = tiktoken.get_encoding("cl100k_base") 19 | if isinstance(message, list): 20 | # use last message. 21 | num_tokens = len(encoding.encode(message[0]["content"])) 22 | else: 23 | num_tokens = len(encoding.encode(message)) 24 | return num_tokens 25 | 26 | 27 | def create_chatgpt_config( 28 | message: Union[str, list], 29 | max_tokens: int, 30 | temperature: float = 1, 31 | batch_size: int = 1, 32 | system_message: str = "You are a helpful assistant.", 33 | model: str = "gpt-3.5-turbo", 34 | ) -> Dict: 35 | if isinstance(message, list): 36 | config = { 37 | "model": model, 38 | "max_tokens": max_tokens, 39 | "temperature": temperature, 40 | "n": batch_size, 41 | "messages": [{"role": "system", "content": system_message}] + message, 42 | } 43 | else: 44 | config = { 45 | "model": model, 46 | "max_tokens": max_tokens, 47 | "temperature": temperature, 48 | "n": batch_size, 49 | "messages": [ 50 | {"role": "system", "content": system_message}, 51 | {"role": "user", "content": message}, 52 | ], 53 | } 54 | return config 55 | 56 | 57 | def handler(signum, frame): 58 | # swallow signum and frame 59 | raise Exception("end of time") 60 | 61 | 62 | def request_chatgpt_engine(config): 63 | ret = None 64 | while ret is None: 65 | try: 66 | signal.signal(signal.SIGALRM, handler) 67 | signal.alarm(100) 68 | ret = client.chat.completions.create(**config) 69 | signal.alarm(0) 70 | except openai._exceptions.BadRequestError as e: 71 | print(e) 72 | signal.alarm(0) 73 | except openai._exceptions.RateLimitError as e: 74 | print("Rate limit exceeded. Waiting...") 75 | print(e) 76 | signal.alarm(0) 77 | time.sleep(5) 78 | except openai._exceptions.APIConnectionError as e: 79 | print("API connection error. Waiting...") 80 | signal.alarm(0) 81 | time.sleep(5) 82 | except Exception as e: 83 | print("Unknown error. Waiting...") 84 | print(e) 85 | signal.alarm(0) 86 | time.sleep(1) 87 | return ret 88 | 89 | 90 | def create_gemini_config( 91 | max_tokens: int, 92 | temperature: float = 1, 93 | batch_size: int = 1, 94 | ) -> Dict: 95 | config = GenerationConfig( 96 | candidate_count=batch_size, 97 | max_output_tokens=max_tokens, 98 | temperature=temperature, 99 | ) 100 | return config 101 | 102 | 103 | safety_settings = [ 104 | { 105 | "category": "HARM_CATEGORY_DANGEROUS", 106 | "threshold": "BLOCK_NONE", 107 | }, 108 | { 109 | "category": "HARM_CATEGORY_HARASSMENT", 110 | "threshold": "BLOCK_NONE", 111 | }, 112 | { 113 | "category": "HARM_CATEGORY_HATE_SPEECH", 114 | "threshold": "BLOCK_NONE", 115 | }, 116 | { 117 | "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", 118 | "threshold": "BLOCK_NONE", 119 | }, 120 | { 121 | "category": "HARM_CATEGORY_DANGEROUS_CONTENT", 122 | "threshold": "BLOCK_NONE", 123 | }, 124 | ] 125 | 126 | 127 | def request_gemini_engine(model, message, config): 128 | ret = None 129 | count = 0 130 | while ret is None: 131 | try: 132 | signal.signal(signal.SIGALRM, handler) 133 | signal.alarm(100) 134 | ret = model.generate_content( 135 | message, generation_config=config, safety_settings=safety_settings 136 | ) 137 | s = ret.text # check if response can be accessed. 138 | signal.alarm(0) 139 | except Exception as e: 140 | ret = None # reset 141 | print("Unknown error. Waiting...") 142 | count += 1 143 | print(e) 144 | # here we need to slightly increase temperature to combat weird gemini output of 145 | # The token generation was stopped as the response was flagged for unauthorized citations. 146 | if count > 10: 147 | config.temperature = min(config.temperature + 0.1, 1) 148 | signal.alarm(0) 149 | time.sleep(20) 150 | return ret 151 | 152 | 153 | def create_palm_config( 154 | message: str, 155 | max_tokens: int, 156 | temperature: float = 1, 157 | batch_size: int = 1, 158 | model: str = "models/text-bison-001", 159 | ) -> Dict: 160 | config = { 161 | "model": model, 162 | "prompt": message, 163 | "temperature": temperature, 164 | "max_output_tokens": max_tokens, 165 | "safety_settings": [ 166 | { 167 | "category": HarmCategory.HARM_CATEGORY_DEROGATORY, 168 | "threshold": HarmBlockThreshold.BLOCK_NONE, 169 | }, 170 | { 171 | "category": HarmCategory.HARM_CATEGORY_TOXICITY, 172 | "threshold": HarmBlockThreshold.BLOCK_NONE, 173 | }, 174 | { 175 | "category": HarmCategory.HARM_CATEGORY_SEXUAL, 176 | "threshold": HarmBlockThreshold.BLOCK_NONE, 177 | }, 178 | { 179 | "category": HarmCategory.HARM_CATEGORY_VIOLENCE, 180 | "threshold": HarmBlockThreshold.BLOCK_NONE, 181 | }, 182 | { 183 | "category": HarmCategory.HARM_CATEGORY_DANGEROUS, 184 | "threshold": HarmBlockThreshold.BLOCK_NONE, 185 | }, 186 | { 187 | "category": HarmCategory.HARM_CATEGORY_MEDICAL, 188 | "threshold": HarmBlockThreshold.BLOCK_NONE, 189 | }, 190 | ], 191 | } 192 | return config 193 | 194 | 195 | def request_palm_engine(model, config): 196 | ret = None 197 | count = 0 198 | while ret is None: 199 | try: 200 | signal.signal(signal.SIGALRM, handler) 201 | signal.alarm(100) 202 | ret = model.generate_text(**config) 203 | s = ret.result # check if response can be accessed. 204 | if s is None: 205 | config["temperature"] = min(config["temperature"] + 0.1, 1) 206 | count += 1 207 | if count > 100: 208 | ret.result = "" # just return empty string 209 | else: 210 | ret = None # reset 211 | signal.alarm(0) 212 | except Exception as e: 213 | ret = None # reset 214 | print("Unknown error. Waiting...") 215 | print(e) 216 | signal.alarm(0) 217 | time.sleep(20) 218 | return ret 219 | 220 | 221 | def create_anthropic_config( 222 | message: str, 223 | max_tokens: int, 224 | temperature: float = 1, 225 | batch_size: int = 1, 226 | model: str = "claude-2.1", 227 | ) -> Dict: 228 | if isinstance(message, list): 229 | config = { 230 | "model": model, 231 | "temperature": temperature, 232 | "max_tokens": max_tokens, 233 | "messages": message, 234 | } 235 | else: 236 | config = { 237 | "model": model, 238 | "temperature": temperature, 239 | "max_tokens": max_tokens, 240 | "messages": [{"role": "user", "content": message}], 241 | } 242 | return config 243 | 244 | 245 | def request_anthropic_engine(client, config): 246 | ret = None 247 | while ret is None: 248 | try: 249 | signal.signal(signal.SIGALRM, handler) 250 | signal.alarm(100) 251 | ret = client.messages.create(**config) 252 | signal.alarm(0) 253 | except Exception as e: 254 | print("Unknown error. Waiting...") 255 | print(e) 256 | signal.alarm(0) 257 | time.sleep(10) 258 | return ret 259 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools_scm] 6 | write_to = "evoeval/_version.py" 7 | version_scheme = "release-branch-semver" 8 | local_scheme = "no-local-version" 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wget 2 | appdirs 3 | tempdir 4 | multipledispatch 5 | numpy 6 | tqdm 7 | termcolor 8 | evalplus @ git+https://github.com/evalplus/evalplus 9 | -------------------------------------------------------------------------------- /resources/butterfly_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/evo-eval/evoeval/d5ca3104ec30b99f1076f51d4476eb4c3f29effa/resources/butterfly_dark.png -------------------------------------------------------------------------------- /resources/example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/evo-eval/evoeval/d5ca3104ec30b99f1076f51d4476eb4c3f29effa/resources/example.gif -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = evoeval 3 | description = "EvoEval: Evolving Coding Benchmarks via LLM" 4 | long_description = file: README.md 5 | long_description_content_type = text/markdown 6 | url = https://github.com/evo-eval/evoeval 7 | license = Apache-2.0 8 | license_file = LICENSE 9 | platform = any 10 | classifiers = 11 | Operating System :: OS Independent 12 | Programming Language :: Python :: 3 13 | License :: OSI Approved :: Apache Software License 14 | 15 | [options] 16 | packages = find: 17 | python_requires = >=3.9 18 | dependency_links = 19 | install_requires = 20 | wget>=3.2 21 | tempdir>=0.7.1 22 | multipledispatch>=0.6.0 23 | appdirs>=1.4.4 24 | numpy>=1.19.5 25 | tqdm>=4.56.0 26 | termcolor>=2.0.0 27 | evalplus>=0.2.0 28 | 29 | [options.entry_points] 30 | console_scripts = 31 | evoeval.evaluate = evoeval.evaluate:main 32 | -------------------------------------------------------------------------------- /tool/sanitize.py: -------------------------------------------------------------------------------- 1 | # largely adopted from EvalPlus 2 | 3 | import ast 4 | import os 5 | import pathlib 6 | 7 | from evalplus.data import get_human_eval_plus 8 | from tqdm import tqdm 9 | 10 | from evoeval.data import get_evo_eval 11 | 12 | INCODER_EXTRA = ["", "<|", ""] 13 | POLYCODER_EXTRA = ["\n//", "\n/*"] 14 | NON_CODE_EOFS = ["<|endoftext|>", "\n```", "\n", "\n#"] 15 | 16 | 17 | def get_all_python_files(folder): 18 | # return a list of full-path python files 19 | py_files = [] 20 | for root, _, files in os.walk(folder): 21 | for file in files: 22 | if file.endswith(".py"): 23 | py_files.append(os.path.join(root, file)) 24 | return py_files 25 | 26 | 27 | def remove_unindented_lines(code, ok_starts): 28 | new_code = "" 29 | for line in code.splitlines(): 30 | if any([line.startswith(t) for t in ok_starts]) or line.strip() == "": 31 | new_code += line + "\n" 32 | continue 33 | 34 | lspace = len(line) - len(line.lstrip()) 35 | if lspace == 0: 36 | continue 37 | 38 | new_code += line + "\n" 39 | 40 | return new_code 41 | 42 | 43 | def extract_function(code, target_func): 44 | def remove_last_line_until_parse(code): 45 | try: 46 | tree = ast.parse(code) 47 | except: 48 | if "\n" in code: 49 | code = code.rsplit("\n", 1)[0] 50 | return remove_last_line_until_parse(code) 51 | else: 52 | return None 53 | return tree 54 | 55 | tree = remove_last_line_until_parse(code) 56 | if tree is None: # fail to parse 57 | return "" 58 | 59 | # return the target function only 60 | for node in tree.body: 61 | if isinstance(node, ast.FunctionDef): 62 | if node.name == target_func: 63 | return ast.unparse(node) 64 | return "" 65 | 66 | 67 | def to_four_space_indents(old_code): 68 | new_code = "" 69 | for line in old_code.splitlines(): 70 | lspace = len(line) - len(line.lstrip()) 71 | if lspace == 3: 72 | new_code += " " 73 | new_code += line + "\n" 74 | return new_code 75 | 76 | 77 | def sanitize_folder(args, folder): 78 | # task_id -> entry_point 79 | entry_point = {} 80 | prompts = {} 81 | 82 | if args.dataset == "humaneval": 83 | problems = get_human_eval_plus() 84 | else: 85 | problems = get_evo_eval(args.dataset) 86 | 87 | for task_id, problem in problems.items(): 88 | entry_point[task_id] = problem["entry_point"] 89 | prompts[task_id] = problem["prompt"] 90 | 91 | # make a new folder with "-sanitized" suffix 92 | old_folder = pathlib.Path(folder) 93 | if args.inplace: 94 | new_folder = old_folder 95 | else: 96 | new_folder = old_folder.parent / (old_folder.name + "-sanitized") 97 | 98 | nsan = 0 99 | ntotal = 0 100 | for pyf in tqdm(get_all_python_files(folder)): 101 | # Get [?] from "[prefix]/HumanEval_[?]/[number].py": 102 | task_id = pyf.split("/")[-2].replace("_", "/") 103 | ntotal += 1 104 | old_code = open(pyf).read() 105 | 106 | def_left = "def " + entry_point[task_id] + "(" 107 | 108 | imports = prompts[task_id].split(def_left)[0] 109 | def_right = def_left.join(prompts[task_id].split(def_left)[1:]) 110 | 111 | new_code = imports + def_left + old_code.split(def_left)[-1] 112 | chunks = new_code.split(def_left) # imports + def_left + {def_right + impl} 113 | 114 | if len(chunks) == 2: 115 | new_code = def_left + chunks[-1] # fn + impl 116 | 117 | if "chatgpt" in folder: 118 | tmp = "" 119 | for line in new_code.splitlines(): 120 | if line.strip() == "python": 121 | continue 122 | tmp += line + "\n" 123 | new_code = tmp 124 | 125 | new_code = to_four_space_indents(new_code) 126 | 127 | if args.eof: 128 | eof_strs = NON_CODE_EOFS 129 | if "incoder" in folder: 130 | eof_strs = eof_strs + INCODER_EXTRA 131 | if "polycoder" in folder: 132 | eof_strs = eof_strs + POLYCODER_EXTRA 133 | if "mistral" in folder: 134 | eof_strs = eof_strs + [r""] 135 | for eof in eof_strs: 136 | new_code = new_code.split(eof)[0] 137 | 138 | # extract the target function and remove lines that are not indented 139 | new_code = extract_function(new_code, entry_point[task_id]) 140 | 141 | if len(chunks) == 2: 142 | new_code = chunks[0] + new_code 143 | 144 | # write to new folder 145 | new_pyf = pyf.replace(str(old_folder), str(new_folder)) 146 | 147 | if new_code.strip() != old_code.strip(): 148 | print("Sanitized: ", pyf, "->", new_pyf) 149 | nsan += 1 150 | 151 | pathlib.Path(new_pyf).parent.mkdir(parents=True, exist_ok=True) 152 | with open(new_pyf, "w") as f: 153 | f.write(new_code) 154 | 155 | print(f"Sanitized {nsan} out of {ntotal} files.") 156 | 157 | 158 | def main(): 159 | import argparse 160 | 161 | parser = argparse.ArgumentParser() 162 | parser.add_argument("--folder", type=str, required=True) 163 | parser.add_argument("--dataset", type=str, required=True) 164 | parser.add_argument("--eof", action="store_true") 165 | parser.add_argument("--inplace", action="store_true") 166 | parser.add_argument( 167 | "--root_folder", 168 | action="store_true", 169 | help="Use if we want to sanitize all folders in the root folder.", 170 | ) 171 | 172 | args = parser.parse_args() 173 | 174 | assert not args.folder.endswith("/") 175 | 176 | if not args.root_folder: 177 | sanitize_folder(args, args.folder) 178 | else: 179 | for folder in os.listdir(args.folder): 180 | if os.path.isdir(f"{args.folder}/{folder}") and "sanitized" not in folder: 181 | sanitize_folder(args, f"{args.folder}/{folder}") 182 | 183 | 184 | if __name__ == "__main__": 185 | main() 186 | --------------------------------------------------------------------------------