├── .dockerignore
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── README.md
├── codegen
├── generate.py
└── model.py
├── evoeval
├── __init__.py
├── data.py
├── eval_test
│ ├── __init__.py
│ ├── _creative_special_oracle.py
│ ├── _difficult_special_oracle.py
│ ├── _he_special_oracle.py
│ └── _subtle_special_oracle.py
├── evaluate.py
└── util
│ └── api_request.py
├── pyproject.toml
├── requirements.txt
├── resources
├── butterfly_dark.png
└── example.gif
├── setup.cfg
└── tool
└── sanitize.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | # nuclear option because steven uses PyCharm.
161 | .idea/
162 |
163 | # VSCode
164 | .vscode/
165 | *.jsonl
166 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | # nuclear option because steven uses PyCharm.
161 | .idea/
162 |
163 |
164 | evoeval/_version.py
165 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pycqa/isort
3 | rev: 5.12.0
4 | hooks:
5 | - id: isort
6 | name: isort (python)
7 | args: ["--profile", "black"]
8 | - repo: https://github.com/psf/black
9 | rev: 22.6.0
10 | hooks:
11 | - id: black
12 | - repo: https://github.com/pre-commit/pre-commit-hooks
13 | rev: v4.3.0
14 | hooks:
15 | - id: check-yaml
16 | - id: end-of-file-fixer
17 | - id: trailing-whitespace
18 | exclude: (?x)^(
19 | resources/.*|
20 | README.*
21 | )$
22 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # base env: py39 ubuntu20.04
2 | # 3.9 is needed for typing related stuff
3 | FROM python:3.9-slim-buster
4 |
5 | # install git
6 | RUN apt-get update && apt-get install -y git
7 |
8 | # upgrade to latest pip
9 | RUN pip install --upgrade pip
10 |
11 | COPY . /evoeval
12 |
13 | RUN cd /evoeval && ls -l && pip install .
14 |
15 | WORKDIR /app
16 |
17 | ENTRYPOINT ["python3", "-m", "evoeval.evaluate"]
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #
EvoEval: Evolving Coding Benchmarks via LLM
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | ⚡Quick Start |
13 | 🔠Benchmarks |
14 | 🤖LLM Generated Code |
15 | 📝Citation |
16 | 🙏Acknowledgement
17 |
18 |
19 | ##
About
20 |
21 | **EvoEval**1 is a holistic benchmark suite created by _evolving_ **HumanEval** problems:
22 | - 🔥 Contains **828** new problems across **5** 🌠 semantic-altering and **2** ⭐ semantic-preserving benchmarks
23 | - 🔮 Allows evaluation/comparison across different **dimensions** and problem **types** (i.e., _Difficult_, _Creative_ or _Tool Use_ problems). See our [**visualization tool**](https://evo-eval.github.io/visualization.html) for ready-to-use comparison
24 | - 🏆 Complete with [**leaderboard**](https://evo-eval.github.io/leaderboard.html), **groundtruth solutions**, **robust testcases** and **evaluation scripts** to easily fit into your evaluation pipeline
25 | - 🤖 Generated LLM code samples from **>50** different models to save you time in running experiments
26 |
27 | 1 coincidentally similar pronunciation with 😈 EvilEval
28 |
29 |
30 |
31 |
32 |
33 | Checkout our 📃 [paper](https://arxiv.org/abs/2403.19114) and [webpage](https://evo-eval.github.io) for more detail!
34 |
35 |
36 |
37 | ## ⚡ Quick Start
38 |
39 | Directly install the package:
40 |
41 | ```bash
42 | pip install evoeval --upgrade
43 | ```
44 |
45 | ⏬ Nightly Version
46 |
47 |
48 | ```bash
49 | pip install "git+https://github.com/evo-eval/evoeval.git" --upgrade
50 | ```
51 |
52 |
53 |
54 |
55 | ⏬ Local Repository
56 |
57 |
58 | ```bash
59 | git clone https://github.com/evo-eval/evoeval.git
60 | cd evoeval
61 | export PYTHONPATH=$PYTHONPATH:$(pwd)
62 | pip install -r requirements.txt
63 | ```
64 |
65 |
66 |
67 |
68 | Now you are ready to download EvoEval benchmarks and perform evaluation!
69 |
70 | ### 🧑💻 Code generation
71 |
72 | To download our benchmarks, simply use the following code snippet:
73 |
74 | ```python
75 | from evoeval.data import get_evo_eval
76 |
77 | evoeval_benchmark = "EvoEval_difficult" # you can pick from 7 different benchmarks!
78 |
79 | problems = get_evo_eval(evoeval_benchmark)
80 | ```
81 |
82 | For code generation and evaluation, we adopt the same style as [HumanEval+](https://github.com/evalplus/evalplus) and [HumanEval](https://github.com/openai/human-eval).
83 |
84 | Implement the `GEN_SOLUTION` function by calling the LLM to produce the complete solution (include the function header + code) and save the samples to `{benchmark}_samples.jsonl`:
85 |
86 | ```python
87 | from evoeval.data import get_evo_eval, write_jsonl
88 |
89 | evoeval_benchmark = "EvoEval_difficult"
90 |
91 | samples = [
92 | dict(task_id=task_id, solution=GEN_SOLUTION(problem["prompt"]))
93 | for task_id, problem in get_evo_eval(evoeval_benchmark).items()
94 | ]
95 | write_jsonl(f"{evoeval_benchmark}_samples.jsonl", samples)
96 | ```
97 |
98 | > [!TIP]
99 | >
100 | > EvoEval `samples.jsonl` expects the solution field to contain the **complete** code implementation, this is
101 | > slightly different from the original HumanEval where the solution field only contains the function body.
102 | >
103 | > If you want to follow exactly like HumanEval setup, checkout our 🤗 Huggingface [datasets](https://huggingface.co/evoeval), which can be directly ran with
104 | > HumanEval evaluation [script](https://huggingface.co/evoeval)
105 |
106 | ### 🕵️ Evaluation
107 |
108 | You can use our provided [docker](https://docs.docker.com/get-docker/) image:
109 |
110 | ```bash
111 | docker run --rm -v $(pwd):/app evoeval/evoeval:latest --dataset EvoEval_difficult --samples EvoEval_difficult_samples.jsonl
112 | ```
113 |
114 | Or run it locally:
115 |
116 | ```bash
117 | evoeval.evaluate --dataset EvoEval_difficult --samples EvoEval_difficult_samples.jsonl
118 | ```
119 |
120 | Or if you are using it as a local repository:
121 |
122 | ```bash
123 | export PYTHONPATH=$PYTHONPATH:$(pwd)
124 | python evoeval/evaluate.py --dataset EvoEval_difficult --samples EvoEval_difficult_samples.jsonl
125 | ```
126 |
127 | You should expect to see the following output (when evaluated on GPT-4):
128 | ```
129 | Computing expected output...
130 | Expected outputs computed in 11.24s
131 | Reading samples...
132 | 100it [00:00, 164.16it/s]
133 | 100%|████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 12.77it/s]
134 | EvoEval_difficult
135 | pass@1: 0.520 # for reference GPT-4 solves more than 80% of problems in HumanEval
136 | ```
137 | This shows the pass@1 score for the EvoEval_difficult benchmark. You can use `--i-just-wanna-run` to recompute the evaluation result
138 |
139 | > [!Note]
140 | >
141 | > You can also evaluate the LLM solutions in a folder format with each subfolder contains
142 | > the LLM solution for each problem in the benchmark
143 | >
144 | > For example, you can grab the [GPT-4 solutions](https://github.com/evo-eval/evoeval/releases/download/v0.1.0/gpt-4_temp_0.0.zip) in our [v0.1.0 release](https://github.com/evo-eval/evoeval/releases/tag/v0.1.0).
145 | > After unzipping, you can run the following command:
146 | >
147 | > ```bash
148 | > evoeval.evaluate --dataset EvoEval_difficult --samples gpt-4_temp_0.0/EvoEval_difficult
149 | > ```
150 | >
151 | > to obtain the same result as above using `.jsonl`
152 |
153 |
154 | ## 🔠 Benchmarks
155 |
156 | **EvoEval** contains **7** different benchmarks, each with a unique set of problems
157 | evolved from the original **HumanEval** problems. 🌠 denotes semantic-altering benchmarks,
158 | while ⭐ denotes semantic-preserving benchmarks.:
159 |
160 | 🌠EvoEval_difficult:
161 |
162 |
163 | > Introduce complexity by adding additional constraints and requirements,
164 | > replace commonly used requirements to less common ones, or add additional reasoning
165 | > steps to the original problem.
166 |
167 |
168 |
169 | 🌠EvoEval_creative:
170 |
171 |
172 | > Generate a more creative problem compared to the original through the use
173 | > of stories or uncommon narratives.
174 |
175 |
176 |
177 |
178 | 🌠EvoEval_subtle:
179 |
180 |
181 | > Make a subtle and minor change to the original problem such as inverting or
182 | > replacing a requirement.
183 |
184 |
185 |
186 |
187 | 🌠EvoEval_combine:
188 |
189 |
190 | > Combine two different problems by integrating the concepts from both problems. In order to select problems that make sense to combine, we apply a simple heuristic
191 | > to combine only problems of the same type together categorized based on the type of
192 | > input arguments in the original problem.
193 |
194 |
195 |
196 | 🌠EvoEval_tool_use:
197 |
198 |
199 | > Produce a new problem containing a main problem and one or more helpers
200 | > functions which can be used to solve it. Each helper function is fully implemented and
201 | > provides hints or useful functionality for solving the main problem. The main problem
202 | > does not explicitly reference individual helper functions, and we do not require the model
203 | > to use the provided helpers.
204 |
205 |
206 |
207 |
208 | ⭐EvoEval_verbose:
209 |
210 |
211 | > Reword the original docstring to be more verbose. These verbose docstrings
212 | > can use more descriptive language to illustrate the problem, include detailed explanation
213 | > of the example output, and provide additional hints.
214 |
215 |
216 |
217 | ⭐EvoEval_concise:
218 |
219 |
220 | > Reword the original docstring to be more concise by removing unnecessary
221 | > details and using concise language. Furthermore, simple examples that are not required
222 | > to demonstrate edge cases may be removed.
223 |
224 |
225 |
226 |
227 | For each problem in each **EvoEval** benchmark, we include the complete groundtruth as well as test cases for functional evaluation.
228 |
229 | > [!Note]
230 | >
231 | > **Problem Structure**
232 | >
233 | > ```json
234 | > {
235 | > "task_id": "identifier string for the task",
236 | > "entry_point": "name of the function",
237 | > "prompt": "function signature with docstring",
238 | > "canonical_solution": "groundtruth implementation",
239 | > "inputs": "test inputs for each problem",
240 | > "parent": "original HumanEval problem it evolved from",
241 | > "main": "special field of EvoEval_tool_use to show just the main problem description",
242 | > "helpers": "special field of EvoEval_tool_use to show the helper functions"
243 | > }
244 | > ```
245 |
246 | ## 🤖 LLM Generated Code
247 |
248 | To view the performance of **>50** LLMs on the EvoEval benchmarks,
249 | we provide a complete [leaderboard](https://evo-eval.github.io/leaderboard.html) as well as a
250 | [visualization tool](https://evo-eval.github.io/visualization.html) to compare the performance of different models.
251 |
252 | Further, we also provide all code samples from LLMs on the **EvoEval** benchmarks:
253 |
254 | * See the attachment of our [v0.1.0 release](https://github.com/evo-eval/evoeval/releases/tag/v0.1.0).
255 |
256 | Each LLM generation is packaged in a zip file named like `{model_name}_temp_0.0.zip`. You can unzip the folder and obtain the
257 | LLM generation for each of our 7 benchmarks + the original HumanEval problems. Note that we only evaluate the greedy output for each LLM.
258 |
259 | ## 📝 Citation
260 |
261 | ```bibtex
262 | @article{evoeval,
263 | author = {Xia, Chunqiu Steven and Deng, Yinlin and Zhang, Lingming},
264 | title = {Top Leaderboard Ranking = Top Coding Proficiency, Always? EvoEval: Evolving Coding Benchmarks via LLM},
265 | year = {2024},
266 | journal = {arXiv preprint},
267 | }
268 | ```
269 |
270 | > [!Note]
271 | >
272 | > The first two authors contributed equally to this work, with author order determined via [_Nigiri_](https://senseis.xmp.net/?Nigiri)
273 |
274 | ## 🙏 Acknowledgement
275 |
276 | * [HumanEval](https://github.com/openai/human-eval)
277 | * We especially thank [EvalPlus](https://github.com/evalplus/evalplus)
278 |
279 |
280 |
281 |
--------------------------------------------------------------------------------
/codegen/generate.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | from os import PathLike
4 |
5 | from evalplus.data import get_human_eval_plus
6 | from model import DecoderBase, make_model
7 | from rich.progress import (
8 | BarColumn,
9 | MofNCompleteColumn,
10 | Progress,
11 | TextColumn,
12 | TimeElapsedColumn,
13 | )
14 |
15 | from evoeval.data import get_evo_eval
16 |
17 |
18 | def construct_contract_prompt(prompt: str, contract_type: str, contract: str) -> str:
19 | if contract_type == "no":
20 | return prompt
21 | elif contract_type == "docstring":
22 | # embed within the docstring
23 | sep = ""
24 | if '"""' in prompt:
25 | sep = '"""'
26 | elif "'''" in prompt:
27 | sep = "'''"
28 | assert sep != ""
29 | l = prompt.split(sep)
30 | contract = "\n".join([x.split("#")[0] for x in contract.splitlines()])
31 | l[1] = (
32 | l[1] + contract + "\n" + " " * (len(contract) - len(contract.lstrip()) - 1)
33 | )
34 | return sep.join(l)
35 | elif contract_type == "code":
36 | # at the beginning of the function
37 | contract = "\n".join([x.split("#")[0] for x in contract.splitlines()])
38 | return prompt + contract
39 |
40 |
41 | def code_generate(args, workdir: PathLike, model: DecoderBase, id_range=None):
42 | with Progress(
43 | TextColumn(
44 | f"{args.dataset} •" + "[progress.percentage]{task.percentage:>3.0f}%"
45 | ),
46 | BarColumn(),
47 | MofNCompleteColumn(),
48 | TextColumn("•"),
49 | TimeElapsedColumn(),
50 | ) as p:
51 | if args.dataset == "humaneval":
52 | dataset = get_human_eval_plus()
53 | else:
54 | dataset = get_evo_eval(args.dataset)
55 |
56 | for task_id, task in p.track(dataset.items()):
57 | if id_range is not None:
58 | id_num = int(task_id.split("/")[1])
59 | low, high = id_range
60 | if id_num < low or id_num >= high:
61 | p.console.print(f"Skipping {task_id} as it is not in {id_range}")
62 | continue
63 |
64 | p_name = task_id.replace("/", "_")
65 | if args.use_contracts != "no" and task["contract"] == "":
66 | continue
67 | os.makedirs(os.path.join(workdir, p_name), exist_ok=True)
68 | log = f"Codegen: {p_name} @ {model}"
69 | n_existing = 0
70 | if args.resume:
71 | # count existing .py files
72 | n_existing = len(
73 | [
74 | f
75 | for f in os.listdir(os.path.join(workdir, p_name))
76 | if f.endswith(".py")
77 | ]
78 | )
79 | if n_existing > 0:
80 | log += f" (resuming from {n_existing})"
81 |
82 | nsamples = args.n_samples - n_existing
83 | p.console.print(log)
84 |
85 | sidx = args.n_samples - nsamples
86 | while sidx < args.n_samples:
87 | outputs = model.codegen(
88 | construct_contract_prompt(
89 | task["prompt"], args.use_contracts, task["contract"]
90 | ),
91 | do_sample=not args.greedy,
92 | num_samples=args.n_samples - sidx,
93 | )
94 | assert outputs, "No outputs from model!"
95 | for impl in outputs:
96 | try:
97 | with open(
98 | os.path.join(workdir, p_name, f"{sidx}.py"),
99 | "w",
100 | encoding="utf-8",
101 | ) as f:
102 | if model.conversational:
103 | f.write(impl)
104 | else:
105 | f.write(task["prompt"] + impl)
106 | except UnicodeEncodeError:
107 | continue
108 | sidx += 1
109 |
110 |
111 | def main():
112 | parser = argparse.ArgumentParser()
113 | parser.add_argument("--model", required=True, type=str)
114 | parser.add_argument("--bs", required=True, type=int)
115 | parser.add_argument("--temperature", required=True, type=float)
116 | parser.add_argument("--dataset", default="evileval", type=str)
117 | parser.add_argument("--root", type=str, required=True)
118 | parser.add_argument("--n_samples", default=200, type=int)
119 | parser.add_argument("--resume", action="store_true")
120 | parser.add_argument("--use_contracts", default="no", type=str)
121 | parser.add_argument("--greedy", action="store_true")
122 | # id_range is list
123 | parser.add_argument("--id-range", default=None, nargs="+", type=int)
124 | args = parser.parse_args()
125 |
126 | # if args.dataset not in ["evileval", "humaneval"]:
127 | # raise NotImplementedError("Unsupported dataset: {}".format(args.dataset))
128 |
129 | if args.use_contracts not in ["no", "code", "docstring"]:
130 | raise NotImplementedError(
131 | "Unsupported contract usage: {}".format(args.use_contracts)
132 | )
133 | if args.greedy and (args.temperature != 0 or args.bs != 1 or args.n_samples != 1):
134 | raise ValueError(
135 | f"Greedy decoding is only supported with temperature({args.temperature}) = 0, batch_size({args.bs}) = 1"
136 | f" and n_samples({args.n_samples}) = 1"
137 | )
138 |
139 | if args.id_range is not None:
140 | assert len(args.id_range) == 2, "id_range must be a list of length 2"
141 | assert args.id_range[0] < args.id_range[1], "id_range must be increasing"
142 | args.id_range = tuple(args.id_range)
143 |
144 | # Make project dir
145 | os.makedirs(args.root, exist_ok=True)
146 | # Make dataset dir
147 | os.makedirs(os.path.join(args.root, args.dataset), exist_ok=True)
148 | # Make dir for codes generated by each model
149 | args.model = args.model.lower()
150 | model = make_model(
151 | name=args.model, batch_size=args.bs, temperature=args.temperature
152 | )
153 | workdir = os.path.join(
154 | args.root,
155 | args.dataset,
156 | args.model
157 | + f"_temp_{args.temperature}"
158 | + ("" if args.use_contracts == "no" else f"-contract-{args.use_contracts}"),
159 | )
160 | os.makedirs(workdir, exist_ok=True)
161 |
162 | with open(os.path.join(workdir, "args.txt"), "w") as f:
163 | f.write(str(args))
164 |
165 | code_generate(args, workdir=workdir, model=model, id_range=args.id_range)
166 |
167 |
168 | if __name__ == "__main__":
169 | main()
170 |
--------------------------------------------------------------------------------
/codegen/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | from abc import ABC, abstractmethod
3 | from typing import List
4 | from warnings import warn
5 |
6 | # Communism
7 | os.environ["HF_HOME"] = os.environ.get("HF_HOME", "/ColossalTitan/huggingface/")
8 |
9 | import anthropic
10 | import google.generativeai as genai
11 | import openai
12 | import torch
13 | from transformers import (
14 | AutoModelForCausalLM,
15 | AutoModelForSeq2SeqLM,
16 | AutoTokenizer,
17 | StoppingCriteria,
18 | StoppingCriteriaList,
19 | )
20 | from vllm import LLM, SamplingParams
21 |
22 | from evoeval.util.api_request import (
23 | create_anthropic_config,
24 | create_chatgpt_config,
25 | create_gemini_config,
26 | create_palm_config,
27 | num_tokens_from_messages,
28 | request_anthropic_engine,
29 | request_chatgpt_engine,
30 | request_gemini_engine,
31 | request_palm_engine,
32 | )
33 |
34 | HUMANEVAL_EOS = ["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif"]
35 | NON_CODE_EOS = ["<|endoftext|>", "\n```", "\n", "<|endofmask|>"]
36 | EOS = HUMANEVAL_EOS + NON_CODE_EOS
37 |
38 |
39 | # Adopted from https://github.com/huggingface/transformers/pull/14897
40 | class EndOfFunctionCriteria(StoppingCriteria):
41 | def __init__(self, start_length, eos, tokenizer, *args, **kwargs):
42 | super().__init__(*args, **kwargs)
43 | self.start_length = start_length
44 | self.eos = eos
45 | self.tokenizer = tokenizer
46 | self.end_length = {}
47 |
48 | def __call__(self, input_ids, scores, **kwargs):
49 | """Returns true if all generated sequences contain any of the end-of-function strings."""
50 | decoded_generations = self.tokenizer.batch_decode(
51 | input_ids[:, self.start_length :]
52 | )
53 | done = []
54 | for index, decoded_generation in enumerate(decoded_generations):
55 | finished = any(
56 | [stop_string in decoded_generation for stop_string in self.eos]
57 | )
58 | if (
59 | finished and index not in self.end_length
60 | ): # ensures first time we see it
61 | for stop_string in self.eos:
62 | if stop_string in decoded_generation:
63 | self.end_length[index] = len(
64 | input_ids[
65 | index, # get length of actual generation
66 | self.start_length : -len(
67 | self.tokenizer.encode(
68 | stop_string,
69 | add_special_tokens=False,
70 | return_tensors="pt",
71 | )[0]
72 | ),
73 | ]
74 | )
75 | done.append(finished)
76 | return all(done)
77 |
78 |
79 | class DecoderBase(ABC):
80 | def __init__(
81 | self,
82 | name: str,
83 | batch_size: int = 1,
84 | temperature: float = 0.8,
85 | max_new_tokens: int = 512,
86 | conversational: bool = False,
87 | body: bool = False,
88 | ) -> None:
89 | print("Initializing a decoder model: {} ...".format(name))
90 | self.name = name
91 | self.batch_size = batch_size
92 | self.temperature = temperature
93 | self.eos = EOS
94 | self.skip_special_tokens = False
95 | self.max_new_tokens = max_new_tokens
96 | self.conversational = conversational
97 | self.body = body
98 |
99 | @abstractmethod
100 | def codegen(
101 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
102 | ) -> List[str]:
103 | pass
104 |
105 | def __repr__(self) -> str:
106 | return self.name
107 |
108 | def __str__(self) -> str:
109 | return self.name
110 |
111 |
112 | class VLlmDecoder(DecoderBase):
113 | def __init__(
114 | self,
115 | name: str,
116 | batch_size: int = 1,
117 | temperature: float = 0.8,
118 | max_new_tokens: int = 512,
119 | conversational: bool = False,
120 | ) -> None:
121 | super().__init__(name, batch_size, temperature, max_new_tokens, conversational)
122 | kwargs = {"tensor_parallel_size": int(os.getenv("VLLM_N_GPUS", "1"))}
123 |
124 | if "CodeLlama" in name:
125 | kwargs["dtype"] = "bfloat16"
126 | elif "code-millenials" in name:
127 | kwargs["dtype"] = "float16"
128 | elif "uukuguy/speechless-code-mistral-7b-v1.0" == name:
129 | kwargs["dtype"] = "float16"
130 | elif "uukuguy/speechless-codellama-34b-v2.0" == name:
131 | kwargs["dtype"] = "float16"
132 | elif "CodeBooga" in name:
133 | kwargs["dtype"] = "float16"
134 | elif "WizardCoder" in name and "V1.1" in name:
135 | kwargs["dtype"] = "bfloat16"
136 | elif "WizardCoder" in name:
137 | kwargs["dtype"] = "float16"
138 | elif "deepseek" in name:
139 | kwargs["dtype"] = "bfloat16"
140 | elif "mixtral" in name.lower():
141 | kwargs["dtype"] = "bfloat16"
142 | elif "solar" in name:
143 | kwargs["dtype"] = "float16"
144 | elif "mistral" in name.lower():
145 | kwargs["dtype"] = "bfloat16"
146 | elif "phi" in name.lower():
147 | kwargs["dtype"] = "float16"
148 | kwargs["trust_remote_code"] = True
149 | elif "openchat" in name.lower():
150 | kwargs["dtype"] = "bfloat16"
151 |
152 | # reset the eos
153 | self.eos = []
154 | self.llm = LLM(model=name, max_model_len=2048, **kwargs)
155 |
156 | def codegen(
157 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
158 | ) -> List[str]:
159 | if do_sample:
160 | assert self.temperature > 0, "Temperature must be greater than 0!"
161 | batch_size = min(self.batch_size, num_samples)
162 |
163 | vllm_outputs = self.llm.generate(
164 | [prompt] * batch_size,
165 | SamplingParams(
166 | temperature=self.temperature,
167 | max_tokens=self.max_new_tokens
168 | + len(self.llm.get_tokenizer().encode(prompt, return_tensors="pt")[0]),
169 | top_p=0.95 if do_sample else 1.0,
170 | stop=self.eos,
171 | ),
172 | use_tqdm=False,
173 | )
174 |
175 | gen_strs = [x.outputs[0].text.replace("\t", " ") for x in vllm_outputs]
176 |
177 | return gen_strs
178 |
179 |
180 | class CodeLlamaInstructSmall(VLlmDecoder):
181 | def __init__(self, name: str, **kwargs) -> None:
182 | kwargs["conversational"] = True
183 | super().__init__(name, **kwargs)
184 | self.eos += ["\n```"]
185 |
186 | def codegen(
187 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
188 | ) -> List[str]:
189 | if do_sample:
190 | assert self.temperature > 0, "Temperature must be greater than 0!"
191 |
192 | input = f"""[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:
193 | ```python
194 | {prompt}
195 | ```
196 | [/INST]
197 | ```python
198 | """
199 |
200 | return VLlmDecoder.codegen(self, input, do_sample, num_samples)
201 |
202 |
203 | class Alpaca(VLlmDecoder):
204 | def __init__(self, name: str, **kwargs) -> None:
205 | kwargs["conversational"] = True
206 | super().__init__(name, **kwargs)
207 | self.eos += ["\n```"]
208 |
209 | def codegen(
210 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
211 | ) -> List[str]:
212 | prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes request.
213 |
214 | ### Instruction:
215 | Create a Python script for this problem:
216 | {prompt}
217 |
218 | ### Response:
219 | ```python
220 | """
221 | return VLlmDecoder.codegen(self, prompt, do_sample, num_samples)
222 |
223 |
224 | class OpenChat(VLlmDecoder):
225 | def __init__(self, name: str, **kwargs) -> None:
226 | kwargs["conversational"] = True
227 | super().__init__(name, **kwargs)
228 | self.eos += ["\n```"]
229 |
230 | def codegen(
231 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
232 | ) -> List[str]:
233 | if do_sample:
234 | assert self.temperature > 0, "Temperature must be greater than 0!"
235 |
236 | input = f"""GPT4 Correct User: Can you complete the following Python function?
237 | ```python
238 | {prompt}
239 | ```
240 | <|end_of_turn|>GPT4 Correct Assistant:
241 | ```python
242 | """
243 | return VLlmDecoder.codegen(self, input, do_sample, num_samples)
244 |
245 |
246 | class WizardCoderDecoder(VLlmDecoder):
247 | def codegen(
248 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
249 | ) -> List[str]:
250 | if do_sample:
251 | assert self.temperature > 0, "Temperature must be greater than 0!"
252 |
253 | prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
254 |
255 |
256 | ### Instruction:
257 | Create a Python script for this problem:
258 | {prompt}
259 |
260 | ### Response:"""
261 |
262 | batch_size = min(self.batch_size, num_samples)
263 |
264 | num_of_tokens = len(
265 | self.llm.get_tokenizer().encode(prompt, return_tensors="pt")[0]
266 | )
267 |
268 | vllm_outputs = self.llm.generate(
269 | [prompt] * batch_size,
270 | SamplingParams(
271 | temperature=self.temperature,
272 | max_tokens=num_of_tokens + self.max_new_tokens,
273 | top_p=0.95 if do_sample else 1.0,
274 | ),
275 | use_tqdm=False,
276 | )
277 |
278 | return [x.outputs[0].text.replace("\t", " ") for x in vllm_outputs]
279 |
280 |
281 | class XwinCoder(VLlmDecoder):
282 | def __init__(self, name: str, **kwargs) -> None:
283 | kwargs["conversational"] = True
284 | super().__init__(name, **kwargs)
285 | self.eos += ["\n```"]
286 |
287 | def codegen(
288 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
289 | ) -> List[str]:
290 |
291 | prompt = f""": You are an AI coding assistant that helps people with programming. Write a response that appropriately completes the user's request.
292 | : Complete the following code for me and return a fully runable code.
293 | ```python
294 | {prompt}
295 | ```
296 | :
297 | ```python
298 | """
299 | return VLlmDecoder.codegen(self, prompt, do_sample, num_samples)
300 |
301 |
302 | class HFTorchDecoder(DecoderBase):
303 | def __init__(self, name: str, **kwargs):
304 | super().__init__(name=name, **kwargs)
305 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
306 | kwargs = {
307 | "trust_remote_code": name
308 | in {
309 | "bigcode/santacoder",
310 | "Salesforce/codegen2-1B",
311 | "Salesforce/codegen2-3_7B",
312 | "Salesforce/codegen2-7B",
313 | "Salesforce/codegen2-16B",
314 | "deepseek-ai/deepseek-coder-6.7b-base",
315 | "deepseek-ai/deepseek-coder-33b-base",
316 | "stabilityai/stable-code-3b",
317 | "Qwen/Qwen-14B-Chat",
318 | "Qwen/Qwen-7B-Chat",
319 | }
320 | }
321 |
322 | if "codegen-" in name: # use fp16 for codegen models
323 | kwargs["torch_dtype"] = torch.float16
324 | if "codegen2-" in name: # avoid warning of trust remote code
325 | kwargs["revision"] = "main"
326 | if "16b" in name.lower():
327 | kwargs["device_map"] = "auto"
328 | if "starcoder2" in name:
329 | kwargs["device_map"] = "auto"
330 | if "starcoder" in name:
331 | kwargs["torch_dtype"] = torch.bfloat16
332 | if "CodeLlama" in name:
333 | if "34b" in name.lower() or "70b" in name.lower():
334 | kwargs["device_map"] = "auto"
335 | kwargs["torch_dtype"] = torch.bfloat16
336 | self.skip_special_tokens = True
337 | if "CodeBooga" in name:
338 | kwargs["torch_dtype"] = torch.float16
339 | kwargs["device_map"] = "auto"
340 | self.skip_special_tokens = True
341 | if "Mistral-7B-codealpaca-lora" == name:
342 | kwargs["torch_dtype"] = torch.float16
343 | self.skip_special_tokens = True
344 | elif "Mistral" in name or "zephyr-7b-beta" in name:
345 | kwargs["torch_dtype"] = torch.bfloat16
346 | if "Mixtral" in name:
347 | kwargs["torch_dtype"] = torch.bfloat16
348 | kwargs["device_map"] = "auto"
349 | if "deepseek" in name:
350 | kwargs["torch_dtype"] = torch.bfloat16
351 | if "33b" in name.lower():
352 | kwargs["device_map"] = "auto"
353 | self.skip_special_tokens = True
354 | if "/phi" in name:
355 | kwargs["torch_dtype"] = torch.float16
356 | kwargs["trust_remote_code"] = True
357 | self.skip_special_tokens = True
358 | if "Qwen" in name:
359 | kwargs["torch_dtype"] = torch.bfloat16
360 | self.skip_special_tokens = True
361 | if "72B" in name:
362 | kwargs["device_map"] = "auto"
363 | if "Phind" in name:
364 | kwargs["torch_dtype"] = torch.bfloat16
365 | kwargs["device_map"] = "auto"
366 | if "gemma" in name:
367 | kwargs["torch_dtype"] = torch.bfloat16
368 | if "Magicoder" in name:
369 | kwargs["torch_dtype"] = torch.bfloat16
370 | kwargs["device_map"] = "auto"
371 |
372 | print(f"{kwargs = }")
373 |
374 | self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
375 | self.model = AutoModelForCausalLM.from_pretrained(name, **kwargs)
376 | if name in {"StabilityAI/stablelm-base-alpha-7b"}:
377 | print("Switching to float16 ...")
378 | self.model = self.model.half()
379 | self.skip_special_tokens = True
380 |
381 | if "device_map" not in kwargs:
382 | self.model = self.model.to(self.device)
383 |
384 | @torch.inference_mode()
385 | def codegen(
386 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
387 | ) -> List[str]:
388 | if self.temperature == 0:
389 | assert not do_sample
390 | assert num_samples == 1
391 |
392 | input_tokens = self.tokenizer.encode(prompt, return_tensors="pt").to(
393 | self.device
394 | )
395 | scores = StoppingCriteriaList(
396 | [
397 | EndOfFunctionCriteria(
398 | start_length=len(input_tokens[0]),
399 | eos=self.eos,
400 | tokenizer=self.tokenizer,
401 | )
402 | ]
403 | )
404 | kwargs = {}
405 | if do_sample:
406 | kwargs["top_p"] = 0.95
407 | kwargs["temperature"] = self.temperature
408 |
409 | raw_outputs = self.model.generate(
410 | input_tokens,
411 | max_new_tokens=self.max_new_tokens,
412 | stopping_criteria=scores,
413 | do_sample=do_sample,
414 | output_scores=True,
415 | return_dict_in_generate=True,
416 | num_return_sequences=min(self.batch_size, num_samples),
417 | pad_token_id=self.tokenizer.eos_token_id,
418 | **kwargs,
419 | ) # remove warning
420 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
421 | gen_strs = self.tokenizer.batch_decode(
422 | gen_seqs, skip_special_tokens=self.skip_special_tokens
423 | )
424 | outputs = []
425 | # removes eos tokens.
426 | for output in gen_strs:
427 | min_index = 10000
428 | for eos in self.eos:
429 | if eos in output:
430 | # could be multiple eos in outputs, better pick minimum one
431 | min_index = min(min_index, output.index(eos))
432 | outputs.append(output[:min_index])
433 | return outputs
434 |
435 |
436 | class CodeLlamaInstructLarge(HFTorchDecoder):
437 | def __init__(self, name: str, **kwargs) -> None:
438 | kwargs["conversational"] = True
439 | super().__init__(name, **kwargs)
440 | self.eos = ["\n```"]
441 |
442 | def codegen(
443 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
444 | ) -> List[str]:
445 | if do_sample:
446 | assert self.temperature > 0, "Temperature must be greater than 0!"
447 |
448 | input = f"""'Source: system
449 |
450 | You are a helpful and honest code assistant expert in Python. Please, provide all answers to programming questions in Python.
451 | Source: user
452 |
453 | Provide a self-contained Python script that solves the following problem:
454 | ```python
455 | {prompt}
456 | ```
457 | Source: assistant
458 |
459 | Here is a Python script that solves the problem:
460 | ```python
461 | """
462 |
463 | input_tokens = self.tokenizer.encode(input, return_tensors="pt").to(self.device)
464 | scores = StoppingCriteriaList(
465 | [
466 | EndOfFunctionCriteria(
467 | start_length=len(input_tokens[0]),
468 | eos=self.eos,
469 | tokenizer=self.tokenizer,
470 | )
471 | ]
472 | )
473 | kwargs = {}
474 | if do_sample:
475 | kwargs["top_p"] = 0.95
476 | kwargs["temperature"] = self.temperature
477 |
478 | max_new_tokens = self.max_new_tokens + len(
479 | self.tokenizer.encode(prompt, return_tensors="pt")[0]
480 | )
481 |
482 | raw_outputs = self.model.generate(
483 | input_tokens,
484 | max_new_tokens=max_new_tokens,
485 | stopping_criteria=scores,
486 | do_sample=do_sample,
487 | output_scores=True,
488 | return_dict_in_generate=True,
489 | num_return_sequences=min(self.batch_size, num_samples),
490 | pad_token_id=self.tokenizer.eos_token_id,
491 | **kwargs,
492 | ) # remove warning
493 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
494 | gen_strs = self.tokenizer.batch_decode(
495 | gen_seqs, skip_special_tokens=self.skip_special_tokens
496 | )
497 | outputs = []
498 | # removes eos tokens.
499 | for output in gen_strs:
500 | min_index = 10000
501 | for eos in self.eos:
502 | if eos in output:
503 | # could be multiple eos in outputs, better pick minimum one
504 | min_index = min(min_index, output.index(eos))
505 | outputs.append(output[:min_index])
506 | return outputs
507 |
508 |
509 | class QwenInstruct(HFTorchDecoder):
510 |
511 | generation_template = "Please implement the following Python function in a markdown style code block:\n\n```python\n{prompt}\n```\n"
512 | incorrect_code_template = "```python\n{incorrect_solution}\n```\n"
513 | feedback_template = "{feedback}"
514 |
515 | @torch.inference_mode()
516 | def codegen(
517 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
518 | ) -> List[str]:
519 | if self.temperature == 0:
520 | assert not do_sample
521 | assert num_samples == 1
522 | content = self.generation_template.format(prompt=prompt)
523 |
524 | input_tokens = self.tokenizer.apply_chat_template(
525 | [
526 | {
527 | "role": "user",
528 | "content": content,
529 | }
530 | ],
531 | add_generation_prompt=True,
532 | return_tensors="pt",
533 | ).to(self.device)
534 |
535 | max_token = len(input_tokens[0]) + self.max_new_tokens
536 |
537 | kwargs = {}
538 | if do_sample:
539 | kwargs["top_p"] = 0.95
540 | kwargs["temperature"] = self.temperature
541 |
542 | raw_outputs = self.model.generate(
543 | input_tokens,
544 | max_new_tokens=max_token,
545 | do_sample=do_sample,
546 | output_scores=True,
547 | return_dict_in_generate=True,
548 | top_k=50,
549 | num_return_sequences=min(self.batch_size, num_samples),
550 | pad_token_id=self.tokenizer.eos_token_id,
551 | **kwargs,
552 | ) # remove warning
553 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
554 | gen_strs = self.tokenizer.batch_decode(
555 | gen_seqs, skip_special_tokens=self.skip_special_tokens
556 | )
557 | return gen_strs
558 |
559 |
560 | class DeepSeekInstruct(HFTorchDecoder):
561 |
562 | generation_template = "Please implement the following Python function in a markdown style code block:\n\n```python\n{prompt}\n```\n"
563 | incorrect_code_template = "```python\n{incorrect_solution}\n```\n"
564 | feedback_template = "{feedback}"
565 |
566 | @torch.inference_mode()
567 | def codegen(
568 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
569 | ) -> List[str]:
570 | if self.temperature == 0:
571 | assert not do_sample
572 | assert num_samples == 1
573 | content = self.generation_template.format(prompt=prompt)
574 |
575 | input_tokens = self.tokenizer.apply_chat_template(
576 | [
577 | {
578 | "role": "user",
579 | "content": content,
580 | }
581 | ],
582 | add_generation_prompt=True,
583 | return_tensors="pt",
584 | ).to(self.device)
585 |
586 | # set instruction model to have more max_tokens TODO: for all models
587 | max_token = len(input_tokens[0]) + self.max_new_tokens
588 |
589 | kwargs = {}
590 | if do_sample:
591 | kwargs["top_p"] = 0.95
592 | kwargs["temperature"] = self.temperature
593 |
594 | raw_outputs = self.model.generate(
595 | input_tokens,
596 | max_new_tokens=max_token,
597 | do_sample=do_sample,
598 | output_scores=True,
599 | return_dict_in_generate=True,
600 | top_k=50,
601 | num_return_sequences=min(self.batch_size, num_samples),
602 | pad_token_id=self.tokenizer.eos_token_id,
603 | eos_token_id=32021,
604 | **kwargs,
605 | ) # remove warning
606 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
607 | gen_strs = self.tokenizer.batch_decode(
608 | gen_seqs, skip_special_tokens=self.skip_special_tokens
609 | )
610 | return gen_strs
611 | # return [x.split("```python")[-1].split("```")[0] for x in gen_strs]
612 |
613 |
614 | class MistralInstruct(DeepSeekInstruct):
615 | pass # just use the same as DeepSeekInstruct
616 |
617 |
618 | class MixtralSPMXInstruct(DeepSeekInstruct):
619 | pass # just use the same as DeepSeekInstruct
620 |
621 |
622 | class GemmaInstruct(QwenInstruct):
623 | pass # just use the same as QwenInstruct
624 |
625 |
626 | class MagicCoderInstruct(DeepSeekInstruct):
627 |
628 | generation_template = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.\n\n@@ Instruction\nWrite a solution to the following problem:\n```python\n{prompt}\n```\n\n@@ Response\n"""
629 |
630 | @torch.inference_mode()
631 | def codegen(
632 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
633 | ) -> List[str]:
634 | if self.temperature == 0:
635 | assert not do_sample
636 | assert num_samples == 1
637 | content = self.generation_template.format(prompt=prompt)
638 |
639 | input_tokens = self.tokenizer.encode(content, return_tensors="pt").to(
640 | self.device
641 | )
642 |
643 | max_token = len(input_tokens[0]) + self.max_new_tokens
644 |
645 | kwargs = {}
646 | if do_sample:
647 | kwargs["top_p"] = 0.95
648 | kwargs["temperature"] = self.temperature
649 |
650 | raw_outputs = self.model.generate(
651 | input_tokens,
652 | max_new_tokens=max_token,
653 | do_sample=do_sample,
654 | output_scores=True,
655 | return_dict_in_generate=True,
656 | top_k=50,
657 | num_return_sequences=min(self.batch_size, num_samples),
658 | pad_token_id=self.tokenizer.eos_token_id,
659 | eos_token_id=self.tokenizer.eos_token_id,
660 | **kwargs,
661 | ) # remove warning
662 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
663 | gen_strs = self.tokenizer.batch_decode(
664 | gen_seqs, skip_special_tokens=self.skip_special_tokens
665 | )
666 | return gen_strs
667 |
668 |
669 | class AnthropicDecoder(DecoderBase):
670 | generation_template = (
671 | "Please complete the following code snippet.\n```\n{prompt}\n```"
672 | )
673 |
674 | def __init__(self, name: str, model_name: str = "gpt-3.5-turbo", **kwargs) -> None:
675 | super().__init__(name, **kwargs)
676 | self.model_name = model_name
677 | self.client = anthropic.Anthropic(
678 | api_key=os.getenv("ANTHROPIC_API_KEY", "dummy")
679 | )
680 |
681 | def _anthrophic_parse(self, ret, prompt, body=False):
682 | outputs = []
683 | for returns in ret.content:
684 | raw_o = returns.text
685 | outputs.append(raw_o)
686 | return outputs
687 |
688 | def codegen(
689 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
690 | ) -> List[str]:
691 | if do_sample:
692 | assert self.temperature > 0, "Temperature must be positive for sampling"
693 |
694 | batch_size = min(self.batch_size, num_samples)
695 | assert batch_size <= 20, "Use larger batch size could blow up the memory!"
696 |
697 | message = self.generation_template.format(prompt=prompt.strip())
698 |
699 | # estimation
700 | num_tokens = num_tokens_from_messages(message, self.model_name)
701 |
702 | config = create_anthropic_config(
703 | message=message,
704 | max_tokens=num_tokens + self.max_new_tokens,
705 | temperature=self.temperature,
706 | batch_size=batch_size,
707 | model=self.model_name,
708 | )
709 | ret = request_anthropic_engine(self.client, config)
710 | return self._anthrophic_parse(ret, prompt.strip(), body=self.body)
711 |
712 |
713 | class PalmDecoder(DecoderBase):
714 | generation_template = (
715 | "Please complete the following code snippet.\n```\n{prompt}\n```"
716 | )
717 |
718 | def __init__(self, name: str, model_name: str = "palm", **kwargs) -> None:
719 | super().__init__(name, **kwargs)
720 | genai.configure(api_key=os.getenv("GOOGLE_API_KEY", "dummy"))
721 | self.model_name = model_name
722 |
723 | def _palm_parse(self, ret, prompt):
724 | outputs = []
725 | raw_o = ret.result
726 | outputs.append(raw_o)
727 | return outputs
728 |
729 | def codegen(
730 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
731 | ) -> List[str]:
732 | if do_sample:
733 | assert self.temperature > 0, "Temperature must be positive for sampling"
734 |
735 | batch_size = min(self.batch_size, num_samples)
736 | assert batch_size <= 20, "Use larger batch size could blow up the memory!"
737 |
738 | message = self.generation_template.format(prompt=prompt.strip())
739 |
740 | # approximate ge
741 | num_tokens = num_tokens_from_messages(message, self.model_name)
742 |
743 | config = create_palm_config(
744 | message=message,
745 | max_tokens=num_tokens + self.max_new_tokens,
746 | temperature=self.temperature,
747 | batch_size=batch_size,
748 | model=self.model_name,
749 | )
750 | ret = request_palm_engine(genai, config)
751 | # if "gpt-3.5" in self.model_name:
752 | return self._palm_parse(ret, prompt.strip())
753 |
754 |
755 | class GeminiChatDecoder(DecoderBase):
756 | generation_template = (
757 | "Please complete the following code snippet.\n```\n{prompt}\n```"
758 | )
759 |
760 | def __init__(
761 | self, name: str, model_name: str = "models/gemini-pro", **kwargs
762 | ) -> None:
763 | super().__init__(name, **kwargs)
764 | self.model_name = model_name
765 | genai.configure(api_key=os.getenv("GOOGLE_API_KEY", "dummy"))
766 | self.model = genai.GenerativeModel(self.model_name)
767 |
768 | @staticmethod
769 | def _find_gen_func_sig(prompt):
770 | func_sig = ""
771 | for x in prompt.splitlines():
772 | if x.startswith("def ") and x.endswith(":"):
773 | # always pick the last one, since there could pre-defined functions.
774 | func_sig = x
775 | return func_sig
776 |
777 | @staticmethod
778 | def _remove_eos(gen):
779 | min_index = 100000000
780 | for eos in EOS:
781 | if eos in gen:
782 | min_index = min(min_index, gen.index(eos))
783 | return gen[:min_index]
784 |
785 | def _gemini_parse(self, ret, prompt):
786 | outputs = []
787 | raw_o = ret.text
788 | outputs.append(raw_o)
789 | return outputs
790 |
791 | def codegen(
792 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
793 | ) -> List[str]:
794 | if do_sample:
795 | assert self.temperature > 0, "Temperature must be positive for sampling"
796 |
797 | batch_size = min(self.batch_size, num_samples)
798 | assert batch_size <= 20, "Use larger batch size could blow up the memory!"
799 |
800 | message = self.generation_template.format(prompt=prompt.strip())
801 |
802 | # approximate ge
803 | num_tokens = num_tokens_from_messages(message, self.model_name)
804 |
805 | config = create_gemini_config(
806 | max_tokens=num_tokens + self.max_new_tokens,
807 | temperature=self.temperature,
808 | batch_size=batch_size,
809 | )
810 | ret = request_gemini_engine(self.model, message, config)
811 | # if "gpt-3.5" in self.model_name:
812 | return self._gemini_parse(ret, prompt.strip())
813 |
814 |
815 | class OpenAIChatDecoder(DecoderBase):
816 | generation_template = (
817 | "Please complete the following code snippet.\n```\n{prompt}\n```"
818 | )
819 |
820 | def __init__(self, name: str, model_name: str = "gpt-3.5-turbo", **kwargs) -> None:
821 | super().__init__(name, **kwargs)
822 | self.model_name = model_name
823 | openai.api_key = os.environ.get("OPENAI_API_KEY", "dummy")
824 |
825 | def codegen(
826 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
827 | ) -> List[str]:
828 | if do_sample:
829 | assert self.temperature > 0, "Temperature must be positive for sampling"
830 |
831 | batch_size = min(self.batch_size, num_samples)
832 | assert batch_size <= 20, "Use larger batch size could blow up the memory!"
833 |
834 | # construct prompt
835 | # if "gpt-3.5" in self.model_name: # chatgpt
836 | message = self.generation_template.format(prompt=prompt.strip())
837 |
838 | num_tokens = num_tokens_from_messages(message, self.model_name)
839 |
840 | config = create_chatgpt_config(
841 | message=message,
842 | max_tokens=num_tokens + self.max_new_tokens,
843 | temperature=self.temperature,
844 | batch_size=batch_size,
845 | model=self.model_name,
846 | )
847 | ret = request_chatgpt_engine(config)
848 | outputs = []
849 | for returns in ret.choices:
850 | outputs.append(returns.message.content)
851 | return outputs
852 |
853 |
854 | class StarCoder2(HFTorchDecoder):
855 | def codegen(
856 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
857 | ) -> List[str]:
858 | prompt = prompt.strip() # starcoder2 needs this, bad
859 | return HFTorchDecoder.codegen(self, prompt, do_sample, num_samples)
860 |
861 |
862 | class StarCoderInfill(HFTorchDecoder):
863 | def __init__(self, name: str, **kwargs) -> None:
864 | super().__init__(name, **kwargs)
865 | self.prefix_token = ""
866 | self.suffix_token = ""
867 |
868 | def codegen(
869 | self, prompt: str, do_sample: bool = True, num_samples: int = 200
870 | ) -> List[str]:
871 | if self.temperature == 0:
872 | assert not do_sample
873 | assert num_samples == 1
874 |
875 | input = self.prefix_token + prompt + self.suffix_token
876 | input_tokens = self.tokenizer.encode(input, return_tensors="pt").to(self.device)
877 | scores = StoppingCriteriaList(
878 | [
879 | EndOfFunctionCriteria(
880 | start_length=len(input_tokens[0]),
881 | eos=self.eos,
882 | tokenizer=self.tokenizer,
883 | )
884 | ]
885 | )
886 | temperature = max(self.temperature, 1e-2)
887 | raw_outputs = self.model.generate(
888 | input_tokens,
889 | max_new_tokens=self.max_new_tokens,
890 | stopping_criteria=scores,
891 | do_sample=do_sample,
892 | top_p=0.95,
893 | top_k=None,
894 | temperature=temperature,
895 | num_return_sequences=min(self.batch_size, num_samples),
896 | output_scores=True,
897 | return_dict_in_generate=True,
898 | repetition_penalty=1.0,
899 | pad_token_id=self.tokenizer.eos_token_id,
900 | )
901 | gen_seqs = raw_outputs.sequences[:, len(input_tokens[0]) :]
902 | gen_strs = self.tokenizer.batch_decode(
903 | gen_seqs, skip_special_tokens=self.skip_special_tokens
904 | )
905 | outputs = []
906 | # removes eos tokens.
907 | for output in gen_strs:
908 | min_index = 10000
909 | for eos in self.eos:
910 | if eos in output:
911 | min_index = min(min_index, output.index(eos))
912 | outputs.append(output[:min_index])
913 | return outputs
914 |
915 |
916 | def make_model(name: str, batch_size: int = 1, temperature: float = 0.8):
917 | if name == "claude-3":
918 | return AnthropicDecoder(
919 | batch_size=batch_size,
920 | name="claude",
921 | temperature=temperature,
922 | model_name="claude-3-opus-20240229",
923 | conversational=True,
924 | )
925 | elif name == "claude-3-haiku": # cheaper model
926 | return AnthropicDecoder(
927 | batch_size=batch_size,
928 | name="claude",
929 | temperature=temperature,
930 | model_name="claude-3-haiku-20240307",
931 | conversational=True,
932 | )
933 | elif name == "claude-2":
934 | return AnthropicDecoder(
935 | batch_size=batch_size,
936 | name="claude",
937 | temperature=temperature,
938 | model_name="claude-2.1",
939 | conversational=True,
940 | )
941 | elif name == "gemini-pro":
942 | return GeminiChatDecoder(
943 | batch_size=batch_size,
944 | name="gemini-pro",
945 | temperature=temperature,
946 | model_name="models/gemini-pro",
947 | conversational=True,
948 | )
949 | elif name == "palm":
950 | return PalmDecoder(
951 | batch_size=batch_size,
952 | name="palm",
953 | temperature=temperature,
954 | model_name="models/text-bison-001",
955 | conversational=True,
956 | )
957 | elif name == "chatgpt":
958 | return OpenAIChatDecoder(
959 | batch_size=batch_size,
960 | name="ChatGPT",
961 | temperature=temperature,
962 | model_name="gpt-3.5-turbo",
963 | conversational=True,
964 | )
965 | elif name == "gpt-4-turbo":
966 | return OpenAIChatDecoder(
967 | batch_size=batch_size,
968 | name="GPT4",
969 | temperature=temperature,
970 | model_name="gpt-4-turbo-preview",
971 | conversational=True,
972 | )
973 | elif name in ["gpt-4", "gpt-4-1106-preview"]:
974 | return OpenAIChatDecoder(
975 | batch_size=batch_size,
976 | name="GPT4",
977 | temperature=temperature,
978 | model_name=name,
979 | conversational=True,
980 | )
981 | elif name.startswith("starcoder2"):
982 | import re
983 |
984 | pattern = re.compile(r"starcoder2-(\d+)b")
985 | matches = pattern.findall(name)
986 | nb = int(matches[0])
987 | assert float(nb) > 0
988 | return StarCoder2(
989 | batch_size=batch_size,
990 | name=f"bigcode/{name}",
991 | temperature=temperature,
992 | )
993 | elif name.startswith("starcoder"):
994 | return StarCoderInfill(
995 | batch_size=batch_size, name=f"bigcode/{name}", temperature=temperature
996 | )
997 | elif name.startswith("code-llama-"):
998 | import re
999 |
1000 | pattern = re.compile(r"code-llama-(\d+\.?\d*)b(.*)")
1001 | matches = pattern.findall(name)[0]
1002 | nb = matches[0]
1003 | assert float(nb) > 0
1004 |
1005 | if "instruct" in name:
1006 | if float(nb) < 69: # nice
1007 | return CodeLlamaInstructSmall(
1008 | batch_size=batch_size,
1009 | name=f"codellama/CodeLlama-{nb}b-Instruct-hf",
1010 | temperature=temperature,
1011 | )
1012 | else:
1013 | return CodeLlamaInstructLarge(
1014 | batch_size=batch_size,
1015 | name=f"codellama/CodeLlama-{nb}b-Instruct-hf",
1016 | temperature=temperature,
1017 | )
1018 | elif "python" in name:
1019 | return HFTorchDecoder(
1020 | batch_size=batch_size,
1021 | name=f"codellama/CodeLlama-{nb}b-Python-hf",
1022 | temperature=temperature,
1023 | )
1024 | else:
1025 | return VLlmDecoder(
1026 | batch_size=batch_size,
1027 | name=f"codellama/CodeLlama-{nb}b-hf",
1028 | temperature=temperature,
1029 | )
1030 | elif name.startswith("deepseek-coder"):
1031 | import re
1032 |
1033 | # format deepseek-coder-{nb}b*
1034 | pattern = re.compile(r"deepseek-coder-(\d+\.?\d*)b(.*)")
1035 | matches = pattern.findall(name)[0]
1036 | nb = matches[0]
1037 | assert float(nb) > 0
1038 |
1039 | if "instruct" in name:
1040 | return DeepSeekInstruct(
1041 | batch_size=batch_size,
1042 | name=f"deepseek-ai/{name}",
1043 | temperature=temperature,
1044 | conversational=True,
1045 | )
1046 | else:
1047 | return HFTorchDecoder(
1048 | batch_size=batch_size,
1049 | name=f"deepseek-ai/deepseek-coder-{nb}b-base",
1050 | temperature=temperature,
1051 | )
1052 | elif name == "magicoder-s-ds-6.7b":
1053 | return MagicCoderInstruct(
1054 | batch_size=batch_size,
1055 | name=f"ise-uiuc/Magicoder-S-DS-6.7B",
1056 | temperature=temperature,
1057 | conversational=True,
1058 | )
1059 | elif name == "magicoder-s-cl-7b":
1060 | return MagicCoderInstruct(
1061 | batch_size=batch_size,
1062 | name=f"ise-uiuc/Magicoder-S-CL-7B",
1063 | temperature=temperature,
1064 | conversational=True,
1065 | )
1066 | elif name.startswith("wizardcoder-34b"):
1067 | return WizardCoderDecoder(
1068 | batch_size=batch_size,
1069 | name=f"WizardLM/WizardCoder-Python-34B-V1.0",
1070 | temperature=temperature,
1071 | conversational=True,
1072 | )
1073 | elif name.startswith("wizardcoder-33b-1.1"):
1074 | return WizardCoderDecoder(
1075 | batch_size=batch_size,
1076 | name=f"WizardLM/WizardCoder-33B-V1.1",
1077 | temperature=temperature,
1078 | conversational=True,
1079 | )
1080 | elif name == "phind-code-llama-34b-v2":
1081 | return HFTorchDecoder(
1082 | batch_size=batch_size,
1083 | name="Phind/Phind-CodeLlama-34B-v2",
1084 | temperature=temperature,
1085 | )
1086 | elif name.startswith("mistral-7b"):
1087 | if "instruct" in name:
1088 | if name.endswith("-v02"):
1089 | return MistralInstruct(
1090 | batch_size=batch_size,
1091 | name="mistralai/Mistral-7B-Instruct-v0.2",
1092 | temperature=temperature,
1093 | conversational=True,
1094 | )
1095 | else:
1096 | return MistralInstruct(
1097 | batch_size=batch_size,
1098 | name="mistralai/Mistral-7B-Instruct-v0.1",
1099 | temperature=temperature,
1100 | conversational=True,
1101 | )
1102 | else:
1103 | return HFTorchDecoder(
1104 | batch_size=batch_size,
1105 | name="mistralai/Mistral-7B-v0.1",
1106 | temperature=temperature,
1107 | )
1108 | elif name.startswith("mixtral-8x7b"):
1109 | if "instruct" in name:
1110 | return MixtralSPMXInstruct(
1111 | batch_size=batch_size,
1112 | name="mistralai/Mixtral-8x7B-Instruct-v0.1",
1113 | temperature=temperature,
1114 | conversational=True,
1115 | )
1116 | else:
1117 | return HFTorchDecoder(
1118 | batch_size=batch_size,
1119 | name="mistralai/Mixtral-8x7B-v0.1",
1120 | temperature=temperature,
1121 | )
1122 | elif name == "stable-code-3b":
1123 | return HFTorchDecoder(
1124 | batch_size=batch_size,
1125 | name="stabilityai/stable-code-3b",
1126 | temperature=temperature,
1127 | )
1128 | elif name == "speechless-codellama-34b":
1129 | return Alpaca(
1130 | batch_size=batch_size,
1131 | name="uukuguy/speechless-codellama-34b-v2.0",
1132 | temperature=temperature,
1133 | )
1134 | elif name == "openchat":
1135 | return OpenChat(
1136 | batch_size=batch_size,
1137 | name="openchat/openchat-3.5-0106",
1138 | temperature=temperature,
1139 | )
1140 | elif name.startswith("code-millenials-34b"):
1141 | return Alpaca(
1142 | batch_size=batch_size,
1143 | name="budecosystem/code-millenials-34b",
1144 | temperature=temperature,
1145 | conversational=True,
1146 | )
1147 | elif name == "phi-2":
1148 | return VLlmDecoder(
1149 | batch_size=batch_size,
1150 | name="microsoft/phi-2",
1151 | temperature=temperature,
1152 | )
1153 | elif name.startswith("qwen"):
1154 | # format deepseek-coder-{nb}b*
1155 | import re
1156 |
1157 | pattern = re.compile(r"qwen-(\d+\.?\d*)b(.*)")
1158 | matches = pattern.findall(name)[0]
1159 | nb = matches[0]
1160 | assert float(nb) > 0
1161 |
1162 | if "1.5" in name:
1163 | return QwenInstruct(
1164 | batch_size=batch_size,
1165 | name=f"Qwen/Qwen1.5-{nb}B-Chat",
1166 | temperature=temperature,
1167 | conversational=True,
1168 | )
1169 | else:
1170 | return QwenInstruct(
1171 | batch_size=batch_size,
1172 | name=f"Qwen/Qwen-{nb}B-Chat",
1173 | temperature=temperature,
1174 | conversational=True,
1175 | )
1176 | elif name.startswith("xwincoder-34b"):
1177 | return XwinCoder(
1178 | batch_size=batch_size, name="Xwin-LM/XwinCoder-34B", temperature=temperature
1179 | )
1180 | elif name.startswith("gemma"):
1181 | import re
1182 |
1183 | pattern = re.compile(r"gemma-(\d+\.?\d*)b(.*)")
1184 | matches = pattern.findall(name)[0]
1185 | nb = matches[0]
1186 | assert float(nb) > 0
1187 | if "instruct" in name:
1188 | return GemmaInstruct(
1189 | batch_size=batch_size,
1190 | name=f"google/gemma-{nb}b-it",
1191 | temperature=temperature,
1192 | conversational=True,
1193 | )
1194 | else:
1195 | return HFTorchDecoder(
1196 | batch_size=batch_size,
1197 | name=f"google/gemma-{nb}b",
1198 | temperature=temperature,
1199 | )
1200 |
1201 | raise ValueError(f"Invalid model name: {name}")
1202 |
--------------------------------------------------------------------------------
/evoeval/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | from evoeval._version import __version__, __version_tuple__
3 | except ImportError:
4 | __version__ = "local-dev"
5 |
--------------------------------------------------------------------------------
/evoeval/data.py:
--------------------------------------------------------------------------------
1 | # largely adopted from EvalPlus
2 | import gzip
3 | import hashlib
4 | import json
5 | import os
6 | from typing import Dict, Iterable
7 |
8 | import tempdir
9 | import wget
10 | from appdirs import user_cache_dir
11 |
12 | CACHE_DIR = user_cache_dir("evoeval")
13 |
14 |
15 | EVOEVAL_VERSION = "v0.1.0"
16 | EVOEVAL_OVERRIDE_PATH = os.environ.get("EVOEVAL_OVERRIDE_PATH", None)
17 |
18 |
19 | def write_jsonl(
20 | filename: str, data: Iterable[Dict], append: bool = False, drop_builtin: bool = True
21 | ):
22 | """
23 | Writes an iterable of dictionaries to jsonl
24 | """
25 | if append:
26 | mode = "ab"
27 | else:
28 | mode = "wb"
29 | filename = os.path.expanduser(filename)
30 | if filename.endswith(".gz"):
31 | with open(filename, mode) as fp:
32 | with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
33 | for x in data:
34 | if drop_builtin:
35 | x = {k: v for k, v in x.items() if not k.startswith("_")}
36 | gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
37 | else:
38 | with open(filename, mode) as fp:
39 | for x in data:
40 | if drop_builtin:
41 | x = {k: v for k, v in x.items() if not k.startswith("_")}
42 | fp.write((json.dumps(x) + "\n").encode("utf-8"))
43 |
44 |
45 | def make_cache(gzip_url, cache_path, dataset_name):
46 | # Check if human eval file exists in CACHE_DIR
47 | if not os.path.exists(cache_path):
48 | # Install HumanEval dataset and parse as jsonl
49 | print(f"Downloading dataset from {gzip_url}")
50 | with tempdir.TempDir() as tmpdir:
51 | # TODO need to test this.
52 | evoeval_gz_path = os.path.join(tmpdir, f"{dataset_name}-data.jsonl.gz")
53 | wget.download(gzip_url, evoeval_gz_path)
54 |
55 | with gzip.open(evoeval_gz_path, "rb") as f:
56 | evoeval = f.read().decode("utf-8")
57 |
58 | # create CACHE_DIR if not exists
59 | if not os.path.exists(CACHE_DIR):
60 | os.makedirs(CACHE_DIR)
61 |
62 | # Write the original human eval file to CACHE_DIR
63 | with open(cache_path, "w") as f:
64 | f.write(evoeval)
65 |
66 |
67 | def get_dataset_metadata(name: str, version: str):
68 | assert name in [
69 | "EvoEval_difficult",
70 | "EvoEval_creative",
71 | "EvoEval_subtle",
72 | "EvoEval_combine",
73 | "EvoEval_tool_use",
74 | "EvoEval_verbose",
75 | "EvoEval_concise",
76 | ], f"Unknown/unsupported dataset: {name}"
77 | url = f"https://github.com/evo-eval/evoeval_release/releases/download/{version}/{name}.jsonl.gz"
78 | cache_path = os.path.join(CACHE_DIR, f"{name}-{version}.jsonl")
79 | return url, cache_path
80 |
81 |
82 | def _ready_evo_eval_path(dataset_name: str) -> str:
83 | if EVOEVAL_OVERRIDE_PATH is not None:
84 | # create CACHE_DIR if not exists
85 | if not os.path.exists(CACHE_DIR):
86 | os.makedirs(CACHE_DIR)
87 | return f"{EVOEVAL_OVERRIDE_PATH}/{dataset_name}.jsonl"
88 |
89 | url, cache_path = get_dataset_metadata(dataset_name, EVOEVAL_VERSION)
90 | make_cache(url, cache_path, dataset_name)
91 |
92 | return cache_path
93 |
94 |
95 | def get_evo_eval_plus_hash(dataset_name: str) -> str:
96 | evoeval_path = _ready_evo_eval_path(dataset_name)
97 | with open(evoeval_path, "rb") as f:
98 | evoeval = f.read()
99 | return hashlib.md5(evoeval).hexdigest()
100 |
101 |
102 | def get_evo_eval(dataset_name: str):
103 | evoeval_path = _ready_evo_eval_path(dataset_name)
104 | with open(evoeval_path, "r") as f:
105 | data = {json.loads(task)["task_id"]: json.loads(task) for task in f.readlines()}
106 |
107 | return data
108 |
--------------------------------------------------------------------------------
/evoeval/eval_test/__init__.py:
--------------------------------------------------------------------------------
1 | # largely adopted from https://github.com/evalplus/evalplus
2 |
3 | import itertools
4 | import json
5 | import multiprocessing
6 | import os
7 | import time
8 | from enum import IntEnum, auto
9 | from multiprocessing import Array, Value
10 | from typing import Any, Dict, List, Tuple, Union
11 |
12 | import numpy as np
13 | from evalplus.eval.utils import (
14 | create_tempdir,
15 | reliability_guard,
16 | swallow_io,
17 | time_limit,
18 | )
19 |
20 | from evoeval.eval_test._creative_special_oracle import (
21 | _check_maze,
22 | _check_path,
23 | _check_product,
24 | )
25 | from evoeval.eval_test._difficult_special_oracle import (
26 | _check_difficult_poly,
27 | _check_insensitive_palindrome,
28 | )
29 | from evoeval.eval_test._he_special_oracle import _poly
30 | from evoeval.eval_test._subtle_special_oracle import _check_poly
31 |
32 |
33 | class CustomEncoder(json.JSONEncoder):
34 | def default(self, obj):
35 | if isinstance(obj, set):
36 | return list(obj)
37 | if isinstance(obj, object):
38 | return str(obj)
39 | return json.JSONEncoder.default(self, obj)
40 |
41 |
42 | def compatible_eval_result(results: Dict) -> Dict:
43 | # compatibility
44 | for task_results in results["eval"].values():
45 | # update the "files" field to "nfiles"
46 | if "files" in task_results and "nfiles" not in task_results:
47 | task_results["nfiles"] = len(task_results.pop("files"))
48 | return results
49 |
50 |
51 | # unbiased estimator from https://github.com/openai/human-eval
52 | def estimate_pass_at_k(
53 | num_samples: Union[int, List[int], np.ndarray],
54 | num_correct: Union[List[int], np.ndarray],
55 | k: int,
56 | ) -> np.ndarray:
57 | """
58 | Estimates pass@k of each problem and returns them in an array.
59 | """
60 |
61 | def estimator(n: int, c: int, k: int) -> float:
62 | """
63 | Calculates 1 - comb(n - c, k) / comb(n, k).
64 | """
65 | if n - c < k:
66 | return 1.0
67 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
68 |
69 | if isinstance(num_samples, int):
70 | num_samples_it = itertools.repeat(num_samples, len(num_correct))
71 | else:
72 | assert len(num_samples) == len(num_correct)
73 | num_samples_it = iter(num_samples)
74 |
75 | return np.array(
76 | [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
77 | )
78 |
79 |
80 | PASS = "pass"
81 | FAIL = "fail"
82 | TIMEOUT = "timeout"
83 |
84 | _SUCCESS = 0
85 | _FAILED = 1
86 | _TIMEOUT = 2
87 | _UNKNOWN = 3
88 |
89 | _mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None}
90 |
91 |
92 | def is_floats(x) -> bool:
93 | # check if it is float; List[float]; Tuple[float]
94 | # TODO: search for any close floats? (other data structures)
95 | if isinstance(x, float):
96 | return True
97 | if isinstance(x, (list, tuple)):
98 | return all(isinstance(i, float) for i in x)
99 | if isinstance(x, np.ndarray):
100 | return x.dtype == np.float64 or x.dtype == np.float32
101 | return False
102 |
103 |
104 | class DataType(IntEnum):
105 | Float = auto()
106 | Bool = auto()
107 | Int = auto()
108 | Str = auto()
109 | Null = auto()
110 | Tuple = auto()
111 | List = auto()
112 | Dict = auto()
113 | Set = auto()
114 | Type = auto()
115 | Unknown = auto()
116 |
117 |
118 | def get_type(x):
119 | if x is None:
120 | return DataType.Null
121 | elif isinstance(x, bool):
122 | return DataType.Bool
123 | elif isinstance(x, int):
124 | return DataType.Int
125 | elif isinstance(x, str):
126 | return DataType.Str
127 | elif is_floats(x):
128 | return DataType.Float
129 | elif isinstance(x, tuple):
130 | return DataType.Tuple
131 | elif isinstance(x, list):
132 | return DataType.List
133 | elif isinstance(x, dict):
134 | return DataType.Dict
135 | elif isinstance(x, set):
136 | return DataType.Set
137 | elif isinstance(x, type):
138 | return DataType.Type
139 | else:
140 | return DataType.Unknown
141 |
142 |
143 | def is_equal(x, y) -> tuple[bool, str]:
144 | x_type, y_type = get_type(x), get_type(y)
145 | if x_type != y_type:
146 | return False, "Type mismatch: {} vs {}".format(str(x_type), str(y_type))
147 |
148 | if x_type in [
149 | DataType.Int,
150 | DataType.Bool,
151 | DataType.Null,
152 | DataType.Str,
153 | DataType.Set,
154 | DataType.Type,
155 | ]:
156 | if x == y:
157 | return True, None
158 | try:
159 | error_msg = "INT/BOOL/NULL/ Value mismatch: {} vs {}".format(
160 | repr(x)[:300], repr(y)[:300]
161 | )
162 | except:
163 | error_msg = "Value mismatch: too large for display"
164 | return False, error_msg
165 | elif x_type == DataType.Float:
166 | if np.allclose(x, y, equal_nan=True, atol=1e-6): # guard against nan
167 | return True, None
168 | else:
169 | return False, "FLOAT Value mismatch: {} vs {}".format(x, y)
170 | elif x_type in [DataType.List, DataType.Tuple]:
171 | if len(x) != len(y):
172 | return False, "Length mismatch: {} vs {}".format(len(x), len(y))
173 | for i in range(len(x)):
174 | equal, msg = is_equal(x[i], y[i])
175 | if not equal:
176 | return False, msg
177 | return True, None
178 | elif x_type == DataType.Dict:
179 | if len(x) != len(y):
180 | return False, "Length mismatch: {} vs {}".format(len(x), len(y))
181 | for k, v in x.items():
182 | if k not in y:
183 | return False, "DICT Value mismatch: key {} in {} but not in {}".format(
184 | k, x, y
185 | )
186 | equal, msg = is_equal(v, y[k])
187 | if not equal:
188 | return False, msg
189 | return True, None
190 | else:
191 | # from IPython import embed
192 | # embed()
193 | try:
194 | if x == y: # e.g., object comparison
195 | return True, None
196 | else:
197 | return False, "ELSE Value mismatch: {} vs {}".format(x, y)
198 | except:
199 | return False, "Unsupported type: {} <-- {}".format(x_type, type(x))
200 |
201 |
202 | def unsafe_execute(
203 | dataset: str,
204 | entry_point: str,
205 | task_id: str,
206 | code: str,
207 | inputs,
208 | expected: List,
209 | time_limits,
210 | atol,
211 | fast_check,
212 | stat: Value,
213 | details: Array,
214 | progress: Value,
215 | ):
216 | with create_tempdir():
217 | # These system calls are needed when cleaning up tempdir.
218 | import os
219 | import shutil
220 |
221 | rmtree = shutil.rmtree
222 | rmdir = os.rmdir
223 | chdir = os.chdir
224 | # Disable functionalities that can make destructive changes to the test.
225 | # allow only 4GB memory usage
226 | maximum_memory_bytes = 4 * 1024 * 1024 * 1024
227 | reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
228 | exec_globals = {}
229 | try:
230 | with swallow_io():
231 | exec(code, exec_globals)
232 | fn = exec_globals[entry_point]
233 | for i, inp in enumerate(inputs):
234 | try:
235 | with time_limit(time_limits[i]):
236 | out = fn(*inp)
237 | exp = expected[i]
238 | # TODO, for special oracles, think about how to deal with case where
239 | # the function has side affect and changes the input ...
240 | # this is true especially for some grid checking stuff
241 | # ================================================ #
242 | # ============== special oracles ================= #
243 | # use task_id and dataset to determine the oracle
244 | if (
245 | dataset == "humaneval"
246 | or "verbose" in dataset
247 | or "concise" in dataset
248 | ) and task_id == "HumanEval/32":
249 | assert abs(_poly(*out, inp)) <= 1e-6
250 |
251 | # =================== Difficult ================== #
252 | elif "difficult" in dataset and task_id == "EvoEval/10":
253 | _check_insensitive_palindrome(out, *inp, exp)
254 | elif "difficult" in dataset and task_id == "EvoEval/32":
255 | _check_difficult_poly(*inp, out, exp)
256 |
257 | # =================== Creative =================== #
258 | elif "creative" in dataset and task_id == "EvoEval/26":
259 | _check_maze(*inp, out, exp)
260 | elif "creative" in dataset and task_id == "EvoEval/30":
261 | _check_path(*inp, out, exp)
262 | elif "creative" in dataset and task_id == "EvoEval/69":
263 | _check_product(*inp, out, exp)
264 |
265 | # =================== Subtle ===================== #
266 | elif "subtle" in dataset and task_id == "EvoEval/32":
267 | _check_poly(*inp, out)
268 |
269 | # =================== Combine ==================== #
270 |
271 | # =================== Tool Using ================= #
272 |
273 | # ============== special oracles ================= #
274 | # ================================================ #
275 | else:
276 | exact_match, _ = is_equal(exp, out)
277 | assert exact_match
278 | except BaseException:
279 | details[i] = False
280 | progress.value += 1
281 | if fast_check:
282 | raise
283 | continue
284 |
285 | details[i] = True
286 | progress.value += 1
287 | stat.value = _SUCCESS
288 | except BaseException:
289 | stat.value = _FAILED
290 | # Needed for cleaning up.
291 | shutil.rmtree = rmtree
292 | os.rmdir = rmdir
293 | os.chdir = chdir
294 |
295 |
296 | def untrusted_check(
297 | dataset: str,
298 | code: str,
299 | inputs: List[Any],
300 | entry_point: str,
301 | task_id: str,
302 | expected,
303 | atol,
304 | ref_time: List[float],
305 | fast_check: bool = False,
306 | min_time_limit: float = 0.1,
307 | gt_time_limit_factor: float = 2.0,
308 | ) -> Tuple[str, np.ndarray]:
309 | time_limits = [max(min_time_limit, gt_time_limit_factor * t) for t in ref_time]
310 | timeout = min(os.getenv("EVOEVAL_TIMEOUT_PER_TASK", 60), sum(time_limits)) + 1
311 | if not fast_check:
312 | timeout += 1 # extra time for data collection
313 |
314 | # shared memory objects
315 | progress = Value("i", 0)
316 | stat = Value("i", _UNKNOWN)
317 | details = Array("b", [False for _ in range(len(inputs))])
318 | p = multiprocessing.Process(
319 | target=unsafe_execute,
320 | args=(
321 | dataset,
322 | entry_point,
323 | task_id,
324 | code,
325 | inputs,
326 | expected,
327 | time_limits,
328 | atol,
329 | fast_check,
330 | # return values
331 | stat,
332 | details,
333 | progress,
334 | ),
335 | )
336 | p.start()
337 | p.join(timeout=timeout + 1)
338 | if p.is_alive():
339 | p.terminate()
340 | time.sleep(0.1)
341 | if p.is_alive():
342 | p.kill()
343 | time.sleep(0.1)
344 |
345 | stat = _mapping[stat.value]
346 | details = details[: progress.value]
347 |
348 | if not stat:
349 | stat = TIMEOUT
350 |
351 | if stat == PASS:
352 | if len(details) != len(inputs) or not all(details):
353 | stat = FAIL
354 |
355 | return stat, details
356 |
357 |
358 | def evaluateb_files(
359 | dataset: str,
360 | files: List[str],
361 | inputs: List,
362 | expected: List,
363 | entry_point: str,
364 | atol: float,
365 | ref_time: List[float],
366 | fast_check: bool = False,
367 | min_time_limit: float = 0.1,
368 | gt_time_limit_factor: float = 2.0,
369 | ) -> List[Tuple[str, List[bool]]]:
370 | ret = []
371 | # sort files by the id in name (i.e., "../n.py")
372 | files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0]))
373 | for file in files:
374 | code = open(file, "r").read()
375 | stat, det = untrusted_check(
376 | dataset,
377 | code,
378 | inputs,
379 | entry_point,
380 | expected=expected,
381 | atol=atol,
382 | ref_time=ref_time,
383 | fast_check=fast_check,
384 | min_time_limit=min_time_limit,
385 | gt_time_limit_factor=gt_time_limit_factor,
386 | )
387 | ret.append((stat, det.tolist()))
388 | return ret
389 |
--------------------------------------------------------------------------------
/evoeval/eval_test/_creative_special_oracle.py:
--------------------------------------------------------------------------------
1 | # oracle for EvoEval/51 in creative
2 | def _check_maze(maze, start, end, solution_path, gt_path):
3 | if not gt_path:
4 | assert solution_path == []
5 | else:
6 | # check the path according to solution reaches from start to end
7 | move_to_direction = {
8 | "right": (0, 1),
9 | "left": (0, -1),
10 | "up": (-1, 0),
11 | "down": (1, 0),
12 | }
13 | current_position = start
14 | for move in solution_path:
15 | current_position = (
16 | current_position[0] + move_to_direction[move][0],
17 | current_position[1] + move_to_direction[move][1],
18 | )
19 | assert maze[current_position[0]][current_position[1]] != 1
20 |
21 | assert current_position == end
22 |
23 |
24 | # oracle for EvoEval/55 in creative
25 | def _check_path(maze, start, end, solution_path, gt_path):
26 | if not gt_path:
27 | assert solution_path == []
28 | else:
29 | # check the path according to solution reaches from start to end
30 | assert solution_path[0] == start
31 | assert solution_path[-1] == end
32 | assert maze[start[0]][start[0]] != 0
33 | for i in range(1, len(solution_path)):
34 | prev_x, prev_y = solution_path[i - 1]
35 | curr_x, curr_y = solution_path[i]
36 | assert maze[curr_x][curr_y] != 0 # not a wall
37 | assert abs(curr_x - prev_x) + abs(curr_y - prev_y) == 1 # adjacent
38 |
39 |
40 | # oracle for EvoEval/110 in creative
41 | def _check_product(arr, target, solution, gt):
42 | if gt == "No magic today":
43 | assert gt == solution
44 | else:
45 | assert isinstance(solution, tuple)
46 | i, j = solution
47 | assert 0 <= i < j < len(arr) # don't allow negative indexing
48 | assert arr[i] * arr[j] == target
49 |
--------------------------------------------------------------------------------
/evoeval/eval_test/_difficult_special_oracle.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 |
4 | # oracle for EvoEval/10 in difficult
5 | def _check_insensitive_palindrome(check_palindrome, string, gt_palindrome):
6 | assert len(check_palindrome) == len(gt_palindrome)
7 | assert check_palindrome.startswith(string)
8 | assert check_palindrome.lower() == check_palindrome[::-1].lower()
9 |
10 |
11 | def _poly(xs: list, x: float):
12 | """
13 | Evaluates polynomial with coefficients xs at point x.
14 | return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
15 | """
16 | return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
17 |
18 |
19 | # oracle for EvoEval/32 in difficult
20 | def _check_difficult_poly(xs, interval, solution, gt_solution):
21 | if gt_solution is None:
22 | assert solution is None
23 | return
24 |
25 | start, end = interval
26 | assert start <= solution <= end
27 | assert abs(_poly(xs, solution)) <= 2e-2
28 |
--------------------------------------------------------------------------------
/evoeval/eval_test/_he_special_oracle.py:
--------------------------------------------------------------------------------
1 | # Adopted from EvalPlus
2 | import math
3 |
4 |
5 | # oracle for HumanEval/032
6 | def _poly(xs: list, x: float):
7 | """
8 | Evaluates polynomial with coefficients xs at point x.
9 | return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
10 | """
11 | return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
12 |
--------------------------------------------------------------------------------
/evoeval/eval_test/_subtle_special_oracle.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 |
4 | # oracle for EvoEval/32 in subtle
5 | def _poly(xs: list, x: float):
6 | """
7 | Evaluates polynomial with coefficients xs at point x.
8 | return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
9 | """
10 | return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
11 |
12 |
13 | def _check_poly(xs, solution):
14 | full_xs = [xs[0], xs[1]]
15 | for i in range(2, len(xs)):
16 | full_xs.extend([0, xs[i]])
17 | assert abs(_poly(full_xs, solution)) <= 1e-6
18 |
--------------------------------------------------------------------------------
/evoeval/evaluate.py:
--------------------------------------------------------------------------------
1 | # Adopted from https://github.com/evalplus/evalplus
2 | import argparse
3 | import contextlib
4 | import json
5 | import multiprocessing
6 | import os
7 | import pickle
8 | import threading
9 | import time
10 | from collections import Counter, defaultdict
11 | from concurrent.futures import ProcessPoolExecutor, as_completed
12 | from typing import Any, Dict, List, Tuple
13 | from warnings import warn
14 |
15 | import numpy as np
16 | from evalplus.data import get_human_eval_plus
17 | from evalplus.data.utils import load_solutions
18 | from evalplus.gen.util import trusted_exec
19 | from termcolor import cprint
20 | from tqdm import tqdm
21 |
22 | from evoeval.data import CACHE_DIR, get_evo_eval, get_evo_eval_plus_hash
23 | from evoeval.eval_test import (
24 | FAIL,
25 | PASS,
26 | CustomEncoder,
27 | compatible_eval_result,
28 | estimate_pass_at_k,
29 | untrusted_check,
30 | )
31 |
32 | # 1st item: the status
33 | # 2nd item (optional): the detailed pass/fail boolean for each input
34 | Result = Tuple[str, List[bool]]
35 |
36 |
37 | def get_groundtruth(
38 | problems, hashcode, use_raw_inputs=False, compute_plus_inputs=False
39 | ) -> Dict[str, Any]:
40 | if hashcode is not None:
41 | cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
42 | if os.path.exists(cache_file):
43 | print(f"Load from ground-truth from {cache_file}")
44 | with open(cache_file, "rb") as f:
45 | return pickle.load(f)
46 |
47 | print("Computing expected output...")
48 | tbegin = time.time()
49 | expected_output = {}
50 | for task_id, problem in problems.items():
51 | oracle = {}
52 | with contextlib.redirect_stdout(None):
53 | oracle["base"], oracle["base_time"] = trusted_exec(
54 | problem["prompt"] + "\n" + problem["canonical_solution"],
55 | problem["base_input"]
56 | if use_raw_inputs
57 | else [
58 | eval(f"[{i}]") for i in problem["inputs"]
59 | ], # why do we do this? we have more complex input types.
60 | problem["entry_point"],
61 | record_time=True,
62 | output_not_none=False,
63 | )
64 | expected_output[task_id] = oracle
65 |
66 | if compute_plus_inputs:
67 | oracle["plus"], oracle["plus_time"] = trusted_exec(
68 | problem["prompt"] + "\n" + problem["canonical_solution"],
69 | problem["plus_input"], # assumption: we have plus_input
70 | problem["entry_point"],
71 | record_time=True,
72 | output_not_none=False,
73 | )
74 | expected_output[task_id] = oracle
75 |
76 | # print(expected_output)
77 | print(f"Expected outputs computed in {time.time() - tbegin:.2f}s")
78 |
79 | if hashcode is not None:
80 | with open(cache_file, "wb") as f:
81 | pickle.dump(expected_output, f)
82 |
83 | return expected_output
84 |
85 |
86 | def check_correctness(
87 | dataset: str,
88 | completion_id: int,
89 | problem: Dict[str, Any],
90 | solution: str,
91 | expected_output: Dict[str, List],
92 | fast_check=False,
93 | identifier=None,
94 | min_time_limit: float = 0.1,
95 | gt_time_limit_factor: float = 2.0,
96 | use_raw_inputs=False,
97 | compute_plus_inputs=False,
98 | ) -> Dict[str, Result]: # {...}, "base" | "plus" -> (status, details)
99 |
100 | ret = {
101 | "completion_id": completion_id,
102 | "task_id": problem["task_id"],
103 | "_identifier": identifier,
104 | "solution": solution,
105 | }
106 |
107 | ret["result"] = untrusted_check(
108 | dataset,
109 | solution,
110 | problem["base_input"]
111 | if use_raw_inputs
112 | else [eval(f"[{i}]") for i in problem["inputs"]],
113 | problem["entry_point"],
114 | task_id=problem["task_id"],
115 | expected=expected_output["base"],
116 | atol=0, # TODO check
117 | ref_time=expected_output["base_time"],
118 | fast_check=fast_check,
119 | min_time_limit=min_time_limit,
120 | gt_time_limit_factor=gt_time_limit_factor,
121 | )
122 |
123 | if compute_plus_inputs:
124 | ret["plus"] = untrusted_check(
125 | dataset,
126 | solution,
127 | problem["plus_input"],
128 | problem["entry_point"],
129 | task_id=problem["task_id"],
130 | expected=expected_output["plus"],
131 | atol=0, # TODO check
132 | ref_time=expected_output["plus_time"],
133 | fast_check=fast_check,
134 | min_time_limit=min_time_limit,
135 | gt_time_limit_factor=gt_time_limit_factor,
136 | )
137 |
138 | return ret
139 |
140 |
141 | def evaluate(flags):
142 | if flags.parallel is None:
143 | n_workers = max(1, multiprocessing.cpu_count() // 2)
144 | else:
145 | n_workers = flags.parallel
146 |
147 | if os.path.isdir(flags.samples):
148 | result_path = os.path.join(flags.samples, "eval_results.json")
149 | else:
150 | assert flags.samples.endswith(".jsonl")
151 | result_path = flags.samples.replace(".jsonl", "_eval_results.json")
152 |
153 | compute_plus_inputs = False
154 |
155 | if os.path.isfile(result_path) and not flags.i_just_wanna_run:
156 | print(f"Load from previous results from {result_path}")
157 | with open(result_path, "r") as f:
158 | results = json.load(f)
159 |
160 | results = compatible_eval_result(results)
161 | else:
162 | use_raw_inputs = False
163 | if flags.dataset == "humaneval":
164 | use_raw_inputs = True
165 | compute_plus_inputs = True
166 | problems = get_human_eval_plus()
167 | expected_output = get_groundtruth(
168 | problems,
169 | None,
170 | use_raw_inputs=use_raw_inputs,
171 | compute_plus_inputs=compute_plus_inputs,
172 | )
173 | elif "verbose" in flags.dataset or "concise" in flags.dataset:
174 | use_raw_inputs = True
175 | compute_plus_inputs = True
176 | problems = get_evo_eval(flags.dataset)
177 | expected_output = get_groundtruth(
178 | problems,
179 | None,
180 | use_raw_inputs=use_raw_inputs,
181 | compute_plus_inputs=compute_plus_inputs,
182 | )
183 | else:
184 | problems = get_evo_eval(flags.dataset)
185 | dataset_hash = get_evo_eval_plus_hash(flags.dataset)
186 | expected_output = get_groundtruth(
187 | problems,
188 | dataset_hash,
189 | use_raw_inputs=use_raw_inputs,
190 | compute_plus_inputs=compute_plus_inputs,
191 | )
192 |
193 | results = {
194 | "eval": {},
195 | }
196 |
197 | with ProcessPoolExecutor(max_workers=n_workers) as executor:
198 | futures = []
199 | completion_id = Counter()
200 | n_samples = 0
201 | eval_results = defaultdict(list) # task_id ->
202 | remainings = set()
203 |
204 | print("Reading samples...")
205 | for sample in tqdm(load_solutions(flags.samples)):
206 | task_id = sample["task_id"]
207 | solution = (
208 | sample["solution"]
209 | if "solution" in sample
210 | else problems[task_id]["prompt"] + sample["completion"]
211 | )
212 | remainings.add(sample["_identifier"])
213 | args = (
214 | flags.dataset,
215 | completion_id[task_id],
216 | problems[task_id],
217 | solution,
218 | expected_output[task_id],
219 | not flags.test_details, # fast_check
220 | sample["_identifier"],
221 | flags.min_time_limit,
222 | flags.gt_time_limit_factor,
223 | use_raw_inputs,
224 | compute_plus_inputs,
225 | )
226 | futures.append(executor.submit(check_correctness, *args))
227 | completion_id[task_id] += 1
228 | n_samples += 1
229 |
230 | assert n_samples == len(remainings), "Missing problems in unfinished"
231 | assert len(completion_id) == len(problems), "Missing problems in samples"
232 |
233 | def stucking_checker():
234 | while remainings:
235 | last_size = len(remainings)
236 | time.sleep(20)
237 | if last_size != len(remainings) or len(remainings) == 0:
238 | continue
239 | # Potentially stuck
240 | warn("No samples had finished testing in the last 20s")
241 | warn(f"{len(remainings)} samples to be tested: {remainings}")
242 |
243 | threading.Thread(target=stucking_checker).start()
244 |
245 | for future in tqdm(as_completed(futures), total=n_samples):
246 | result = future.result()
247 | remainings.remove(result["_identifier"])
248 | eval_results[result["task_id"]].append(result)
249 |
250 | # sort the results for each problem by completion_id
251 | for task_id, task_results in eval_results.items():
252 | task_results.sort(key=lambda x: x["completion_id"])
253 | results["eval"][task_id] = []
254 | for res in task_results:
255 |
256 | def get_failed_tests(stat, details, inputs) -> List[Any]:
257 | if stat == PASS or not details:
258 | return []
259 |
260 | # if flags.test_details:
261 | return [inputs[i] for i in range(len(details)) if not details[i]]
262 |
263 | base_stat, base_details = res["result"]
264 | base_fail_tests = get_failed_tests(
265 | base_stat,
266 | base_details,
267 | problems[task_id]["base_input"]
268 | if use_raw_inputs
269 | else [eval(f"[{i}]") for i in problems[task_id]["inputs"]],
270 | )
271 |
272 | # initialize plus tests
273 | plus_stat = None
274 | plus_fail_tests = []
275 |
276 | # with plus tests
277 | if not flags.base_only and compute_plus_inputs:
278 | plus_stat, plus_details = res["plus"]
279 | plus_fail_tests = get_failed_tests(
280 | plus_stat, plus_details, problems[task_id]["plus_input"]
281 | )
282 |
283 | results["eval"][task_id].append(
284 | {
285 | "task_id": task_id,
286 | "solution": res["solution"],
287 | "base_status": base_stat,
288 | "plus_status": plus_stat,
289 | "base_fail_tests": base_fail_tests,
290 | "plus_fail_tests": plus_fail_tests,
291 | }
292 | )
293 |
294 | if os.path.isfile(result_path) and flags.i_just_wanna_run:
295 | decision = ""
296 | while decision.lower() not in ["y", "n"]:
297 | print(f"{result_path} already exists. Press [Y/N] to overwrite or exit...")
298 | decision = input()
299 |
300 | if decision.lower() == "y":
301 | # mv the file to a backup
302 | new_path = result_path + ".bak"
303 | while os.path.isfile(new_path):
304 | new_path += ".bak"
305 | os.rename(result_path, new_path)
306 | print(f"Backup {result_path} to {new_path}")
307 |
308 | if not os.path.isfile(result_path):
309 | with open(result_path, "w") as f:
310 | json.dump(
311 | results, f, cls=CustomEncoder
312 | ) # handle some unique cases where failure inputs are sets
313 |
314 | # Calculate pass@k.
315 | total = np.array([len(r) for r in results["eval"].values()])
316 | correct = []
317 | plus_correct = []
318 |
319 | for res in results["eval"].values():
320 | bc = sum([r["base_status"] == PASS for r in res])
321 | correct.append(bc)
322 | if not flags.base_only and compute_plus_inputs:
323 | plus_correct.append(
324 | sum(
325 | [
326 | res[i]["base_status"] == res[i]["plus_status"] == PASS
327 | for i in range(len(res))
328 | ]
329 | )
330 | )
331 |
332 | correct = np.array(correct)
333 | pass_at_k = {
334 | f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
335 | for k in [1, 10, 100]
336 | if total.min() >= k
337 | }
338 | cprint(f"{flags.dataset}", "red")
339 | for k, v in pass_at_k.items():
340 | cprint(f"{k}:\t{v:.3f}", "red")
341 |
342 | if plus_correct:
343 | cprint(f"{flags.dataset}+ (base + extra tests)", "green")
344 | pass_at_k = {
345 | f"pass@{k}": estimate_pass_at_k(total, np.array(plus_correct), k).mean()
346 | for k in [1, 10, 100]
347 | if (total >= k).all()
348 | }
349 | for k, v in pass_at_k.items():
350 | cprint(f"{k}:\t{v:.3f}", "green")
351 |
352 |
353 | def main():
354 | parser = argparse.ArgumentParser(description="Evaluator")
355 | parser.add_argument("--dataset", required=True, type=str)
356 | parser.add_argument("--samples", required=True, type=str)
357 | parser.add_argument("--base-only", action="store_true")
358 | parser.add_argument("--parallel", default=None, type=int)
359 | parser.add_argument("--i-just-wanna-run", action="store_true")
360 | parser.add_argument("--test-details", action="store_true")
361 | parser.add_argument("--min-time-limit", default=1, type=float)
362 | parser.add_argument("--gt-time-limit-factor", default=4.0, type=float)
363 | parser.add_argument("--mini", action="store_true")
364 | parser.add_argument(
365 | "--noextreme", action="store_true", help="Omit extreme test inputs"
366 | )
367 | args = parser.parse_args()
368 |
369 | evaluate(args)
370 |
371 |
372 | if __name__ == "__main__":
373 | main()
374 |
--------------------------------------------------------------------------------
/evoeval/util/api_request.py:
--------------------------------------------------------------------------------
1 | import signal
2 | import time
3 | from typing import Dict, Union
4 |
5 | import openai
6 | import tiktoken
7 | from google.generativeai import GenerationConfig
8 | from google.generativeai.types.safety_types import HarmBlockThreshold, HarmCategory
9 |
10 | client = openai.OpenAI()
11 |
12 |
13 | def num_tokens_from_messages(message, model="gpt-3.5-turbo-0301"):
14 | """Returns the number of tokens used by a list of messages."""
15 | try:
16 | encoding = tiktoken.encoding_for_model(model)
17 | except KeyError:
18 | encoding = tiktoken.get_encoding("cl100k_base")
19 | if isinstance(message, list):
20 | # use last message.
21 | num_tokens = len(encoding.encode(message[0]["content"]))
22 | else:
23 | num_tokens = len(encoding.encode(message))
24 | return num_tokens
25 |
26 |
27 | def create_chatgpt_config(
28 | message: Union[str, list],
29 | max_tokens: int,
30 | temperature: float = 1,
31 | batch_size: int = 1,
32 | system_message: str = "You are a helpful assistant.",
33 | model: str = "gpt-3.5-turbo",
34 | ) -> Dict:
35 | if isinstance(message, list):
36 | config = {
37 | "model": model,
38 | "max_tokens": max_tokens,
39 | "temperature": temperature,
40 | "n": batch_size,
41 | "messages": [{"role": "system", "content": system_message}] + message,
42 | }
43 | else:
44 | config = {
45 | "model": model,
46 | "max_tokens": max_tokens,
47 | "temperature": temperature,
48 | "n": batch_size,
49 | "messages": [
50 | {"role": "system", "content": system_message},
51 | {"role": "user", "content": message},
52 | ],
53 | }
54 | return config
55 |
56 |
57 | def handler(signum, frame):
58 | # swallow signum and frame
59 | raise Exception("end of time")
60 |
61 |
62 | def request_chatgpt_engine(config):
63 | ret = None
64 | while ret is None:
65 | try:
66 | signal.signal(signal.SIGALRM, handler)
67 | signal.alarm(100)
68 | ret = client.chat.completions.create(**config)
69 | signal.alarm(0)
70 | except openai._exceptions.BadRequestError as e:
71 | print(e)
72 | signal.alarm(0)
73 | except openai._exceptions.RateLimitError as e:
74 | print("Rate limit exceeded. Waiting...")
75 | print(e)
76 | signal.alarm(0)
77 | time.sleep(5)
78 | except openai._exceptions.APIConnectionError as e:
79 | print("API connection error. Waiting...")
80 | signal.alarm(0)
81 | time.sleep(5)
82 | except Exception as e:
83 | print("Unknown error. Waiting...")
84 | print(e)
85 | signal.alarm(0)
86 | time.sleep(1)
87 | return ret
88 |
89 |
90 | def create_gemini_config(
91 | max_tokens: int,
92 | temperature: float = 1,
93 | batch_size: int = 1,
94 | ) -> Dict:
95 | config = GenerationConfig(
96 | candidate_count=batch_size,
97 | max_output_tokens=max_tokens,
98 | temperature=temperature,
99 | )
100 | return config
101 |
102 |
103 | safety_settings = [
104 | {
105 | "category": "HARM_CATEGORY_DANGEROUS",
106 | "threshold": "BLOCK_NONE",
107 | },
108 | {
109 | "category": "HARM_CATEGORY_HARASSMENT",
110 | "threshold": "BLOCK_NONE",
111 | },
112 | {
113 | "category": "HARM_CATEGORY_HATE_SPEECH",
114 | "threshold": "BLOCK_NONE",
115 | },
116 | {
117 | "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
118 | "threshold": "BLOCK_NONE",
119 | },
120 | {
121 | "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
122 | "threshold": "BLOCK_NONE",
123 | },
124 | ]
125 |
126 |
127 | def request_gemini_engine(model, message, config):
128 | ret = None
129 | count = 0
130 | while ret is None:
131 | try:
132 | signal.signal(signal.SIGALRM, handler)
133 | signal.alarm(100)
134 | ret = model.generate_content(
135 | message, generation_config=config, safety_settings=safety_settings
136 | )
137 | s = ret.text # check if response can be accessed.
138 | signal.alarm(0)
139 | except Exception as e:
140 | ret = None # reset
141 | print("Unknown error. Waiting...")
142 | count += 1
143 | print(e)
144 | # here we need to slightly increase temperature to combat weird gemini output of
145 | # The token generation was stopped as the response was flagged for unauthorized citations.
146 | if count > 10:
147 | config.temperature = min(config.temperature + 0.1, 1)
148 | signal.alarm(0)
149 | time.sleep(20)
150 | return ret
151 |
152 |
153 | def create_palm_config(
154 | message: str,
155 | max_tokens: int,
156 | temperature: float = 1,
157 | batch_size: int = 1,
158 | model: str = "models/text-bison-001",
159 | ) -> Dict:
160 | config = {
161 | "model": model,
162 | "prompt": message,
163 | "temperature": temperature,
164 | "max_output_tokens": max_tokens,
165 | "safety_settings": [
166 | {
167 | "category": HarmCategory.HARM_CATEGORY_DEROGATORY,
168 | "threshold": HarmBlockThreshold.BLOCK_NONE,
169 | },
170 | {
171 | "category": HarmCategory.HARM_CATEGORY_TOXICITY,
172 | "threshold": HarmBlockThreshold.BLOCK_NONE,
173 | },
174 | {
175 | "category": HarmCategory.HARM_CATEGORY_SEXUAL,
176 | "threshold": HarmBlockThreshold.BLOCK_NONE,
177 | },
178 | {
179 | "category": HarmCategory.HARM_CATEGORY_VIOLENCE,
180 | "threshold": HarmBlockThreshold.BLOCK_NONE,
181 | },
182 | {
183 | "category": HarmCategory.HARM_CATEGORY_DANGEROUS,
184 | "threshold": HarmBlockThreshold.BLOCK_NONE,
185 | },
186 | {
187 | "category": HarmCategory.HARM_CATEGORY_MEDICAL,
188 | "threshold": HarmBlockThreshold.BLOCK_NONE,
189 | },
190 | ],
191 | }
192 | return config
193 |
194 |
195 | def request_palm_engine(model, config):
196 | ret = None
197 | count = 0
198 | while ret is None:
199 | try:
200 | signal.signal(signal.SIGALRM, handler)
201 | signal.alarm(100)
202 | ret = model.generate_text(**config)
203 | s = ret.result # check if response can be accessed.
204 | if s is None:
205 | config["temperature"] = min(config["temperature"] + 0.1, 1)
206 | count += 1
207 | if count > 100:
208 | ret.result = "" # just return empty string
209 | else:
210 | ret = None # reset
211 | signal.alarm(0)
212 | except Exception as e:
213 | ret = None # reset
214 | print("Unknown error. Waiting...")
215 | print(e)
216 | signal.alarm(0)
217 | time.sleep(20)
218 | return ret
219 |
220 |
221 | def create_anthropic_config(
222 | message: str,
223 | max_tokens: int,
224 | temperature: float = 1,
225 | batch_size: int = 1,
226 | model: str = "claude-2.1",
227 | ) -> Dict:
228 | if isinstance(message, list):
229 | config = {
230 | "model": model,
231 | "temperature": temperature,
232 | "max_tokens": max_tokens,
233 | "messages": message,
234 | }
235 | else:
236 | config = {
237 | "model": model,
238 | "temperature": temperature,
239 | "max_tokens": max_tokens,
240 | "messages": [{"role": "user", "content": message}],
241 | }
242 | return config
243 |
244 |
245 | def request_anthropic_engine(client, config):
246 | ret = None
247 | while ret is None:
248 | try:
249 | signal.signal(signal.SIGALRM, handler)
250 | signal.alarm(100)
251 | ret = client.messages.create(**config)
252 | signal.alarm(0)
253 | except Exception as e:
254 | print("Unknown error. Waiting...")
255 | print(e)
256 | signal.alarm(0)
257 | time.sleep(10)
258 | return ret
259 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [tool.setuptools_scm]
6 | write_to = "evoeval/_version.py"
7 | version_scheme = "release-branch-semver"
8 | local_scheme = "no-local-version"
9 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wget
2 | appdirs
3 | tempdir
4 | multipledispatch
5 | numpy
6 | tqdm
7 | termcolor
8 | evalplus @ git+https://github.com/evalplus/evalplus
9 |
--------------------------------------------------------------------------------
/resources/butterfly_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/evo-eval/evoeval/d5ca3104ec30b99f1076f51d4476eb4c3f29effa/resources/butterfly_dark.png
--------------------------------------------------------------------------------
/resources/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/evo-eval/evoeval/d5ca3104ec30b99f1076f51d4476eb4c3f29effa/resources/example.gif
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = evoeval
3 | description = "EvoEval: Evolving Coding Benchmarks via LLM"
4 | long_description = file: README.md
5 | long_description_content_type = text/markdown
6 | url = https://github.com/evo-eval/evoeval
7 | license = Apache-2.0
8 | license_file = LICENSE
9 | platform = any
10 | classifiers =
11 | Operating System :: OS Independent
12 | Programming Language :: Python :: 3
13 | License :: OSI Approved :: Apache Software License
14 |
15 | [options]
16 | packages = find:
17 | python_requires = >=3.9
18 | dependency_links =
19 | install_requires =
20 | wget>=3.2
21 | tempdir>=0.7.1
22 | multipledispatch>=0.6.0
23 | appdirs>=1.4.4
24 | numpy>=1.19.5
25 | tqdm>=4.56.0
26 | termcolor>=2.0.0
27 | evalplus>=0.2.0
28 |
29 | [options.entry_points]
30 | console_scripts =
31 | evoeval.evaluate = evoeval.evaluate:main
32 |
--------------------------------------------------------------------------------
/tool/sanitize.py:
--------------------------------------------------------------------------------
1 | # largely adopted from EvalPlus
2 |
3 | import ast
4 | import os
5 | import pathlib
6 |
7 | from evalplus.data import get_human_eval_plus
8 | from tqdm import tqdm
9 |
10 | from evoeval.data import get_evo_eval
11 |
12 | INCODER_EXTRA = ["", "<|", ""]
13 | POLYCODER_EXTRA = ["\n//", "\n/*"]
14 | NON_CODE_EOFS = ["<|endoftext|>", "\n```", "\n", "\n#"]
15 |
16 |
17 | def get_all_python_files(folder):
18 | # return a list of full-path python files
19 | py_files = []
20 | for root, _, files in os.walk(folder):
21 | for file in files:
22 | if file.endswith(".py"):
23 | py_files.append(os.path.join(root, file))
24 | return py_files
25 |
26 |
27 | def remove_unindented_lines(code, ok_starts):
28 | new_code = ""
29 | for line in code.splitlines():
30 | if any([line.startswith(t) for t in ok_starts]) or line.strip() == "":
31 | new_code += line + "\n"
32 | continue
33 |
34 | lspace = len(line) - len(line.lstrip())
35 | if lspace == 0:
36 | continue
37 |
38 | new_code += line + "\n"
39 |
40 | return new_code
41 |
42 |
43 | def extract_function(code, target_func):
44 | def remove_last_line_until_parse(code):
45 | try:
46 | tree = ast.parse(code)
47 | except:
48 | if "\n" in code:
49 | code = code.rsplit("\n", 1)[0]
50 | return remove_last_line_until_parse(code)
51 | else:
52 | return None
53 | return tree
54 |
55 | tree = remove_last_line_until_parse(code)
56 | if tree is None: # fail to parse
57 | return ""
58 |
59 | # return the target function only
60 | for node in tree.body:
61 | if isinstance(node, ast.FunctionDef):
62 | if node.name == target_func:
63 | return ast.unparse(node)
64 | return ""
65 |
66 |
67 | def to_four_space_indents(old_code):
68 | new_code = ""
69 | for line in old_code.splitlines():
70 | lspace = len(line) - len(line.lstrip())
71 | if lspace == 3:
72 | new_code += " "
73 | new_code += line + "\n"
74 | return new_code
75 |
76 |
77 | def sanitize_folder(args, folder):
78 | # task_id -> entry_point
79 | entry_point = {}
80 | prompts = {}
81 |
82 | if args.dataset == "humaneval":
83 | problems = get_human_eval_plus()
84 | else:
85 | problems = get_evo_eval(args.dataset)
86 |
87 | for task_id, problem in problems.items():
88 | entry_point[task_id] = problem["entry_point"]
89 | prompts[task_id] = problem["prompt"]
90 |
91 | # make a new folder with "-sanitized" suffix
92 | old_folder = pathlib.Path(folder)
93 | if args.inplace:
94 | new_folder = old_folder
95 | else:
96 | new_folder = old_folder.parent / (old_folder.name + "-sanitized")
97 |
98 | nsan = 0
99 | ntotal = 0
100 | for pyf in tqdm(get_all_python_files(folder)):
101 | # Get [?] from "[prefix]/HumanEval_[?]/[number].py":
102 | task_id = pyf.split("/")[-2].replace("_", "/")
103 | ntotal += 1
104 | old_code = open(pyf).read()
105 |
106 | def_left = "def " + entry_point[task_id] + "("
107 |
108 | imports = prompts[task_id].split(def_left)[0]
109 | def_right = def_left.join(prompts[task_id].split(def_left)[1:])
110 |
111 | new_code = imports + def_left + old_code.split(def_left)[-1]
112 | chunks = new_code.split(def_left) # imports + def_left + {def_right + impl}
113 |
114 | if len(chunks) == 2:
115 | new_code = def_left + chunks[-1] # fn + impl
116 |
117 | if "chatgpt" in folder:
118 | tmp = ""
119 | for line in new_code.splitlines():
120 | if line.strip() == "python":
121 | continue
122 | tmp += line + "\n"
123 | new_code = tmp
124 |
125 | new_code = to_four_space_indents(new_code)
126 |
127 | if args.eof:
128 | eof_strs = NON_CODE_EOFS
129 | if "incoder" in folder:
130 | eof_strs = eof_strs + INCODER_EXTRA
131 | if "polycoder" in folder:
132 | eof_strs = eof_strs + POLYCODER_EXTRA
133 | if "mistral" in folder:
134 | eof_strs = eof_strs + [r""]
135 | for eof in eof_strs:
136 | new_code = new_code.split(eof)[0]
137 |
138 | # extract the target function and remove lines that are not indented
139 | new_code = extract_function(new_code, entry_point[task_id])
140 |
141 | if len(chunks) == 2:
142 | new_code = chunks[0] + new_code
143 |
144 | # write to new folder
145 | new_pyf = pyf.replace(str(old_folder), str(new_folder))
146 |
147 | if new_code.strip() != old_code.strip():
148 | print("Sanitized: ", pyf, "->", new_pyf)
149 | nsan += 1
150 |
151 | pathlib.Path(new_pyf).parent.mkdir(parents=True, exist_ok=True)
152 | with open(new_pyf, "w") as f:
153 | f.write(new_code)
154 |
155 | print(f"Sanitized {nsan} out of {ntotal} files.")
156 |
157 |
158 | def main():
159 | import argparse
160 |
161 | parser = argparse.ArgumentParser()
162 | parser.add_argument("--folder", type=str, required=True)
163 | parser.add_argument("--dataset", type=str, required=True)
164 | parser.add_argument("--eof", action="store_true")
165 | parser.add_argument("--inplace", action="store_true")
166 | parser.add_argument(
167 | "--root_folder",
168 | action="store_true",
169 | help="Use if we want to sanitize all folders in the root folder.",
170 | )
171 |
172 | args = parser.parse_args()
173 |
174 | assert not args.folder.endswith("/")
175 |
176 | if not args.root_folder:
177 | sanitize_folder(args, args.folder)
178 | else:
179 | for folder in os.listdir(args.folder):
180 | if os.path.isdir(f"{args.folder}/{folder}") and "sanitized" not in folder:
181 | sanitize_folder(args, f"{args.folder}/{folder}")
182 |
183 |
184 | if __name__ == "__main__":
185 | main()
186 |
--------------------------------------------------------------------------------