├── .dockerignore ├── .github └── ISSUE_TEMPLATE │ ├── buggy_contract.yml │ ├── buggy_test.yml │ ├── config.yml │ └── model_eval_request.yml ├── .gitignore ├── .pre-commit-config.yaml ├── ADVANCED_USAGE.md ├── CITATION.cff ├── Docker ├── Evaluate.Dockerfile ├── Generate.Dockerfile └── Gradio.Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── Requirements ├── requirements-eval.txt └── requirements.txt ├── analysis ├── bcb_subset.py ├── get_results.py ├── lib2domain.json ├── task2domain.json └── utils.py ├── bigcodebench ├── __init__.py ├── data │ ├── __init__.py │ ├── bigcodebench.py │ └── utils.py ├── eval │ ├── __init__.py │ ├── _special_oracle.py │ └── utils.py ├── evaluate.py ├── gen │ ├── __init__.py │ └── util │ │ ├── __init__.py │ │ ├── anthropic_request.py │ │ ├── google_request.py │ │ ├── hf_inference_request.py │ │ ├── mistral_request.py │ │ └── openai_request.py ├── generate.py ├── inspect.py ├── provider │ ├── __init__.py │ ├── anthropic.py │ ├── base.py │ ├── google.py │ ├── hf.py │ ├── hf_inference.py │ ├── mistral.py │ ├── openai.py │ ├── utility.py │ └── vllm.py ├── sanitize.py └── syncheck.py ├── decontamination ├── n_gram_check.py ├── odex_10_overlap.txt ├── odex_13_overlap.txt ├── stackoverflow_10_overlap.txt ├── stackoverflow_13_overlap.txt └── starcoderdata_10_overlap.txt ├── pyproject.toml ├── release.sh ├── release_docker.sh ├── run.sh ├── sandbox-templates ├── e2b.Dockerfile └── e2b.toml ├── setup.cfg ├── tests ├── requirements.txt ├── test_legacy_sanitizer.py └── test_treesitter_sanitizer.py └── tools ├── fix_v019.py ├── fix_v020.py ├── fix_v022.py ├── fix_v023.py └── fix_v025.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | # nuclear option because steven uses PyCharm. 161 | .idea/ 162 | 163 | # VSCode 164 | .vscode/ 165 | backup/ 166 | passrate.p* 167 | min_cov_dir/ 168 | bigcodebench/_version.py 169 | inspect/ 170 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/buggy_contract.yml: -------------------------------------------------------------------------------- 1 | name: "🐛 Report Bad Task" 2 | description: Report to us that certain programming task should be repaired. 3 | title: "🐛 [TaskRemoval/TaskRepair] - " 4 | labels: ["programming task"] 5 | body: 6 | - type: input 7 | id: version 8 | attributes: 9 | label: "BigCodeBench version" 10 | description: What is the version of BigCodeBench? You can find it by running `pip show bigcodebench`. 11 | placeholder: For example, 0.1.5 12 | validations: 13 | required: true 14 | - type: input 15 | id: cache 16 | attributes: 17 | label: "Output of running `ls ~/.cache/bigcodebench`" 18 | validations: 19 | required: true 20 | - type: input 21 | id: task_id 22 | attributes: 23 | label: "Task ID of the programming task" 24 | placeholder: BigCodeBench/[??] 25 | validations: 26 | required: true 27 | - type: textarea 28 | id: original 29 | attributes: 30 | label: "The original complete prompt" 31 | description: You can run `python -c "from bigcodebench.data import get_bigcodebench print(get_bigcodebench['BigCodeBench/❓']['complete_prompt'])"` 32 | render: python 33 | validations: 34 | required: true 35 | - type: textarea 36 | id: new 37 | attributes: 38 | label: "Your proposed new complete prompt" 39 | render: python 40 | validations: 41 | required: true 42 | - type: textarea 43 | id: other 44 | attributes: 45 | label: "Other context" 46 | description: (Optional) Anything else the maintainer should notice? 47 | validations: 48 | required: false 49 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/buggy_test.yml: -------------------------------------------------------------------------------- 1 | name: "🐛 Report Bad Test Inputs" 2 | description: Report to us that certain test inputs should be removed. 3 | title: "🐛 [TestRemoval/TestRepair] - " 4 | labels: ["bug"] 5 | body: 6 | - type: input 7 | id: version 8 | attributes: 9 | label: "EvalPlus version" 10 | description: What is the version of EvalPlus? You can find it by running `pip show bigcodebench`. 11 | placeholder: For example, 0.1.0 12 | validations: 13 | required: true 14 | - type: input 15 | id: cache 16 | attributes: 17 | label: "Output of running `ls ~/.cache/bigcodebench`" 18 | validations: 19 | required: true 20 | - type: input 21 | id: task_id 22 | attributes: 23 | label: "Task ID of the programming task" 24 | placeholder: BigCodeBench/[??] 25 | validations: 26 | required: true 27 | - type: textarea 28 | id: original 29 | attributes: 30 | label: "The original test" 31 | description: You can run `python -c "from bigcodebench.data import get_bigcodebench print(get_bigcodebench['BigCodeBench/❓']['test'])"` 32 | render: python 33 | validations: 34 | required: true 35 | - type: textarea 36 | id: new 37 | attributes: 38 | label: "Your proposed new test" 39 | render: python 40 | validations: 41 | required: true 42 | - type: textarea 43 | id: description 44 | attributes: 45 | label: "Description" 46 | description: An explicit description of why you think this test should be removed 47 | placeholder: Here is a correct solution but it is incorrectly falsified by the test because ... 48 | validations: 49 | required: true 50 | - type: textarea 51 | id: other 52 | attributes: 53 | label: "Other context" 54 | description: (Optional) Anything else the maintainer should notice? 55 | validations: 56 | required: false 57 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/model_eval_request.yml: -------------------------------------------------------------------------------- 1 | name: "🤗 Model Evaluation Request" 2 | description: Request BigCodeBench maintainers to evaluate your model independently and update it on our leaderboard. 3 | title: "🤗 [REQUEST] - " 4 | labels: ["model eval"] 5 | body: 6 | - type: textarea 7 | id: about 8 | attributes: 9 | label: "Model introduction" 10 | description: Provide a brief introduction to the model. 11 | placeholder: The models is created by ... and is used for ... 12 | validations: 13 | required: true 14 | - type: input 15 | id: url 16 | attributes: 17 | label: "Model URL" 18 | description: Indicate the URL (e.g., huggingface or other release pages) of the model 19 | placeholder: https://huggingface.co/[???]/[???] 20 | validations: 21 | required: true 22 | - type: textarea 23 | id: other 24 | attributes: 25 | label: "Additional instructions (Optional)" 26 | description: Special steps indicating how to run the model with preferably scripts/codes. 27 | placeholder: What data type precision should be used? What is the minimal hardware requirement? Can it be accelerated by tools such as vLLM? 28 | validations: 29 | required: false 30 | - type: dropdown 31 | id: author 32 | attributes: 33 | label: "Author" 34 | description: "Are you (one of) the author(s) of the model?" 35 | multiple: false 36 | options: 37 | - "Yes" 38 | - "No" 39 | validations: 40 | required: true 41 | - type: checkboxes 42 | id: security 43 | attributes: 44 | label: "Security" 45 | options: 46 | - label: "I confirm that the model is safe to run which does not contain any malicious code or content." 47 | required: true 48 | - type: checkboxes 49 | id: integrity 50 | attributes: 51 | label: "Integrity" 52 | options: 53 | - label: "I confirm that the model comes from unique and original work and does not contain any plagiarism." 54 | required: true 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | # nuclear option because steven uses PyCharm. 161 | .idea/ 162 | 163 | # VSCode 164 | .vscode/ 165 | OpenPlus/ 166 | backup/ 167 | passrate.p* 168 | min_cov_dir/ 169 | bigcodebench/_version.py 170 | *.jsonl 171 | inspect/ 172 | *.zip 173 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pycqa/isort 3 | rev: 5.12.0 4 | hooks: 5 | - id: isort 6 | name: isort (python) 7 | args: ["--profile", "black"] 8 | - repo: https://github.com/psf/black 9 | rev: 22.6.0 10 | hooks: 11 | - id: black 12 | - repo: https://github.com/pre-commit/pre-commit-hooks 13 | rev: v4.3.0 14 | hooks: 15 | - id: check-yaml 16 | - id: end-of-file-fixer 17 | - id: trailing-whitespace 18 | exclude: (?x)^( 19 | groundtruth/.* 20 | )$ 21 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this work and love it, consider citing it as below \U0001F917" 3 | title: BigCodeBench 4 | authors: 5 | - family-names: BigCodeBench Team 6 | url: https://github.com/bigcode-project/bigcodebench 7 | doi: 8 | date-released: 2024-06-18 9 | license: Apache-2.0 10 | preferred-citation: 11 | type: article 12 | title: "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions" 13 | authors: 14 | - family-names: BigCodeBench Team 15 | year: 2024 16 | journal: 17 | doi: 18 | url: -------------------------------------------------------------------------------- /Docker/Evaluate.Dockerfile: -------------------------------------------------------------------------------- 1 | # Better use newer Python as generated code can use new features 2 | FROM python:3.10-slim 3 | 4 | # install git, g++ and python3-tk 5 | RUN apt-get update && apt-get install -y \ 6 | git \ 7 | g++ \ 8 | python3-tk \ 9 | zip \ 10 | unzip \ 11 | procps \ 12 | r-base \ 13 | libgdal-dev \ 14 | # Add these new dependencies for matplotlib 15 | libfreetype6-dev \ 16 | libpng-dev \ 17 | pkg-config \ 18 | python3-dev \ 19 | python3-matplotlib \ 20 | && rm -rf /var/lib/apt/lists/* 21 | 22 | # upgrade to latest pip 23 | RUN pip install --upgrade pip 24 | 25 | # Add a new user "bigcodebenchuser" 26 | RUN adduser --disabled-password --gecos "" bigcodebenchuser 27 | 28 | RUN rm -rf /bigcodebench 29 | 30 | # Acquire benchmark code to local 31 | ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit 32 | RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench 33 | 34 | RUN pip install numpy==1.24.3 pyarrow==14.0.1 35 | 36 | RUN cd /bigcodebench && \ 37 | pip install . --no-deps 38 | 39 | RUN pip install \ 40 | appdirs \ 41 | fire \ 42 | multipledispatch \ 43 | pqdm \ 44 | tempdir \ 45 | termcolor \ 46 | tqdm \ 47 | tree_sitter \ 48 | tree-sitter-python \ 49 | wget \ 50 | transformers \ 51 | datasets \ 52 | gradio-client \ 53 | numpy \ 54 | rich \ 55 | accelerate \ 56 | anthropic \ 57 | google-genai \ 58 | mistralai \ 59 | openai \ 60 | e2b 61 | 62 | RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt 63 | 64 | # Ensure the numpy version is compatible with the datasets version 65 | RUN pip install datasets==2.17.0 66 | 67 | # Pre-install the dataset 68 | RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench(subset='full'); get_bigcodebench(subset='hard')" 69 | 70 | WORKDIR /app 71 | 72 | RUN chown -R bigcodebenchuser:bigcodebenchuser /app 73 | 74 | RUN chmod -R 777 /app 75 | 76 | USER bigcodebenchuser 77 | 78 | ENTRYPOINT ["python3", "-m", "bigcodebench.evaluate"] 79 | 80 | CMD ["sh", "-c", "pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi; rm -rf /tmp/*"] -------------------------------------------------------------------------------- /Docker/Generate.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 2 | 3 | SHELL ["/bin/bash", "-c"] 4 | 5 | # Setup Environment Variables 6 | ENV CUDA_HOME=/usr/local/cuda \ 7 | PYTHONUNBUFFERED=1 \ 8 | TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" 9 | 10 | # Setup System Utilities 11 | RUN apt-get update --yes --quiet \ 12 | && apt-get upgrade --yes --quiet \ 13 | && DEBIAN_FRONTEND=noninteractive apt-get install --yes --quiet --no-install-recommends \ 14 | apt-utils \ 15 | autoconf \ 16 | automake \ 17 | bc \ 18 | build-essential \ 19 | ca-certificates \ 20 | check \ 21 | cmake \ 22 | curl \ 23 | dmidecode \ 24 | emacs \ 25 | g++\ 26 | gcc \ 27 | git \ 28 | iproute2 \ 29 | jq \ 30 | kmod \ 31 | libaio-dev \ 32 | libcurl4-openssl-dev \ 33 | libgl1-mesa-glx \ 34 | libglib2.0-0 \ 35 | libgomp1 \ 36 | libibverbs-dev \ 37 | libnuma-dev \ 38 | libnuma1 \ 39 | libomp-dev \ 40 | libsm6 \ 41 | libssl-dev \ 42 | libsubunit-dev \ 43 | libsubunit0 \ 44 | libtool \ 45 | libxext6 \ 46 | libxrender-dev \ 47 | make \ 48 | moreutils \ 49 | net-tools \ 50 | ninja-build \ 51 | openssh-client \ 52 | openssh-server \ 53 | openssl \ 54 | pkg-config \ 55 | python3-dev \ 56 | software-properties-common \ 57 | sudo \ 58 | unzip \ 59 | util-linux \ 60 | vim \ 61 | wget \ 62 | zlib1g-dev \ 63 | && apt-get autoremove \ 64 | && apt-get clean \ 65 | && rm -rf /var/lib/apt/lists/ 66 | 67 | # Setup base Python to bootstrap Mamba 68 | RUN add-apt-repository --yes ppa:deadsnakes/ppa \ 69 | && apt-get update --yes --quiet 70 | RUN DEBIAN_FRONTEND=noninteractive apt-get install --yes --quiet --no-install-recommends \ 71 | python3.11 \ 72 | python3.11-dev \ 73 | python3.11-distutils \ 74 | python3.11-lib2to3 \ 75 | python3.11-gdbm \ 76 | python3.11-tk \ 77 | pip 78 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 999 \ 79 | && update-alternatives --config python3 \ 80 | && ln -s /usr/bin/python3 /usr/bin/python 81 | RUN pip install --upgrade pip 82 | 83 | # Setup optimized Mamba environment with required PyTorch dependencies 84 | RUN wget -O /tmp/Miniforge.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-x86_64.sh \ 85 | && bash /tmp/Miniforge.sh -b -p /Miniforge \ 86 | && echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/compat/" >> /Miniforge/etc/profile.d/mamba.sh \ 87 | && source /Miniforge/etc/profile.d/conda.sh \ 88 | && source /Miniforge/etc/profile.d/mamba.sh \ 89 | && mamba update -y -q -n base -c defaults mamba \ 90 | && mamba create -y -q -n BigCodeBench python=3.11 setuptools=69.5.1 \ 91 | && mamba activate BigCodeBench \ 92 | && mamba install -y -q -c conda-forge \ 93 | charset-normalizer \ 94 | gputil \ 95 | ipython \ 96 | numpy \ 97 | pandas \ 98 | scikit-learn \ 99 | wandb \ 100 | && mamba install -y -q -c intel \ 101 | "mkl==2023" \ 102 | "mkl-static==2023" \ 103 | "mkl-include==2023" \ 104 | && mamba install -y -q -c pytorch magma-cuda121 \ 105 | && mamba clean -a -f -y 106 | 107 | # Install VLLM precompiled with appropriate CUDA and ensure PyTorch is installed form the same version channel 108 | RUN source /Miniforge/etc/profile.d/conda.sh \ 109 | && source /Miniforge/etc/profile.d/mamba.sh \ 110 | && mamba activate BigCodeBench 111 | 112 | RUN rm -rf /bigcodebench 113 | 114 | # Acquire benchmark code to local 115 | ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit 116 | RUN git clone https://github.com/bigcode-project/BigCodeBench.git /bigcodebench 117 | 118 | # Install BigCodeBench and pre-load the dataset 119 | RUN source /Miniforge/etc/profile.d/conda.sh \ 120 | && source /Miniforge/etc/profile.d/mamba.sh \ 121 | && mamba activate BigCodeBench \ 122 | && cd /bigcodebench && pip install .[generate] \ 123 | && python -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench()" \ 124 | && export MAX_JOBS=$(($(nproc) - 2)) \ 125 | && pip install --no-cache-dir ninja packaging psutil \ 126 | && pip install flash-attn==2.5.8 --no-build-isolation 127 | 128 | WORKDIR /app 129 | 130 | ENTRYPOINT ["/Miniforge/envs/BigCodeBench/bin/python", "-m", "bigcodebench.generate"] -------------------------------------------------------------------------------- /Docker/Gradio.Dockerfile: -------------------------------------------------------------------------------- 1 | # Better use newer Python as generated code can use new features 2 | FROM python:3.10-slim 3 | 4 | # install git, g++ and python3-tk 5 | RUN apt-get update && apt-get install -y \ 6 | git \ 7 | g++ \ 8 | python3-tk \ 9 | zip \ 10 | unzip \ 11 | procps \ 12 | r-base \ 13 | libgdal-dev \ 14 | # Add these new dependencies for matplotlib 15 | libfreetype6-dev \ 16 | libpng-dev \ 17 | pkg-config \ 18 | python3-dev \ 19 | python3-matplotlib \ 20 | && rm -rf /var/lib/apt/lists/* 21 | # upgrade to latest pip 22 | RUN pip install --upgrade pip 23 | 24 | RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth] schedule==1.2.2 25 | 26 | # Add a new user "bigcodebenchuser" 27 | RUN adduser --disabled-password --gecos "" bigcodebenchuser 28 | 29 | RUN rm -rf /bigcodebench 30 | 31 | # Acquire benchmark code to local 32 | ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit 33 | RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench 34 | 35 | 36 | RUN pip install numpy==1.24.3 pyarrow==14.0.1 37 | 38 | RUN cd /bigcodebench && \ 39 | pip install . --no-deps && \ 40 | pip install \ 41 | appdirs>=1.4.4 \ 42 | fire>=0.6.0 \ 43 | multipledispatch>=0.6.0 \ 44 | pqdm>=0.2.0 \ 45 | tempdir>=0.7.1 \ 46 | termcolor>=2.0.0 \ 47 | tqdm>=4.56.0 \ 48 | tree_sitter_languages>=1.10.2 \ 49 | tree-sitter==0.21.3 \ 50 | wget>=3.2 \ 51 | gradio-client \ 52 | rich 53 | 54 | RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt 55 | 56 | # Ensure the numpy version is compatible with the datasets version 57 | RUN pip install datasets==2.17.0 58 | 59 | # Pre-install the dataset 60 | RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench(subset='full'); get_bigcodebench(subset='hard')" 61 | 62 | RUN apt-get update && \ 63 | apt-get install -y \ 64 | bash \ 65 | git git-lfs \ 66 | wget curl procps \ 67 | htop vim nano && \ 68 | rm -rf /var/lib/apt/lists/* 69 | 70 | 71 | WORKDIR /app 72 | 73 | RUN chown -R bigcodebenchuser:bigcodebenchuser /app 74 | 75 | RUN chmod -R 777 /app 76 | 77 | USER bigcodebenchuser 78 | 79 | # ENTRYPOINT ["python", "app.py"] 80 | 81 | # CMD ["sh", "-c", "pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi; rm -rf /tmp/*"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | ------------------------------------------------------------------------------- 204 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | exclude bigcodebench/_experimental/**/*.py 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BigCodeBench 2 |
3 | BigCodeBench 4 |
5 | 6 |

7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |

16 | 17 |

18 | 💥 Impact • 19 | 📰 News • 20 | 🔥 Quick Start • 21 | 🚀 Remote Evaluation • 22 | 💻 LLM-generated Code • 23 | 🧑 Advanced Usage • 24 | 📰 Result Submission • 25 | 📜 Citation 26 |

27 | 28 |
29 |

🎉 Check out our latest work!
30 | 🌟 SWE Arena 🌟
31 | 🚀 Open Evaluation Platform on AI for Software Engineering 🚀
32 | ✨ 100% free to use the latest frontier models! ✨

33 |
34 | 35 | ## 💥 Impact 36 | BigCodeBench has been trusted by many LLM teams including: 37 | - Zhipu AI 38 | - Alibaba Qwen 39 | - DeepSeek 40 | - Amazon AWS AI 41 | - Snowflake AI Research 42 | - ServiceNow Research 43 | - Meta AI 44 | - Cohere AI 45 | - Sakana AI 46 | - Allen Institute for Artificial Intelligence (AI2) 47 | 48 | ## 📰 News 49 | - **[2025-01-22]** We are releasing `bigcodebench==v0.2.2.dev2`, with 163 models evaluated! 50 | - **[2024-10-06]** We are releasing `bigcodebench==v0.2.0`! 51 | - **[2024-10-05]** We create a public code execution API on the [Hugging Face space](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator). 52 | - **[2024-10-01]** We have evaluated 139 models on BigCodeBench-Hard so far. Take a look at the [leaderboard](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard)! 53 | - **[2024-08-19]** To make the evaluation fully reproducible, we add a real-time code execution session to the leaderboard. It can be viewed [here](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard). 54 | - **[2024-08-02]** We release `bigcodebench==v0.1.9`. 55 | 56 |
More News :: click to expand :: 57 |
58 | 59 | - **[2024-07-18]** We announce a subset of BigCodeBench, BigCodeBench-Hard, which includes 148 tasks that are more aligned with the real-world programming tasks. The details are available [in this blog post](https://huggingface.co/blog/terryyz/bigcodebench-hard). The dataset is available [here](https://huggingface.co/datasets/bigcode/bigcodebench-hard). The new release is `bigcodebench==v0.1.8`. 60 | - **[2024-06-28]** We release `bigcodebench==v0.1.7`. 61 | - **[2024-06-27]** We release `bigcodebench==v0.1.6`. 62 | - **[2024-06-19]** We start the Hugging Face BigCodeBench Leaderboard! The leaderboard is available [here](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard). 63 | - **[2024-06-18]** We release BigCodeBench, a new benchmark for code generation with 1140 software-engineering-oriented programming tasks. Preprint is available [here](https://arxiv.org/abs/2406.15877). PyPI package is available [here](https://pypi.org/project/bigcodebench/) with the version `0.1.5`. 64 | 65 |
66 |
67 | 68 | ## 🌸 About 69 | 70 | ### BigCodeBench 71 | 72 | BigCodeBench is an **_easy-to-use_** benchmark for solving **_practical_** and **_challenging_** tasks via code. It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting. The benchmark is designed for HumanEval-like function-level code generation tasks, but with much more complex instructions and diverse function calls. 73 | 74 | There are two splits in BigCodeBench: 75 | - `Complete`: Thes split is designed for code completion based on the comprehensive docstrings. 76 | - `Instruct`: The split works for the instruction-tuned and chat models only, where the models are asked to generate a code snippet based on the natural language instructions. The instructions only contain necessary information, and require more complex reasoning. 77 | 78 | ### Why BigCodeBench? 79 | 80 | BigCodeBench focuses on task automation via code generation with *diverse function calls* and *complex instructions*, with: 81 | 82 | * ✨ **Precise evaluation & ranking**: See [our leaderboard](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard) for latest LLM rankings before & after rigorous evaluation. 83 | * ✨ **Pre-generated samples**: BigCodeBench accelerates code intelligence research by open-sourcing [LLM-generated samples](#-LLM-generated-code) for various models -- no need to re-run the expensive benchmarks! 84 | 85 | ## 🔥 Quick Start 86 | 87 | To get started, please first set up the environment: 88 | 89 | ```bash 90 | # By default, you will use the remote evaluation API to execute the output samples. 91 | pip install bigcodebench --upgrade 92 | 93 | # You are suggested to use `flash-attn` for generating code samples. 94 | pip install packaging ninja 95 | pip install flash-attn --no-build-isolation 96 | # Note: if you have installation problem, consider using pre-built 97 | # wheels from https://github.com/Dao-AILab/flash-attention/releases 98 | ``` 99 | 100 |
⏬ Install nightly version :: click to expand :: 101 |
102 | 103 | ```bash 104 | # Install to use bigcodebench.generate 105 | pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade 106 | ``` 107 | 108 |
109 |
110 | 111 | 112 | ## 🚀 Remote Evaluation 113 | 114 | We use the greedy decoding as an example to show how to evaluate the generated code samples via remote API. 115 | > [!Warning] 116 | > 117 | > To ease the generation, we use batch inference by default. However, the batch inference results could vary from *batch sizes to batch sizes* and *versions to versions*, at least for the vLLM backend. If you want to get more deterministic results for greedy decoding, please set `--bs` to `1`. 118 | 119 | > [!Note] 120 | > 121 | > `gradio` backend on `BigCodeBench-Full` typically takes 6-7 minutes, and on `BigCodeBench-Hard` typically takes 4-5 minutes. 122 | > `e2b` backend with default machine on `BigCodeBench-Full` typically takes 25-30 minutes, and on `BigCodeBench-Hard` typically takes 15-20 minutes. 123 | 124 | ```bash 125 | bigcodebench.evaluate \ 126 | --model meta-llama/Meta-Llama-3.1-8B-Instruct \ 127 | --execution [e2b|gradio|local] \ 128 | --split [complete|instruct] \ 129 | --subset [full|hard] \ 130 | --backend [vllm|openai|anthropic|google|mistral|hf|hf-inference] 131 | ``` 132 | 133 | - All the resulted files will be stored in a folder named `bcb_results`. 134 | - The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated.jsonl`. 135 | - The evaluation results will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated_eval_results.json`. 136 | - The pass@k results will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated_pass_at_k.json`. 137 | 138 | > [!Note] 139 | > 140 | > The `gradio` backend is hosted on the [Hugging Face space](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) by default. 141 | > The default space can be sometimes slow, so we recommend you to use the `gradio` backend with a cloned [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) endpoint for faster evaluation. 142 | > Otherwise, you can also use the `e2b` sandbox for evaluation, which is also pretty slow on the default machine. 143 | 144 | > [!Note] 145 | > 146 | > BigCodeBench uses different prompts for base and chat models. 147 | > By default it is detected by `tokenizer.chat_template` when using `hf`/`vllm` as backend. 148 | > For other backends, only chat mode is allowed. 149 | > 150 | > Therefore, if your base models come with a `tokenizer.chat_template`, 151 | > please add `--direct_completion` to avoid being evaluated 152 | > in a chat mode. 153 | 154 | To use E2B, you need to set up an account and get an API key from [E2B](https://e2b.dev/). 155 | 156 | ```bash 157 | export E2B_API_KEY= 158 | ``` 159 | 160 | Access OpenAI APIs from [OpenAI Console](https://platform.openai.com/) 161 | ```bash 162 | export OPENAI_API_KEY= 163 | ``` 164 | 165 | Access Anthropic APIs from [Anthropic Console](https://console.anthropic.com/) 166 | ```bash 167 | export ANTHROPIC_API_KEY= 168 | ``` 169 | 170 | Access Mistral APIs from [Mistral Console](https://console.mistral.ai/) 171 | ```bash 172 | export MISTRAL_API_KEY= 173 | ``` 174 | 175 | Access Gemini APIs from [Google AI Studio](https://aistudio.google.com/) 176 | ```bash 177 | export GOOGLE_API_KEY= 178 | ``` 179 | 180 | Access the [Hugging Face Serverless Inference API](https://huggingface.co/docs/api-inference/en/index) 181 | ```bash 182 | export HF_INFERENCE_API_KEY= 183 | ``` 184 | 185 | Please make sure your HF access token has the `Make calls to inference providers` permission. 186 | 187 | ## 💻 LLM-generated Code 188 | 189 | We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard) on the full set: 190 | * See the attachment of our [v0.2.4](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.4). We include `sanitized_samples_calibrated.zip` for your convenience. 191 | 192 | ## 🧑 Advanced Usage 193 | 194 | Please refer to the [ADVANCED USAGE](https://github.com/bigcode-project/bigcodebench/blob/main/ADVANCED_USAGE.md) for more details. 195 | 196 | ## 📰 Result Submission 197 | 198 | Please email both the generated code samples and the execution results to [terry.zhuo@monash.edu](mailto:terry.zhuo@monash.edu) if you would like to contribute your model to the leaderboard. Note that the file names should be in the format of `[model_name]--[revision]--[bigcodebench|bigcodebench-hard]-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated.jsonl` and `[model_name]--[revision]--[bigcodebench|bigcodebench-hard]-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated_eval_results.json`. You can [file an issue](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to remind us if we do not respond to your email within 3 days. 199 | 200 | ## 📜 Citation 201 | 202 | ```bibtex 203 | @article{zhuo2024bigcodebench, 204 | title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions}, 205 | author={Zhuo, Terry Yue and Vu, Minh Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and others}, 206 | journal={arXiv preprint arXiv:2406.15877}, 207 | year={2024} 208 | } 209 | ``` 210 | 211 | ## 🙏 Acknowledgement 212 | 213 | - [EvalPlus](https://github.com/evalplus/evalplus) 214 | -------------------------------------------------------------------------------- /Requirements/requirements-eval.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.8.2 2 | blake3==0.4.1 3 | chardet==5.2.0 4 | cryptography==38.0.0 5 | datetime==5.5 6 | Django==4.2.7 7 | dnspython==2.6.1 8 | docxtpl==0.11.5 9 | Faker==20.1.0 10 | flask_login==0.6.3 11 | flask_restful==0.3.10 12 | flask_wtf==1.2.1 13 | Flask-Mail==0.9.1 14 | flask==3.0.3 15 | folium==0.16.0 16 | gensim==4.3.2 17 | geopandas==0.13.2 18 | geopy==2.4.1 19 | holidays==0.29 20 | keras==2.11.0 21 | Levenshtein==0.25.0 22 | librosa==0.10.1 23 | lxml==4.9.3 24 | matplotlib==3.7.0 25 | mechanize==0.4.9 26 | natsort==7.1.1 27 | networkx==2.6.3 28 | nltk==3.8 29 | numba==0.55.0 30 | numpy==1.21.2 31 | opencv-python-headless==4.9.0.80 32 | openpyxl==3.1.2 33 | pandas==2.0.3 34 | Pillow==10.3.0 35 | prettytable==3.10.0 36 | psutil==5.9.5 37 | pycryptodome==3.14.1 38 | pyfakefs==5.4.1 39 | pyquery==1.4.3 40 | pytesseract==0.3.10 41 | pytest==8.2.0 42 | python_http_client==3.3.7 43 | python-dateutil==2.9.0 44 | python-docx==1.1.0 45 | python-Levenshtein-wheels 46 | pytz==2023.3.post1 47 | PyYAML==6.0.1 48 | requests_mock==1.11.0 49 | requests==2.31.0 50 | Requests==2.31.0 51 | rsa==4.9 52 | scikit-image==0.18.0 53 | scikit-learn==1.3.1 54 | scipy==1.7.2 55 | seaborn==0.13.2 56 | selenium==4.15 57 | sendgrid==6.11.0 58 | shapely==2.0.4 59 | soundfile==0.12.1 60 | statsmodels==0.14.0 61 | statsmodels==0.14.0 62 | sympy==1.12 63 | tensorflow==2.11.0 64 | textblob==0.18.0 65 | texttable==1.7.0 66 | Werkzeug==3.0.1 67 | wikipedia==1.4.0 68 | wordcloud==1.9.3 69 | wordninja==2.0.0 70 | WTForms==3.1.2 71 | xlrd==2.0.1 72 | xlrd==2.0.1 73 | xlwt==1.3.0 74 | xmltodict==0.13.0 -------------------------------------------------------------------------------- /Requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs>=1.4.4 2 | fire>=0.6.0 3 | multipledispatch>=0.6.0 4 | pqdm>=0.2.0 5 | tempdir>=0.7.1 6 | termcolor>=2.0.0 7 | tqdm>=4.56.0 8 | tree_sitter_languages>=1.10.2 9 | tree-sitter==0.21.3 10 | wget>=3.2 11 | -------------------------------------------------------------------------------- /analysis/bcb_subset.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import json 3 | import numpy as np 4 | from tqdm import tqdm 5 | from ast import literal_eval 6 | from glob import glob 7 | from sentence_transformers import SentenceTransformer, util 8 | import matplotlib.pyplot as plt 9 | from transformers import AutoTokenizer 10 | from datasets import load_dataset, Dataset, Features, Value, Sequence, DatasetDict 11 | 12 | from utils import * 13 | 14 | VERSION = "v0.1.0_hf" 15 | 16 | def update_model_info(model_info): 17 | for model, info in model_info.items(): 18 | if "https://huggingface.co/" in info["link"]: 19 | hf_model = info["link"].split("https://huggingface.co/")[-1] 20 | print(hf_model) 21 | tokenizer = AutoTokenizer.from_pretrained(hf_model, trust_remote_code=True) 22 | if tokenizer.chat_template is None: 23 | model_info[model]["direct_complete"] = True 24 | else: 25 | model_info[model]["direct_complete"] = False 26 | else: 27 | model_info[model]["direct_complete"] = False 28 | 29 | return model_info 30 | 31 | 32 | def embed_sentences(data, col_name, id_name, model, save_path, push_to_hub=False): 33 | pool = model.start_multi_process_pool() 34 | embeddings = model.encode_multi_process(data[col_name], pool=pool) 35 | qids = data[id_name] 36 | features = Features({id_name: Value(dtype='string'), 'embeddings': Sequence(Value('float32'))}) 37 | embed_dict = { 38 | id_name: qids, 39 | "embeddings": embeddings 40 | } 41 | embed_ds = Dataset.from_dict(embed_dict, features=features) 42 | if push_to_hub: 43 | embed_ds.push_to_hub(f"bigcode/{save_path}") 44 | else: 45 | embed_ds.save_to_disk(save_path) 46 | return embed_ds 47 | 48 | 49 | def get_top_docs(query_embs, doc_emb, docs): 50 | scores = np.dot(query_embs, doc_emb.T) 51 | top_doc_indices = np.argmax(scores, axis=1) 52 | top_scores = scores[np.arange(len(scores)), top_doc_indices] 53 | results = [(i, docs[doc_idx], score) for i, (doc_idx, score) in tqdm(enumerate(zip(top_doc_indices, top_scores)))] 54 | 55 | return results 56 | 57 | 58 | def filter_top_k_percent(results, k_percent): 59 | all_scores = [score for _, score in results] 60 | threshold = np.percentile(all_scores, 100 - k_percent) 61 | filtered_results = [(i, doc, score) for i, doc, score in results if score > threshold] 62 | return filtered_results 63 | 64 | 65 | def filter_top_threshold(results, threshold): 66 | filtered_results = [(i, doc, score) for i, doc, score in results if score > threshold] 67 | return filtered_results 68 | 69 | 70 | def read_task_perf(tids, task="complete"): 71 | model_results = dict() 72 | result_files = [] 73 | for model, info in model_info.items(): 74 | if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]): 75 | continue 76 | task_perf = {f"BigCodeBench/{task_id}": 0 for task_id in range(1140)} 77 | model = model.replace("/", "--") 78 | try: 79 | if info["prompted"] and not info["direct_complete"]: 80 | files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json") 81 | if files: 82 | file = files[0] 83 | else: 84 | file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0] 85 | else: 86 | file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0] 87 | except: 88 | continue 89 | with open(file, "r") as f: 90 | data = json.load(f) 91 | for task_id, perfs in data["eval"].items(): 92 | status = 1 if perfs[0]["status"] == "pass" else 0 93 | task_perf[task_id] = status 94 | model_results[info["name"]] = np.mean([status for tid, status in task_perf.items() if tid in tids]) 95 | return sorted(model_results.items(), key=lambda x: x[1], reverse=True) 96 | 97 | 98 | if __name__ == "__main__": 99 | bcb = load_dataset("bigcode/bigcodebench", trust_remote_code=True, split=VERSION) 100 | se = load_dataset("bigcode/stack-exchange-preferences-20230914-clean-anonymization", trust_remote_code=True, split="train") 101 | model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") 102 | 103 | model_info = update_model_info(model_info) 104 | 105 | se_embed = embed_sentences(se, "question", "qid", model, "stack-exchange-embeddings-20230914", push_to_hub=True) 106 | bcb_embed = embed_sentences(bcb, "complete_prompt", "task_id", model, "bigcodebench-doc-embeddings", push_to_hub=True) 107 | 108 | solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", trust_remote_code=True, split="complete") 109 | 110 | query_embs = np.array(se_embed["embeddings"]) 111 | doc_emb = np.array(bcb_embed["embeddings"]) 112 | docs = bcb_embed["task_id"] 113 | retrieval_results = get_top_docs(query_embs, doc_emb, docs) 114 | 115 | Dataset.from_dict({"qid": [i for i, _, _ in retrieval_results], "tid": [doc for _, doc, _ in retrieval_results], "score": [score for _, _, score in retrieval_results]}).push_to_hub("bigcode/se_bcb_results") 116 | 117 | retrieval_ds = load_dataset("bigcode/se_bcb_results", trust_remote_code=True, split="train") 118 | 119 | top_results = dict() 120 | for sample in tqdm(retrieval_ds): 121 | i, doc, score = sample["qid"], sample["tid"], sample["score"] 122 | if score > 0.7: 123 | if doc not in top_results: 124 | top_results[doc] = (i, doc, score) 125 | else: 126 | if score > top_results[doc][2]: 127 | top_results[doc] = (i, doc, score) 128 | 129 | top_id = {task_id: (qid, score) for qid, task_id, score in top_results.values()} 130 | 131 | hard_lib_filter = {sample["task_id"] for sample in bcb if len(literal_eval(sample["libs"])) > 2} 132 | hard_length_filter = {sample["task_id"] for sample in bcb if len(sample["canonical_solution"]) > 426} 133 | hard_rate_filter = {task["task_id"]: task["solve_rate"] for task in solve_rate if task["solve_rate"] < 50} 134 | 135 | hard_tid = top_id.keys() & hard_length_filter & hard_rate_filter.keys() & hard_lib_filter 136 | 137 | hard_bcb = bcb.filter(lambda x: x["task_id"] in hard_tid) 138 | hard_bcb_tid = bcb.filter(lambda x: x["task_id"] in hard_tid)["task_id"] 139 | hard_se_qid = [top_id[_id][0] for _id in hard_bcb_tid] 140 | hard_se_q = se.select(hard_se_qid) 141 | hard_se_scores = [top_id[_id][1] for _id in hard_bcb_tid] 142 | hard_bcb_dict = { 143 | "task_id": hard_bcb_tid, 144 | "complete_prompt": hard_bcb["complete_prompt"], 145 | "instruct_prompt": hard_bcb["instruct_prompt"], 146 | "canonical_solution": hard_bcb["canonical_solution"], 147 | "code_prompt": hard_bcb["code_prompt"], 148 | "test": hard_bcb["test"], 149 | "entry_point": hard_bcb["entry_point"], 150 | "doc_struct": hard_bcb["doc_struct"], 151 | "libs": hard_bcb["libs"], 152 | "q_idx": hard_se_qid, 153 | "question": hard_se_q["question"], 154 | "score": hard_se_scores, 155 | "_id": hard_bcb_tid 156 | } 157 | hard_bcb = Dataset.from_dict(hard_bcb_dict) 158 | DatasetDict({VERSION: hard_bcb}).push_to_hub("bigcode/bigcodebench-hard") 159 | 160 | hard_complete_results = read_task_perf(hard_tid) 161 | hard_instruct_results = read_task_perf(hard_tid, task="instruct") 162 | 163 | complete_res_dict = {model: score for model, score in hard_complete_results} 164 | instruct_res_dict = {model: score for model, score in hard_instruct_results} 165 | avg_res_dict = {model: (complete_res_dict[model] + instruct_res_dict[model]) / 2 for model in complete_res_dict if model in instruct_res_dict} 166 | 167 | for model, score in sorted(avg_res_dict.items(), key=lambda x: x[1], reverse=True): 168 | print(model, round(score*100, 1)) -------------------------------------------------------------------------------- /analysis/get_results.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import numpy as np 5 | from numpy import mean 6 | from glob import glob 7 | from utils import model_info 8 | from tqdm import tqdm 9 | import pandas as pd 10 | import itertools 11 | import math 12 | from datasets import Dataset, DatasetDict, load_dataset 13 | from transformers import AutoTokenizer 14 | 15 | def update_model_info(model_info): 16 | for model, info in model_info.items(): 17 | if "https://huggingface.co/" in info["link"]: 18 | hf_model = info["link"].split("https://huggingface.co/")[-1] 19 | print(hf_model) 20 | try: 21 | tokenizer = AutoTokenizer.from_pretrained(hf_model, trust_remote_code=True) 22 | 23 | if tokenizer.chat_template is None: 24 | model_info[model]["direct_complete"] = True 25 | else: 26 | model_info[model]["direct_complete"] = False 27 | except: 28 | model_info[model]["direct_complete"] = True 29 | else: 30 | model_info[model]["direct_complete"] = False 31 | 32 | return model_info 33 | 34 | 35 | def get_results(tids): 36 | results = {} 37 | for model, info in model_info.items(): 38 | results[info["name"]] = { 39 | "link": info["link"], 40 | "open-data": info["open-data"], 41 | "pass@1": { 42 | "complete": None, 43 | "instruct": None, 44 | "complete-cal": None, 45 | "instruct-cal": None, 46 | }, 47 | "prompted": info["prompted"], 48 | "moe": info["moe"], 49 | "size": info["size"], 50 | "act_param": info["act_param"], 51 | "date": info.get("date", None), 52 | "prefill": info.get("prefill", False), 53 | # "direct_complete": info["direct_complete"], 54 | } 55 | 56 | for model, info in model_info.items(): 57 | model = model.replace("/", "--") 58 | hf_model = "" 59 | files = glob(f"results/{model}--bigcodebench-*_eval_results.json") 60 | assert files, f"No files found for results/{model}--bigcodebench-*_eval_results.json" 61 | for file in files: 62 | try: 63 | _, suffix = os.path.basename(file).split("--bigcodebench-hard-") 64 | with open("results/"+model+"--bigcodebench-hard-"+suffix, "r") as f: 65 | data = json.load(f) 66 | except: 67 | _, suffix = os.path.basename(file).split("--bigcodebench-") 68 | with open("results/"+model+"--bigcodebench-"+suffix, "r") as f: 69 | data = json.load(f) 70 | status = [] 71 | 72 | if len(data["eval"]) < len(tids): 73 | continue 74 | for key, value in data["eval"].items(): 75 | if key not in tids: 76 | continue 77 | if value[0]["status"] == "pass": 78 | status.append(1) 79 | else: 80 | status.append(0) 81 | if suffix.startswith("complete"): 82 | task = "complete" 83 | elif suffix.startswith("instruct"): 84 | task = "instruct" 85 | else: 86 | raise ValueError("Unknown task") 87 | 88 | mode = "" 89 | if "calibrated" in file: 90 | mode = "-cal" 91 | 92 | results[info["name"]][f"pass@1"][f"{task}{mode}"] = round(mean(status)*100,1) 93 | if not info["prompted"]:# or info["direct_complete"]: 94 | results[info["name"]][f"pass@1"][f"{task}-cal"] = round(mean(status)*100,1) 95 | 96 | for model, result in results.items(): 97 | for task in ["complete"]: 98 | origin = result["pass@1"].pop(task) 99 | # assert origin, f"Missing original complete results for {model}" 100 | calibrate = result["pass@1"].pop(f"{task}-cal") 101 | if calibrate: 102 | # if calibrate - origin > 1: 103 | # results[model]["lazy"] = True 104 | # else: 105 | # results[model]["lazy"] = False 106 | results[model]["pass@1"][task] = calibrate 107 | else: 108 | # results[model]["lazy"] = False 109 | results[model]["pass@1"][task] = origin 110 | calibrate_instruct = result["pass@1"].pop(f"instruct-cal") 111 | result["pass@1"]["instruct"] = calibrate_instruct 112 | return results 113 | 114 | 115 | def check_valid(results): 116 | for model, result in results.items(): 117 | if result["prompted"] and model not in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]: 118 | assert result["pass@1"]["instruct"], model 119 | assert result["pass@1"]["complete"] 120 | 121 | 122 | def split_gen(): 123 | shutil.rmtree("sanitized_calibrated_samples", ignore_errors=True) 124 | os.makedirs("sanitized_calibrated_samples/hard/complete", exist_ok=True) 125 | os.makedirs("sanitized_calibrated_samples/hard/instruct", exist_ok=True) 126 | os.makedirs("sanitized_calibrated_samples/full/complete", exist_ok=True) 127 | os.makedirs("sanitized_calibrated_samples/full/instruct", exist_ok=True) 128 | 129 | for model, info in model_info.items(): 130 | model = model.replace("/", "--") 131 | files = glob(f"results/{model}--bigcodebench-*.jsonl") 132 | if info["link"].startswith("https://huggingface.co/"): 133 | model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--") 134 | 135 | for file in files: 136 | if "-sanitized" not in file or "calibrated" not in file: 137 | continue 138 | 139 | _, suffix = os.path.basename(file).split("--bigcodebench-") 140 | with open(file, "r") as f: 141 | data = f.readlines() 142 | 143 | split_type = "hard" if "-hard-" in file else "full" 144 | if info["prompted"]: 145 | if suffix.startswith("complete") or suffix.startswith("hard-complete"): 146 | with open(f"sanitized_calibrated_samples/{split_type}/complete/{model}--bigcodebench-{suffix}", "w") as f: 147 | f.writelines(data) 148 | else: 149 | with open(f"sanitized_calibrated_samples/{split_type}/instruct/{model}--bigcodebench-{suffix}", "w") as f: 150 | f.writelines(data) 151 | 152 | def read_task_perf(tids, task="complete"): 153 | model_results = dict() 154 | result_files = [] 155 | for model, info in model_info.items(): 156 | if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]): 157 | continue 158 | 159 | task_perf = dict() 160 | model = model.replace("/", "--") 161 | try: 162 | try: 163 | try: 164 | if info["prompted"]: 165 | files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized*calibrated_eval_results.json") 166 | if files: 167 | file = files[0] 168 | else: 169 | file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0] 170 | else: 171 | file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0] 172 | except: 173 | if info["prompted"]:# and not info["direct_complete"]: 174 | files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized*calibrated_hard_eval_results.json") 175 | if files: 176 | file = files[0] 177 | else: 178 | file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0] 179 | else: 180 | file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0] 181 | except: 182 | try: 183 | if info["prompted"]:# and not info["direct_complete"]: 184 | files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized*calibrated_hard_eval_results.json") 185 | if files: 186 | file = files[0] 187 | else: 188 | file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_hard_eval_results.json")[0] 189 | else: 190 | file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_hard_eval_results.json")[0] 191 | except: 192 | if info["prompted"]: 193 | files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized*calibrated_eval_results.json") 194 | if files: 195 | file = files[0] 196 | else: 197 | file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_eval_results.json")[0] 198 | else: 199 | file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_eval_results.json")[0] 200 | except: 201 | continue 202 | 203 | result_files.append(file) 204 | with open(file, "r") as f: 205 | data = json.load(f) 206 | 207 | if len(data["eval"]) < len(tids): 208 | continue 209 | for task_id, perfs in data["eval"].items(): 210 | if task_id in tids: 211 | status = 1 if perfs[0]["status"] == "pass" else 0 212 | task_perf[task_id] = status 213 | model_results[info["name"]] = task_perf 214 | return model_results, result_files 215 | 216 | 217 | def get_domain_perf(data_dict, task2domain): 218 | domain_perfs = { 219 | "Model": [], 220 | "Computation": [], 221 | "General": [], 222 | "Visualization": [], 223 | "System": [], 224 | "Time": [], 225 | "Network": [], 226 | "Cryptography": [] 227 | } 228 | for model, task_perf in data_dict.items(): 229 | model_domain = {"Computation": [], "General": [], "Visualization": [], "System": [], "Time": [], "Network": [], "Cryptography": []} 230 | for task_id, status in task_perf.items(): 231 | domains = task2domain[task_id] 232 | for domain in domains: 233 | model_domain[domain].append(status) 234 | domain_perf = {domain: round(np.mean(perfs)*100, 1) for domain, perfs in model_domain.items()} 235 | domain_perfs["Model"].append(model) 236 | for domain in model_domain.keys(): 237 | domain_perfs[domain].append(domain_perf[domain]) 238 | return Dataset.from_dict(domain_perfs) 239 | 240 | 241 | def get_solve_rate(data_dict, task="complete"): 242 | task_solve_count = dict() 243 | for model, task_perf in data_dict.items(): 244 | for task_id, score in task_perf.items(): 245 | if task_id not in task_solve_count: 246 | task_solve_count[task_id] = [] 247 | task_solve_count[task_id].append(score) 248 | solve_rate = {task_id: round(np.mean(perfs) * 100, 1) for task_id, perfs in task_solve_count.items()} 249 | return Dataset.from_dict({"task_id": list(solve_rate.keys()), "solve_rate": list(solve_rate.values())}) 250 | 251 | 252 | def get_hf_ds(results): 253 | hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], #"lazy": [],# "direct_complete": [], 254 | "complete": [], "instruct": [], "date": [], "prefill": []} 255 | 256 | for model, result in results.items(): 257 | hf_dataset["model"].append(model) 258 | hf_dataset["link"].append(result["link"]) 259 | hf_dataset["moe"].append(result["moe"]) 260 | hf_dataset["size"].append(result["size"]) 261 | hf_dataset["act_param"].append(result["act_param"]) 262 | hf_dataset["type"].append("🔶" if result["prompted"] else "🟢") 263 | # hf_dataset["lazy"].append(result["lazy"]) 264 | hf_dataset["complete"].append(result["pass@1"]["complete"]) 265 | hf_dataset["instruct"].append(result["pass@1"]["instruct"]) 266 | hf_dataset["date"].append(result["date"]) 267 | hf_dataset["prefill"].append(result["prefill"]) 268 | # hf_dataset["direct_complete"].append(result["direct_complete"]) 269 | 270 | return Dataset.from_dict(hf_dataset) 271 | 272 | def get_bootstrap_scores(df): 273 | bars = pd.DataFrame(dict( 274 | lower = df.quantile(.025), 275 | rating = df.quantile(.5), 276 | upper = df.quantile(.975))).reset_index(names="model").sort_values("rating", ascending=False) 277 | 278 | bars['error_y'] = bars['upper'] - bars["rating"] 279 | bars['error_y_minus'] = bars['rating'] - bars["lower"] 280 | bars['rating_rounded'] = np.round(bars['rating'], 2) 281 | return Dataset.from_pandas(bars) 282 | 283 | 284 | def push_ds(ds, path, local=False): 285 | if local: 286 | ds.save_to_disk(path) 287 | else: 288 | ds.push_to_hub(path) 289 | 290 | 291 | def get_perf_df(data_dict): 292 | perfs = {"Model": []} 293 | for task_id in data_dict[list(data_dict.keys())[0]]: 294 | perfs[task_id] = [] 295 | for model, task_perf in data_dict.items(): 296 | perfs["Model"].append(model) 297 | for task_id, status in task_perf.items(): 298 | perfs[task_id].append(status) 299 | return pd.DataFrame(perfs) 300 | 301 | 302 | if __name__ == "__main__": 303 | split_gen() 304 | bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1") 305 | bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1") 306 | bcb_config = { 307 | "": bcb_orig, 308 | "-hard": bcb_hard, 309 | } 310 | for suffix, bcb in bcb_config.items(): 311 | results = get_results(bcb["task_id"]) 312 | files = [] 313 | complete_data, complete_files = read_task_perf(bcb["task_id"], "complete") 314 | instruct_data, instruct_files = read_task_perf(bcb["task_id"], "instruct") 315 | complete_df = get_perf_df(complete_data) 316 | instruct_df = get_perf_df(instruct_data) 317 | 318 | push_ds(DatasetDict({"complete": Dataset.from_pandas(complete_df), "instruct": Dataset.from_pandas(instruct_df)}), f"bigcode/bigcodebench{suffix}-perf") 319 | 320 | with open("task2domain.json", "r") as f: 321 | task2domain = json.load(f) 322 | domain_complete = get_domain_perf(complete_data, task2domain) 323 | domain_instruct = get_domain_perf(instruct_data, task2domain) 324 | DatasetDict({"complete": domain_complete, "instruct": domain_instruct}).push_to_hub(f"bigcode/bigcodebench{suffix}-domain") 325 | 326 | files.extend(complete_files) 327 | files.extend(instruct_files) 328 | shutil.rmtree("eval_results", ignore_errors=True) 329 | os.makedirs("eval_results", exist_ok=True) 330 | for file in files: 331 | shutil.copy(file, "eval_results") 332 | 333 | complete_solve_rate = get_solve_rate(complete_data, task="complete") 334 | instruct_solve_rate = get_solve_rate(instruct_data, task="instruct") 335 | solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate}) 336 | push_ds(solve_rate_ds, f"bigcode/bigcodebench{suffix}-solve-rate") 337 | 338 | with open(f"results{suffix}.json", "w") as f: 339 | json.dump(results, f, indent=4) 340 | ds = get_hf_ds(results) 341 | push_ds(ds, f"bigcode/bigcodebench{suffix}-results") -------------------------------------------------------------------------------- /analysis/lib2domain.json: -------------------------------------------------------------------------------- 1 | { 2 | "Crypto": "Cryptography", 3 | "PIL": "Visualization", 4 | "array": "General", 5 | "base64": "Cryptography", 6 | "binascii": "Cryptography", 7 | "bisect": "General", 8 | "blake3": "Cryptography", 9 | "bs4": "Network", 10 | "calendar": "Time", 11 | "cgi": "Network", 12 | "chardet": "Network", 13 | "cmath": "Computation", 14 | "codecs": "Cryptography", 15 | "collections": "General", 16 | "cryptography": "Cryptography", 17 | "csv": "System", 18 | "ctypes": "System", 19 | "datetime": "Time", 20 | "dateutil": "Time", 21 | "difflib": "General", 22 | "django": "Network", 23 | "docx": "System", 24 | "email": "Network", 25 | "faker": "General", 26 | "flask": "Network", 27 | "flask_login": "Network", 28 | "flask_mail": "Network", 29 | "flask_restful": "Network", 30 | "fnmatch": "General", 31 | "folium": "Visualization", 32 | "functools": "General", 33 | "geopy": "Network", 34 | "getpass": "System", 35 | "glob": "System", 36 | "gzip": "System", 37 | "hashlib": "Cryptography", 38 | "heapq": "General", 39 | "hmac": "Cryptography", 40 | "html": "Network", 41 | "http": "Network", 42 | "importlib": "General", 43 | "inspect": "General", 44 | "io": "System", 45 | "ipaddress": "Network", 46 | "itertools": "General", 47 | "json": "System", 48 | "keras": "Computation", 49 | "librosa": "Computation", 50 | "logging": "System", 51 | "lxml": "Network", 52 | "math": "Computation", 53 | "matplotlib": "Visualization", 54 | "mechanize": "Network", 55 | "mimetypes": "Network", 56 | "multiprocessing": "System", 57 | "nltk": "Computation", 58 | "numpy": "Computation", 59 | "openpyxl": "System", 60 | "operator": "General", 61 | "os": "System", 62 | "pandas": "Computation", 63 | "pathlib": "System", 64 | "pickle": "System", 65 | "pkgutil": "General", 66 | "platform": "System", 67 | "prettytable": "General", 68 | "psutil": "System", 69 | "pytesseract": "Computation", 70 | "pytz": "Time", 71 | "queue": "General", 72 | "random": "General", 73 | "re": "General", 74 | "requests": "Network", 75 | "rsa": "Cryptography", 76 | "scipy": "Computation", 77 | "seaborn": "Visualization", 78 | "secrets": "Cryptography", 79 | "select": "System", 80 | "sendgrid": "Network", 81 | "shutil": "System", 82 | "sklearn": "Computation", 83 | "smtplib": "Network", 84 | "socket": "Network", 85 | "soundfile": "Computation", 86 | "sqlite3": "System", 87 | "ssl": "Network", 88 | "statistics": "Computation", 89 | "statsmodels": "Computation", 90 | "string": "General", 91 | "struct": "System", 92 | "subprocess": "System", 93 | "sys": "System", 94 | "tarfile": "System", 95 | "tensorflow": "Computation", 96 | "texttable": "General", 97 | "textwrap": "General", 98 | "threading": "System", 99 | "time": "Time", 100 | "turtle": "Visualization", 101 | "types": "General", 102 | "unicodedata": "General", 103 | "urllib": "Network", 104 | "uuid": "General", 105 | "warnings": "General", 106 | "werkzeug": "Network", 107 | "wordninja": "Computation", 108 | "wtforms": "Network", 109 | "xlwt": "System", 110 | "xml": "Network", 111 | "xmltodict": "Network", 112 | "yaml": "System", 113 | "zipfile": "System", 114 | "Levenshtein": "Computation", 115 | "ast": "General", 116 | "configparser": "System", 117 | "cv2": "Computation", 118 | "decimal": "General", 119 | "enum": "General", 120 | "errno": "System", 121 | "flask_wtf": "Network", 122 | "ftplib": "Network", 123 | "gensim": "Computation", 124 | "geopandas": "Computation", 125 | "holidays": "Time", 126 | "mpl_toolkits": "Visualization", 127 | "natsort": "General", 128 | "pyquery": "Network", 129 | "python_http_client": "Network", 130 | "regex": "General", 131 | "shapely": "Computation", 132 | "shlex": "System", 133 | "signal": "System", 134 | "skimage": "Computation", 135 | "sympy": "Computation", 136 | "textblob": "Computation", 137 | "typing": "General", 138 | "wikipedia": "Network", 139 | "wordcloud": "Visualization", 140 | "zlib": "System", 141 | "aspose": "System", 142 | "builtins": "General", 143 | "locale": "System", 144 | "imp": "System", 145 | "docxtpl": "System", 146 | "selenium": "Network", 147 | "IPython": "Computation", 148 | "filecmp": "System", 149 | "multidict": "General", 150 | "sqlalchemy": "System", 151 | "obspy": "Computation", 152 | "pprint": "General", 153 | "xlrd": "System", 154 | "argparse": "General", 155 | "torch": "Computation", 156 | "copy": "General" 157 | } -------------------------------------------------------------------------------- /bigcodebench/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from bigcodebench._version import __version__, __version_tuple__ 3 | except ImportError: 4 | __version__ = "local-dev" 5 | -------------------------------------------------------------------------------- /bigcodebench/data/__init__.py: -------------------------------------------------------------------------------- 1 | from bigcodebench.data.bigcodebench import get_bigcodebench, get_bigcodebench_hash 2 | from bigcodebench.data.utils import load_solutions, write_directory, write_jsonl 3 | -------------------------------------------------------------------------------- /bigcodebench/data/bigcodebench.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import json 3 | import os 4 | from typing import Dict 5 | 6 | from bigcodebench.data.utils import ( 7 | CACHE_DIR, 8 | completeness_check, 9 | get_dataset_metadata, 10 | make_cache, 11 | stream_jsonl, 12 | ) 13 | from datasets import load_dataset 14 | 15 | BIGCODEBENCH_OVERRIDE_PATH = os.environ.get("BIGCODEBENCH_OVERRIDE_PATH", None) 16 | BIGCODEBENCH_HF = "bigcode/bigcodebench" 17 | BIGCODEBENCH_VERSION = "v0.1.4" 18 | 19 | def _ready_bigcodebench_path(subset="full", version="default") -> str: 20 | if BIGCODEBENCH_OVERRIDE_PATH: 21 | return BIGCODEBENCH_OVERRIDE_PATH 22 | 23 | version = BIGCODEBENCH_VERSION if version == "default" else version 24 | url, path = get_dataset_metadata( 25 | BIGCODEBENCH_VERSION, subset 26 | ) 27 | 28 | extra = "-" + subset if subset != "full" else "" 29 | dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION) 30 | make_cache(url, dataset, path) 31 | 32 | return path 33 | 34 | 35 | def get_bigcodebench( 36 | err_incomplete=True, subset="full", version="default" 37 | ) -> Dict[str, Dict]: 38 | """Get BigCodeBench from BigCode's github repo and return as a list of parsed dicts. 39 | 40 | Returns: 41 | List[Dict[str, str]]: List of dicts with keys "complete_prompt", "instruct_prompt", "canonical_solution", "test", "entry_point" 42 | 43 | Notes: 44 | "task_id" is the identifier string for the task. 45 | "complete_prompt" is the prompt to be used for BigCodeBench-Complete. 46 | "instruct_prompt" is the prompt to be used for BigCodeBench-Instruct. 47 | "canonical_solution" is the ground-truth implementation 48 | "test" is the `unittest.TestCase` class. 49 | "entry_point" is the name of the function. 50 | """ 51 | # Check if open eval file exists in CACHE_DIR 52 | data_path = _ready_bigcodebench_path( 53 | subset=subset, version=version 54 | ) 55 | data = {task["task_id"]: task for task in stream_jsonl(data_path)} 56 | if err_incomplete: 57 | completeness_check("BigCodeBench", data) 58 | return data 59 | 60 | def get_bigcodebench_hash(subset="full", version="default") -> str: 61 | """Get the hash of BigCodeBench. 62 | Returns: 63 | str: The hash of BigCodeBench 64 | """ 65 | data_path = _ready_bigcodebench_path(subset, version="default") 66 | with open(data_path, "rb") as f: 67 | data = f.read() 68 | return hashlib.md5(data).hexdigest() 69 | -------------------------------------------------------------------------------- /bigcodebench/data/utils.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import json 3 | import os 4 | from os import PathLike 5 | from typing import Dict, Iterable 6 | 7 | import tempdir 8 | import wget 9 | from appdirs import user_cache_dir 10 | 11 | CACHE_DIR = user_cache_dir("bigcodebench") 12 | 13 | 14 | def get_dataset_metadata(version: str, subset: str="full"): 15 | extra = "-" + subset.capitalize() if subset != "full" else "" 16 | url = f"https://github.com/bigcode-project/bigcodebench-annotation/releases/download/{version}/BigCodeBench{extra}.jsonl.gz" 17 | cache_path = os.path.join(CACHE_DIR, f"BigCodeBench{extra}-{version}.jsonl") 18 | return url, cache_path 19 | 20 | 21 | def make_cache(gzip_url, hf_data, cache_path, gh=False): 22 | # Check if open eval file exists in CACHE_DIR 23 | 24 | if not os.path.exists(cache_path): 25 | if gh: 26 | # Install BigCodeBench dataset and parse as jsonl 27 | print(f"Downloading dataset from {gzip_url}") 28 | with tempdir.TempDir() as tmpdir: 29 | gz_path = os.path.join(tmpdir, f"data.jsonl.gz") 30 | wget.download(gzip_url, gz_path) 31 | 32 | with gzip.open(gz_path, "rb") as f: 33 | data = f.read().decode("utf-8") 34 | 35 | # create CACHE_DIR if not exists 36 | if not os.path.exists(CACHE_DIR): 37 | os.makedirs(CACHE_DIR) 38 | 39 | # Write the original open eval file to CACHE_DIR 40 | with open(cache_path, "w") as f: 41 | f.write(data) 42 | else: 43 | hf_data.to_json(cache_path) 44 | 45 | 46 | def write_jsonl( 47 | filename: str, data: Iterable[Dict], append: bool = False, drop_builtin: bool = True 48 | ): 49 | """ 50 | Writes an iterable of dictionaries to jsonl 51 | """ 52 | if append: 53 | mode = "ab" 54 | else: 55 | mode = "wb" 56 | filename = os.path.expanduser(filename) 57 | if filename.endswith(".gz"): 58 | with open(filename, mode) as fp: 59 | with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp: 60 | for x in data: 61 | if drop_builtin: 62 | x = {k: v for k, v in x.items() if not k.startswith("_")} 63 | gzfp.write((json.dumps(x) + "\n").encode("utf-8")) 64 | else: 65 | with open(filename, mode) as fp: 66 | for x in data: 67 | if drop_builtin: 68 | x = {k: v for k, v in x.items() if not k.startswith("_")} 69 | fp.write((json.dumps(x) + "\n").encode("utf-8")) 70 | 71 | 72 | def stream_jsonl(filename: str) -> Iterable[Dict]: 73 | """ 74 | Parses each jsonl line and yields it as a dictionary 75 | """ 76 | if filename.endswith(".gz"): 77 | with open(filename, "rb") as gzfp: 78 | with gzip.open(gzfp, "rt") as fp: 79 | for line in fp: 80 | if any(not x.isspace() for x in line): 81 | yield json.loads(line) 82 | else: 83 | with open(filename, "r") as fp: 84 | for line in fp: 85 | if any(not x.isspace() for x in line): 86 | yield json.loads(line) 87 | 88 | 89 | def load_solutions(sample_path: PathLike) -> Iterable[Dict]: 90 | """We accept two formats of inputs. 91 | + `sample.jsonl` which is the format from BigCodeBench, i.e., {task_id, completion or solution}. 92 | + A folder which contains sub-folders named after the task_id. Each sub-folder 93 | contains samples named in `[?].py` where `?` is the solution id starting with 0. 94 | Different from `sample.jsonl`, the solutions must be complete (with prompt prefix). 95 | """ 96 | 97 | # if it is a file 98 | if os.path.isfile(sample_path): 99 | for i, sample in enumerate(stream_jsonl(sample_path)): 100 | assert ( 101 | "completion" in sample or "solution" in sample 102 | ), "No completion or solution found in sample!" 103 | assert "solution" not in sample or isinstance( 104 | sample["solution"], str 105 | ), "Solution must be a string! If you have multiple solutions, please repeat the task_id." 106 | assert "completion" not in sample or isinstance( 107 | sample["completion"], str 108 | ), "Completion must be a string! If you have multiple solutions, please repeat the task_id." 109 | 110 | sample["_identifier"] = ( 111 | sample["task_id"] + f" (line {i+1} in {sample_path})" 112 | ) 113 | yield sample 114 | else: 115 | # if it is a folder 116 | for task_id in os.listdir(sample_path): 117 | task_path = os.path.join(sample_path, task_id) 118 | if not os.path.isdir(task_path): 119 | continue 120 | 121 | for solution_id in os.listdir(task_path): 122 | solution_path = os.path.join(task_path, solution_id) 123 | if os.path.isfile(solution_path) and solution_path.endswith(".py"): 124 | with open(solution_path, "r") as f: 125 | completion = f.read() 126 | yield { 127 | "_identifier": solution_path, 128 | "_path": solution_path, 129 | "task_id": task_id.replace("_", "/"), 130 | "solution": completion, 131 | } 132 | 133 | 134 | def write_directory(directory: PathLike, data: Iterable[Dict]): 135 | os.makedirs(directory, exist_ok=True) 136 | counters = {} 137 | for sample in data: 138 | assert "solution" in sample, "Samples must come with `solution` field!" 139 | task_id = sample["task_id"].replace("/", "_") 140 | task_dir = os.path.join(directory, task_id) 141 | os.makedirs(task_dir, exist_ok=True) 142 | if task_id not in counters: 143 | counters[task_id] = 0 144 | sample_id = counters[task_id] 145 | with open(os.path.join(task_dir, f"{sample_id}.py"), "w") as f: 146 | f.write(sample["solution"]) 147 | counters[task_id] += 1 148 | 149 | 150 | def completeness_check(name, data): 151 | for task_id, task in data.items(): 152 | for key in [ 153 | "complete_prompt", 154 | "instruct_prompt", 155 | "canonical_solution", 156 | "code_prompt", 157 | "test", 158 | "entry_point" 159 | ]: 160 | assert key in task, f"{key} not found in {name} #{task_id}!" 161 | 162 | 163 | def to_raw(string): 164 | return string.encode("unicode-escape").decode().replace("\\\\", "\\") 165 | -------------------------------------------------------------------------------- /bigcodebench/eval/__init__.py: -------------------------------------------------------------------------------- 1 | # The MIT License 2 | # 3 | # Copyright (c) OpenAI (https://openai.com) 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import itertools 24 | import multiprocessing 25 | import os 26 | import sys 27 | import time 28 | import types 29 | import unittest 30 | from multiprocessing import Array, Value, Manager 31 | from typing import Any, Dict, List, Tuple, Union 32 | 33 | import numpy as np 34 | 35 | from bigcodebench.eval._special_oracle import ( 36 | _poly, 37 | ) 38 | from bigcodebench.eval.utils import ( 39 | create_tempdir, 40 | reliability_guard, 41 | swallow_io, 42 | time_limit, 43 | safe_environment, 44 | TIMEOUT_LIMIT, 45 | ) 46 | 47 | 48 | def compatible_eval_result(results: Dict) -> Dict: 49 | # compatibility 50 | for task_results in results["eval"].values(): 51 | # update the "files" field to "nfiles" 52 | if "files" in task_results and "nfiles" not in task_results: 53 | task_results["nfiles"] = len(task_results.pop("files")) 54 | return results 55 | 56 | 57 | # unbiased estimator from https://github.com/openai/human-eval 58 | def estimate_pass_at_k( 59 | num_samples: Union[int, List[int], np.ndarray], 60 | num_correct: Union[List[int], np.ndarray], 61 | k: int, 62 | ) -> np.ndarray: 63 | """ 64 | Estimates pass@k of each problem and returns them in an array. 65 | """ 66 | 67 | def estimator(n: int, c: int, k: int) -> float: 68 | """ 69 | Calculates 1 - comb(n - c, k) / comb(n, k). 70 | """ 71 | if n - c < k: 72 | return 1.0 73 | return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) 74 | 75 | if isinstance(num_samples, int): 76 | num_samples_it = itertools.repeat(num_samples, len(num_correct)) 77 | else: 78 | assert len(num_samples) == len(num_correct) 79 | num_samples_it = iter(num_samples) 80 | 81 | return np.array( 82 | [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)] 83 | ) 84 | 85 | 86 | PASS = "pass" 87 | FAIL = "fail" 88 | TIMEOUT = "timeout" 89 | 90 | _SUCCESS = 0 91 | _FAILED = 1 92 | _TIMEOUT = 2 93 | _UNKNOWN = 3 94 | 95 | _mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None} 96 | 97 | 98 | def is_floats(x) -> bool: 99 | # check if it is float; List[float]; Tuple[float] 100 | if isinstance(x, float): 101 | return True 102 | if isinstance(x, (list, tuple)): 103 | return all(isinstance(i, float) for i in x) 104 | if isinstance(x, np.ndarray): 105 | return x.dtype == np.float64 or x.dtype == np.float32 106 | return False 107 | 108 | 109 | def unsafe_execute( 110 | entry_point: str, 111 | code: str, 112 | test_code: str, 113 | timeout: float, 114 | max_as_limit: float, 115 | max_data_limit: float, 116 | max_stack_limit: float, 117 | stat, # Value 118 | details, # Array 119 | ): 120 | with safe_environment(), create_tempdir(): 121 | # These system calls are needed when cleaning up tempdir. 122 | import os 123 | import shutil 124 | import builtins 125 | 126 | rmtree = shutil.rmtree 127 | rmdir = os.rmdir 128 | chdir = os.chdir 129 | # Disable functionalities that can make destructive changes to the test. 130 | reliability_guard(max_as_limit, max_data_limit, max_stack_limit) 131 | module_name = "__test__" 132 | new_module = types.ModuleType(module_name) 133 | # Set necessary attributes for the module 134 | new_module.__dict__.update({ 135 | '__builtins__': builtins, 136 | '__file__': f"{module_name}.py", 137 | '__package__': None, 138 | '__doc__': None, 139 | 'sys': sys, 140 | 'os': os, 141 | 'environ': os.environ, 142 | }) 143 | 144 | try: 145 | full_code = code + "\n" + test_code 146 | 147 | with swallow_io(): 148 | exec(compile(full_code, f"{module_name}.py", 'exec'), new_module.__dict__) 149 | sys.modules[module_name] = new_module 150 | TestCases = getattr(new_module, 'TestCases') 151 | loader = unittest.TestLoader() 152 | suite = loader.loadTestsFromTestCase(TestCases) 153 | test_result = unittest.TestResult() 154 | start_time = time.time() 155 | with time_limit(timeout): 156 | suite.run(test_result) 157 | 158 | issues = test_result.failures + test_result.errors 159 | for test, trace in issues: 160 | details[test.id().split(".")[-1]] = trace 161 | stat.value = _SUCCESS 162 | except BaseException as e: 163 | details["ALL"] = str(e) 164 | stat.value = _FAILED 165 | # Needed for cleaning up. 166 | shutil.rmtree = rmtree 167 | os.rmdir = rmdir 168 | os.chdir = chdir 169 | 170 | 171 | def untrusted_check( 172 | code: str, 173 | test_code: str, 174 | entry_point: str, 175 | max_as_limit: float, 176 | max_data_limit: float, 177 | max_stack_limit: float, 178 | min_time_limit: float = 10, 179 | gt_time_limit: float = 60 180 | ) -> Tuple[str, np.ndarray]: 181 | min_time_limit = max(min_time_limit, gt_time_limit) 182 | timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1 183 | # shared memory objects 184 | stat = Value("i", _UNKNOWN) 185 | manager = Manager() 186 | details = manager.dict() 187 | 188 | p = multiprocessing.Process( 189 | target=unsafe_execute, 190 | args=( 191 | entry_point, 192 | code, 193 | test_code, 194 | timeout, 195 | max_as_limit, 196 | max_data_limit, 197 | max_stack_limit, 198 | stat, 199 | details, 200 | ), 201 | ) 202 | p.start() 203 | p.join(timeout=timeout+1) 204 | if p.is_alive(): 205 | p.terminate() 206 | time.sleep(0.1) 207 | if p.is_alive(): 208 | p.kill() 209 | time.sleep(0.1) 210 | 211 | stat = _mapping[stat.value] 212 | # convert details to a dict 213 | details = dict(details) 214 | 215 | if not stat: 216 | stat = TIMEOUT 217 | if stat == PASS: 218 | if details: 219 | stat = FAIL 220 | 221 | return stat, details 222 | 223 | 224 | def evaluate_files( 225 | files: List[str], 226 | inputs: List, 227 | entry_point: str, 228 | min_time_limit: float = 0.1, 229 | gt_time_limit_factor: float = 2.0, 230 | ) -> List[Tuple[str, List[bool]]]: 231 | ret = [] 232 | # sort files by the id in name (i.e., "../n.py") 233 | files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0])) 234 | for file in files: 235 | code = open(file, "r").read() 236 | stat, det = untrusted_check( 237 | code, 238 | inputs, 239 | entry_point, 240 | ) 241 | ret.append((stat, det.tolist())) 242 | return ret 243 | -------------------------------------------------------------------------------- /bigcodebench/eval/_special_oracle.py: -------------------------------------------------------------------------------- 1 | """Special oracle handlings for problems where direct differential testing is not applicable.""" 2 | 3 | import math 4 | 5 | # oracle for HumaneEval/032 6 | def _poly(xs: list, x: float): 7 | """ 8 | Evaluates polynomial with coefficients xs at point x. 9 | return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n 10 | """ 11 | return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)]) 12 | -------------------------------------------------------------------------------- /bigcodebench/eval/utils.py: -------------------------------------------------------------------------------- 1 | # The MIT License 2 | # 3 | # Copyright (c) OpenAI (https://openai.com) 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | # THE SOFTWARE. 22 | 23 | import contextlib 24 | import faulthandler 25 | import io 26 | import os 27 | import platform 28 | import signal 29 | import tempfile 30 | import subprocess 31 | import multiprocessing 32 | import time 33 | from typing import Optional 34 | 35 | TIMEOUT_LIMIT=240.0 36 | 37 | @contextlib.contextmanager 38 | def swallow_subprocess_output(): 39 | """Context manager to swallow stdout and stderr for subprocesses.""" 40 | original_popen = subprocess.Popen 41 | original_run = subprocess.run 42 | 43 | def _popen_patch(*args, **kwargs): 44 | if 'capture_output' in kwargs and kwargs['capture_output']: 45 | # Avoid setting stdout or stderr if capture_output is True 46 | kwargs.pop('stdout', None) 47 | kwargs.pop('stderr', None) 48 | else: 49 | kwargs.setdefault('stdout', subprocess.PIPE) 50 | kwargs.setdefault('stderr', subprocess.PIPE) 51 | return original_popen(*args, **kwargs) 52 | 53 | def _run_patch(*args, **kwargs): 54 | if 'capture_output' in kwargs and kwargs['capture_output']: 55 | # Avoid setting stdout or stderr if capture_output is True 56 | kwargs.pop('stdout', None) 57 | kwargs.pop('stderr', None) 58 | else: 59 | kwargs.setdefault('stdout', subprocess.PIPE) 60 | kwargs.setdefault('stderr', subprocess.PIPE) 61 | return original_run(*args, **kwargs) 62 | 63 | subprocess.Popen = _popen_patch 64 | subprocess.run = _run_patch 65 | try: 66 | yield 67 | finally: 68 | subprocess.Popen = original_popen 69 | subprocess.run = original_run 70 | 71 | @contextlib.contextmanager 72 | def swallow_io(): 73 | stream = WriteOnlyStringIO() 74 | with contextlib.redirect_stdout(stream): 75 | with contextlib.redirect_stderr(stream): 76 | with redirect_stdin(stream): 77 | with swallow_subprocess_output(): 78 | yield 79 | 80 | 81 | @contextlib.contextmanager 82 | def time_limit(seconds: float): 83 | def signal_handler(signum, frame): 84 | raise TimeoutException("Timed out!") 85 | 86 | signal.setitimer(signal.ITIMER_REAL, seconds) 87 | signal.signal(signal.SIGALRM, signal_handler) 88 | try: 89 | yield 90 | finally: 91 | signal.setitimer(signal.ITIMER_REAL, 0) 92 | 93 | 94 | @contextlib.contextmanager 95 | def create_tempdir(): 96 | with tempfile.TemporaryDirectory() as dirname: 97 | with chdir(dirname): 98 | yield dirname 99 | 100 | 101 | @contextlib.contextmanager 102 | def chdir(root): 103 | if root == ".": 104 | yield 105 | return 106 | cwd = os.getcwd() 107 | os.chdir(root) 108 | try: 109 | yield 110 | except BaseException as exc: 111 | raise exc 112 | finally: 113 | os.chdir(cwd) 114 | 115 | 116 | @contextlib.contextmanager 117 | def safe_environment(): 118 | # Save original functions 119 | original_kill = os.kill 120 | original_killpg = os.killpg 121 | original_system = os.system 122 | original_subprocess_call = subprocess.call 123 | original_subprocess_check_output = subprocess.check_output 124 | original_subprocess_run = subprocess.run 125 | original_subprocess_popen = subprocess.Popen 126 | original_os_popen = os.popen 127 | original_os_execv = os.execv 128 | original_os_execvp = os.execvp 129 | original_os_execvpe = os.execvpe 130 | 131 | current_pid = os.getpid() 132 | current_pgid = os.getpgid(current_pid) 133 | manager = multiprocessing.Manager() 134 | child_pids = manager.list() 135 | 136 | def safe_kill(pid, sig): 137 | try: 138 | pgid = os.getpgid(pid) 139 | if pid == current_pid or pid in child_pids: 140 | original_kill(pid, sig) 141 | else: 142 | print(f"Prevented attempt to kill PID {pid} with signal {sig}") 143 | except ProcessLookupError: 144 | pass 145 | 146 | def safe_killpg(pgid, sig): 147 | if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}: 148 | original_killpg(pgid, sig) 149 | else: 150 | print(f"Prevented attempt to kill PGID {pgid} with signal {sig}") 151 | 152 | def safe_system(command): 153 | print(f"Intercepted system command: {command}") 154 | if 'kill' in command or 'killall' in command: 155 | return 0 # Simulate successful execution without doing anything 156 | return original_system(command) 157 | 158 | def safe_subprocess_call(command, *args, **kwargs): 159 | print(f"Intercepted subprocess call: {command}") 160 | if 'kill' in command or 'killall' in command: 161 | return 0 # Simulate successful execution without doing anything 162 | return original_subprocess_call(command, *args, **kwargs) 163 | 164 | def safe_subprocess_check_output(command, *args, **kwargs): 165 | print(f"Intercepted command: {command}") 166 | if 'ps' in command: 167 | return b"" # Simulate no processes found 168 | return original_subprocess_check_output(command, *args, **kwargs) 169 | 170 | def safe_subprocess_run(*args, **kwargs): 171 | print(f"Intercepted subprocess run command: {args}") 172 | if 'kill' in args[0] or 'killall' in args[0]: 173 | return subprocess.CompletedProcess(args, 0, b'', b'') # Simulate successful execution 174 | return original_subprocess_run(*args, **kwargs) 175 | 176 | class SafePopen(subprocess.Popen): 177 | def __init__(self, *args, **kwargs): 178 | print(f"Intercepted Popen command: {args}") 179 | kwargs['preexec_fn'] = os.setsid # Start the process in a new session 180 | super().__init__(*args, **kwargs) 181 | child_pids.append(self.pid) 182 | 183 | def communicate(self, *args, **kwargs): 184 | try: 185 | return super().communicate(*args, **kwargs) 186 | except subprocess.TimeoutExpired: 187 | print("Timeout expired, intercepted and returning None") 188 | return None, None 189 | 190 | def kill(self): 191 | print(f"Intercepted kill call for PID {self.pid}") 192 | safe_kill(self.pid, signal.SIGTERM) 193 | 194 | def terminate(self): 195 | print(f"Intercepted terminate call for PID {self.pid}") 196 | safe_kill(self.pid, signal.SIGTERM) 197 | 198 | def safe_os_popen(command): 199 | print(f"Intercepted os.popen command: {command}") 200 | if 'kill' in command or 'killall' in command: 201 | return os.popen('echo Intercepted') 202 | return original_os_popen(command) 203 | 204 | def safe_exec(*args, **kwargs): 205 | print(f"Intercepted exec command: {args}") 206 | 207 | # Override the risky functions with the safe versions 208 | os.kill = safe_kill 209 | os.killpg = safe_killpg 210 | os.system = safe_system 211 | subprocess.call = safe_subprocess_call 212 | subprocess.check_output = safe_subprocess_check_output 213 | subprocess.run = safe_subprocess_run 214 | subprocess.Popen = SafePopen 215 | os.popen = safe_os_popen 216 | os.execv = safe_exec 217 | os.execvp = safe_exec 218 | os.execvpe = safe_exec 219 | 220 | try: 221 | yield 222 | finally: 223 | for pid in child_pids: 224 | try: 225 | os.kill(pid, signal.SIGTERM) 226 | for _ in range(10): 227 | time.sleep(0.1) 228 | try: 229 | os.kill(pid, 0) 230 | except ProcessLookupError: 231 | break 232 | else: 233 | os.kill(pid, signal.SIGKILL) 234 | except ProcessLookupError: 235 | pass 236 | except Exception as e: 237 | print(f"Error handling process {pid}: {e}") 238 | 239 | os.kill = original_kill 240 | os.killpg = original_killpg 241 | os.system = original_system 242 | subprocess.call = original_subprocess_call 243 | subprocess.check_output = original_subprocess_check_output 244 | subprocess.run = original_subprocess_run 245 | subprocess.Popen = original_subprocess_popen 246 | os.popen = original_os_popen 247 | os.execv = original_os_execv 248 | os.execvp = original_os_execvp 249 | os.execvpe = original_os_execvpe 250 | 251 | 252 | class TimeoutException(Exception): 253 | pass 254 | 255 | 256 | class WriteOnlyStringIO(io.StringIO): 257 | """StringIO that throws an exception when it's read from""" 258 | 259 | def read(self, *args, **kwargs): 260 | raise IOError 261 | 262 | def readline(self, *args, **kwargs): 263 | raise IOError 264 | 265 | def readlines(self, *args, **kwargs): 266 | raise IOError 267 | 268 | def readable(self, *args, **kwargs): 269 | """Returns True if the IO object can be read.""" 270 | return False 271 | 272 | 273 | class redirect_stdin(contextlib._RedirectStream): # type: ignore 274 | _stream = "stdin" 275 | 276 | 277 | def reliability_guard(max_as_limit, max_data_limit, max_stack_limit): 278 | """ 279 | This disables various destructive functions and prevents the generated code 280 | from interfering with the test (e.g. fork bomb, killing other processes, 281 | removing filesystem files, etc.) 282 | 283 | WARNING 284 | This function is NOT a security sandbox. Untrusted code, including, model- 285 | generated code, should not be blindly executed outside of one. See the 286 | Codex paper for more information about OpenAI's code sandbox, and proceed 287 | with caution. 288 | """ 289 | 290 | import os 291 | import time 292 | from datetime import datetime 293 | 294 | os.environ['TZ'] = 'UTC' 295 | time.tzset() 296 | 297 | os.environ["OMP_NUM_THREADS"] = "1" 298 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 299 | os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0" 300 | 301 | if max_as_limit and max_data_limit and max_stack_limit: 302 | import resource 303 | 304 | max_as_limit = max_as_limit * 1024 * 1024 305 | max_data_limit = max_data_limit * 1024 * 1024 306 | max_stack_limit = max_stack_limit * 1024 * 1024 307 | 308 | resource.setrlimit( 309 | resource.RLIMIT_AS, (max_as_limit, max_as_limit) 310 | ) 311 | resource.setrlimit( 312 | resource.RLIMIT_DATA, (max_data_limit, max_data_limit) 313 | ) 314 | if not platform.uname().system == "Darwin": 315 | resource.setrlimit( 316 | resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit) 317 | ) 318 | 319 | faulthandler.disable() 320 | 321 | import builtins 322 | 323 | builtins.exit = None 324 | builtins.quit = None 325 | 326 | import matplotlib.pyplot as plt 327 | plt.close('all') 328 | -------------------------------------------------------------------------------- /bigcodebench/gen/__init__.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Any, List 3 | 4 | 5 | class BaseGen(object): 6 | def __init__(self, inputs: List[Any], entry_point: str, contract: str): 7 | """Initializing a input mutator. 8 | 9 | Args: 10 | inputs (List[Any]): The set of initial inputs (i.e., seeds) 11 | entry_point (str): The function name to invoke with the input 12 | contract (str): The contract to verify input validity 13 | """ 14 | self.contract = contract 15 | self.entry_point = entry_point 16 | self.seed_pool: List[Any] = copy.deepcopy(inputs) 17 | self.new_inputs = [] 18 | self.seed_hash = set([hash(str(x)) for x in self.seed_pool]) 19 | 20 | def generate(self, num: int) -> List[Any]: 21 | raise NotImplementedError 22 | -------------------------------------------------------------------------------- /bigcodebench/gen/util/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import sys 4 | import types 5 | import unittest 6 | import tempfile 7 | import multiprocessing 8 | from multiprocessing import Array, Value, Manager 9 | from bigcodebench.eval.utils import ( 10 | create_tempdir, 11 | reliability_guard, 12 | swallow_io, 13 | time_limit, 14 | safe_environment, 15 | TIMEOUT_LIMIT, 16 | ) 17 | 18 | 19 | def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_stack_limit, times): 20 | """Execute trusted code in place.""" 21 | # Specify a unique cache dir by modifying XDG_CONFIG_HOME 22 | old_xdg = os.environ.get("XDG_CONFIG_HOME") 23 | temp_xdg = tempfile.mkdtemp(prefix="xdg_config_") 24 | os.environ["XDG_CONFIG_HOME"] = temp_xdg 25 | 26 | try: 27 | with create_tempdir(): 28 | import shutil 29 | import builtins 30 | 31 | rmtree = shutil.rmtree 32 | rmdir = os.rmdir 33 | chdir = os.chdir 34 | module_name = "__test__" 35 | new_module = types.ModuleType(module_name) 36 | 37 | reliability_guard(max_as_limit, max_data_limit, max_stack_limit) 38 | 39 | # Set necessary attributes for the module 40 | new_module.__dict__.update({ 41 | '__builtins__': builtins, 42 | '__file__': f"{module_name}.py", 43 | '__package__': None, 44 | '__doc__': None, 45 | 'sys': sys, 46 | 'os': os, 47 | 'environ': os.environ, 48 | }) 49 | 50 | # Combine the user code and the test code 51 | full_code = code + "\n" + test_code 52 | 53 | # Compile and execute the combined code within the new module 54 | exec(compile(full_code, f"{module_name}.py", 'exec'), 55 | new_module.__dict__) 56 | sys.modules[module_name] = new_module 57 | TestCases = getattr(new_module, 'TestCases') 58 | loader = unittest.TestLoader() 59 | suite = loader.loadTestsFromTestCase(TestCases) 60 | test_result = unittest.TestResult() 61 | start = time.time() 62 | with safe_environment(), swallow_io(), time_limit(seconds=TIMEOUT_LIMIT): 63 | suite.run(test_result) 64 | 65 | errors = test_result.failures + test_result.errors 66 | if len(errors) > 0: 67 | print(errors) 68 | times.value = -1 69 | else: 70 | times.value = time.time() - start 71 | 72 | # Needed for cleaning up. 73 | shutil.rmtree = rmtree 74 | os.rmdir = rmdir 75 | os.chdir = chdir 76 | 77 | finally: 78 | # Restore the original environment variable 79 | if old_xdg is None: 80 | os.environ.pop("XDG_CONFIG_HOME", None) 81 | else: 82 | os.environ["XDG_CONFIG_HOME"] = old_xdg 83 | shutil.rmtree(temp_xdg, ignore_errors=True) 84 | 85 | 86 | def trusted_check_exec(code, inputs): 87 | """Check trusted_exec success.""" 88 | try: 89 | with time_limit(seconds=TIMEOUT_LIMIT): 90 | trusted_exec(code, inputs) 91 | except Exception: 92 | return False 93 | return True 94 | 95 | 96 | def trusted_check( 97 | code: str, 98 | test_code: str, 99 | task_id: str, 100 | max_as_limit: float, 101 | max_data_limit: float, 102 | max_stack_limit: float, 103 | min_time_limit: float = 10, 104 | ): 105 | timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1 106 | # shared memory objects 107 | times = Value("d", -1) 108 | manager = Manager() 109 | 110 | p = multiprocessing.Process( 111 | target=trusted_exec, 112 | args=( 113 | code, 114 | test_code, 115 | task_id, 116 | max_as_limit, 117 | max_data_limit, 118 | max_stack_limit, 119 | times, 120 | ), 121 | ) 122 | p.start() 123 | p.join(timeout=timeout+1) 124 | if p.is_alive(): 125 | p.terminate() 126 | time.sleep(0.1) 127 | if p.is_alive(): 128 | p.kill() 129 | time.sleep(0.1) 130 | 131 | if times.value == -1: 132 | times = None 133 | else: 134 | times = times.value 135 | 136 | return {"task_id": task_id, "time": times} -------------------------------------------------------------------------------- /bigcodebench/gen/util/anthropic_request.py: -------------------------------------------------------------------------------- 1 | import signal 2 | import time 3 | 4 | import anthropic 5 | from anthropic.types import Message 6 | 7 | 8 | def handler(signum, frame): 9 | # swallow signum and frame 10 | raise Exception("end of time") 11 | 12 | 13 | def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message: 14 | ret = None 15 | while ret is None: 16 | try: 17 | signal.signal(signal.SIGALRM, handler) 18 | signal.alarm(100) 19 | if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs: 20 | kwargs["thinking"] = { 21 | "type": "enabled", 22 | "budget_tokens": kwargs["reasoning_budget"], 23 | } 24 | kwargs["betas"] = [kwargs["reasoning_beta"]] 25 | kwargs.pop("reasoning_budget") 26 | kwargs.pop("reasoning_beta") 27 | kwargs.pop("temperature") 28 | if "thinking" in kwargs: 29 | ret = client.beta.messages.create(*args, **kwargs, stream=True) 30 | else: 31 | ret = client.messages.create(*args, **kwargs) 32 | signal.alarm(0) 33 | except anthropic.RateLimitError: 34 | print("Rate limit exceeded. Waiting...") 35 | signal.alarm(0) 36 | time.sleep(5) 37 | except anthropic.APIConnectionError: 38 | print("API connection error. Waiting...") 39 | signal.alarm(0) 40 | time.sleep(5) 41 | except anthropic.InternalServerError: 42 | print("Internal server error. Waiting...") 43 | signal.alarm(0) 44 | time.sleep(5) 45 | except anthropic.APIError as e: 46 | print("Unknown API error") 47 | print(e) 48 | if ( 49 | e.body["error"]["message"] 50 | == "Output blocked by content filtering policy" 51 | ): 52 | raise Exception("Content filtering policy blocked output") 53 | signal.alarm(0) 54 | except Exception as e: 55 | print("Unknown error. Waiting...") 56 | print(e) 57 | signal.alarm(0) 58 | time.sleep(1) 59 | return ret -------------------------------------------------------------------------------- /bigcodebench/gen/util/google_request.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from google import genai 4 | from google.api_core.exceptions import GoogleAPICallError, ResourceExhausted 5 | 6 | 7 | def make_request( 8 | model: str, 9 | client: genai.Client, 10 | message: str, 11 | temperature: float, 12 | n: int, 13 | max_new_tokens: int = 2048, 14 | ) -> genai.types.GenerateContentResponse: 15 | kwargs = {"temperature": temperature, "max_output_tokens": max_new_tokens} 16 | 17 | if "-thinking-" in model: 18 | kwargs.pop("max_output_tokens") 19 | 20 | response = client.models.generate_content( 21 | model=model, 22 | contents=message, 23 | config=genai.types.GenerateContentConfig( 24 | candidate_count=n, 25 | safety_settings=[ 26 | genai.types.SafetySetting( 27 | category='HARM_CATEGORY_DANGEROUS_CONTENT', 28 | threshold='BLOCK_NONE' 29 | ), 30 | genai.types.SafetySetting( 31 | category='HARM_CATEGORY_SEXUALLY_EXPLICIT', 32 | threshold='BLOCK_NONE' 33 | ), 34 | genai.types.SafetySetting( 35 | category='HARM_CATEGORY_HATE_SPEECH', 36 | threshold='BLOCK_NONE' 37 | ), 38 | genai.types.SafetySetting( 39 | category='HARM_CATEGORY_HARASSMENT', 40 | threshold='BLOCK_NONE' 41 | ), 42 | ], 43 | **kwargs 44 | ), 45 | ) 46 | 47 | return response 48 | 49 | 50 | def make_auto_request(*args, **kwargs) -> genai.types.GenerateContentResponse: 51 | ret = None 52 | while ret is None: 53 | try: 54 | ret = make_request(*args, **kwargs) 55 | except ResourceExhausted as e: 56 | print("Rate limit exceeded. Waiting...", e.message) 57 | time.sleep(10) 58 | except GoogleAPICallError as e: 59 | print(e.message) 60 | time.sleep(1) 61 | except Exception as e: 62 | print("Unknown error. Waiting...") 63 | print(e) 64 | time.sleep(1) 65 | return ret -------------------------------------------------------------------------------- /bigcodebench/gen/util/hf_inference_request.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from huggingface_hub import InferenceClient 4 | from huggingface_hub.inference._generated.types import TextGenerationOutput 5 | 6 | 7 | def make_request( 8 | client: InferenceClient, 9 | message: str, 10 | model: str, 11 | temperature: float, 12 | n: int, 13 | max_new_tokens: int = 2048, 14 | ) -> TextGenerationOutput: 15 | response = client.text_generation( 16 | model=model, 17 | prompt=message, 18 | do_sample=False, 19 | max_new_tokens=max_new_tokens, 20 | ) 21 | 22 | return response 23 | 24 | 25 | def make_auto_request(*args, **kwargs) -> TextGenerationOutput: 26 | ret = None 27 | while ret is None: 28 | try: 29 | ret = make_request(*args, **kwargs) 30 | except Exception as e: 31 | print("Unknown error. Waiting...") 32 | print(e) 33 | time.sleep(1) 34 | return ret 35 | -------------------------------------------------------------------------------- /bigcodebench/gen/util/mistral_request.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from mistralai.client import MistralClient 4 | from mistralai.models.chat_completion import ChatMessage 5 | 6 | def make_auto_request(client: MistralClient, *args, **kwargs) -> ChatMessage: 7 | ret = None 8 | while ret is None: 9 | try: 10 | ret = client.chat(*args, **kwargs) 11 | except Exception as e: 12 | print("Unknown error. Waiting...") 13 | print(e) 14 | time.sleep(1) 15 | return ret -------------------------------------------------------------------------------- /bigcodebench/gen/util/openai_request.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import openai 4 | from openai.types.chat import ChatCompletion 5 | 6 | 7 | def make_request( 8 | client: openai.Client, 9 | message: str, 10 | model: str, 11 | max_tokens: int = 512, 12 | temperature: float = 1, 13 | reasoning_effort: str = "medium", 14 | n: int = 1, 15 | **kwargs 16 | ) -> ChatCompletion: 17 | kwargs["top_p"] = 0.95 18 | kwargs["max_completion_tokens"] = max_tokens 19 | kwargs["temperature"] = temperature 20 | if any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): # pop top-p and max_completion_tokens 21 | kwargs.pop("top_p") 22 | kwargs.pop("max_completion_tokens") 23 | kwargs.pop("temperature") 24 | kwargs["reasoning_effort"] = reasoning_effort 25 | 26 | return client.chat.completions.create( 27 | model=model, 28 | messages=[ 29 | {"role": "user", "content": message}, 30 | ], 31 | n=n, 32 | **kwargs 33 | ) 34 | 35 | 36 | def make_auto_request(*args, **kwargs) -> ChatCompletion: 37 | ret = None 38 | while ret is None: 39 | try: 40 | ret = make_request(*args, **kwargs) 41 | except openai.RateLimitError: 42 | print("Rate limit exceeded. Waiting...") 43 | time.sleep(5) 44 | except openai.APIConnectionError: 45 | print("API connection error. Waiting...") 46 | time.sleep(5) 47 | except openai.APIError as e: 48 | print(e) 49 | except Exception as e: 50 | print("Unknown error. Waiting...") 51 | print(e) 52 | time.sleep(1) 53 | return ret -------------------------------------------------------------------------------- /bigcodebench/generate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | from typing import Optional, Tuple 5 | 6 | from bigcodebench.provider import DecoderBase, make_model 7 | from bigcodebench.data import get_bigcodebench, write_jsonl 8 | from bigcodebench.sanitize import sanitize 9 | from rich.progress import ( 10 | BarColumn, 11 | MofNCompleteColumn, 12 | Progress, 13 | TextColumn, 14 | TimeElapsedColumn, 15 | ) 16 | 17 | 18 | def codegen( 19 | model: DecoderBase, 20 | target_path: str, 21 | split: str, 22 | subset: str, 23 | greedy: bool = False, 24 | strip_newlines: bool = False, 25 | n_samples: int = 1, 26 | id_range: Tuple[int, int] = None, 27 | resume: bool = True, 28 | batch_size: int = -1, 29 | ): 30 | with Progress( 31 | TextColumn(f"BigCodeBench--{split.capitalize()} ({subset.capitalize()}) •" + "[progress.percentage]{task.percentage:>3.0f}%"), 32 | BarColumn(), 33 | MofNCompleteColumn(), 34 | TextColumn("•"), 35 | TimeElapsedColumn(), 36 | ) as p: 37 | 38 | dataset = get_bigcodebench(subset=subset) 39 | 40 | if model.is_direct_completion() and split == "instruct": 41 | raise Exception("Base model does not support direct completion for instruct tasks") 42 | 43 | # create target_path if it doesn't exist, e.g., a/b.jsonl 44 | dirname = os.path.dirname(target_path) 45 | if not os.path.exists(dirname) and dirname != "": 46 | os.makedirs(dirname) 47 | 48 | batch_prompts = [] 49 | batch_task_ids = [] 50 | batch_nsamples = [] 51 | batch_entry_points = [] 52 | 53 | # Read existing data once if resuming 54 | task2nexist = {} 55 | if resume and os.path.exists(target_path): 56 | with open(target_path, "r") as f: 57 | for line in f: 58 | item = json.loads(line) 59 | task2nexist[item["task_id"]] = task2nexist.get(item["task_id"], 0) + 1 60 | 61 | for id_num, (task_id, task) in enumerate(p.track(dataset.items())): 62 | if id_range is not None: 63 | low, high = id_range 64 | if id_num < low: 65 | p.console.print(f"Skipping {task_id} as it is not in {id_range}") 66 | continue 67 | if id_num >= id_range[1]: 68 | break 69 | 70 | p_name = task_id.replace("/", "_") 71 | 72 | n_existing = task2nexist.get(task_id, 0) 73 | nsamples = n_samples - n_existing 74 | 75 | try: 76 | prompt = task[f"{split}_prompt"] 77 | except: 78 | raise Exception(f"Invalid split {split} for bigcodebench-{subset}") 79 | if strip_newlines: 80 | prompt = prompt.strip("\n") 81 | 82 | if nsamples > 0: 83 | batch_prompts.append(prompt) 84 | batch_task_ids.append(task_id) 85 | batch_nsamples.append(nsamples) 86 | batch_entry_points.append(task["entry_point"]) 87 | 88 | log = f"Codegen: {p_name} @ {model}" 89 | if n_existing > 0: 90 | log += f" (resuming from {n_existing})" 91 | p.console.print(log) 92 | 93 | if (batch_size and len(batch_prompts) == batch_size) or id_num == len(dataset) - 1 or (id_range and id_num == id_range[1] - 1): 94 | if not batch_prompts and (id_num == len(dataset) - 1 or (id_range and id_num == id_range[1] - 1)): 95 | break 96 | outputs = model.codegen( 97 | batch_prompts, 98 | do_sample=not greedy, 99 | num_samples=max(batch_nsamples), 100 | ) 101 | assert outputs, "No outputs from model!" 102 | 103 | samples = [] 104 | for task_id, content, entry_point, nsamples, task_outputs in zip(batch_task_ids, batch_prompts, batch_entry_points, batch_nsamples, outputs): 105 | if model.is_direct_completion(): 106 | samples.extend([ 107 | dict(task_id=task_id, solution=sanitize(content+completion, entry_point), raw_solution=content+completion) 108 | for completion in task_outputs[:nsamples] 109 | ]) 110 | else: 111 | samples.extend([ 112 | dict(task_id=task_id, solution=sanitize(completion, entry_point), raw_solution=completion) 113 | for completion in task_outputs[:nsamples] 114 | ]) 115 | 116 | print(f"Generated {len(samples)} samples") 117 | write_jsonl(target_path, samples, append=True) 118 | 119 | # Clear batches 120 | batch_prompts = [] 121 | batch_task_ids = [] 122 | batch_nsamples = [] 123 | 124 | 125 | def run_codegen( 126 | model: str, 127 | split: str, 128 | subset: str, 129 | root: str = "bcb_results", 130 | lora_path: str = None, 131 | bs: Optional[int] = None, 132 | n_samples: int = 1, 133 | temperature: float = 0.0, 134 | max_new_tokens: int = 1280, 135 | # vllm 136 | max_model_len: int = 12800, 137 | greedy: bool = False, 138 | # openai 139 | reasoning_effort: str = "medium", 140 | # anthropic 141 | reasoning_budget: int = 0, 142 | reasoning_beta: str = "output-128k-2025-02-19", 143 | strip_newlines: bool = False, 144 | direct_completion: bool = False, 145 | resume: bool = True, 146 | id_range: str = None, 147 | backend: str = "vllm", 148 | base_url: str = None, 149 | tp: int = 1, 150 | instruction_prefix: str = "Please provide a self-contained Python script that solves the following problem in a markdown code block:", 151 | response_prefix: str ="Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:", 152 | skip_prefill: bool = False, 153 | revision: str = "main", 154 | trust_remote_code: bool = False, 155 | tokenizer_name: str = None, 156 | tokenizer_legacy: bool = False, 157 | ): 158 | 159 | if greedy or (temperature == 0 and n_samples == 1): 160 | temperature = 0 161 | n_samples = 1 162 | greedy = True 163 | print("Greedy decoding ON (--greedy): setting n_samples=1, temperature=0") 164 | 165 | if id_range is not None: 166 | id_range = [int(i) for i in id_range.split("-")] 167 | assert len(id_range) == 2, "id_range must be a list of length 2" 168 | assert id_range[0] < id_range[1], "id_range must be increasing" 169 | id_range = tuple(id_range) 170 | 171 | # Make project dir 172 | os.makedirs(root, exist_ok=True) 173 | 174 | # Make dir for codes generated by each model 175 | model_runner = make_model( 176 | model=model, 177 | backend=backend, 178 | subset=subset, 179 | split=split, 180 | lora_path=lora_path, 181 | temperature=temperature, 182 | max_new_tokens=max_new_tokens, 183 | max_model_len=max_model_len, 184 | reasoning_effort=reasoning_effort, 185 | reasoning_budget=reasoning_budget, 186 | reasoning_beta=reasoning_beta, 187 | instruction_prefix=instruction_prefix, 188 | response_prefix=response_prefix, 189 | prefill=not skip_prefill, 190 | base_url=base_url, 191 | tp=tp, 192 | revision=revision, 193 | trust_remote_code=trust_remote_code, 194 | direct_completion=direct_completion, 195 | tokenizer_name=tokenizer_name, 196 | tokenizer_legacy=tokenizer_legacy 197 | ) 198 | 199 | extra = "-" + subset if subset != "full" else "" 200 | if backend == "openai" and reasoning_effort and any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): 201 | model = model + f"--{reasoning_effort}" 202 | 203 | if lora_path: 204 | model = model + f"--lora-{lora_path}" 205 | 206 | if backend == "anthropic" and reasoning_budget and reasoning_beta: 207 | model = model + f"--{reasoning_budget}-{reasoning_beta}" 208 | 209 | if skip_prefill: 210 | identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" 211 | else: 212 | identifier = model.replace("/", "--") + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl" 213 | 214 | target_path = os.path.join(root, identifier) 215 | 216 | if not resume: 217 | os.remove(target_path) 218 | 219 | codegen( 220 | model=model_runner, 221 | target_path=target_path, 222 | split=split, 223 | subset=subset, 224 | greedy=greedy, 225 | strip_newlines=strip_newlines, 226 | n_samples=n_samples, 227 | resume=resume, 228 | id_range=id_range, 229 | batch_size=bs 230 | ) 231 | 232 | return target_path 233 | 234 | 235 | def main(): 236 | from fire import Fire 237 | Fire(run_codegen) 238 | 239 | 240 | if __name__ == "__main__": 241 | main() 242 | -------------------------------------------------------------------------------- /bigcodebench/inspect.py: -------------------------------------------------------------------------------- 1 | from bigcodebench.data import get_bigcodebench 2 | import os 3 | import shutil 4 | import json 5 | import argparse 6 | 7 | def inspection(args): 8 | """ 9 | Write a series of files for each task into a directory. 10 | 11 | Each Directory Structure: 12 | -- task_id 13 | -- ground_truth.py: prompt + canonical_solution 14 | -- completion.py: prompt + completion 15 | -- execution_trace.txt: execution trace 16 | """ 17 | path = os.path.join(args.save_path, args.eval_results.split("/")[-1].replace(".json", "")) 18 | if args.in_place: 19 | shutil.rmtree(path, ignore_errors=True) 20 | if not os.path.exists(path): 21 | os.makedirs(path) 22 | problems = get_bigcodebench(subset=args.subset) 23 | 24 | eval_results = json.load(open(args.eval_results, "r")) 25 | for task_id, results in eval_results["eval"].items(): 26 | if task_id not in problems: 27 | continue 28 | if all(result["status"] == "pass" for result in results): 29 | continue 30 | task_path = os.path.join(path, task_id) 31 | if not os.path.exists(task_path): 32 | os.makedirs(task_path) 33 | task_id_data = problems[task_id] 34 | with open(os.path.join(task_path, "ground_truth.py"), "w") as f: 35 | f.write(task_id_data[f"{args.split}_prompt"] + "\n\n" + task_id_data["canonical_solution"]) 36 | 37 | # write test 38 | with open(os.path.join(task_path, "test_case.py"), "w") as f: 39 | f.write(task_id_data["test"]) 40 | 41 | for i, result in enumerate(results): 42 | with open(os.path.join(task_path, f"completion_{i}.py"), "w") as f: 43 | f.write(result["solution"]) 44 | 45 | for i, result in enumerate(results): 46 | with open(os.path.join(task_path, f"complete_{i}_execution_trace.txt"), "w") as f: 47 | for test_case, execution_trace in result["details"].items(): 48 | f.write(f"Test Case: {test_case}\n\n") 49 | f.write(execution_trace) 50 | f.write("="*50 + "\n") 51 | def main(): 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("--eval_results", required=True, type=str) 54 | parser.add_argument( 55 | "--split", required=True, type=str, choices=["complete", "instruct"] 56 | ) 57 | parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"]) 58 | parser.add_argument("--save_path", default="inspect", type=str) 59 | parser.add_argument("--in_place", action="store_true") 60 | args = parser.parse_args() 61 | 62 | inspection(args) 63 | 64 | if __name__ == "__main__": 65 | main() -------------------------------------------------------------------------------- /bigcodebench/provider/__init__.py: -------------------------------------------------------------------------------- 1 | from bigcodebench.provider.base import DecoderBase 2 | 3 | 4 | def make_model( 5 | model: str, 6 | backend: str, 7 | subset: str, 8 | split: str, 9 | lora_path: str = None, 10 | dataset: str = "bigcodebench", 11 | temperature: float = 0.0, 12 | max_new_tokens: int = 1280, 13 | max_model_len: int = 12800, 14 | # openai only 15 | reasoning_effort: str = "medium", 16 | # anthropic only 17 | reasoning_budget: int = 0, 18 | reasoning_beta: str = "output-128k-2025-02-19", 19 | # instruction model only 20 | instruction_prefix: str = None, 21 | response_prefix: str = None, 22 | prefill: bool = True, 23 | # vllm and hf only 24 | revision: str = "main", 25 | # vllm only 26 | tp: int = 1, 27 | direct_completion: bool = False, 28 | base_url: str = None, 29 | trust_remote_code: bool = False, 30 | # hf only 31 | attn_implementation: str = "eager", 32 | # tokenizer 33 | tokenizer_name: str = None, 34 | tokenizer_legacy: bool = True, 35 | ) -> DecoderBase: 36 | if backend == "vllm": 37 | from bigcodebench.provider.vllm import VllmDecoder 38 | 39 | return VllmDecoder( 40 | name=model, 41 | subset=subset, 42 | split=split, 43 | lora_path=lora_path, 44 | temperature=temperature, 45 | max_new_tokens=max_new_tokens, 46 | max_model_len=max_model_len, 47 | revision=revision, 48 | dataset=dataset, 49 | direct_completion=direct_completion, 50 | tp=tp, 51 | instruction_prefix=instruction_prefix, 52 | response_prefix=response_prefix, 53 | prefill=prefill, 54 | trust_remote_code=trust_remote_code, 55 | tokenizer_name=tokenizer_name, 56 | tokenizer_legacy=tokenizer_legacy, 57 | ) 58 | elif backend == "hf": 59 | from bigcodebench.provider.hf import HuggingFaceDecoder 60 | 61 | return HuggingFaceDecoder( 62 | name=model, 63 | subset=subset, 64 | split=split, 65 | lora_path=lora_path, 66 | temperature=temperature, 67 | max_new_tokens=max_new_tokens, 68 | revision=revision, 69 | dataset=dataset, 70 | direct_completion=direct_completion, 71 | instruction_prefix=instruction_prefix, 72 | response_prefix=response_prefix, 73 | prefill=prefill, 74 | attn_implementation=attn_implementation, 75 | trust_remote_code=trust_remote_code, 76 | tokenizer_name=tokenizer_name, 77 | tokenizer_legacy=tokenizer_legacy, 78 | ) 79 | elif backend == "hf-inference": 80 | from bigcodebench.provider.hf_inference import HuggingFaceInferenceDecoder 81 | 82 | return HuggingFaceInferenceDecoder( 83 | name=model, 84 | subset=subset, 85 | split=split, 86 | temperature=temperature, 87 | max_new_tokens=max_new_tokens, 88 | direct_completion=direct_completion, 89 | instruction_prefix=instruction_prefix, 90 | response_prefix=response_prefix, 91 | ) 92 | elif backend == "openai": 93 | from bigcodebench.provider.openai import OpenAIChatDecoder 94 | 95 | assert not direct_completion, f"{backend} backend does not serve base model" 96 | return OpenAIChatDecoder( 97 | name=model, 98 | subset=subset, 99 | split=split, 100 | temperature=temperature, 101 | max_new_tokens=max_new_tokens, 102 | reasoning_effort=reasoning_effort, 103 | base_url=base_url, 104 | instruction_prefix=instruction_prefix, 105 | response_prefix=response_prefix, 106 | ) 107 | elif backend == "mistral": 108 | from bigcodebench.provider.mistral import MistralChatDecoder 109 | 110 | return MistralChatDecoder( 111 | name=model, 112 | subset=subset, 113 | split=split, 114 | temperature=temperature, 115 | max_new_tokens=max_new_tokens, 116 | instruction_prefix=instruction_prefix, 117 | response_prefix=response_prefix, 118 | ) 119 | elif backend == "anthropic": 120 | from bigcodebench.provider.anthropic import AnthropicDecoder 121 | 122 | assert not direct_completion, f"{backend} backend does not serve base model" 123 | return AnthropicDecoder( 124 | name=model, 125 | subset=subset, 126 | split=split, 127 | temperature=temperature, 128 | max_new_tokens=max_new_tokens, 129 | reasoning_budget=reasoning_budget, 130 | reasoning_beta=reasoning_beta, 131 | instruction_prefix=instruction_prefix, 132 | response_prefix=response_prefix, 133 | ) 134 | elif backend == "google": 135 | from bigcodebench.provider.google import GoogleDecoder 136 | 137 | assert not direct_completion, f"{backend} backend does not serve base model" 138 | return GoogleDecoder( 139 | name=model, 140 | subset=subset, 141 | split=split, 142 | temperature=temperature, 143 | max_new_tokens=max_new_tokens, 144 | instruction_prefix=instruction_prefix, 145 | response_prefix=response_prefix, 146 | ) -------------------------------------------------------------------------------- /bigcodebench/provider/anthropic.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | from tqdm import tqdm 4 | 5 | import anthropic 6 | 7 | from bigcodebench.gen.util.anthropic_request import make_auto_request 8 | from bigcodebench.provider.base import DecoderBase 9 | from bigcodebench.provider.utility import make_raw_chat_prompt 10 | 11 | class AnthropicDecoder(DecoderBase): 12 | def __init__(self, name: str, reasoning_budget: int = 0, reasoning_beta: str = "output-128k-2025-02-19", **kwargs) -> None: 13 | super().__init__(name, **kwargs) 14 | self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_KEY")) 15 | self.reasoning_budget = reasoning_budget 16 | self.reasoning_beta = reasoning_beta 17 | 18 | def codegen( 19 | self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 20 | ) -> List[str]: 21 | if do_sample: 22 | assert self.temperature > 0, "Temperature must be positive for sampling" 23 | 24 | all_outputs = [] 25 | for prompt in tqdm(prompts): 26 | outputs = [] 27 | 28 | for _ in range(num_samples): 29 | ret = make_auto_request( 30 | client=self.client, 31 | model=self.name, 32 | messages=[ 33 | { 34 | "role": "user", 35 | "content": make_raw_chat_prompt( 36 | task_prompt=prompt, 37 | subset=self.subset, 38 | split=self.split, 39 | instruction_prefix=self.instruction_prefix, 40 | response_prefix=self.response_prefix, 41 | tokenizer=None, 42 | ) 43 | } 44 | ], 45 | max_tokens=self.max_new_tokens, 46 | temperature=self.temperature, 47 | stop_sequences=self.eos, 48 | reasoning_budget=self.reasoning_budget, 49 | reasoning_beta=self.reasoning_beta, 50 | ) 51 | if isinstance(ret, anthropic.Stream): 52 | output = "" 53 | for chunk in ret: 54 | if chunk.type == "content_block_delta": 55 | # if chunk.delta.type == "thinking_delta": 56 | # output += chunk.delta.thinking 57 | if chunk.delta.type == "text_delta": 58 | output += chunk.delta.text 59 | outputs.append(output) 60 | else: 61 | outputs.append(ret.content[0].text) 62 | all_outputs.append(outputs) 63 | return all_outputs 64 | 65 | def is_direct_completion(self) -> bool: 66 | return False -------------------------------------------------------------------------------- /bigcodebench/provider/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | 4 | from bigcodebench.provider.utility import EOS 5 | 6 | 7 | class DecoderBase(ABC): 8 | def __init__( 9 | self, 10 | name: str, 11 | subset: str, 12 | split: str, 13 | temperature: float = 0.8, 14 | max_new_tokens: int = 1280, 15 | revision: str = "main", 16 | dtype: str = "bfloat16", # default 17 | direct_completion: bool = False, 18 | trust_remote_code: bool = False, 19 | tokenizer_name: str = None, 20 | tokenizer_legacy: bool = False, 21 | instruction_prefix: str = None, 22 | response_prefix: str = None, 23 | prefill: bool = True, 24 | ) -> None: 25 | print("Initializing a decoder model: {} ...".format(name)) 26 | self.name = name 27 | self.subset = subset 28 | self.split = split 29 | self.temperature = temperature 30 | self.eos = EOS 31 | self.skip_special_tokens = False 32 | self.max_new_tokens = max_new_tokens 33 | self.dtype = dtype 34 | self.revision = revision 35 | self.direct_completion = direct_completion 36 | self.trust_remote_code = trust_remote_code 37 | self.tokenizer_name = tokenizer_name 38 | self.tokenizer_legacy = tokenizer_legacy 39 | self.instruction_prefix = instruction_prefix 40 | self.response_prefix = response_prefix 41 | self.prefill = prefill 42 | 43 | @abstractmethod 44 | def codegen( 45 | self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 46 | ) -> List[str]: 47 | pass 48 | 49 | @abstractmethod 50 | def is_direct_completion(self) -> bool: 51 | pass 52 | 53 | def __repr__(self) -> str: 54 | return self.name 55 | 56 | def __str__(self) -> str: 57 | return self.name -------------------------------------------------------------------------------- /bigcodebench/provider/google.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | from tqdm import tqdm 4 | 5 | from google import genai 6 | 7 | from bigcodebench.provider.base import DecoderBase 8 | from bigcodebench.gen.util.google_request import make_auto_request 9 | from bigcodebench.provider.utility import make_raw_chat_prompt 10 | 11 | 12 | class GoogleDecoder(DecoderBase): 13 | def __init__(self, name: str, **kwargs): 14 | super().__init__(name, **kwargs) 15 | self.model = name 16 | self.client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) 17 | 18 | def codegen( 19 | self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 20 | ) -> List[str]: 21 | if do_sample: 22 | assert self.temperature > 0, "Temperature must be positive for sampling" 23 | 24 | all_outputs = [] 25 | 26 | for prompt in tqdm(prompts): 27 | outputs = [] 28 | message = make_raw_chat_prompt( 29 | task_prompt=prompt, 30 | subset=self.subset, 31 | split=self.split, 32 | instruction_prefix=self.instruction_prefix, 33 | response_prefix=self.response_prefix, 34 | tokenizer=None, 35 | ) 36 | ret = make_auto_request( 37 | model=self.model, 38 | client=self.client, 39 | message=message, 40 | n=num_samples, 41 | temperature=self.temperature, 42 | max_new_tokens=self.max_new_tokens, 43 | ) 44 | for candidate in ret.candidates: 45 | parts = candidate.content.parts 46 | if parts: 47 | outputs.append(parts[0].text) 48 | else: 49 | print("Empty response!") 50 | outputs.append("") 51 | print(f"{candidate.safety_ratings = }") 52 | all_outputs.append(outputs) 53 | return all_outputs 54 | 55 | def is_direct_completion(self) -> bool: 56 | return False -------------------------------------------------------------------------------- /bigcodebench/provider/hf.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | from stop_sequencer import StopSequencer 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | 7 | from bigcodebench.provider.base import DecoderBase 8 | from bigcodebench.provider.utility import ( 9 | extra_eos_for_direct_completion, 10 | make_raw_chat_prompt, 11 | ) 12 | 13 | 14 | class HuggingFaceDecoder(DecoderBase): 15 | def __init__( 16 | self, 17 | name: str, 18 | dataset: str, 19 | attn_implementation: str = "eager", 20 | **kwargs, 21 | ): 22 | super().__init__(name=name, **kwargs) 23 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 24 | 25 | kwargs = { 26 | "device_map": "auto", 27 | "trust_remote_code": self.trust_remote_code, 28 | "torch_dtype": getattr(torch, self.dtype), 29 | "attn_implementation": attn_implementation, # "eager", "flash_attention_2", "sdpa" 30 | "revision": self.revision, 31 | } 32 | self.skip_special_tokens = True 33 | 34 | print(f"{kwargs = }") 35 | 36 | self.tokenizer = AutoTokenizer.from_pretrained(name, use_fast=False, legacy=self.tokenizer_legacy) 37 | self.tokenizer.pad_token = self.tokenizer.eos_token 38 | # assume the model is decoder-only 39 | self.tokenizer.padding_side = 'left' 40 | 41 | if self.is_direct_completion(): # no chat template 42 | self.eos += extra_eos_for_direct_completion(dataset) 43 | else: # with chat template 44 | if self.prefill and "```" in self.response_prefix: 45 | self.eos += ["\n```\n"] 46 | 47 | print(f"{self.eos = }") 48 | self.model = AutoModelForCausalLM.from_pretrained(name, **kwargs) 49 | 50 | def is_direct_completion(self) -> bool: 51 | return self.direct_completion or self.tokenizer.chat_template is None 52 | 53 | @torch.inference_mode() 54 | def codegen( 55 | self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 56 | ) -> List[str]: 57 | if self.temperature == 0: 58 | assert not do_sample 59 | assert num_samples == 1 60 | 61 | prompts = [ 62 | prompt 63 | if self.is_direct_completion() 64 | else make_raw_chat_prompt( 65 | prompt, self.subset, self.split, self.instruction_prefix, self.response_prefix, self.tokenizer, self.direct_completion 66 | ) 67 | for prompt in prompts 68 | ] 69 | 70 | input_tokens = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to( 71 | self.device 72 | )["input_ids"] 73 | 74 | kwargs = {} 75 | if do_sample: 76 | kwargs["top_p"] = 0.95 77 | kwargs["temperature"] = self.temperature 78 | ret = self.model.generate( 79 | input_tokens, 80 | max_new_tokens=self.max_new_tokens, 81 | do_sample=do_sample, 82 | num_return_sequences=num_samples, 83 | pad_token_id=self.tokenizer.eos_token_id, 84 | stop_strings=self.eos, 85 | tokenizer=self.tokenizer, 86 | **kwargs, 87 | ) 88 | 89 | # Reshape ret into a list of lists, each sublist containing num_samples elements 90 | ret_chunks = [ret[i:i + num_samples] for i in range(0, len(ret), num_samples)] 91 | 92 | all_outputs = [] 93 | # Process each chunk in ret_chunks 94 | for i, ret_chunk in enumerate(ret_chunks): 95 | gen_strs = self.tokenizer.batch_decode( 96 | ret_chunk[:, input_tokens[i].size(-1):], 97 | skip_special_tokens=self.skip_special_tokens, 98 | ) 99 | outputs = [] 100 | for output in gen_strs: 101 | min_index = 10000 102 | for eos in self.eos: 103 | if eos in output: 104 | min_index = min(min_index, output.index(eos)) 105 | outputs.append(output[:min_index].replace("\t", " ")) 106 | all_outputs.append(outputs) 107 | return all_outputs -------------------------------------------------------------------------------- /bigcodebench/provider/hf_inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | from tqdm import tqdm 4 | 5 | from huggingface_hub import InferenceClient 6 | 7 | from bigcodebench.provider.base import DecoderBase 8 | from bigcodebench.gen.util.hf_inference_request import make_auto_request 9 | from bigcodebench.provider.utility import make_raw_chat_prompt 10 | 11 | 12 | class HuggingFaceInferenceDecoder(DecoderBase): 13 | def __init__(self, name: str, **kwargs): 14 | super().__init__(name, **kwargs) 15 | self.client = InferenceClient( 16 | provider="hf-inference", api_key=os.getenv("HF_INFERENCE_API_KEY") 17 | ) 18 | 19 | def codegen( 20 | self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 21 | ) -> List[str]: 22 | if do_sample: 23 | assert self.temperature > 0, "Temperature must be positive for sampling" 24 | 25 | all_outputs = [] 26 | 27 | for prompt in tqdm(prompts): 28 | outputs = [] 29 | message = ( 30 | prompt 31 | if self.is_direct_completion() 32 | else make_raw_chat_prompt( 33 | task_prompt=prompt, 34 | subset=self.subset, 35 | split=self.split, 36 | instruction_prefix=self.instruction_prefix, 37 | response_prefix=self.response_prefix, 38 | tokenizer=None, 39 | ) 40 | ) 41 | ret = make_auto_request( 42 | self.client, 43 | message=message, 44 | model=self.name, 45 | n=num_samples, 46 | temperature=self.temperature, 47 | max_new_tokens=self.max_new_tokens, 48 | ) 49 | outputs.append(ret) 50 | all_outputs.append(outputs) 51 | return all_outputs 52 | 53 | def is_direct_completion(self) -> bool: 54 | return self.direct_completion 55 | -------------------------------------------------------------------------------- /bigcodebench/provider/mistral.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | from tqdm import tqdm 4 | 5 | from mistralai.client import MistralClient 6 | from mistralai.models.chat_completion import ChatMessage 7 | 8 | from bigcodebench.provider.base import DecoderBase 9 | from bigcodebench.gen.util.mistral_request import make_auto_request 10 | from bigcodebench.provider.utility import make_raw_chat_prompt 11 | 12 | class MistralChatDecoder(DecoderBase): 13 | def __init__(self, name: str, **kwargs) -> None: 14 | super().__init__(name, **kwargs) 15 | self.client = MistralClient(api_key=os.getenv("MISTRAL_API_KEY")) 16 | 17 | def codegen( 18 | self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 19 | ) -> List[str]: 20 | if do_sample: 21 | assert self.temperature > 0, "Temperature must be positive for sampling" 22 | 23 | all_outputs = [] 24 | for prompt in tqdm(prompts): 25 | outputs = [] 26 | 27 | for _ in range(num_samples): 28 | ret = make_auto_request( 29 | client=self.client, 30 | model=self.name, 31 | messages=[ 32 | ChatMessage( 33 | role="user", 34 | content=make_raw_chat_prompt( 35 | task_prompt=prompt, 36 | subset=self.subset, 37 | split=self.split, 38 | instruction_prefix=self.instruction_prefix, 39 | response_prefix=self.response_prefix, 40 | tokenizer=None, 41 | direct_completion=None, 42 | ) 43 | ) 44 | ], 45 | max_tokens=self.max_new_tokens, 46 | ) 47 | outputs.append(ret.choices[0].message.content) 48 | all_outputs.append(outputs) 49 | return all_outputs 50 | 51 | def is_direct_completion(self) -> bool: 52 | return False -------------------------------------------------------------------------------- /bigcodebench/provider/openai.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | from tqdm import tqdm 4 | import openai 5 | 6 | from bigcodebench.gen.util.openai_request import make_auto_request 7 | from bigcodebench.provider.utility import make_raw_chat_prompt 8 | from bigcodebench.provider.base import DecoderBase 9 | from bigcodebench.provider.utility import concurrent_call 10 | 11 | class OpenAIChatDecoder(DecoderBase): 12 | def __init__(self, name: str, base_url=None, reasoning_effort="medium", **kwargs) -> None: 13 | super().__init__(name, **kwargs) 14 | self.base_url = base_url 15 | self.reasoning_effort = reasoning_effort 16 | 17 | def codegen( 18 | self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 19 | ) -> List[str]: 20 | if do_sample: 21 | assert self.temperature > 0, "Temperature must be positive for sampling" 22 | messages = [make_raw_chat_prompt( 23 | task_prompt=prompt, 24 | subset=self.subset, 25 | split=self.split, 26 | instruction_prefix=self.instruction_prefix, 27 | response_prefix=self.response_prefix, 28 | tokenizer=None, 29 | ) for prompt in prompts] 30 | # use concurrency based batching for o1 and deepseek models 31 | if any(self.name.startswith(model) or self.name.endswith(model) for model in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): 32 | return self._codegen_batch_via_concurrency(messages, num_samples) 33 | 34 | return self._codegen_api_batch(messages, num_samples) 35 | 36 | def _codegen_api_batch(self, messages: List[str], num_samples: int) -> List[str]: 37 | client = openai.OpenAI( 38 | api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=self.base_url 39 | ) 40 | 41 | all_outputs = [] 42 | for message in tqdm(messages): 43 | ret = make_auto_request( 44 | client, 45 | message=message, 46 | model=self.name, 47 | max_tokens=self.max_new_tokens, 48 | temperature=self.temperature, 49 | reasoning_effort=self.reasoning_effort, 50 | n=num_samples, 51 | ) 52 | outputs = [] 53 | for item in ret.choices: 54 | outputs.append(item.message.content) 55 | all_outputs.append(outputs) 56 | return all_outputs 57 | 58 | def _codegen_batch_via_concurrency(self, messages: List[str], num_samples: int) -> List[str]: 59 | batches = concurrent_call( 60 | num_samples, self._codegen_api_batch, messages, num_samples=1 61 | ) 62 | return [[element for sublist in item for element in sublist] for item in zip(*batches)] 63 | 64 | def is_direct_completion(self) -> bool: 65 | return False -------------------------------------------------------------------------------- /bigcodebench/provider/utility.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from transformers import AutoTokenizer 3 | from concurrent.futures import ThreadPoolExecutor 4 | 5 | EOS = [ 6 | "<|endoftext|>", 7 | "<|endofmask|>", 8 | "", 9 | "\nif __name__", 10 | "\ndef main(", 11 | "\nprint(", 12 | ] 13 | 14 | 15 | def extra_eos_for_direct_completion(dataset) -> List[str]: 16 | if dataset.lower() == "bigcodebench": 17 | return ["\ndef ", "\nclass ", "\nimport ", "\nfrom ", "\nassert "] 18 | raise ValueError(f"Unknown dataset: {dataset}") 19 | 20 | 21 | # some random words which serves as the splitter 22 | _MAGIC_SPLITTER_ = "-[[]]-this-is-really-our-highest-priority-[[]]-" 23 | 24 | 25 | def make_raw_chat_prompt( 26 | task_prompt: str, 27 | subset: str, 28 | split: str, 29 | instruction_prefix: str, 30 | response_prefix: str, 31 | tokenizer: AutoTokenizer, 32 | prefill: bool = True, 33 | direct_completion: bool = False, 34 | ) -> str: 35 | # directly return prompt if it does not have a tokenizer.chat_template 36 | if tokenizer: 37 | if tokenizer.chat_template is None or direct_completion: 38 | return task_prompt 39 | 40 | assert instruction_prefix is not None, "Instruction prefix is required!" 41 | assert response_prefix is not None, "Response prefix is required!" 42 | 43 | if split == "complete": 44 | task_prompt = f"""\ 45 | {instruction_prefix} 46 | ``` 47 | {task_prompt.strip()} 48 | ``` 49 | """ 50 | else: 51 | task_prompt = f"""\ 52 | {instruction_prefix} 53 | {task_prompt.strip()} 54 | """ 55 | response = f"""\ 56 | {response_prefix} 57 | ```python 58 | {_MAGIC_SPLITTER_} 59 | ``` 60 | """ 61 | if tokenizer: 62 | if prefill: 63 | task_prompt = tokenizer.apply_chat_template( 64 | [ 65 | {"role": "user", "content": task_prompt}, 66 | {"role": "assistant", "content": response}, 67 | ], 68 | tokenize=False, 69 | ).split(_MAGIC_SPLITTER_)[0] 70 | else: 71 | task_prompt = tokenizer.apply_chat_template( 72 | [ 73 | {"role": "user", "content": task_prompt}, 74 | ], 75 | tokenize=False, add_generation_prompt=True 76 | ).split(_MAGIC_SPLITTER_)[0] 77 | return task_prompt 78 | 79 | 80 | def concurrent_call(n, callback, /, *args, **kwargs): 81 | with ThreadPoolExecutor(max_workers=n) as executor: 82 | futures = [executor.submit(callback, *args, **kwargs) for _ in range(n)] 83 | return [future.result() for future in futures] -------------------------------------------------------------------------------- /bigcodebench/provider/vllm.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | 4 | from transformers import AutoTokenizer 5 | from vllm import LLM, SamplingParams 6 | from vllm.lora.request import LoRARequest 7 | from huggingface_hub import snapshot_download 8 | 9 | from bigcodebench.provider.base import DecoderBase 10 | from bigcodebench.provider.utility import ( 11 | extra_eos_for_direct_completion, 12 | make_raw_chat_prompt, 13 | ) 14 | 15 | class VllmDecoder(DecoderBase): 16 | def __init__(self, name: str, lora_path: str, dataset: str, tp: int, max_model_len: int, **kwargs) -> None: 17 | super().__init__(name, **kwargs) 18 | 19 | kwargs = { 20 | "tensor_parallel_size": int(os.getenv("VLLM_N_GPUS", tp)), 21 | "dtype": self.dtype, 22 | "trust_remote_code": self.trust_remote_code, 23 | "revision": self.revision, 24 | } 25 | if self.tokenizer_name is None: 26 | self.tokenizer_name = self.name 27 | 28 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, **kwargs, legacy=self.tokenizer_legacy) 29 | if self.is_direct_completion(): 30 | self.eos += extra_eos_for_direct_completion(dataset) 31 | else: 32 | if self.prefill and "```" in self.response_prefix: 33 | self.eos += ["\n```\n"] 34 | 35 | self.lora_request = None 36 | if lora_path: 37 | local_lora_path = snapshot_download(lora_path) 38 | self.lora_request = LoRARequest( 39 | "lora", 40 | 1, 41 | local_lora_path, 42 | ) 43 | 44 | self.llm = LLM(model=name, max_model_len=max_model_len, enable_lora=True if self.lora_request else False, **kwargs) 45 | self.llm.set_tokenizer(tokenizer=self.tokenizer) 46 | 47 | def is_direct_completion(self) -> bool: 48 | return self.tokenizer.chat_template is None or self.direct_completion 49 | 50 | def codegen( 51 | self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 52 | ) -> List[str]: 53 | if do_sample: 54 | assert self.temperature > 0, "Temperature must be greater than 0!" 55 | 56 | prompts = [ 57 | make_raw_chat_prompt( 58 | task_prompt=prompt, 59 | subset=self.subset, 60 | split=self.split, 61 | instruction_prefix=self.instruction_prefix, 62 | response_prefix=self.response_prefix, 63 | prefill=self.prefill, 64 | tokenizer=self.tokenizer, 65 | direct_completion=self.direct_completion, 66 | ) 67 | for prompt in prompts 68 | ] 69 | vllm_outputs = self.llm.generate( 70 | prompts, 71 | SamplingParams( 72 | n=num_samples, 73 | temperature=self.temperature, 74 | max_tokens=self.max_new_tokens, 75 | top_p=0.95 if do_sample else 1.0, 76 | stop=self.eos, 77 | skip_special_tokens=self.skip_special_tokens, 78 | ), 79 | lora_request=self.lora_request, 80 | use_tqdm=True, 81 | ) 82 | 83 | gen_strs = [[x.text.replace("\t", " ") for x in output.outputs] for output in vllm_outputs] 84 | return gen_strs -------------------------------------------------------------------------------- /bigcodebench/sanitize.py: -------------------------------------------------------------------------------- 1 | """Post-processing LLM-generated Python code implemented using tree-sitter.""" 2 | 3 | import os 4 | import pathlib 5 | from typing import Dict, Generator, List, Optional, Set, Tuple 6 | from pqdm.processes import pqdm 7 | 8 | from tqdm import tqdm 9 | import tree_sitter_python 10 | from tree_sitter import Language, Node, Parser 11 | 12 | from bigcodebench.data import ( 13 | get_bigcodebench, 14 | load_solutions, 15 | write_directory, 16 | write_jsonl, 17 | ) 18 | from bigcodebench.syncheck import syntax_check 19 | 20 | CLASS_TYPE = "class_definition" 21 | FUNCTION_TYPE = "function_definition" 22 | IMPORT_TYPE = ["import_statement", "import_from_statement"] 23 | IDENTIFIER_TYPE = "identifier" 24 | ATTRIBUTE_TYPE = "attribute" 25 | RETURN_TYPE = "return_statement" 26 | EXPRESSION_TYPE = "expression_statement" 27 | ASSIGNMENT_TYPE = "assignment" 28 | 29 | 30 | def code_extract(text: str) -> str: 31 | lines = text.split("\n") 32 | longest_line_pair = (0, 0) 33 | longest_so_far = 0 34 | 35 | for i in range(len(lines)): 36 | for j in range(i + 1, len(lines)): 37 | current_lines = "\n".join(lines[i : j + 1]) 38 | if syntax_check(current_lines): 39 | current_length = sum(1 for line in lines[i : j + 1] if line.strip()) 40 | if current_length > longest_so_far: 41 | longest_so_far = current_length 42 | longest_line_pair = (i, j) 43 | 44 | return "\n".join(lines[longest_line_pair[0] : longest_line_pair[1] + 1]) 45 | 46 | 47 | def get_deps(nodes: List[Tuple[str, Node]]) -> Dict[str, Set[str]]: 48 | 49 | def dfs_get_deps(node: Node, deps: Set[str]) -> None: 50 | for child in node.children: 51 | if child.type == IDENTIFIER_TYPE: 52 | deps.add(child.text.decode("utf8")) 53 | else: 54 | dfs_get_deps(child, deps) 55 | 56 | name2deps = {} 57 | for name, node in nodes: 58 | deps = set() 59 | dfs_get_deps(node, deps) 60 | name2deps[name] = deps 61 | return name2deps 62 | 63 | 64 | def get_function_dependency(entrypoint: str, call_graph: Dict[str, str]) -> Set[str]: 65 | queue = [entrypoint] 66 | visited = {entrypoint} 67 | while queue: 68 | current = queue.pop(0) 69 | if current not in call_graph: 70 | continue 71 | for neighbour in call_graph[current]: 72 | if not (neighbour in visited): 73 | visited.add(neighbour) 74 | queue.append(neighbour) 75 | return visited 76 | 77 | 78 | def get_definition_name(node: Node) -> str: 79 | for child in node.children: 80 | if child.type == IDENTIFIER_TYPE: 81 | return child.text.decode("utf8") 82 | 83 | 84 | def traverse_tree(node: Node) -> Generator[Node, None, None]: 85 | cursor = node.walk() 86 | depth = 0 87 | 88 | visited_children = False 89 | while True: 90 | if not visited_children: 91 | yield cursor.node 92 | if not cursor.goto_first_child(): 93 | depth += 1 94 | visited_children = True 95 | elif cursor.goto_next_sibling(): 96 | visited_children = False 97 | elif not cursor.goto_parent() or depth == 0: 98 | break 99 | else: 100 | depth -= 1 101 | 102 | 103 | def has_return_statement(node: Node) -> bool: 104 | traverse_nodes = traverse_tree(node) 105 | for node in traverse_nodes: 106 | if node.type == RETURN_TYPE: 107 | return True 108 | return False 109 | 110 | 111 | def extract_target_code_or_empty(code: str, entrypoint: Optional[str] = None) -> str: 112 | code = code_extract(code.strip()) 113 | code_bytes = bytes(code, "utf8") 114 | parser = Parser(Language(tree_sitter_python.language())) 115 | tree = parser.parse(code_bytes) 116 | class_names = set() 117 | function_names = set() 118 | variable_names = set() 119 | 120 | root_node = tree.root_node 121 | import_nodes = [] 122 | definition_nodes = [] 123 | 124 | for child in root_node.children: 125 | if child.type in IMPORT_TYPE: 126 | import_nodes.append(child) 127 | elif child.type == CLASS_TYPE: 128 | name = get_definition_name(child) 129 | if not ( 130 | name in class_names or name in variable_names or name in function_names 131 | ): 132 | definition_nodes.append((name, child)) 133 | class_names.add(name) 134 | elif child.type == FUNCTION_TYPE: 135 | name = get_definition_name(child) 136 | if not ( 137 | name in function_names or name in variable_names or name in class_names 138 | ): 139 | definition_nodes.append((name, child)) 140 | function_names.add(get_definition_name(child)) 141 | elif ( 142 | child.type == EXPRESSION_TYPE and child.children[0].type == ASSIGNMENT_TYPE 143 | ): 144 | subchild = child.children[0] 145 | name = get_definition_name(subchild) 146 | if not ( 147 | name in variable_names or name in function_names or name in class_names 148 | ): 149 | definition_nodes.append((name, subchild)) 150 | variable_names.add(name) 151 | 152 | if entrypoint: 153 | name2deps = get_deps(definition_nodes) 154 | reacheable = get_function_dependency(entrypoint, name2deps) 155 | 156 | sanitized_output = b"" 157 | 158 | for node in import_nodes: 159 | sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n" 160 | 161 | for pair in definition_nodes: 162 | name, node = pair 163 | if entrypoint and not (name in reacheable): 164 | continue 165 | sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n" 166 | 167 | sanitized_output = sanitized_output[:-1].decode("utf8") 168 | 169 | # ad-hoc approach to remove unnecessary lines, but it works 170 | lines = sanitized_output.splitlines() 171 | outer_lines = [] 172 | for i in range(len(lines) - 1, -1, -1): 173 | if lines[i].startswith(" "): 174 | break 175 | if not lines[i].startswith(" ") and entrypoint in lines[i]: 176 | outer_lines.append(i) 177 | if outer_lines: 178 | sanitized_output = "\n".join(lines[: outer_lines[-1]]) 179 | return sanitized_output 180 | 181 | 182 | def sanitize(code: str, entrypoint: Optional[str] = None) -> str: 183 | sanitized_code = extract_target_code_or_empty(code, entrypoint).strip() 184 | if not sanitized_code: 185 | return code_extract(code) 186 | return sanitized_code 187 | 188 | 189 | def process_solution( 190 | sample_solution: Dict, 191 | dataset: Dict, 192 | entry_point: Dict, 193 | debug_task: str = None, 194 | calibrate: bool = False, 195 | is_folder: bool = False, 196 | target_path: str = None, 197 | ): 198 | 199 | task_id = sample_solution.get("task_id") 200 | if not task_id or task_id not in dataset: 201 | return None 202 | 203 | dbg_identifier = sample_solution["_identifier"] 204 | if debug_task is not None and task_id != debug_task: 205 | return None 206 | 207 | function_name = entry_point.get(task_id) 208 | old_code = sample_solution.get("solution") 209 | 210 | if old_code is None: 211 | assert "completion" in sample_solution, sample_solution 212 | old_code = dataset[task_id]["complete_prompt"] + "\n" + sample_solution.get("completion") 213 | else: 214 | if calibrate: 215 | old_code = old_code.replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ") 216 | 217 | new_code = sanitize(code=old_code, entrypoint=function_name) 218 | 219 | # if old code and new code are different, print msg 220 | if new_code != old_code: 221 | msg = "Sanitized: " + dbg_identifier 222 | if is_folder: 223 | msg += " -> " + dbg_identifier.replace(samples, target_path) 224 | print(msg) 225 | 226 | return {"task_id": task_id, "solution": new_code} 227 | 228 | 229 | def script( 230 | samples: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 231 | ): 232 | # task_id -> entry_point 233 | entry_point = {} 234 | # merge two datasets 235 | dataset = {**get_bigcodebench()} 236 | 237 | for task_id, problem in dataset.items(): 238 | entry_point[task_id] = problem["entry_point"] 239 | 240 | # make a new folder with "-sanitized" suffix 241 | is_folder = os.path.isdir(samples) 242 | target_path = pathlib.Path(samples) 243 | if not inplace: 244 | if is_folder: 245 | if calibrate: 246 | new_name = target_path.name + "-sanitized-calibrated" 247 | else: 248 | new_name = target_path.name + "-sanitized" 249 | else: 250 | if calibrate: 251 | new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl") 252 | else: 253 | new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl") 254 | target_path = target_path.parent / new_name 255 | target_path = str(target_path) 256 | 257 | nsan = 0 258 | ntotal = 0 259 | 260 | new_solutions = [] 261 | 262 | parallel_arg_list = [ 263 | { 264 | "sample_solution": sample_solution, 265 | "dataset": dataset, 266 | "entry_point": entry_point, 267 | "debug_task": debug_task, 268 | "calibrate": calibrate, 269 | "is_folder": is_folder, 270 | "target_path": target_path 271 | } 272 | for sample_solution in load_solutions(samples) 273 | ] 274 | 275 | results = pqdm(parallel_arg_list, process_solution, n_jobs=min(parallel, os.cpu_count()), argument_type="kwargs") 276 | 277 | for result in results: 278 | if result is not None: 279 | new_solutions.append(result) 280 | nsan += 1 281 | ntotal += 1 282 | 283 | if is_folder: 284 | write_directory(target_path, new_solutions) 285 | else: 286 | write_jsonl(target_path, new_solutions) 287 | 288 | if nsan > 0: 289 | print(f"Sanitized {nsan} out of {ntotal} files.") 290 | else: 291 | print(f"All files seems valid -- no files are sanitized.") 292 | print(f"Check the sanitized files at {target_path}") 293 | 294 | 295 | def main(): 296 | from fire import Fire 297 | 298 | Fire(script) 299 | 300 | 301 | if __name__ == "__main__": 302 | main() 303 | -------------------------------------------------------------------------------- /bigcodebench/syncheck.py: -------------------------------------------------------------------------------- 1 | """This file checks two things: 2 | 1. Is the LLMs codegen completed for each benchmark? 3 | 2. Warn the code that are not compilable (it could be some impl issues). 4 | """ 5 | 6 | import ast 7 | import traceback 8 | 9 | from termcolor import colored 10 | 11 | from bigcodebench.data import load_solutions 12 | 13 | 14 | def syntax_check(code, verbose=False): 15 | try: 16 | ast.parse(code) 17 | return True 18 | except (SyntaxError, MemoryError): 19 | if verbose: 20 | traceback.print_exc() 21 | return False 22 | 23 | 24 | def script( 25 | samples: str, nsample_check: int = None, verbose: bool = False 26 | ): 27 | # List[Dict{"task_id", "solution"}] 28 | solutions = load_solutions(samples) 29 | 30 | from bigcodebench.data import get_bigcodebench 31 | 32 | dataset = get_bigcodebench() 33 | dataset_name = "BigCodeBench" 34 | 35 | print(colored(f"Dataset: {dataset_name}", "blue")) 36 | 37 | id2solutions = {} 38 | for solution in solutions: 39 | task_id = solution["task_id"] 40 | if task_id not in id2solutions: 41 | id2solutions[task_id] = [] 42 | if "solution" not in solution: 43 | assert "completion" in solution, "solution or completion must exist!" 44 | solution["solution"] = dataset[task_id]["complete_prompt"] + solution["completion"] 45 | id2solutions[task_id].append(solution) 46 | 47 | print(colored("==============================", "blue")) 48 | print(colored(" ::: Checking completeness... ", "blue")) 49 | print(colored(" ::::: All tasks complete? ", "blue")) 50 | ndone = 0 51 | 52 | task_ids = dataset.keys() 53 | ntask = len(task_ids) 54 | for task_id in task_ids: 55 | if task_id not in id2solutions: 56 | print(colored(f" ⚠️ {task_id} is missing!", "red")) 57 | continue 58 | nfiles = len(id2solutions[task_id]) 59 | 60 | if nsample_check is None or nfiles <= nsample_check: 61 | ndone += 1 62 | continue 63 | 64 | print( 65 | colored( 66 | f" ⚠️ {task_id} only has {nfiles} samples! But {nsample_check} are expected.", 67 | "red", 68 | ) 69 | ) 70 | 71 | # check if there is enough number of samples here. 72 | if nsample_check is not None: 73 | if ntask != ndone: 74 | ntbd = ntask - ndone 75 | print(colored(f" ::::: ⚠️ {ntbd}/{ntask} tasks incomplete!", "red")) 76 | else: 77 | print(colored(f" ::::: All {ntask} tasks complete!", "green")) 78 | 79 | print(colored("==============================", "blue")) 80 | print(colored(" ::: Checking compilation... ", "blue")) 81 | print(colored(" ::::: All code compilable? ", "blue")) 82 | ncode = 0 83 | nwrong = 0 84 | for task_id in task_ids: 85 | # task_id must exist 86 | if task_id not in id2solutions: 87 | continue 88 | 89 | for solution in id2solutions[task_id]: 90 | ncode += 1 91 | code = solution["solution"] 92 | dbg_identifier = solution["_identifier"] 93 | if code.strip() == "": 94 | print(colored(f" ⚠️ {dbg_identifier} is empty!", "red")) 95 | nwrong += 1 96 | elif not syntax_check(code, verbose): 97 | print(colored(f" ⚠️ {dbg_identifier} is not compilable!", "red")) 98 | nwrong += 1 99 | if 0 != nwrong: 100 | print(colored(f" ::::: ⚠️ {nwrong}/{ncode} code are not compilable!", "red")) 101 | else: 102 | print(colored(f" ::::: All {ncode} code are compilable!", "green")) 103 | 104 | 105 | def main(): 106 | from fire import Fire 107 | 108 | Fire(script) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /decontamination/n_gram_check.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, load_from_disk 2 | from collections import Counter 3 | import tiktoken 4 | from nltk import ngrams 5 | from tqdm import tqdm 6 | import datasets 7 | 8 | def has_overlap(sample_1, sample_2): 9 | """Check if there is any N-gram overlap between the long string and a given string.""" 10 | return not set(sample_1).isdisjoint(set(sample_2)) 11 | 12 | from concurrent.futures import ThreadPoolExecutor, as_completed 13 | 14 | def calculate_overlap_percentage(samples_1, samples_2): 15 | def check_sample(sample): 16 | for long_sample in samples_2: 17 | if has_overlap(sample, long_sample["ngram"]): 18 | return 1 19 | return 0 20 | 21 | count = 0 22 | with ThreadPoolExecutor() as executor: 23 | futures = [executor.submit(check_sample, sample) for sample in samples_1] 24 | for future in tqdm(as_completed(futures), total=len(futures)): 25 | count += future.result() 26 | 27 | return count / len(samples_1) * 100 28 | 29 | def load_odex_data(n=10): 30 | def map_ngram(sample): 31 | return {"ngram": set([" ".join(ngram) for ngram in ngrams(sample["intent"].split(), n)])} 32 | dataset = load_dataset("neulab/odex", "en", split="test") 33 | dataset = dataset.map(map_ngram, num_proc=16, batch_size=16, remove_columns=dataset.column_names) 34 | return dataset 35 | 36 | def load_stackoverflow(n=10): 37 | def map_ngram(sample): 38 | return {"ngram": set([" ".join(ngram) for ngram in ngrams(sample["question"].split(), n)])} 39 | dataset = load_dataset("bigcode/stack-exchange-preferences-20230914-clean-anonymization", split="train") 40 | dataset = dataset.map(map_ngram, num_proc=16, batch_size=16, remove_columns=dataset.column_names) 41 | dataset.push_to_hub(f"stackoverflow_ngram_{n}") 42 | return dataset 43 | 44 | 45 | def load_starcoderdata(n=10): 46 | def map_ngram(sample): 47 | return {"ngram": set([" ".join(ngram) for ngram in ngrams(sample["content"].split(), n)])} 48 | dataset = load_dataset("bigcode/starcoderdata", data_dir="python", split="train") 49 | dataset = dataset.map(map_ngram, num_proc=16, batch_size=16, remove_columns=dataset.column_names) 50 | dataset.push_to_hub(f"starcoderdata_ngram_{n}") 51 | return dataset 52 | 53 | def load_bigcodebench(n=10): 54 | def map_ngram(sample): 55 | return {"ngram": set([" ".join(ngram) for ngram in ngrams(sample["instruct_prompt"].split("```")[0].split(), n)])} 56 | dataset = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf") 57 | dataset = dataset.map(map_ngram, num_proc=16, batch_size=16, remove_columns=dataset.column_names) 58 | dataset.push_to_hub(f"bigcodebench_ngram_{n}") 59 | return dataset 60 | 61 | 62 | if __name__ == "__main__": 63 | n_gram_size = 10 64 | N_SHARDS = 50 65 | user_name = "terryyz" 66 | bigcodebench = load_dataset(f"{user_name}/bigcodebench_ngram_{n_gram_size}", split="train") 67 | 68 | dataset_name = "starcoderdata" 69 | print(dataset_name, n_gram_size) 70 | indices = [] 71 | for i in tqdm(range(N_SHARDS)): 72 | ds = load_dataset(f"{user_name}/{dataset_name}_ngram_{n_gram_size}_overlap_{i}", split="train") 73 | overlap_indices = [idx for idx, example in enumerate(ds) if example["overlap"]] 74 | indices.extend(overlap_indices) 75 | with open(f"{dataset_name}_ngram_{n_gram_size}_overlap.txt", "w") as f: 76 | f.write(f"{len(set(indices))/1140*100:.2f}%") -------------------------------------------------------------------------------- /decontamination/odex_10_overlap.txt: -------------------------------------------------------------------------------- 1 | 0.09% -------------------------------------------------------------------------------- /decontamination/odex_13_overlap.txt: -------------------------------------------------------------------------------- 1 | odex: 0.00% -------------------------------------------------------------------------------- /decontamination/stackoverflow_10_overlap.txt: -------------------------------------------------------------------------------- 1 | 1.49% -------------------------------------------------------------------------------- /decontamination/stackoverflow_13_overlap.txt: -------------------------------------------------------------------------------- 1 | 0.18% -------------------------------------------------------------------------------- /decontamination/starcoderdata_10_overlap.txt: -------------------------------------------------------------------------------- 1 | 2.54% -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.setuptools_scm] 6 | write_to = "bigcodebench/_version.py" 7 | version_scheme = "release-branch-semver" 8 | local_scheme = "no-local-version" 9 | -------------------------------------------------------------------------------- /release.sh: -------------------------------------------------------------------------------- 1 | # argument version 2 | 3 | set -eux 4 | 5 | while getopts "v:" opt; do 6 | case $opt in 7 | v) 8 | version=$OPTARG 9 | ;; 10 | \?) 11 | echo "Invalid option: -$OPTARG" >&2 12 | ;; 13 | esac 14 | done 15 | 16 | if [ -z "$version" ]; then 17 | echo "version is required" 18 | exit 1 19 | fi 20 | 21 | export PYTHONPATH=$PWD pytest tests 22 | 23 | git tag $version 24 | 25 | rm -rf dist 26 | python3 -m build 27 | python3 -m twine upload dist/* 28 | 29 | git push 30 | git push --tags -------------------------------------------------------------------------------- /release_docker.sh: -------------------------------------------------------------------------------- 1 | # argument version 2 | 3 | set -eux 4 | 5 | while getopts "v:" opt; do 6 | case $opt in 7 | v) 8 | version=$OPTARG 9 | ;; 10 | \?) 11 | echo "Invalid option: -$OPTARG" >&2 12 | ;; 13 | esac 14 | done 15 | 16 | if [ -z "$version" ]; then 17 | echo "version is required" 18 | exit 1 19 | fi 20 | 21 | export PYTHONPATH=$PWD pytest tests 22 | 23 | docker buildx create --name multiplatform-builder --use || true 24 | docker buildx use multiplatform-builder 25 | 26 | # Build and push evaluate image 27 | docker buildx build --platform linux/amd64 \ 28 | -f Docker/Evaluate.Dockerfile . \ 29 | -t bigcodebench/bigcodebench-evaluate:$version \ 30 | -t bigcodebench/bigcodebench-evaluate:latest \ 31 | --push 32 | 33 | # Build and push gradio image 34 | docker buildx build --platform linux/amd64 \ 35 | -f Docker/Gradio.Dockerfile . \ 36 | -t bigcodebench/bigcodebench-gradio:$version \ 37 | -t bigcodebench/bigcodebench-gradio:latest \ 38 | --push -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | DATASET=bigcodebench 2 | MODEL=meta-llama/Meta-Llama-3.1-8B-Instruct 3 | BACKEND=vllm 4 | NUM_GPU=2 5 | SPLIT=complete 6 | SUBSET=full 7 | export E2B_API_KEY="e2b_0a231fa3b0a2b01690ab6c66a23b55c0979ce4ee" 8 | 9 | bigcodebench.evaluate \ 10 | --model $MODEL \ 11 | --split $SPLIT \ 12 | --subset $SUBSET \ 13 | --backend $BACKEND -------------------------------------------------------------------------------- /sandbox-templates/e2b.Dockerfile: -------------------------------------------------------------------------------- 1 | # Better use newer Python as generated code can use new features 2 | FROM python:3.10-slim 3 | 4 | # install git, g++ and python3-tk 5 | RUN apt-get update && apt-get install -y \ 6 | git \ 7 | g++ \ 8 | python3-tk \ 9 | zip \ 10 | unzip \ 11 | procps \ 12 | r-base \ 13 | libgdal-dev \ 14 | # Add these new dependencies for matplotlib 15 | libfreetype6-dev \ 16 | libpng-dev \ 17 | pkg-config \ 18 | python3-dev \ 19 | python3-matplotlib \ 20 | && rm -rf /var/lib/apt/lists/* 21 | 22 | # upgrade to latest pip 23 | RUN pip install --upgrade pip 24 | 25 | # Add a new user "bigcodebenchuser" 26 | RUN adduser --disabled-password --gecos "" bigcodebenchuser 27 | 28 | RUN rm -rf /bigcodebench 29 | 30 | RUN echo 1 31 | # Acquire benchmark code to local 32 | ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit 33 | RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench 34 | 35 | RUN pip install numpy==1.24.3 pyarrow==14.0.1 36 | 37 | RUN cd /bigcodebench && \ 38 | pip install . --no-deps 39 | 40 | RUN pip install --timeout 2000 \ 41 | appdirs \ 42 | fire \ 43 | multipledispatch \ 44 | pqdm \ 45 | tempdir \ 46 | termcolor \ 47 | tqdm \ 48 | transformers \ 49 | tree_sitter \ 50 | tree-sitter-python \ 51 | wget \ 52 | datasets \ 53 | gradio-client \ 54 | numpy \ 55 | rich \ 56 | e2b 57 | 58 | RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt 59 | 60 | # Ensure the numpy version is compatible with the datasets version 61 | RUN pip install datasets==2.17.0 62 | 63 | WORKDIR /app 64 | 65 | RUN chown -R bigcodebenchuser:bigcodebenchuser /app 66 | 67 | RUN chmod -R 777 /app && rm -rf /root/.cache/pip 68 | 69 | USER bigcodebenchuser -------------------------------------------------------------------------------- /sandbox-templates/e2b.toml: -------------------------------------------------------------------------------- 1 | # This is a config for E2B sandbox template. 2 | # You can use template ID (xs3c9i0hy53751xam77h) or template name (bigcodebench_evaluator) to create a sandbox: 3 | 4 | # Python SDK 5 | # from e2b import Sandbox, AsyncSandbox 6 | # sandbox = Sandbox("bigcodebench_evaluator") # Sync sandbox 7 | # sandbox = await AsyncSandbox.create("bigcodebench_evaluator") # Async sandbox 8 | 9 | # JS SDK 10 | # import { Sandbox } from 'e2b' 11 | # const sandbox = await Sandbox.create('bigcodebench_evaluator') 12 | 13 | team_id = "f317d0d2-ba02-44c5-8b77-e4a2d7830c7c" 14 | dockerfile = "e2b.Dockerfile" 15 | template_name = "bigcodebench_evaluator" 16 | template_id = "xs3c9i0hy53751xam77h" 17 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = bigcodebench 3 | description = "Evaluation package for BigCodeBench" 4 | long_description = file: README.md 5 | long_description_content_type = text/markdown 6 | url = https://github.com/bigcode-project/bigcodebench 7 | license = Apache-2.0 8 | license_files = LICENSE 9 | platform = any 10 | classifiers = 11 | Operating System :: OS Independent 12 | Programming Language :: Python :: 3 13 | License :: OSI Approved :: Apache Software License 14 | 15 | [options] 16 | packages = find: 17 | python_requires = >=3.8 18 | dependency_links = 19 | install_requires = 20 | appdirs>=1.4.4 21 | fire>=0.6.0 22 | multipledispatch>=0.6.0 23 | pqdm>=0.2.0 24 | tempdir>=0.7.1 25 | termcolor>=2.0.0 26 | tqdm>=4.56.0 27 | tree_sitter>=0.22.0 28 | tree-sitter-python>=0.21.0 29 | wget>=3.2 30 | transformers 31 | datasets 32 | gradio-client 33 | vllm 34 | numpy 35 | rich 36 | accelerate>=0.30.1 37 | anthropic>=0.26.1 38 | google-genai 39 | mistralai>=0.2.0,<1.0.0 40 | openai>=1.11.1 41 | e2b 42 | 43 | [options.entry_points] 44 | console_scripts = 45 | bigcodebench.evaluate = bigcodebench.evaluate:main 46 | bigcodebench.sanitize = bigcodebench.sanitize:main 47 | bigcodebench.syncheck = bigcodebench.syncheck:main 48 | bigcodebench.legacy_sanitize = bigcodebench.legacy_sanitize:main 49 | bigcodebench.generate = bigcodebench.generate:main 50 | bigcodebench.inspect = bigcodebench.inspect:main 51 | -------------------------------------------------------------------------------- /tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | -------------------------------------------------------------------------------- /tests/test_legacy_sanitizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from bigcodebench.lecacy_sanitize import sanitize 4 | 5 | 6 | def test_inline_fn(): 7 | assert ( 8 | sanitize( 9 | """\ 10 | def f(n): 11 | def factorial(i): 12 | if i == 0: 13 | return 1 14 | else: 15 | return i * factorial(i-1) 16 | 17 | result = [] 18 | for i in range(1, n+1): 19 | if i % 2 == 0: 20 | result.append(factorial(i)) 21 | else: 22 | result.append(sum(range(1, i+1))) 23 | return result 24 | 25 | # Test the function 26 | print(f(5))""", 27 | entry_point="f", 28 | ) 29 | == """\ 30 | def f(n): 31 | def factorial(i): 32 | if i == 0: 33 | return 1 34 | else: 35 | return i * factorial(i-1) 36 | 37 | result = [] 38 | for i in range(1, n+1): 39 | if i % 2 == 0: 40 | result.append(factorial(i)) 41 | else: 42 | result.append(sum(range(1, i+1))) 43 | return result""" 44 | ) 45 | -------------------------------------------------------------------------------- /tests/test_treesitter_sanitizer.py: -------------------------------------------------------------------------------- 1 | from bigcodebench.sanitize import code_extract, sanitize 2 | 3 | 4 | def test_code_extract(): 5 | test_simple = r"""Here is some python code generated 6 | import numpy as np 7 | Sorry, I made a mistake, let me try again 8 | from numpy import sin, cos, tan 9 | 10 | def f(x): 11 | return tan(x) 12 | As you can observe from above 13 | """ 14 | assert ( 15 | code_extract(test_simple) 16 | == r"""from numpy import sin, cos, tan 17 | 18 | def f(x): 19 | return tan(x)""" 20 | ) 21 | 22 | test_empty_lines = r"""import numpy as np 23 | 24 | 25 | import pandas 26 | Sorry, let me try again 27 | from numpy import sin, cos, tan 28 | def f(x): 29 | return tan(x) 30 | """ 31 | assert ( 32 | code_extract(test_empty_lines) 33 | == r"""from numpy import sin, cos, tan 34 | def f(x): 35 | return tan(x)""" 36 | ) 37 | 38 | 39 | def test_sanitize_simple(): 40 | icode = r"""Following is the code snippet: 41 | ```python 42 | import numpy as np 43 | from numpy import sin, cos 44 | 45 | def f(x): 46 | return np.tan(x) 47 | 48 | def g(x): 49 | return cos(f(x)) 50 | 51 | def g(x): 52 | return sin(f(x)) 53 | 54 | def c(x): 55 | assert 1==1 56 | 57 | assert g(0) == 1 58 | ``` 59 | """ 60 | assert ( 61 | sanitize(icode) 62 | == r"""import numpy as np 63 | from numpy import sin, cos 64 | def f(x): 65 | return np.tan(x) 66 | def g(x): 67 | return cos(f(x))""" 68 | ) 69 | 70 | 71 | def test_sanitize_class(): 72 | icode = r"""Following is the code snippet: 73 | ```python 74 | import numpy as np 75 | from numpy import sin, cos 76 | class g(): 77 | def hello_world(): 78 | return 0 79 | def f(x): 80 | print(g.hello_world()) 81 | return np.tan(x) 82 | ``` 83 | """ 84 | 85 | assert ( 86 | sanitize(icode) 87 | == r"""import numpy as np 88 | from numpy import sin, cos 89 | class g(): 90 | def hello_world(): 91 | return 0 92 | def f(x): 93 | print(g.hello_world()) 94 | return np.tan(x)""" 95 | ) 96 | 97 | 98 | def test_entrypoint_basic(): 99 | icode = r"""Following is the code snippet: 100 | ```python 101 | import numpy as np 102 | from numpy import sin, cos 103 | 104 | def f(x): 105 | return np.tan(x) 106 | 107 | def g(x): 108 | return cos(f(x)) 109 | 110 | def g(x): 111 | return sin(f(x)) 112 | 113 | def c(x): 114 | return 0 115 | 116 | assert g(0) == 1 117 | ``` 118 | """ 119 | assert ( 120 | sanitize(icode, "g") 121 | == r"""import numpy as np 122 | from numpy import sin, cos 123 | def f(x): 124 | return np.tan(x) 125 | def g(x): 126 | return cos(f(x))""" 127 | ) 128 | 129 | 130 | def test_entrypoint_chain(): 131 | icode = r"""Following is the code snippet: 132 | ```python 133 | import numpy as np 134 | from numpy import sin, cos 135 | 136 | def f(x): 137 | return c(x) 138 | assert f(1) == 5 139 | def g(x): 140 | return cos(f(x)) 141 | 142 | def c(x): 143 | newObj = h() 144 | return x 145 | 146 | class h(): 147 | def hello_world(): 148 | return 0 149 | 150 | class h(): 151 | def goodbye_world(): 152 | return 0 153 | 154 | 155 | assert g(0) == 1 156 | ``` 157 | """ 158 | print(sanitize(icode, "g")) 159 | assert ( 160 | sanitize(icode, "g") 161 | == r"""import numpy as np 162 | from numpy import sin, cos 163 | def f(x): 164 | return c(x) 165 | def g(x): 166 | return cos(f(x)) 167 | def c(x): 168 | newObj = h() 169 | return x 170 | class h(): 171 | def hello_world(): 172 | return 0""" 173 | ) 174 | 175 | 176 | def test_entrypoint_no_chain(): 177 | icode = r"""Following is the code snippet: 178 | ```python 179 | import numpy as np 180 | from numpy import sin, cos, sum 181 | 182 | def f(x): 183 | return np.sum(x) 184 | assert f(1) == 5 185 | def g(x): 186 | return cos(f(x)) 187 | 188 | def c(x): 189 | newObj = h() 190 | return x 191 | 192 | class h(): 193 | def hello_world(): 194 | return 0 195 | 196 | 197 | assert g(0) == 1 198 | ``` 199 | """ 200 | assert ( 201 | sanitize(icode, "g") 202 | == r"""import numpy as np 203 | from numpy import sin, cos, sum 204 | def f(x): 205 | return np.sum(x) 206 | def g(x): 207 | return cos(f(x))""" 208 | ) 209 | 210 | 211 | def test_entrypoint_variable(): 212 | icode = r"""Following is the code snippet: 213 | ```python 214 | import numpy as np 215 | from numpy import sin, cos 216 | 217 | SOME_CONSTANT = 5 218 | 219 | def f(x): 220 | return c(x) 221 | assert f(1) == 5 222 | def g(x): 223 | return cos(f(x)) 224 | 225 | def c(x): 226 | newObj = h() 227 | return x 228 | 229 | class h(): 230 | def hello_world(): 231 | return SOME_CONSTANT 232 | 233 | def d(x): 234 | return g(x) 235 | 236 | 237 | assert g(0) == 1 238 | ``` 239 | """ 240 | 241 | assert ( 242 | sanitize(icode, "g") 243 | == r"""import numpy as np 244 | from numpy import sin, cos 245 | SOME_CONSTANT = 5 246 | def f(x): 247 | return c(x) 248 | def g(x): 249 | return cos(f(x)) 250 | def c(x): 251 | newObj = h() 252 | return x 253 | class h(): 254 | def hello_world(): 255 | return SOME_CONSTANT""" 256 | ) 257 | -------------------------------------------------------------------------------- /tools/fix_v019.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, DatasetDict 2 | from huggingface_hub import HfApi 3 | 4 | import json 5 | import copy 6 | 7 | BIGCODEBENCH_HF = "bigcode/bigcodebench" 8 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard" 9 | BIGCODEBENCH_VERSION = "v0.1.0_hf" 10 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update" 11 | BIGCODEBENCH_NEW_VERSION = "v0.1.1" 12 | 13 | def map_ds(sample): 14 | 15 | if sample["task_id"] in ["BigCodeBench/1006"]: 16 | sample["test"] = sample["test"].replace( 17 | '''\ 18 | def test_valid_zip_url(self): 19 | """Test a valid ZIP URL.""" 20 | url = "https://getsamplefiles.com/download/zip/sample-1.zip" 21 | result = task_func(url) 22 | self.assertTrue(result.startswith("mnt/data/downloads/")) 23 | self.assertTrue(result.endswith("sample-1")) 24 | shutil.rmtree("mnt/data/downloads") 25 | ''', 26 | '''\ 27 | @patch("requests.get") 28 | def test_non_zip_content(self, mock_get): 29 | """Test a valid ZIP URL.""" 30 | mock_get.return_value.status_code = 200 31 | mock_get.return_value.headers = {"Content-Type": "application/zip"} 32 | mock_get.return_value.content = b"1" 33 | url = "https://valid-url.com/sample.zip" 34 | result = task_func(url) 35 | ''', 36 | ) 37 | 38 | if sample["task_id"] in ["BigCodeBench/760"]: 39 | for k in sample.keys(): 40 | if "prompt" in k: 41 | sample[k] = sample[k].replace( 42 | "from datetime import datetime", 43 | "import datetime" 44 | ) 45 | 46 | if sample["task_id"] in ["BigCodeBench/178"]: 47 | for k in sample.keys(): 48 | sample[k] = sample[k].replace( 49 | "from urllib import request\n", 50 | "" 51 | ) 52 | sample[k] = sample[k].replace( 53 | " - urllib.request\n", 54 | "" 55 | ) 56 | 57 | return sample 58 | 59 | if __name__ == "__main__": 60 | api = HfApi() 61 | ds_dict = load_dataset(BIGCODEBENCH_HF) 62 | hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF) 63 | ds = ds_dict[BIGCODEBENCH_VERSION] 64 | hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION] 65 | function_id = [178, 760, 1006] 66 | 67 | new_ds = ds.map(map_ds) 68 | new_ds.to_json("BigCodeBench.jsonl") 69 | ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds 70 | ds_dict.push_to_hub(BIGCODEBENCH_HF) 71 | 72 | new_hard_ds = hard_ds.map(map_ds) 73 | new_hard_ds.to_json("BigCodeBench-Hard.jsonl") 74 | hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds 75 | hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF) 76 | 77 | for i in function_id: 78 | old_sample = ds.select([i]) 79 | new_sample = new_ds.select([i]) 80 | old_sample.to_json("old.jsonl") 81 | new_sample.to_json("new.jsonl") 82 | api.upload_file( 83 | path_or_fileobj="old.jsonl", 84 | path_in_repo=f"{i}/old.jsonl", 85 | repo_id=BIGCODEBENCH_UPDATE, 86 | # repo_type="dataset" 87 | ) 88 | api.upload_file( 89 | path_or_fileobj="new.jsonl", 90 | path_in_repo=f"{i}/new.jsonl", 91 | repo_id=BIGCODEBENCH_UPDATE, 92 | # repo_type="dataset" 93 | ) 94 | 95 | -------------------------------------------------------------------------------- /tools/fix_v020.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, DatasetDict 2 | from huggingface_hub import HfApi 3 | 4 | import json 5 | import copy 6 | 7 | BIGCODEBENCH_HF = "bigcode/bigcodebench" 8 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard" 9 | BIGCODEBENCH_VERSION = "v0.1.1" 10 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update" 11 | BIGCODEBENCH_NEW_VERSION = "v0.1.2" 12 | 13 | def map_ds(sample): 14 | if sample["task_id"] in ["BigCodeBench/16"]: 15 | for k in sample.keys(): 16 | sample[k] = sample[k].replace( 17 | "No logs found to backup.", "No logs found to backup" 18 | ) 19 | 20 | if sample["task_id"] in ["BigCodeBench/37"]: 21 | for k in sample.keys(): 22 | if "prompt" in k: 23 | sample[k] = "import pandas as pd\n" + sample[k] 24 | sample[k] = sample[k].replace( 25 | "Requirements:\n - sklearn.ensemble\n", 26 | "Requirements:\n - pandas\n - sklearn.ensemble\n" 27 | ) 28 | 29 | if sample["task_id"] in ["BigCodeBench/241"]: 30 | for k in sample.keys(): 31 | if "prompt" in k: 32 | sample[k] = sample[k].replace( 33 | "The function will plot the original and normalized arrays using matplotlib.", 34 | "The function will plot the original and normalized arrays with a title of 'Original vs. Normalized Data'." 35 | ) 36 | 37 | if sample["task_id"] in ["BigCodeBench/267"]: 38 | for k in sample.keys(): 39 | if "prompt" in k: 40 | sample[k] = sample[k].replace( 41 | "Plots and returns the FFT of the signal.", 42 | "Plots and returns the FFT of the signal with a title of 'FFT of the signal'." 43 | ) 44 | 45 | return sample 46 | 47 | if __name__ == "__main__": 48 | api = HfApi() 49 | ds_dict = load_dataset(BIGCODEBENCH_HF) 50 | hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF) 51 | ds = ds_dict[BIGCODEBENCH_VERSION] 52 | hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION] 53 | function_id = [16, 37, 241, 267] 54 | 55 | new_ds = ds.map(map_ds) 56 | new_ds.to_json("BigCodeBench.jsonl") 57 | ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds 58 | ds_dict.push_to_hub(BIGCODEBENCH_HF) 59 | 60 | new_hard_ds = hard_ds.map(map_ds) 61 | new_hard_ds.to_json("BigCodeBench-Hard.jsonl") 62 | hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds 63 | hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF) 64 | 65 | for i in function_id: 66 | old_sample = ds.select([i]) 67 | new_sample = new_ds.select([i]) 68 | old_sample.to_json("old.jsonl") 69 | new_sample.to_json("new.jsonl") 70 | api.upload_file( 71 | path_or_fileobj="old.jsonl", 72 | path_in_repo=f"{i}/old.jsonl", 73 | repo_id=BIGCODEBENCH_UPDATE, 74 | # repo_type="dataset" 75 | ) 76 | api.upload_file( 77 | path_or_fileobj="new.jsonl", 78 | path_in_repo=f"{i}/new.jsonl", 79 | repo_id=BIGCODEBENCH_UPDATE, 80 | # repo_type="dataset" 81 | ) 82 | -------------------------------------------------------------------------------- /tools/fix_v022.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, DatasetDict 2 | from huggingface_hub import HfApi 3 | 4 | import json 5 | import copy 6 | 7 | BIGCODEBENCH_HF = "bigcode/bigcodebench" 8 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard" 9 | BIGCODEBENCH_VERSION = "v0.1.2" 10 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update" 11 | BIGCODEBENCH_NEW_VERSION = "v0.1.3" 12 | 13 | def map_ds(sample): 14 | if sample["task_id"] in ["BigCodeBench/1005"]: 15 | for k in sample.keys(): 16 | sample[k] = sample[k].replace( 17 | "https://getsamplefiles.com/download/zip/sample-2.zip", "https://getsamplefiles.com/download/zip/sample-5.zip" 18 | ).replace( 19 | "sample_2", "sample_5" 20 | ).replace( 21 | "Sample 2", "Sample 5" 22 | ) 23 | return sample 24 | 25 | if __name__ == "__main__": 26 | api = HfApi() 27 | ds_dict = load_dataset(BIGCODEBENCH_HF) 28 | hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF) 29 | ds = ds_dict[BIGCODEBENCH_VERSION] 30 | hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION] 31 | function_id = [1005] 32 | 33 | new_ds = ds.map(map_ds) 34 | new_ds.to_json("BigCodeBench.jsonl") 35 | ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds 36 | ds_dict.push_to_hub(BIGCODEBENCH_HF) 37 | 38 | new_hard_ds = hard_ds.map(map_ds) 39 | new_hard_ds.to_json("BigCodeBench-Hard.jsonl") 40 | hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds 41 | hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF) 42 | 43 | for i in function_id: 44 | old_sample = ds.select([i]) 45 | new_sample = new_ds.select([i]) 46 | old_sample.to_json("old.jsonl") 47 | new_sample.to_json("new.jsonl") 48 | api.upload_file( 49 | path_or_fileobj="old.jsonl", 50 | path_in_repo=f"{i}/old.jsonl", 51 | repo_id=BIGCODEBENCH_UPDATE, 52 | # repo_type="dataset" 53 | ) 54 | api.upload_file( 55 | path_or_fileobj="new.jsonl", 56 | path_in_repo=f"{i}/new.jsonl", 57 | repo_id=BIGCODEBENCH_UPDATE, 58 | # repo_type="dataset" 59 | ) 60 | -------------------------------------------------------------------------------- /tools/fix_v023.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset, DatasetDict 2 | from huggingface_hub import HfApi 3 | 4 | import json 5 | import copy 6 | 7 | BIGCODEBENCH_HF = "bigcode/bigcodebench" 8 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard" 9 | BIGCODEBENCH_VERSION = "v0.1.3" 10 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update" 11 | BIGCODEBENCH_NEW_VERSION = "v0.1.4" 12 | 13 | def map_ds(sample): 14 | if sample["task_id"] in ["BigCodeBench/211"]: 15 | sample['test'] = sample['test'].replace( 16 | """ 17 | mock_response = MagicMock() 18 | mock_response.content = MOCK_CONTENT 19 | """, 20 | """ 21 | mock_response = MagicMock() 22 | mock_response.content = MOCK_CONTENT 23 | mock_response.status_code = 200 24 | """ 25 | ) 26 | if sample["task_id"] in ["BigCodeBench/215"]: 27 | sample['test'] = sample['test'].replace( 28 | """ 29 | mock_response = Mock() 30 | """, 31 | """ 32 | mock_response = Mock() 33 | mock_response.status_code = 200 34 | """ 35 | ) 36 | sample['test'] = sample['test'].replace( 37 | """ 38 | mock_response.text =""", 39 | """ 40 | MOCK_TEXT =""" 41 | ) 42 | sample['test'] = sample['test'].replace( 43 | """ 44 | mock_get.return_value = mock_response 45 | """, 46 | """ 47 | mock_response.text = MOCK_TEXT 48 | mock_response.json = lambda: json.loads(MOCK_TEXT) 49 | mock_get.return_value = mock_response 50 | """ 51 | ) 52 | sample['complete_prompt'] = sample['complete_prompt'].replace("Thif function will raise", "This function will raise") 53 | sample['instruct_prompt'] = sample['instruct_prompt'].replace("Thif function will raise", "This function will raise") 54 | sample['doc_struct'] = sample['doc_struct'].replace("Thif function will raise", "This function will raise") 55 | return sample 56 | 57 | if __name__ == "__main__": 58 | api = HfApi() 59 | ds_dict = load_dataset(BIGCODEBENCH_HF) 60 | hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF) 61 | ds = ds_dict[BIGCODEBENCH_VERSION] 62 | hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION] 63 | function_id = [211, 215] 64 | 65 | new_ds = ds.map(map_ds) 66 | new_ds.to_json("BigCodeBench.jsonl") 67 | ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds 68 | ds_dict.push_to_hub(BIGCODEBENCH_HF) 69 | 70 | new_hard_ds = hard_ds.map(map_ds) 71 | new_hard_ds.to_json("BigCodeBench-Hard.jsonl") 72 | hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds 73 | hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF) 74 | 75 | for i in function_id: 76 | old_sample = ds.select([i]) 77 | new_sample = new_ds.select([i]) 78 | old_sample.to_json("old.jsonl") 79 | new_sample.to_json("new.jsonl") 80 | api.upload_file( 81 | path_or_fileobj="old.jsonl", 82 | path_in_repo=f"{i}/old.jsonl", 83 | repo_id=BIGCODEBENCH_UPDATE, 84 | # repo_type="dataset" 85 | ) 86 | api.upload_file( 87 | path_or_fileobj="new.jsonl", 88 | path_in_repo=f"{i}/new.jsonl", 89 | repo_id=BIGCODEBENCH_UPDATE, 90 | # repo_type="dataset" 91 | ) 92 | -------------------------------------------------------------------------------- /tools/fix_v025.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from huggingface_hub import HfApi 3 | 4 | BIGCODEBENCH_HF = "bigcode/bigcodebench" 5 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard" 6 | BIGCODEBENCH_VERSION = "v0.1.4" 7 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update" 8 | BIGCODEBENCH_NEW_VERSION = "v0.1.5" 9 | 10 | def map_ds(sample): 11 | if sample["task_id"] in ["BigCodeBench/332"]: 12 | sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt'] 13 | sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] 14 | sample['instruct_prompt'] = sample['instruct_prompt'].replace( 15 | "\nYou should write self-contained code starting with:\n```\n", 16 | "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" 17 | ) 18 | 19 | if sample["task_id"] in ["BigCodeBench/334"]: 20 | sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] 21 | sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] 22 | sample['instruct_prompt'] = sample['instruct_prompt'].replace( 23 | "\nYou should write self-contained code starting with:\n```\n", 24 | "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" 25 | ) 26 | 27 | if sample["task_id"] in ["BigCodeBench/376"]: 28 | sample['code_prompt'] = sample['code_prompt'].replace( 29 | "import nltk\n", 30 | "import nltk\nnltk.download('stopwords')\n", 31 | 1 32 | ) 33 | sample['complete_prompt'] = sample['complete_prompt'].replace( 34 | "import nltk\n", 35 | "import nltk\nnltk.download('stopwords')\n", 36 | 1 37 | ) 38 | sample['instruct_prompt'] = sample['instruct_prompt'].replace( 39 | "\nYou should write self-contained code starting with:\n```\nimport nltk\n", 40 | "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" 41 | ) 42 | 43 | if sample["task_id"] in ["BigCodeBench/383"]: 44 | sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] 45 | sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] 46 | sample['instruct_prompt'] = sample['instruct_prompt'].replace( 47 | "\nYou should write self-contained code starting with:\n```\n", 48 | "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" 49 | ) 50 | 51 | if sample["task_id"] in ["BigCodeBench/633"]: 52 | sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt'] 53 | sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] 54 | sample['instruct_prompt'] = sample['instruct_prompt'].replace( 55 | "\nYou should write self-contained code starting with:\n```\n", 56 | "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" 57 | ) 58 | 59 | if sample["task_id"] in ["BigCodeBench/635"]: 60 | sample['code_prompt'] = sample['code_prompt'].replace( 61 | "# Importing the required libraries", 62 | "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n" 63 | ) 64 | 65 | sample['complete_prompt'] = sample['complete_prompt'].replace( 66 | "# Importing the required libraries", 67 | "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n" 68 | ) 69 | 70 | sample['instruct_prompt'] = sample['instruct_prompt'].replace( 71 | "\nYou should write self-contained code starting with:\n```\n", 72 | "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" 73 | ) 74 | 75 | if sample["task_id"] in ["BigCodeBench/849"]: 76 | sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt'] 77 | sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt'] 78 | sample['instruct_prompt'] = sample['instruct_prompt'].replace( 79 | "\nYou should write self-contained code starting with:\n```\n", 80 | "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n" 81 | ) 82 | 83 | if sample["task_id"] in ["BigCodeBench/940"]: 84 | sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] 85 | sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] 86 | sample['instruct_prompt'] = sample['instruct_prompt'].replace( 87 | "\nYou should write self-contained code starting with:\n```\n", 88 | "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" 89 | ) 90 | 91 | if sample["task_id"] in ["BigCodeBench/1109"]: 92 | sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt'] 93 | sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt'] 94 | sample['instruct_prompt'] = sample['instruct_prompt'].replace( 95 | "\nYou should write self-contained code starting with:\n```\n", 96 | "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n" 97 | ) 98 | 99 | return sample 100 | 101 | if __name__ == "__main__": 102 | api = HfApi() 103 | ds_dict = load_dataset(BIGCODEBENCH_HF) 104 | hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF) 105 | ds = ds_dict[BIGCODEBENCH_VERSION] 106 | hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION] 107 | function_id = [332, 334, 376, 383, 633, 635, 849, 940, 1109] 108 | 109 | new_ds = ds.map(map_ds) 110 | new_ds.to_json("BigCodeBench.jsonl") 111 | ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds 112 | ds_dict.push_to_hub(BIGCODEBENCH_HF) 113 | 114 | new_hard_ds = hard_ds.map(map_ds) 115 | new_hard_ds.to_json("BigCodeBench-Hard.jsonl") 116 | hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds 117 | hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF) 118 | 119 | for i in function_id: 120 | old_sample = ds.select([i]) 121 | new_sample = new_ds.select([i]) 122 | old_sample.to_json("old.jsonl") 123 | new_sample.to_json("new.jsonl") 124 | api.upload_file( 125 | path_or_fileobj="old.jsonl", 126 | path_in_repo=f"{i}/old.jsonl", 127 | repo_id=BIGCODEBENCH_UPDATE, 128 | # repo_type="dataset" 129 | ) 130 | api.upload_file( 131 | path_or_fileobj="new.jsonl", 132 | path_in_repo=f"{i}/new.jsonl", 133 | repo_id=BIGCODEBENCH_UPDATE, 134 | # repo_type="dataset" 135 | ) --------------------------------------------------------------------------------