├── .dockerignore
├── .github
    └── ISSUE_TEMPLATE
    │   ├── buggy_contract.yml
    │   ├── buggy_test.yml
    │   ├── config.yml
    │   └── model_eval_request.yml
├── .gitignore
├── .pre-commit-config.yaml
├── ADVANCED_USAGE.md
├── CITATION.cff
├── Docker
    ├── Evaluate.Dockerfile
    ├── Generate.Dockerfile
    └── Gradio.Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── Requirements
    ├── requirements-eval.txt
    └── requirements.txt
├── analysis
    ├── bcb_subset.py
    ├── get_results.py
    ├── lib2domain.json
    ├── task2domain.json
    └── utils.py
├── bigcodebench
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── bigcodebench.py
    │   └── utils.py
    ├── eval
    │   ├── __init__.py
    │   ├── _special_oracle.py
    │   └── utils.py
    ├── evaluate.py
    ├── gen
    │   ├── __init__.py
    │   └── util
    │   │   ├── __init__.py
    │   │   ├── anthropic_request.py
    │   │   ├── google_request.py
    │   │   ├── hf_inference_request.py
    │   │   ├── mistral_request.py
    │   │   └── openai_request.py
    ├── generate.py
    ├── inspect.py
    ├── provider
    │   ├── __init__.py
    │   ├── anthropic.py
    │   ├── base.py
    │   ├── google.py
    │   ├── hf.py
    │   ├── hf_inference.py
    │   ├── mistral.py
    │   ├── openai.py
    │   ├── utility.py
    │   └── vllm.py
    ├── sanitize.py
    └── syncheck.py
├── decontamination
    ├── n_gram_check.py
    ├── odex_10_overlap.txt
    ├── odex_13_overlap.txt
    ├── stackoverflow_10_overlap.txt
    ├── stackoverflow_13_overlap.txt
    └── starcoderdata_10_overlap.txt
├── pyproject.toml
├── release.sh
├── release_docker.sh
├── run.sh
├── sandbox-templates
    ├── e2b.Dockerfile
    └── e2b.toml
├── setup.cfg
├── tests
    ├── requirements.txt
    ├── test_legacy_sanitizer.py
    └── test_treesitter_sanitizer.py
└── tools
    ├── fix_v019.py
    ├── fix_v020.py
    ├── fix_v022.py
    ├── fix_v023.py
    └── fix_v025.py


/.dockerignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #  nuclear option because steven uses PyCharm.
161 | .idea/
162 | 
163 | # VSCode
164 | .vscode/
165 | backup/
166 | passrate.p*
167 | min_cov_dir/
168 | bigcodebench/_version.py
169 | inspect/
170 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/buggy_contract.yml:
--------------------------------------------------------------------------------
 1 | name: "🐛 Report Bad Task"
 2 | description: Report to us that certain programming task should be repaired.
 3 | title: "🐛 [TaskRemoval/TaskRepair] - <TASK_ID> <WHY>"
 4 | labels: ["programming task"]
 5 | body:
 6 |   - type: input
 7 |     id: version
 8 |     attributes:
 9 |       label: "BigCodeBench version"
10 |       description: What is the version of BigCodeBench? You can find it by running `pip show bigcodebench`.
11 |       placeholder: For example, 0.1.5
12 |     validations:
13 |       required: true
14 |   - type: input
15 |     id: cache
16 |     attributes:
17 |       label: "Output of running `ls ~/.cache/bigcodebench`"
18 |     validations:
19 |       required: true
20 |   - type: input
21 |     id: task_id
22 |     attributes:
23 |       label: "Task ID of the programming task"
24 |       placeholder: BigCodeBench/[??]
25 |     validations:
26 |       required: true
27 |   - type: textarea
28 |     id: original
29 |     attributes:
30 |       label: "The original complete prompt"
31 |       description: You can run `python -c "from bigcodebench.data import get_bigcodebench print(get_bigcodebench['BigCodeBench/❓']['complete_prompt'])"`
32 |       render: python
33 |     validations:
34 |       required: true
35 |   - type: textarea
36 |     id: new
37 |     attributes:
38 |       label: "Your proposed new complete prompt"
39 |       render: python
40 |     validations:
41 |       required: true
42 |   - type: textarea
43 |     id: other
44 |     attributes:
45 |       label: "Other context"
46 |       description: (Optional) Anything else the maintainer should notice?
47 |     validations:
48 |       required: false
49 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/buggy_test.yml:
--------------------------------------------------------------------------------
 1 | name: "🐛 Report Bad Test Inputs"
 2 | description: Report to us that certain test inputs should be removed.
 3 | title: "🐛 [TestRemoval/TestRepair] - <TASK_ID> <WHY>"
 4 | labels: ["bug"]
 5 | body:
 6 |   - type: input
 7 |     id: version
 8 |     attributes:
 9 |       label: "EvalPlus version"
10 |       description: What is the version of EvalPlus? You can find it by running `pip show bigcodebench`.
11 |       placeholder: For example, 0.1.0
12 |     validations:
13 |       required: true
14 |   - type: input
15 |     id: cache
16 |     attributes:
17 |       label: "Output of running `ls ~/.cache/bigcodebench`"
18 |     validations:
19 |       required: true
20 |   - type: input
21 |     id: task_id
22 |     attributes:
23 |       label: "Task ID of the programming task"
24 |       placeholder: BigCodeBench/[??]
25 |     validations:
26 |       required: true
27 |   - type: textarea
28 |     id: original
29 |     attributes:
30 |       label: "The original test"
31 |       description: You can run `python -c "from bigcodebench.data import get_bigcodebench print(get_bigcodebench['BigCodeBench/❓']['test'])"`
32 |       render: python
33 |     validations:
34 |       required: true
35 |   - type: textarea
36 |     id: new
37 |     attributes:
38 |       label: "Your proposed new test"
39 |       render: python
40 |     validations:
41 |       required: true
42 |   - type: textarea
43 |     id: description
44 |     attributes:
45 |       label: "Description"
46 |       description: An explicit description of why you think this test should be removed
47 |       placeholder: Here is a correct solution but it is incorrectly falsified by the test because ...
48 |     validations:
49 |       required: true
50 |   - type: textarea
51 |     id: other
52 |     attributes:
53 |       label: "Other context"
54 |       description: (Optional) Anything else the maintainer should notice?
55 |     validations:
56 |       required: false
57 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/model_eval_request.yml:
--------------------------------------------------------------------------------
 1 | name: "🤗 Model Evaluation Request"
 2 | description: Request BigCodeBench maintainers to evaluate your model independently and update it on our leaderboard.
 3 | title: "🤗 [REQUEST] - <MODEL_NAME>"
 4 | labels: ["model eval"]
 5 | body:
 6 |   - type: textarea
 7 |     id: about
 8 |     attributes:
 9 |       label: "Model introduction"
10 |       description: Provide a brief introduction to the model.
11 |       placeholder: The models is created by ... and is used for ...
12 |     validations:
13 |       required: true
14 |   - type: input
15 |     id: url
16 |     attributes:
17 |       label: "Model URL"
18 |       description: Indicate the URL (e.g., huggingface or other release pages) of the model
19 |       placeholder: https://huggingface.co/[???]/[???]
20 |     validations:
21 |       required: true
22 |   - type: textarea
23 |     id: other
24 |     attributes:
25 |       label: "Additional instructions (Optional)"
26 |       description: Special steps indicating how to run the model with preferably scripts/codes.
27 |       placeholder: What data type precision should be used? What is the minimal hardware requirement? Can it be accelerated by tools such as vLLM?
28 |     validations:
29 |       required: false
30 |   - type: dropdown
31 |     id: author
32 |     attributes:
33 |       label: "Author"
34 |       description: "Are you (one of) the author(s) of the model?"
35 |       multiple: false
36 |       options:
37 |         - "Yes"
38 |         - "No"
39 |     validations:
40 |       required: true
41 |   - type: checkboxes
42 |     id: security
43 |     attributes:
44 |       label: "Security"
45 |       options:
46 |         - label: "I confirm that the model is safe to run which does not contain any malicious code or content."
47 |           required: true
48 |   - type: checkboxes
49 |     id: integrity
50 |     attributes:
51 |       label: "Integrity"
52 |       options:
53 |         - label: "I confirm that the model comes from unique and original work and does not contain any plagiarism."
54 |           required: true
55 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #  nuclear option because steven uses PyCharm.
161 | .idea/
162 | 
163 | # VSCode
164 | .vscode/
165 | OpenPlus/
166 | backup/
167 | passrate.p*
168 | min_cov_dir/
169 | bigcodebench/_version.py
170 | *.jsonl
171 | inspect/
172 | *.zip
173 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pycqa/isort
 3 |     rev: 5.12.0
 4 |     hooks:
 5 |     -   id: isort
 6 |         name: isort (python)
 7 |         args: ["--profile", "black"]
 8 | -   repo: https://github.com/psf/black
 9 |     rev: 22.6.0
10 |     hooks:
11 |     -   id: black
12 | -   repo: https://github.com/pre-commit/pre-commit-hooks
13 |     rev: v4.3.0
14 |     hooks:
15 |     -   id: check-yaml
16 |     -   id: end-of-file-fixer
17 |     -   id: trailing-whitespace
18 | exclude: (?x)^(
19 |         groundtruth/.*
20 |     )$
21 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this work and love it, consider citing it as below \U0001F917"
 3 | title: BigCodeBench
 4 | authors:
 5 |   - family-names: BigCodeBench Team
 6 | url: https://github.com/bigcode-project/bigcodebench
 7 | doi: 
 8 | date-released: 2024-06-18
 9 | license: Apache-2.0
10 | preferred-citation:
11 |   type: article
12 |   title: "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions"
13 |   authors:
14 |     - family-names: BigCodeBench Team
15 |   year: 2024
16 |   journal:
17 |   doi:
18 |   url:


--------------------------------------------------------------------------------
/Docker/Evaluate.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Better use newer Python as generated code can use new features
 2 | FROM python:3.10-slim
 3 | 
 4 | # install git, g++ and python3-tk
 5 | RUN apt-get update && apt-get install -y \
 6 |     git \
 7 |     g++ \
 8 |     python3-tk \
 9 |     zip \
10 |     unzip \
11 |     procps \
12 |     r-base \
13 |     libgdal-dev \
14 |     # Add these new dependencies for matplotlib
15 |     libfreetype6-dev \
16 |     libpng-dev \
17 |     pkg-config \
18 |     python3-dev \
19 |     python3-matplotlib \
20 |     && rm -rf /var/lib/apt/lists/*
21 | 
22 | # upgrade to latest pip
23 | RUN pip install --upgrade pip
24 | 
25 | # Add a new user "bigcodebenchuser"
26 | RUN adduser --disabled-password --gecos "" bigcodebenchuser
27 | 
28 | RUN rm -rf /bigcodebench
29 | 
30 | # Acquire benchmark code to local
31 | ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
32 | RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
33 | 
34 | RUN pip install numpy==1.24.3 pyarrow==14.0.1
35 | 
36 | RUN cd /bigcodebench && \
37 |     pip install . --no-deps
38 |     
39 | RUN pip install \
40 |     appdirs \
41 |     fire \
42 |     multipledispatch \
43 |     pqdm \
44 |     tempdir \
45 |     termcolor \
46 |     tqdm \
47 |     tree_sitter \
48 |     tree-sitter-python \
49 |     wget \
50 |     transformers \
51 |     datasets \
52 |     gradio-client \
53 |     numpy \
54 |     rich \
55 |     accelerate \
56 |     anthropic \
57 |     google-genai \
58 |     mistralai \
59 |     openai \
60 |     e2b
61 | 
62 | RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt
63 | 
64 | # Ensure the numpy version is compatible with the datasets version
65 | RUN pip install datasets==2.17.0
66 | 
67 | # Pre-install the dataset
68 | RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench(subset='full'); get_bigcodebench(subset='hard')"
69 | 
70 | WORKDIR /app
71 | 
72 | RUN chown -R bigcodebenchuser:bigcodebenchuser /app
73 | 
74 | RUN chmod -R 777 /app
75 | 
76 | USER bigcodebenchuser
77 | 
78 | ENTRYPOINT ["python3", "-m", "bigcodebench.evaluate"]
79 | 
80 | CMD ["sh", "-c", "pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi; rm -rf /tmp/*"]


--------------------------------------------------------------------------------
/Docker/Generate.Dockerfile:
--------------------------------------------------------------------------------
  1 | FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
  2 | 
  3 | SHELL ["/bin/bash", "-c"]
  4 | 
  5 | # Setup Environment Variables
  6 | ENV CUDA_HOME=/usr/local/cuda \
  7 |     PYTHONUNBUFFERED=1 \
  8 |     TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
  9 | 
 10 | # Setup System Utilities
 11 | RUN apt-get update --yes --quiet \
 12 |     && apt-get upgrade --yes --quiet \
 13 |     && DEBIAN_FRONTEND=noninteractive apt-get install --yes --quiet --no-install-recommends \
 14 |         apt-utils \
 15 |         autoconf \
 16 |         automake \
 17 |         bc \
 18 |         build-essential \
 19 |         ca-certificates \
 20 |         check \
 21 |         cmake \
 22 |         curl \
 23 |         dmidecode \
 24 |         emacs \
 25 |         g++\
 26 |         gcc \
 27 |         git \
 28 |         iproute2 \
 29 |         jq \
 30 |         kmod \
 31 |         libaio-dev \
 32 |         libcurl4-openssl-dev \
 33 |         libgl1-mesa-glx \
 34 |         libglib2.0-0 \
 35 |         libgomp1 \
 36 |         libibverbs-dev \
 37 |         libnuma-dev \
 38 |         libnuma1 \
 39 |         libomp-dev \
 40 |         libsm6 \
 41 |         libssl-dev \
 42 |         libsubunit-dev \
 43 |         libsubunit0 \
 44 |         libtool \
 45 |         libxext6 \
 46 |         libxrender-dev \
 47 |         make \
 48 |         moreutils \
 49 |         net-tools \
 50 |         ninja-build \
 51 |         openssh-client \
 52 |         openssh-server \
 53 |         openssl \
 54 |         pkg-config \
 55 |         python3-dev \
 56 |         software-properties-common \
 57 |         sudo \
 58 |         unzip \
 59 |         util-linux \
 60 |         vim \
 61 |         wget \
 62 |         zlib1g-dev \
 63 |     && apt-get autoremove \
 64 |     && apt-get clean \
 65 |     && rm -rf /var/lib/apt/lists/
 66 | 
 67 | # Setup base Python to bootstrap Mamba
 68 | RUN add-apt-repository --yes ppa:deadsnakes/ppa \
 69 |     && apt-get update --yes --quiet
 70 | RUN DEBIAN_FRONTEND=noninteractive apt-get install --yes --quiet --no-install-recommends \
 71 |         python3.11 \
 72 |         python3.11-dev \
 73 |         python3.11-distutils \
 74 |         python3.11-lib2to3 \
 75 |         python3.11-gdbm \
 76 |         python3.11-tk \
 77 |         pip
 78 | RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 999 \
 79 |     && update-alternatives --config python3 \
 80 |     && ln -s /usr/bin/python3 /usr/bin/python
 81 | RUN pip install --upgrade pip
 82 | 
 83 | # Setup optimized Mamba environment with required PyTorch dependencies
 84 | RUN wget -O /tmp/Miniforge.sh https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-Linux-x86_64.sh \
 85 |     && bash /tmp/Miniforge.sh -b -p /Miniforge \
 86 |     && echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12.1/compat/" >> /Miniforge/etc/profile.d/mamba.sh \
 87 |     && source /Miniforge/etc/profile.d/conda.sh \
 88 |     && source /Miniforge/etc/profile.d/mamba.sh \
 89 |     && mamba update -y -q -n base -c defaults mamba \
 90 |     && mamba create -y -q -n BigCodeBench python=3.11 setuptools=69.5.1 \
 91 |     && mamba activate BigCodeBench \
 92 |     && mamba install -y -q -c conda-forge \
 93 |         charset-normalizer \
 94 |         gputil \
 95 |         ipython \
 96 |         numpy \
 97 |         pandas \
 98 |         scikit-learn \
 99 |         wandb \
100 |     && mamba install -y -q -c intel \
101 |         "mkl==2023" \
102 |         "mkl-static==2023" \
103 |         "mkl-include==2023" \
104 |     && mamba install -y -q -c pytorch magma-cuda121 \
105 |     && mamba clean -a -f -y
106 | 
107 | # Install VLLM precompiled with appropriate CUDA and ensure PyTorch is installed form the same version channel
108 | RUN source /Miniforge/etc/profile.d/conda.sh \
109 |     && source /Miniforge/etc/profile.d/mamba.sh \
110 |     && mamba activate BigCodeBench
111 | 
112 | RUN rm -rf /bigcodebench
113 | 
114 | # Acquire benchmark code to local
115 | ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
116 | RUN git clone https://github.com/bigcode-project/BigCodeBench.git /bigcodebench
117 | 
118 | # Install BigCodeBench and pre-load the dataset
119 | RUN source /Miniforge/etc/profile.d/conda.sh \
120 |     && source /Miniforge/etc/profile.d/mamba.sh \
121 |     && mamba activate BigCodeBench \
122 |     && cd /bigcodebench && pip install .[generate] \
123 |     && python -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench()" \
124 |     && export MAX_JOBS=$(($(nproc) - 2)) \
125 |     && pip install --no-cache-dir ninja packaging psutil \
126 |     && pip install flash-attn==2.5.8 --no-build-isolation
127 | 
128 | WORKDIR /app
129 | 
130 | ENTRYPOINT ["/Miniforge/envs/BigCodeBench/bin/python", "-m", "bigcodebench.generate"]


--------------------------------------------------------------------------------
/Docker/Gradio.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Better use newer Python as generated code can use new features
 2 | FROM python:3.10-slim
 3 | 
 4 | # install git, g++ and python3-tk
 5 | RUN apt-get update && apt-get install -y \
 6 |     git \
 7 |     g++ \
 8 |     python3-tk \
 9 |     zip \
10 |     unzip \
11 |     procps \
12 |     r-base \
13 |     libgdal-dev \
14 |     # Add these new dependencies for matplotlib
15 |     libfreetype6-dev \
16 |     libpng-dev \
17 |     pkg-config \
18 |     python3-dev \
19 |     python3-matplotlib \
20 |     && rm -rf /var/lib/apt/lists/*
21 | # upgrade to latest pip
22 | RUN pip install --upgrade pip
23 | 
24 | RUN pip install APScheduler==3.10.1 black==23.11.0 click==8.1.3 huggingface-hub>=0.18.0 plotly python-dateutil==2.8.2 gradio-space-ci@git+https://huggingface.co/spaces/Wauplin/gradio-space-ci@0.2.3 isort ruff gradio[oauth] schedule==1.2.2
25 | 
26 | # Add a new user "bigcodebenchuser"
27 | RUN adduser --disabled-password --gecos "" bigcodebenchuser
28 | 
29 | RUN rm -rf /bigcodebench
30 | 
31 | # Acquire benchmark code to local
32 | ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
33 | RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
34 | 
35 | 
36 | RUN pip install numpy==1.24.3 pyarrow==14.0.1
37 | 
38 | RUN cd /bigcodebench && \
39 |     pip install . --no-deps && \
40 |     pip install \
41 |     appdirs>=1.4.4 \
42 |     fire>=0.6.0 \
43 |     multipledispatch>=0.6.0 \
44 |     pqdm>=0.2.0 \
45 |     tempdir>=0.7.1 \
46 |     termcolor>=2.0.0 \
47 |     tqdm>=4.56.0 \
48 |     tree_sitter_languages>=1.10.2 \
49 |     tree-sitter==0.21.3 \
50 |     wget>=3.2 \
51 |     gradio-client \
52 |     rich
53 | 
54 | RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt
55 | 
56 | # Ensure the numpy version is compatible with the datasets version
57 | RUN pip install datasets==2.17.0
58 | 
59 | # Pre-install the dataset
60 | RUN python3 -c "from bigcodebench.data import get_bigcodebench; get_bigcodebench(subset='full'); get_bigcodebench(subset='hard')"
61 | 
62 | RUN apt-get update && \
63 |     apt-get install -y \
64 |       bash \
65 |       git git-lfs \
66 |       wget curl procps \
67 |       htop vim nano && \
68 |     rm -rf /var/lib/apt/lists/*
69 | 
70 | 
71 | WORKDIR /app
72 | 
73 | RUN chown -R bigcodebenchuser:bigcodebenchuser /app
74 | 
75 | RUN chmod -R 777 /app
76 | 
77 | USER bigcodebenchuser
78 | 
79 | # ENTRYPOINT ["python", "app.py"]
80 | 
81 | # CMD ["sh", "-c", "pids=$(ps -u $(id -u) -o pid,comm | grep 'bigcodebench' | awk '{print $1}'); if [ -n \"$pids\" ]; then echo $pids | xargs -r kill; fi; rm -rf /tmp/*"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | -------------------------------------------------------------------------------
204 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | exclude bigcodebench/_experimental/**/*.py
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # BigCodeBench
  2 | <center>
  3 | <img src="https://github.com/bigcode-bench/bigcode-bench.github.io/blob/main/asset/bigcodebench_banner.svg?raw=true" alt="BigCodeBench">
  4 | </center>
  5 | 
  6 | <p align="center">
  7 |     <a href="https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard"><img src="https://img.shields.io/badge/🤗&nbsp&nbsp%F0%9F%8F%86-leaderboard-%23ff8811"></a>
  8 |     <a href="https://huggingface.co/collections/bigcode/bigcodebench-666ed21a5039c618e608ab06"><img src="https://img.shields.io/badge/🤗-collection-pink"></a>
  9 |     <a href="https://bigcode-bench.github.io/"><img src="https://img.shields.io/badge/%F0%9F%8F%86-website-8A2BE2"></a>
 10 |     <a href="https://arxiv.org/abs/2406.15877"><img src="https://img.shields.io/badge/arXiv-2406.15877-b31b1b.svg"></a>
 11 |     <a href="https://pypi.org/project/bigcodebench/"><img src="https://img.shields.io/pypi/v/bigcodebench?color=g"></a>
 12 |     <a href="https://pepy.tech/project/bigcodebench"><img src="https://static.pepy.tech/badge/bigcodebench"></a>
 13 |     <a href="https://github.com/bigcodebench/bigcodebench/blob/master/LICENSE"><img src="https://img.shields.io/pypi/l/bigcodebench"></a>
 14 |     <a href="https://hub.docker.com/r/bigcodebench/bigcodebench-evaluate" title="Docker-Eval"><img src="https://img.shields.io/docker/image-size/bigcodebench/bigcodebench-evaluate"></a>
 15 | </p>
 16 | 
 17 | <p align="center">
 18 |     <a href="#-impact">💥 Impact</a> •
 19 |     <a href="#-news">📰 News</a> •
 20 |     <a href="#-quick-start">🔥 Quick Start</a> •
 21 |     <a href="#-remote-evaluation">🚀 Remote Evaluation</a> •
 22 |     <a href="#-llm-generated-code">💻 LLM-generated Code</a> •
 23 |     <a href="#-advanced-usage">🧑 Advanced Usage</a> •
 24 |     <a href="#-result-submission">📰 Result Submission</a> •
 25 |     <a href="#-citation">📜 Citation</a>
 26 | </p>
 27 | 
 28 | <div align="center">
 29 |     <h2>🎉 Check out our latest work!<br>
 30 |     <a href="https://swe-arena.com">🌟 SWE Arena 🌟</a><br>
 31 |     <strong>🚀 Open Evaluation Platform on AI for Software Engineering 🚀<br>
 32 |     ✨ 100% free to use the latest frontier models! ✨</strong></h2>
 33 | </div>
 34 | 
 35 | ## 💥 Impact
 36 | BigCodeBench has been trusted by many LLM teams including:
 37 | - Zhipu AI
 38 | - Alibaba Qwen
 39 | - DeepSeek
 40 | - Amazon AWS AI
 41 | - Snowflake AI Research
 42 | - ServiceNow Research
 43 | - Meta AI
 44 | - Cohere AI
 45 | - Sakana AI
 46 | - Allen Institute for Artificial Intelligence (AI2)
 47 | 
 48 | ## 📰 News
 49 | - **[2025-01-22]** We are releasing `bigcodebench==v0.2.2.dev2`, with 163 models evaluated!
 50 | - **[2024-10-06]** We are releasing `bigcodebench==v0.2.0`!
 51 | - **[2024-10-05]** We create a public code execution API on the [Hugging Face space](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator).
 52 | - **[2024-10-01]** We have evaluated 139 models on BigCodeBench-Hard so far. Take a look at the [leaderboard](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard)!
 53 | - **[2024-08-19]** To make the evaluation fully reproducible, we add a real-time code execution session to the leaderboard. It can be viewed [here](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard).
 54 | - **[2024-08-02]** We release `bigcodebench==v0.1.9`.
 55 | 
 56 | <details><summary>More News <i>:: click to expand ::</i></summary>
 57 | <div>
 58 | 
 59 | - **[2024-07-18]** We announce a subset of BigCodeBench, BigCodeBench-Hard, which includes 148 tasks that are more aligned with the real-world programming tasks. The details are available [in this blog post](https://huggingface.co/blog/terryyz/bigcodebench-hard). The dataset is available [here](https://huggingface.co/datasets/bigcode/bigcodebench-hard). The new release is `bigcodebench==v0.1.8`.
 60 | - **[2024-06-28]** We release `bigcodebench==v0.1.7`.
 61 | - **[2024-06-27]** We release `bigcodebench==v0.1.6`.
 62 | - **[2024-06-19]** We start the Hugging Face BigCodeBench Leaderboard! The leaderboard is available [here](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard).
 63 | - **[2024-06-18]** We release BigCodeBench, a new benchmark for code generation with 1140 software-engineering-oriented programming tasks. Preprint is available [here](https://arxiv.org/abs/2406.15877). PyPI package is available [here](https://pypi.org/project/bigcodebench/) with the version `0.1.5`.
 64 | 
 65 | </div>
 66 | </details>
 67 | 
 68 | ## 🌸 About
 69 | 
 70 | ### BigCodeBench
 71 | 
 72 | BigCodeBench is an **_easy-to-use_** benchmark for solving **_practical_** and **_challenging_** tasks via code. It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting. The benchmark is designed for HumanEval-like function-level code generation tasks, but with much more complex instructions and diverse function calls.
 73 | 
 74 | There are two splits in BigCodeBench:
 75 | - `Complete`: Thes split is designed for code completion based on the comprehensive docstrings.
 76 | - `Instruct`: The split works for the instruction-tuned and chat models only, where the models are asked to generate a code snippet based on the natural language instructions. The instructions only contain necessary information, and require more complex reasoning.
 77 | 
 78 | ### Why BigCodeBench?
 79 | 
 80 | BigCodeBench focuses on task automation via code generation with *diverse function calls* and *complex instructions*, with:
 81 | 
 82 | * ✨ **Precise evaluation & ranking**: See [our leaderboard](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard) for latest LLM rankings before & after rigorous evaluation.
 83 | * ✨ **Pre-generated samples**: BigCodeBench accelerates code intelligence research by open-sourcing [LLM-generated samples](#-LLM-generated-code) for various models -- no need to re-run the expensive benchmarks!
 84 | 
 85 | ## 🔥 Quick Start
 86 | 
 87 | To get started, please first set up the environment:
 88 | 
 89 | ```bash
 90 | # By default, you will use the remote evaluation API to execute the output samples.
 91 | pip install bigcodebench --upgrade
 92 | 
 93 | # You are suggested to use `flash-attn` for generating code samples.
 94 | pip install packaging ninja
 95 | pip install flash-attn --no-build-isolation
 96 | # Note: if you have installation problem, consider using pre-built
 97 | # wheels from https://github.com/Dao-AILab/flash-attention/releases
 98 | ```
 99 | 
100 | <details><summary>⏬ Install nightly version <i>:: click to expand ::</i></summary>
101 | <div>
102 | 
103 | ```bash
104 | # Install to use bigcodebench.generate
105 | pip install "git+https://github.com/bigcode-project/bigcodebench.git" --upgrade
106 | ```
107 | 
108 | </div>
109 | </details>
110 | 
111 | 
112 | ## 🚀 Remote Evaluation
113 | 
114 | We use the greedy decoding as an example to show how to evaluate the generated code samples via remote API.
115 | > [!Warning]
116 | >
117 | > To ease the generation, we use batch inference by default. However, the batch inference results could vary from *batch sizes to batch sizes* and *versions to versions*, at least for the vLLM backend. If you want to get more deterministic results for greedy decoding, please set `--bs` to `1`. 
118 | 
119 | > [!Note]
120 | >
121 | > `gradio` backend on `BigCodeBench-Full` typically takes 6-7 minutes, and on `BigCodeBench-Hard` typically takes 4-5 minutes.
122 | > `e2b` backend with default machine on `BigCodeBench-Full` typically takes 25-30 minutes, and on `BigCodeBench-Hard` typically takes 15-20 minutes.
123 | 
124 | ```bash
125 | bigcodebench.evaluate \
126 |   --model meta-llama/Meta-Llama-3.1-8B-Instruct \
127 |   --execution [e2b|gradio|local] \
128 |   --split [complete|instruct] \
129 |   --subset [full|hard] \
130 |   --backend [vllm|openai|anthropic|google|mistral|hf|hf-inference]
131 | ```
132 | 
133 | - All the resulted files will be stored in a folder named `bcb_results`.
134 | - The generated code samples will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated.jsonl`.
135 | - The evaluation results will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated_eval_results.json`.
136 | - The pass@k results will be stored in a file named `[model_name]--bigcodebench-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated_pass_at_k.json`.
137 | 
138 | > [!Note]
139 | >
140 | > The `gradio` backend is hosted on the [Hugging Face space](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) by default.
141 | > The default space can be sometimes slow, so we recommend you to use the `gradio` backend with a cloned [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) endpoint for faster evaluation.
142 | > Otherwise, you can also use the `e2b` sandbox for evaluation, which is also pretty slow on the default machine.
143 | 
144 | > [!Note]
145 | >
146 | > BigCodeBench uses different prompts for base and chat models.
147 | > By default it is detected by `tokenizer.chat_template` when using `hf`/`vllm` as backend.
148 | > For other backends, only chat mode is allowed.
149 | >
150 | > Therefore, if your base models come with a `tokenizer.chat_template`,
151 | > please add `--direct_completion` to avoid being evaluated
152 | > in a chat mode.
153 | 
154 | To use E2B, you need to set up an account and get an API key from [E2B](https://e2b.dev/).
155 | 
156 | ```bash
157 | export E2B_API_KEY=<your_e2b_api_key>
158 | ```
159 | 
160 | Access OpenAI APIs from [OpenAI Console](https://platform.openai.com/)
161 | ```bash
162 | export OPENAI_API_KEY=<your_openai_api_key>
163 | ```
164 | 
165 | Access Anthropic APIs from [Anthropic Console](https://console.anthropic.com/)
166 | ```bash
167 | export ANTHROPIC_API_KEY=<your_anthropic_api_key>
168 | ```
169 | 
170 | Access Mistral APIs from [Mistral Console](https://console.mistral.ai/)
171 | ```bash
172 | export MISTRAL_API_KEY=<your_mistral_api_key>
173 | ```
174 | 
175 | Access Gemini APIs from [Google AI Studio](https://aistudio.google.com/)
176 | ```bash
177 | export GOOGLE_API_KEY=<your_google_api_key>
178 | ```
179 | 
180 | Access the [Hugging Face Serverless Inference API](https://huggingface.co/docs/api-inference/en/index)
181 | ```bash
182 | export HF_INFERENCE_API_KEY=<your_hf_api_key>
183 | ```
184 | 
185 | Please make sure your HF access token has the `Make calls to inference providers` permission.
186 | 
187 | ## 💻 LLM-generated Code
188 | 
189 | We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard) on the full set:
190 | *  See the attachment of our [v0.2.4](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.4). We include `sanitized_samples_calibrated.zip` for your convenience.
191 | 
192 | ## 🧑 Advanced Usage
193 | 
194 | Please refer to the [ADVANCED USAGE](https://github.com/bigcode-project/bigcodebench/blob/main/ADVANCED_USAGE.md) for more details.
195 | 
196 | ## 📰 Result Submission
197 | 
198 | Please email both the generated code samples and the execution results to [terry.zhuo@monash.edu](mailto:terry.zhuo@monash.edu) if you would like to contribute your model to the leaderboard. Note that the file names should be in the format of `[model_name]--[revision]--[bigcodebench|bigcodebench-hard]-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated.jsonl` and `[model_name]--[revision]--[bigcodebench|bigcodebench-hard]-[instruct|complete]--[backend]-[temp]-[n_samples]-sanitized_calibrated_eval_results.json`. You can [file an issue](https://github.com/bigcode-project/bigcodebench/issues/new/choose) to remind us if we do not respond to your email within 3 days.
199 | 
200 | ## 📜 Citation
201 | 
202 | ```bibtex
203 | @article{zhuo2024bigcodebench,
204 |   title={BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions},
205 |   author={Zhuo, Terry Yue and Vu, Minh Chien and Chim, Jenny and Hu, Han and Yu, Wenhao and Widyasari, Ratnadira and Yusuf, Imam Nur Bani and Zhan, Haolan and He, Junda and Paul, Indraneil and others},
206 |   journal={arXiv preprint arXiv:2406.15877},
207 |   year={2024}
208 | }
209 | ```
210 | 
211 | ## 🙏 Acknowledgement
212 | 
213 | - [EvalPlus](https://github.com/evalplus/evalplus)
214 | 


--------------------------------------------------------------------------------
/Requirements/requirements-eval.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.8.2
 2 | blake3==0.4.1
 3 | chardet==5.2.0
 4 | cryptography==38.0.0
 5 | datetime==5.5
 6 | Django==4.2.7
 7 | dnspython==2.6.1
 8 | docxtpl==0.11.5
 9 | Faker==20.1.0
10 | flask_login==0.6.3
11 | flask_restful==0.3.10
12 | flask_wtf==1.2.1
13 | Flask-Mail==0.9.1
14 | flask==3.0.3
15 | folium==0.16.0
16 | gensim==4.3.2
17 | geopandas==0.13.2
18 | geopy==2.4.1
19 | holidays==0.29
20 | keras==2.11.0
21 | Levenshtein==0.25.0
22 | librosa==0.10.1
23 | lxml==4.9.3
24 | matplotlib==3.7.0
25 | mechanize==0.4.9
26 | natsort==7.1.1
27 | networkx==2.6.3
28 | nltk==3.8
29 | numba==0.55.0
30 | numpy==1.21.2
31 | opencv-python-headless==4.9.0.80
32 | openpyxl==3.1.2
33 | pandas==2.0.3
34 | Pillow==10.3.0
35 | prettytable==3.10.0
36 | psutil==5.9.5
37 | pycryptodome==3.14.1
38 | pyfakefs==5.4.1
39 | pyquery==1.4.3
40 | pytesseract==0.3.10
41 | pytest==8.2.0
42 | python_http_client==3.3.7
43 | python-dateutil==2.9.0
44 | python-docx==1.1.0
45 | python-Levenshtein-wheels
46 | pytz==2023.3.post1
47 | PyYAML==6.0.1
48 | requests_mock==1.11.0
49 | requests==2.31.0
50 | Requests==2.31.0
51 | rsa==4.9
52 | scikit-image==0.18.0
53 | scikit-learn==1.3.1
54 | scipy==1.7.2
55 | seaborn==0.13.2
56 | selenium==4.15
57 | sendgrid==6.11.0
58 | shapely==2.0.4
59 | soundfile==0.12.1
60 | statsmodels==0.14.0
61 | statsmodels==0.14.0
62 | sympy==1.12
63 | tensorflow==2.11.0
64 | textblob==0.18.0
65 | texttable==1.7.0
66 | Werkzeug==3.0.1
67 | wikipedia==1.4.0
68 | wordcloud==1.9.3
69 | wordninja==2.0.0
70 | WTForms==3.1.2
71 | xlrd==2.0.1
72 | xlrd==2.0.1
73 | xlwt==1.3.0
74 | xmltodict==0.13.0


--------------------------------------------------------------------------------
/Requirements/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs>=1.4.4
 2 | fire>=0.6.0
 3 | multipledispatch>=0.6.0
 4 | pqdm>=0.2.0
 5 | tempdir>=0.7.1
 6 | termcolor>=2.0.0
 7 | tqdm>=4.56.0
 8 | tree_sitter_languages>=1.10.2
 9 | tree-sitter==0.21.3
10 | wget>=3.2
11 | 


--------------------------------------------------------------------------------
/analysis/bcb_subset.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import json
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | from ast import literal_eval
  6 | from glob import glob
  7 | from sentence_transformers import SentenceTransformer, util
  8 | import matplotlib.pyplot as plt
  9 | from transformers import AutoTokenizer
 10 | from datasets import load_dataset, Dataset, Features, Value, Sequence, DatasetDict
 11 | 
 12 | from utils import *
 13 | 
 14 | VERSION = "v0.1.0_hf"
 15 | 
 16 | def update_model_info(model_info):
 17 |     for model, info in model_info.items():
 18 |         if "https://huggingface.co/" in info["link"]:
 19 |             hf_model = info["link"].split("https://huggingface.co/")[-1]
 20 |             print(hf_model)
 21 |             tokenizer = AutoTokenizer.from_pretrained(hf_model, trust_remote_code=True)
 22 |             if tokenizer.chat_template is None:
 23 |                 model_info[model]["direct_complete"] = True
 24 |             else:
 25 |                 model_info[model]["direct_complete"] = False
 26 |         else:
 27 |             model_info[model]["direct_complete"] = False
 28 |     
 29 |     return model_info
 30 | 
 31 | 
 32 | def embed_sentences(data, col_name, id_name, model, save_path, push_to_hub=False):
 33 |     pool = model.start_multi_process_pool()
 34 |     embeddings = model.encode_multi_process(data[col_name], pool=pool)
 35 |     qids = data[id_name]
 36 |     features = Features({id_name: Value(dtype='string'), 'embeddings': Sequence(Value('float32'))})
 37 |     embed_dict = {
 38 |         id_name: qids,
 39 |         "embeddings": embeddings
 40 |     }
 41 |     embed_ds = Dataset.from_dict(embed_dict, features=features)
 42 |     if push_to_hub:
 43 |         embed_ds.push_to_hub(f"bigcode/{save_path}")
 44 |     else:
 45 |         embed_ds.save_to_disk(save_path)
 46 |     return embed_ds
 47 | 
 48 | 
 49 | def get_top_docs(query_embs, doc_emb, docs):
 50 |     scores = np.dot(query_embs, doc_emb.T)
 51 |     top_doc_indices = np.argmax(scores, axis=1)
 52 |     top_scores = scores[np.arange(len(scores)), top_doc_indices]
 53 |     results = [(i, docs[doc_idx], score) for i, (doc_idx, score) in tqdm(enumerate(zip(top_doc_indices, top_scores)))]
 54 |     
 55 |     return results
 56 | 
 57 | 
 58 | def filter_top_k_percent(results, k_percent):
 59 |     all_scores = [score for _, score in results]
 60 |     threshold = np.percentile(all_scores, 100 - k_percent)
 61 |     filtered_results = [(i, doc, score) for i, doc, score in results if score > threshold]
 62 |     return filtered_results
 63 | 
 64 | 
 65 | def filter_top_threshold(results, threshold):
 66 |     filtered_results = [(i, doc, score) for i, doc, score in results if score > threshold]
 67 |     return filtered_results
 68 | 
 69 | 
 70 | def read_task_perf(tids, task="complete"):
 71 |     model_results = dict()
 72 |     result_files = []
 73 |     for model, info in model_info.items():
 74 |         if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]):
 75 |             continue
 76 |         task_perf = {f"BigCodeBench/{task_id}": 0 for task_id in range(1140)}
 77 |         model = model.replace("/", "--")
 78 |         try:
 79 |             if info["prompted"] and not info["direct_complete"]:
 80 |                 files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized-calibrated_eval_results.json")
 81 |                 if files:
 82 |                     file = files[0]
 83 |                 else:
 84 |                     file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
 85 |             else:
 86 |                 file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
 87 |         except:
 88 |             continue
 89 |         with open(file, "r") as f:
 90 |             data = json.load(f)
 91 |         for task_id, perfs in data["eval"].items():
 92 |             status = 1 if perfs[0]["status"] == "pass" else 0
 93 |             task_perf[task_id] = status
 94 |         model_results[info["name"]] = np.mean([status for tid, status in task_perf.items() if tid in tids])
 95 |     return sorted(model_results.items(), key=lambda x: x[1], reverse=True)
 96 | 
 97 | 
 98 | if __name__ == "__main__":    
 99 |     bcb = load_dataset("bigcode/bigcodebench", trust_remote_code=True, split=VERSION)
100 |     se = load_dataset("bigcode/stack-exchange-preferences-20230914-clean-anonymization", trust_remote_code=True, split="train")
101 |     model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
102 | 
103 |     model_info = update_model_info(model_info)
104 | 
105 |     se_embed = embed_sentences(se, "question", "qid", model, "stack-exchange-embeddings-20230914", push_to_hub=True)
106 |     bcb_embed = embed_sentences(bcb, "complete_prompt", "task_id", model, "bigcodebench-doc-embeddings", push_to_hub=True)
107 | 
108 |     solve_rate = load_dataset("bigcode/bigcodebench-solve-rate", trust_remote_code=True, split="complete")
109 | 
110 |     query_embs = np.array(se_embed["embeddings"])
111 |     doc_emb = np.array(bcb_embed["embeddings"])
112 |     docs = bcb_embed["task_id"]
113 |     retrieval_results = get_top_docs(query_embs, doc_emb, docs)
114 | 
115 |     Dataset.from_dict({"qid": [i for i, _, _ in retrieval_results], "tid": [doc for _, doc, _ in retrieval_results], "score": [score for _, _, score in retrieval_results]}).push_to_hub("bigcode/se_bcb_results")
116 | 
117 |     retrieval_ds = load_dataset("bigcode/se_bcb_results", trust_remote_code=True, split="train")
118 | 
119 |     top_results = dict()
120 |     for sample in tqdm(retrieval_ds):
121 |         i, doc, score = sample["qid"], sample["tid"], sample["score"]
122 |         if score > 0.7:
123 |             if doc not in top_results:
124 |                 top_results[doc] = (i, doc, score)
125 |             else:
126 |                 if score > top_results[doc][2]:
127 |                     top_results[doc] = (i, doc, score)
128 | 
129 |     top_id = {task_id: (qid, score) for qid, task_id, score in top_results.values()}
130 | 
131 |     hard_lib_filter = {sample["task_id"] for sample in bcb if len(literal_eval(sample["libs"])) > 2}
132 |     hard_length_filter = {sample["task_id"] for sample in bcb if len(sample["canonical_solution"]) > 426}
133 |     hard_rate_filter = {task["task_id"]: task["solve_rate"] for task in solve_rate if task["solve_rate"] < 50}
134 | 
135 |     hard_tid = top_id.keys() & hard_length_filter & hard_rate_filter.keys() & hard_lib_filter
136 | 
137 |     hard_bcb = bcb.filter(lambda x: x["task_id"] in hard_tid)
138 |     hard_bcb_tid = bcb.filter(lambda x: x["task_id"] in hard_tid)["task_id"]
139 |     hard_se_qid = [top_id[_id][0] for _id in hard_bcb_tid]
140 |     hard_se_q = se.select(hard_se_qid)
141 |     hard_se_scores = [top_id[_id][1] for _id in hard_bcb_tid]
142 |     hard_bcb_dict = {
143 |         "task_id": hard_bcb_tid,
144 |         "complete_prompt": hard_bcb["complete_prompt"],
145 |         "instruct_prompt": hard_bcb["instruct_prompt"],
146 |         "canonical_solution": hard_bcb["canonical_solution"],
147 |         "code_prompt": hard_bcb["code_prompt"],
148 |         "test": hard_bcb["test"],
149 |         "entry_point": hard_bcb["entry_point"],
150 |         "doc_struct": hard_bcb["doc_struct"],
151 |         "libs": hard_bcb["libs"],
152 |         "q_idx": hard_se_qid,
153 |         "question": hard_se_q["question"],
154 |         "score": hard_se_scores,
155 |         "_id": hard_bcb_tid
156 |     }
157 |     hard_bcb = Dataset.from_dict(hard_bcb_dict)
158 |     DatasetDict({VERSION: hard_bcb}).push_to_hub("bigcode/bigcodebench-hard")
159 |         
160 |     hard_complete_results = read_task_perf(hard_tid)
161 |     hard_instruct_results = read_task_perf(hard_tid, task="instruct")
162 | 
163 |     complete_res_dict = {model: score for model, score in hard_complete_results}
164 |     instruct_res_dict = {model: score for model, score in hard_instruct_results}
165 |     avg_res_dict = {model: (complete_res_dict[model] + instruct_res_dict[model]) / 2 for model in complete_res_dict if model in instruct_res_dict}
166 | 
167 |     for model, score in sorted(avg_res_dict.items(), key=lambda x: x[1], reverse=True):
168 |         print(model, round(score*100, 1))


--------------------------------------------------------------------------------
/analysis/get_results.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import shutil
  4 | import numpy as np
  5 | from numpy import mean
  6 | from glob import glob
  7 | from utils import model_info
  8 | from tqdm import tqdm
  9 | import pandas as pd
 10 | import itertools
 11 | import math
 12 | from datasets import Dataset, DatasetDict, load_dataset
 13 | from transformers import AutoTokenizer
 14 | 
 15 | def update_model_info(model_info):
 16 |     for model, info in model_info.items():
 17 |         if "https://huggingface.co/" in info["link"]:
 18 |             hf_model = info["link"].split("https://huggingface.co/")[-1]
 19 |             print(hf_model)
 20 |             try:
 21 |                 tokenizer = AutoTokenizer.from_pretrained(hf_model, trust_remote_code=True)
 22 |                 
 23 |                 if tokenizer.chat_template is None:
 24 |                     model_info[model]["direct_complete"] = True
 25 |                 else:
 26 |                     model_info[model]["direct_complete"] = False
 27 |             except:
 28 |                 model_info[model]["direct_complete"] = True
 29 |         else:
 30 |             model_info[model]["direct_complete"] = False
 31 |     
 32 |     return model_info
 33 | 
 34 | 
 35 | def get_results(tids):
 36 |     results = {}
 37 |     for model, info in model_info.items():
 38 |         results[info["name"]] = {
 39 |             "link": info["link"],
 40 |             "open-data": info["open-data"],
 41 |             "pass@1": {
 42 |                 "complete": None,
 43 |                 "instruct": None,
 44 |                 "complete-cal": None,
 45 |                 "instruct-cal": None,
 46 |             },
 47 |             "prompted": info["prompted"],
 48 |             "moe": info["moe"],
 49 |             "size": info["size"],
 50 |             "act_param": info["act_param"],
 51 |             "date": info.get("date", None),
 52 |             "prefill": info.get("prefill", False),
 53 |             # "direct_complete": info["direct_complete"],
 54 |         }
 55 |         
 56 |     for model, info in model_info.items():
 57 |         model = model.replace("/", "--")
 58 |         hf_model = ""
 59 |         files = glob(f"results/{model}--bigcodebench-*_eval_results.json")
 60 |         assert files, f"No files found for results/{model}--bigcodebench-*_eval_results.json"
 61 |         for file in files:
 62 |             try:
 63 |                 _, suffix = os.path.basename(file).split("--bigcodebench-hard-")
 64 |                 with open("results/"+model+"--bigcodebench-hard-"+suffix, "r") as f:
 65 |                     data = json.load(f)
 66 |             except:
 67 |                 _, suffix = os.path.basename(file).split("--bigcodebench-")
 68 |                 with open("results/"+model+"--bigcodebench-"+suffix, "r") as f:
 69 |                     data = json.load(f)
 70 |             status = []
 71 |             
 72 |             if len(data["eval"]) < len(tids):
 73 |                 continue
 74 |             for key, value in data["eval"].items():
 75 |                 if key not in tids:
 76 |                     continue
 77 |                 if value[0]["status"] == "pass":
 78 |                     status.append(1)
 79 |                 else:
 80 |                     status.append(0)
 81 |             if suffix.startswith("complete"):
 82 |                 task = "complete"
 83 |             elif suffix.startswith("instruct"):
 84 |                 task = "instruct"
 85 |             else:
 86 |                 raise ValueError("Unknown task")
 87 | 
 88 |             mode = ""
 89 |             if "calibrated" in file:
 90 |                 mode = "-cal"
 91 |             
 92 |             results[info["name"]][f"pass@1"][f"{task}{mode}"] = round(mean(status)*100,1)
 93 |             if not info["prompted"]:# or info["direct_complete"]:
 94 |                 results[info["name"]][f"pass@1"][f"{task}-cal"] = round(mean(status)*100,1)
 95 |             
 96 |     for model, result in results.items():
 97 |         for task in ["complete"]:
 98 |             origin = result["pass@1"].pop(task)
 99 |             # assert origin, f"Missing original complete results for {model}"
100 |             calibrate = result["pass@1"].pop(f"{task}-cal")
101 |             if calibrate:
102 |                 # if calibrate - origin > 1:
103 |                 #     results[model]["lazy"] = True
104 |                 # else:
105 |                 #     results[model]["lazy"] = False
106 |                 results[model]["pass@1"][task] = calibrate
107 |             else:
108 |                 # results[model]["lazy"] = False
109 |                 results[model]["pass@1"][task] = origin
110 |         calibrate_instruct = result["pass@1"].pop(f"instruct-cal")
111 |         result["pass@1"]["instruct"] = calibrate_instruct
112 |     return results
113 | 
114 | 
115 | def check_valid(results):
116 |     for model, result in results.items():
117 |         if result["prompted"] and model not in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]:
118 |             assert result["pass@1"]["instruct"], model
119 |         assert result["pass@1"]["complete"]
120 | 
121 | 
122 | def split_gen():
123 |     shutil.rmtree("sanitized_calibrated_samples", ignore_errors=True)
124 |     os.makedirs("sanitized_calibrated_samples/hard/complete", exist_ok=True)
125 |     os.makedirs("sanitized_calibrated_samples/hard/instruct", exist_ok=True)
126 |     os.makedirs("sanitized_calibrated_samples/full/complete", exist_ok=True)
127 |     os.makedirs("sanitized_calibrated_samples/full/instruct", exist_ok=True)
128 |     
129 |     for model, info in model_info.items():
130 |         model = model.replace("/", "--")
131 |         files = glob(f"results/{model}--bigcodebench-*.jsonl")
132 |         if info["link"].startswith("https://huggingface.co/"):
133 |             model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
134 |         
135 |         for file in files:
136 |             if "-sanitized" not in file or "calibrated" not in file:
137 |                 continue
138 |                 
139 |             _, suffix = os.path.basename(file).split("--bigcodebench-")
140 |             with open(file, "r") as f:
141 |                 data = f.readlines()
142 |                 
143 |             split_type = "hard" if "-hard-" in file else "full"
144 |             if info["prompted"]:
145 |                 if suffix.startswith("complete") or suffix.startswith("hard-complete"):
146 |                     with open(f"sanitized_calibrated_samples/{split_type}/complete/{model}--bigcodebench-{suffix}", "w") as f:
147 |                         f.writelines(data)
148 |                 else:
149 |                     with open(f"sanitized_calibrated_samples/{split_type}/instruct/{model}--bigcodebench-{suffix}", "w") as f:
150 |                         f.writelines(data)
151 | 
152 | def read_task_perf(tids, task="complete"):
153 |     model_results = dict()
154 |     result_files = []
155 |     for model, info in model_info.items():
156 |         if task == "instruct" and (not info["prompted"] or info["name"] in ["Granite-Code-3B-Instruct", "Granite-Code-8B-Instruct"]):
157 |             continue
158 | 
159 |         task_perf = dict()
160 |         model = model.replace("/", "--")
161 |         try:
162 |             try:
163 |                 try:
164 |                     if info["prompted"]:
165 |                         files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized*calibrated_eval_results.json")
166 |                         if files:
167 |                             file = files[0]
168 |                         else:
169 |                             file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
170 |                     else:
171 |                         file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_eval_results.json")[0]
172 |                 except:
173 |                     if info["prompted"]:# and not info["direct_complete"]:
174 |                         files = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized*calibrated_hard_eval_results.json")
175 |                         if files:
176 |                             file = files[0]
177 |                         else:
178 |                             file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
179 |                     else:
180 |                         file = glob(f"results/{model}--bigcodebench-{task}*-0-1-sanitized_hard_eval_results.json")[0]
181 |             except:
182 |                 try:
183 |                     if info["prompted"]:# and not info["direct_complete"]:
184 |                         files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized*calibrated_hard_eval_results.json")
185 |                         if files:
186 |                             file = files[0]
187 |                         else:
188 |                             file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_hard_eval_results.json")[0]
189 |                     else:
190 |                         file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_hard_eval_results.json")[0]
191 |                 except:
192 |                     if info["prompted"]:
193 |                         files = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized*calibrated_eval_results.json")
194 |                         if files:
195 |                             file = files[0]
196 |                         else:
197 |                             file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_eval_results.json")[0]
198 |                     else:
199 |                         file = glob(f"results/{model}--bigcodebench-hard-{task}*-0-1-sanitized_eval_results.json")[0]
200 |         except:
201 |             continue
202 |         
203 |         result_files.append(file)
204 |         with open(file, "r") as f:
205 |             data = json.load(f)
206 | 
207 |         if len(data["eval"]) < len(tids):
208 |             continue
209 |         for task_id, perfs in data["eval"].items():
210 |             if task_id in tids:
211 |                 status = 1 if perfs[0]["status"] == "pass" else 0
212 |                 task_perf[task_id] = status
213 |         model_results[info["name"]] = task_perf
214 |     return model_results, result_files
215 | 
216 | 
217 | def get_domain_perf(data_dict, task2domain):
218 |     domain_perfs = {
219 |         "Model": [],
220 |         "Computation": [],
221 |         "General": [],
222 |         "Visualization": [],
223 |         "System": [],
224 |         "Time": [],
225 |         "Network": [],
226 |         "Cryptography": []
227 |     }
228 |     for model, task_perf in data_dict.items():
229 |         model_domain = {"Computation": [], "General": [], "Visualization": [], "System": [], "Time": [], "Network": [], "Cryptography": []}
230 |         for task_id, status in task_perf.items():
231 |             domains = task2domain[task_id]
232 |             for domain in domains:
233 |                 model_domain[domain].append(status)
234 |         domain_perf = {domain: round(np.mean(perfs)*100, 1) for domain, perfs in model_domain.items()}
235 |         domain_perfs["Model"].append(model)
236 |         for domain in model_domain.keys():
237 |             domain_perfs[domain].append(domain_perf[domain])
238 |     return Dataset.from_dict(domain_perfs)
239 | 
240 | 
241 | def get_solve_rate(data_dict, task="complete"):
242 |     task_solve_count = dict()
243 |     for model, task_perf in data_dict.items():
244 |         for task_id, score in task_perf.items():
245 |             if task_id not in task_solve_count:
246 |                 task_solve_count[task_id] = []
247 |             task_solve_count[task_id].append(score)
248 |     solve_rate = {task_id: round(np.mean(perfs) * 100, 1) for task_id, perfs in task_solve_count.items()}
249 |     return Dataset.from_dict({"task_id": list(solve_rate.keys()), "solve_rate": list(solve_rate.values())})
250 | 
251 | 
252 | def get_hf_ds(results):
253 |     hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], #"lazy": [],# "direct_complete": [],
254 |                   "complete": [], "instruct": [], "date": [], "prefill": []}
255 | 
256 |     for model, result in results.items():
257 |         hf_dataset["model"].append(model)
258 |         hf_dataset["link"].append(result["link"])
259 |         hf_dataset["moe"].append(result["moe"])
260 |         hf_dataset["size"].append(result["size"])
261 |         hf_dataset["act_param"].append(result["act_param"])
262 |         hf_dataset["type"].append("🔶" if result["prompted"] else "🟢")
263 |         # hf_dataset["lazy"].append(result["lazy"])
264 |         hf_dataset["complete"].append(result["pass@1"]["complete"])
265 |         hf_dataset["instruct"].append(result["pass@1"]["instruct"])
266 |         hf_dataset["date"].append(result["date"])
267 |         hf_dataset["prefill"].append(result["prefill"])
268 |         # hf_dataset["direct_complete"].append(result["direct_complete"])
269 | 
270 |     return Dataset.from_dict(hf_dataset)
271 | 
272 | def get_bootstrap_scores(df):
273 |     bars = pd.DataFrame(dict(
274 |         lower = df.quantile(.025),
275 |         rating = df.quantile(.5),
276 |         upper = df.quantile(.975))).reset_index(names="model").sort_values("rating", ascending=False)
277 |     
278 |     bars['error_y'] = bars['upper'] - bars["rating"]
279 |     bars['error_y_minus'] = bars['rating'] - bars["lower"]
280 |     bars['rating_rounded'] = np.round(bars['rating'], 2)
281 |     return Dataset.from_pandas(bars)
282 | 
283 | 
284 | def push_ds(ds, path, local=False):
285 |     if local:
286 |         ds.save_to_disk(path)
287 |     else:
288 |         ds.push_to_hub(path)
289 | 
290 | 
291 | def get_perf_df(data_dict):
292 |     perfs = {"Model": []}
293 |     for task_id in data_dict[list(data_dict.keys())[0]]:
294 |         perfs[task_id] = []
295 |     for model, task_perf in data_dict.items():
296 |         perfs["Model"].append(model)
297 |         for task_id, status in task_perf.items():
298 |             perfs[task_id].append(status)
299 |     return pd.DataFrame(perfs)
300 | 
301 |     
302 | if __name__ == "__main__":
303 |     split_gen()
304 |     bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1")
305 |     bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1")
306 |     bcb_config = {
307 |         "": bcb_orig,
308 |         "-hard": bcb_hard,
309 |     }
310 |     for suffix, bcb in bcb_config.items():
311 |         results = get_results(bcb["task_id"])
312 |         files = []
313 |         complete_data, complete_files = read_task_perf(bcb["task_id"], "complete")
314 |         instruct_data, instruct_files = read_task_perf(bcb["task_id"], "instruct")
315 |         complete_df = get_perf_df(complete_data)
316 |         instruct_df = get_perf_df(instruct_data)
317 | 
318 |         push_ds(DatasetDict({"complete": Dataset.from_pandas(complete_df), "instruct": Dataset.from_pandas(instruct_df)}), f"bigcode/bigcodebench{suffix}-perf")
319 | 
320 |         with open("task2domain.json", "r") as f:
321 |             task2domain = json.load(f)
322 |         domain_complete = get_domain_perf(complete_data, task2domain)
323 |         domain_instruct = get_domain_perf(instruct_data, task2domain)
324 |         DatasetDict({"complete": domain_complete, "instruct": domain_instruct}).push_to_hub(f"bigcode/bigcodebench{suffix}-domain")
325 | 
326 |         files.extend(complete_files)
327 |         files.extend(instruct_files)
328 |         shutil.rmtree("eval_results", ignore_errors=True)
329 |         os.makedirs("eval_results", exist_ok=True)
330 |         for file in files:
331 |             shutil.copy(file, "eval_results")
332 |         
333 |         complete_solve_rate = get_solve_rate(complete_data, task="complete")
334 |         instruct_solve_rate = get_solve_rate(instruct_data, task="instruct")
335 |         solve_rate_ds = DatasetDict({"complete": complete_solve_rate, "instruct": instruct_solve_rate})
336 |         push_ds(solve_rate_ds, f"bigcode/bigcodebench{suffix}-solve-rate")
337 | 
338 |         with open(f"results{suffix}.json", "w") as f:
339 |             json.dump(results, f, indent=4)
340 |         ds = get_hf_ds(results)
341 |         push_ds(ds, f"bigcode/bigcodebench{suffix}-results")


--------------------------------------------------------------------------------
/analysis/lib2domain.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Crypto": "Cryptography",
  3 |     "PIL": "Visualization",
  4 |     "array": "General",
  5 |     "base64": "Cryptography",
  6 |     "binascii": "Cryptography",
  7 |     "bisect": "General",
  8 |     "blake3": "Cryptography",
  9 |     "bs4": "Network",
 10 |     "calendar": "Time",
 11 |     "cgi": "Network",
 12 |     "chardet": "Network",
 13 |     "cmath": "Computation",
 14 |     "codecs": "Cryptography",
 15 |     "collections": "General",
 16 |     "cryptography": "Cryptography",
 17 |     "csv": "System",
 18 |     "ctypes": "System",
 19 |     "datetime": "Time",
 20 |     "dateutil": "Time",
 21 |     "difflib": "General",
 22 |     "django": "Network",
 23 |     "docx": "System",
 24 |     "email": "Network",
 25 |     "faker": "General",
 26 |     "flask": "Network",
 27 |     "flask_login": "Network",
 28 |     "flask_mail": "Network",
 29 |     "flask_restful": "Network",
 30 |     "fnmatch": "General",
 31 |     "folium": "Visualization",
 32 |     "functools": "General",
 33 |     "geopy": "Network",
 34 |     "getpass": "System",
 35 |     "glob": "System",
 36 |     "gzip": "System",
 37 |     "hashlib": "Cryptography",
 38 |     "heapq": "General",
 39 |     "hmac": "Cryptography",
 40 |     "html": "Network",
 41 |     "http": "Network",
 42 |     "importlib": "General",
 43 |     "inspect": "General",
 44 |     "io": "System",
 45 |     "ipaddress": "Network",
 46 |     "itertools": "General",
 47 |     "json": "System",
 48 |     "keras": "Computation",
 49 |     "librosa": "Computation",
 50 |     "logging": "System",
 51 |     "lxml": "Network",
 52 |     "math": "Computation",
 53 |     "matplotlib": "Visualization",
 54 |     "mechanize": "Network",
 55 |     "mimetypes": "Network",
 56 |     "multiprocessing": "System",
 57 |     "nltk": "Computation",
 58 |     "numpy": "Computation",
 59 |     "openpyxl": "System",
 60 |     "operator": "General",
 61 |     "os": "System",
 62 |     "pandas": "Computation",
 63 |     "pathlib": "System",
 64 |     "pickle": "System",
 65 |     "pkgutil": "General",
 66 |     "platform": "System",
 67 |     "prettytable": "General",
 68 |     "psutil": "System",
 69 |     "pytesseract": "Computation",
 70 |     "pytz": "Time",
 71 |     "queue": "General",
 72 |     "random": "General",
 73 |     "re": "General",
 74 |     "requests": "Network",
 75 |     "rsa": "Cryptography",
 76 |     "scipy": "Computation",
 77 |     "seaborn": "Visualization",
 78 |     "secrets": "Cryptography",
 79 |     "select": "System",
 80 |     "sendgrid": "Network",
 81 |     "shutil": "System",
 82 |     "sklearn": "Computation",
 83 |     "smtplib": "Network",
 84 |     "socket": "Network",
 85 |     "soundfile": "Computation",
 86 |     "sqlite3": "System",
 87 |     "ssl": "Network",
 88 |     "statistics": "Computation",
 89 |     "statsmodels": "Computation",
 90 |     "string": "General",
 91 |     "struct": "System",
 92 |     "subprocess": "System",
 93 |     "sys": "System",
 94 |     "tarfile": "System",
 95 |     "tensorflow": "Computation",
 96 |     "texttable": "General",
 97 |     "textwrap": "General",
 98 |     "threading": "System",
 99 |     "time": "Time",
100 |     "turtle": "Visualization",
101 |     "types": "General",
102 |     "unicodedata": "General",
103 |     "urllib": "Network",
104 |     "uuid": "General",
105 |     "warnings": "General",
106 |     "werkzeug": "Network",
107 |     "wordninja": "Computation",
108 |     "wtforms": "Network",
109 |     "xlwt": "System",
110 |     "xml": "Network",
111 |     "xmltodict": "Network",
112 |     "yaml": "System",
113 |     "zipfile": "System",
114 |     "Levenshtein": "Computation",
115 |     "ast": "General",
116 |     "configparser": "System",
117 |     "cv2": "Computation",
118 |     "decimal": "General",
119 |     "enum": "General",
120 |     "errno": "System",
121 |     "flask_wtf": "Network",
122 |     "ftplib": "Network",
123 |     "gensim": "Computation",
124 |     "geopandas": "Computation",
125 |     "holidays": "Time",
126 |     "mpl_toolkits": "Visualization",
127 |     "natsort": "General",
128 |     "pyquery": "Network",
129 |     "python_http_client": "Network",
130 |     "regex": "General",
131 |     "shapely": "Computation",
132 |     "shlex": "System",
133 |     "signal": "System",
134 |     "skimage": "Computation",
135 |     "sympy": "Computation",
136 |     "textblob": "Computation",
137 |     "typing": "General",
138 |     "wikipedia": "Network",
139 |     "wordcloud": "Visualization",
140 |     "zlib": "System",
141 |     "aspose": "System",
142 |     "builtins": "General",
143 |     "locale": "System",
144 |     "imp": "System",
145 |     "docxtpl": "System",
146 |     "selenium": "Network",
147 |     "IPython": "Computation",
148 |     "filecmp": "System",
149 |     "multidict": "General",
150 |     "sqlalchemy": "System",
151 |     "obspy": "Computation",
152 |     "pprint": "General",
153 |     "xlrd": "System",
154 |     "argparse": "General",
155 |     "torch": "Computation",
156 |     "copy": "General"
157 | }


--------------------------------------------------------------------------------
/bigcodebench/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from bigcodebench._version import __version__, __version_tuple__
3 | except ImportError:
4 |     __version__ = "local-dev"
5 | 


--------------------------------------------------------------------------------
/bigcodebench/data/__init__.py:
--------------------------------------------------------------------------------
1 | from bigcodebench.data.bigcodebench import get_bigcodebench, get_bigcodebench_hash
2 | from bigcodebench.data.utils import load_solutions, write_directory, write_jsonl
3 | 


--------------------------------------------------------------------------------
/bigcodebench/data/bigcodebench.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import json
 3 | import os
 4 | from typing import Dict
 5 | 
 6 | from bigcodebench.data.utils import (
 7 |     CACHE_DIR,
 8 |     completeness_check,
 9 |     get_dataset_metadata,
10 |     make_cache,
11 |     stream_jsonl,
12 | )
13 | from datasets import load_dataset
14 | 
15 | BIGCODEBENCH_OVERRIDE_PATH = os.environ.get("BIGCODEBENCH_OVERRIDE_PATH", None)
16 | BIGCODEBENCH_HF = "bigcode/bigcodebench"
17 | BIGCODEBENCH_VERSION = "v0.1.4"
18 | 
19 | def _ready_bigcodebench_path(subset="full", version="default") -> str:
20 |     if BIGCODEBENCH_OVERRIDE_PATH:
21 |         return BIGCODEBENCH_OVERRIDE_PATH
22 | 
23 |     version = BIGCODEBENCH_VERSION if version == "default" else version
24 |     url, path = get_dataset_metadata(
25 |         BIGCODEBENCH_VERSION, subset
26 |     )
27 |     
28 |     extra = "-" + subset if subset != "full" else ""
29 |     dataset = load_dataset(BIGCODEBENCH_HF+extra, split=BIGCODEBENCH_VERSION)
30 |     make_cache(url, dataset, path)
31 | 
32 |     return path
33 | 
34 | 
35 | def get_bigcodebench(
36 |     err_incomplete=True, subset="full", version="default"
37 |     ) -> Dict[str, Dict]:
38 |     """Get BigCodeBench from BigCode's github repo and return as a list of parsed dicts.
39 | 
40 |     Returns:
41 |         List[Dict[str, str]]: List of dicts with keys "complete_prompt", "instruct_prompt", "canonical_solution", "test", "entry_point"
42 | 
43 |     Notes:
44 |         "task_id" is the identifier string for the task.
45 |         "complete_prompt" is the prompt to be used for BigCodeBench-Complete.
46 |         "instruct_prompt" is the prompt to be used for BigCodeBench-Instruct.
47 |         "canonical_solution" is the ground-truth implementation
48 |         "test" is the `unittest.TestCase` class.
49 |         "entry_point" is the name of the function.
50 |     """
51 |     # Check if open eval file exists in CACHE_DIR
52 |     data_path = _ready_bigcodebench_path(
53 |         subset=subset, version=version
54 |     )
55 |     data = {task["task_id"]: task for task in stream_jsonl(data_path)}
56 |     if err_incomplete:
57 |         completeness_check("BigCodeBench", data)
58 |     return data
59 | 
60 | def get_bigcodebench_hash(subset="full", version="default") -> str:
61 |     """Get the hash of BigCodeBench.
62 |     Returns:
63 |         str: The hash of BigCodeBench
64 |     """
65 |     data_path = _ready_bigcodebench_path(subset, version="default")
66 |     with open(data_path, "rb") as f:
67 |         data = f.read()
68 |     return hashlib.md5(data).hexdigest()
69 | 


--------------------------------------------------------------------------------
/bigcodebench/data/utils.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import json
  3 | import os
  4 | from os import PathLike
  5 | from typing import Dict, Iterable
  6 | 
  7 | import tempdir
  8 | import wget
  9 | from appdirs import user_cache_dir
 10 | 
 11 | CACHE_DIR = user_cache_dir("bigcodebench")
 12 | 
 13 | 
 14 | def get_dataset_metadata(version: str, subset: str="full"):
 15 |     extra = "-" + subset.capitalize() if subset != "full" else ""
 16 |     url = f"https://github.com/bigcode-project/bigcodebench-annotation/releases/download/{version}/BigCodeBench{extra}.jsonl.gz"
 17 |     cache_path = os.path.join(CACHE_DIR, f"BigCodeBench{extra}-{version}.jsonl")
 18 |     return url, cache_path
 19 | 
 20 | 
 21 | def make_cache(gzip_url, hf_data, cache_path, gh=False):
 22 |     # Check if open eval file exists in CACHE_DIR
 23 |     
 24 |     if not os.path.exists(cache_path):
 25 |         if gh:
 26 |             # Install BigCodeBench dataset and parse as jsonl
 27 |             print(f"Downloading dataset from {gzip_url}")
 28 |             with tempdir.TempDir() as tmpdir:
 29 |                 gz_path = os.path.join(tmpdir, f"data.jsonl.gz")
 30 |                 wget.download(gzip_url, gz_path)
 31 | 
 32 |                 with gzip.open(gz_path, "rb") as f:
 33 |                     data = f.read().decode("utf-8")
 34 | 
 35 |             # create CACHE_DIR if not exists
 36 |             if not os.path.exists(CACHE_DIR):
 37 |                 os.makedirs(CACHE_DIR)
 38 | 
 39 |             # Write the original open eval file to CACHE_DIR
 40 |             with open(cache_path, "w") as f:
 41 |                 f.write(data)
 42 |         else:
 43 |             hf_data.to_json(cache_path)
 44 | 
 45 | 
 46 | def write_jsonl(
 47 |     filename: str, data: Iterable[Dict], append: bool = False, drop_builtin: bool = True
 48 | ):
 49 |     """
 50 |     Writes an iterable of dictionaries to jsonl
 51 |     """
 52 |     if append:
 53 |         mode = "ab"
 54 |     else:
 55 |         mode = "wb"
 56 |     filename = os.path.expanduser(filename)
 57 |     if filename.endswith(".gz"):
 58 |         with open(filename, mode) as fp:
 59 |             with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
 60 |                 for x in data:
 61 |                     if drop_builtin:
 62 |                         x = {k: v for k, v in x.items() if not k.startswith("_")}
 63 |                     gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
 64 |     else:
 65 |         with open(filename, mode) as fp:
 66 |             for x in data:
 67 |                 if drop_builtin:
 68 |                     x = {k: v for k, v in x.items() if not k.startswith("_")}
 69 |                 fp.write((json.dumps(x) + "\n").encode("utf-8"))
 70 | 
 71 | 
 72 | def stream_jsonl(filename: str) -> Iterable[Dict]:
 73 |     """
 74 |     Parses each jsonl line and yields it as a dictionary
 75 |     """
 76 |     if filename.endswith(".gz"):
 77 |         with open(filename, "rb") as gzfp:
 78 |             with gzip.open(gzfp, "rt") as fp:
 79 |                 for line in fp:
 80 |                     if any(not x.isspace() for x in line):
 81 |                         yield json.loads(line)
 82 |     else:
 83 |         with open(filename, "r") as fp:
 84 |             for line in fp:
 85 |                 if any(not x.isspace() for x in line):
 86 |                     yield json.loads(line)
 87 | 
 88 | 
 89 | def load_solutions(sample_path: PathLike) -> Iterable[Dict]:
 90 |     """We accept two formats of inputs.
 91 |     + `sample.jsonl` which is the format from BigCodeBench, i.e., {task_id, completion or solution}.
 92 |     + A folder which contains sub-folders named after the task_id. Each sub-folder
 93 |     contains samples named in `[?].py` where `?` is the solution id starting with 0.
 94 |     Different from `sample.jsonl`, the solutions must be complete (with prompt prefix).
 95 |     """
 96 | 
 97 |     # if it is a file
 98 |     if os.path.isfile(sample_path):
 99 |         for i, sample in enumerate(stream_jsonl(sample_path)):
100 |             assert (
101 |                 "completion" in sample or "solution" in sample
102 |             ), "No completion or solution found in sample!"
103 |             assert "solution" not in sample or isinstance(
104 |                 sample["solution"], str
105 |             ), "Solution must be a string! If you have multiple solutions, please repeat the task_id."
106 |             assert "completion" not in sample or isinstance(
107 |                 sample["completion"], str
108 |             ), "Completion must be a string! If you have multiple solutions, please repeat the task_id."
109 | 
110 |             sample["_identifier"] = (
111 |                 sample["task_id"] + f" (line {i+1} in {sample_path})"
112 |             )
113 |             yield sample
114 |     else:
115 |         # if it is a folder
116 |         for task_id in os.listdir(sample_path):
117 |             task_path = os.path.join(sample_path, task_id)
118 |             if not os.path.isdir(task_path):
119 |                 continue
120 | 
121 |             for solution_id in os.listdir(task_path):
122 |                 solution_path = os.path.join(task_path, solution_id)
123 |                 if os.path.isfile(solution_path) and solution_path.endswith(".py"):
124 |                     with open(solution_path, "r") as f:
125 |                         completion = f.read()
126 |                     yield {
127 |                         "_identifier": solution_path,
128 |                         "_path": solution_path,
129 |                         "task_id": task_id.replace("_", "/"),
130 |                         "solution": completion,
131 |                     }
132 | 
133 | 
134 | def write_directory(directory: PathLike, data: Iterable[Dict]):
135 |     os.makedirs(directory, exist_ok=True)
136 |     counters = {}
137 |     for sample in data:
138 |         assert "solution" in sample, "Samples must come with `solution` field!"
139 |         task_id = sample["task_id"].replace("/", "_")
140 |         task_dir = os.path.join(directory, task_id)
141 |         os.makedirs(task_dir, exist_ok=True)
142 |         if task_id not in counters:
143 |             counters[task_id] = 0
144 |         sample_id = counters[task_id]
145 |         with open(os.path.join(task_dir, f"{sample_id}.py"), "w") as f:
146 |             f.write(sample["solution"])
147 |         counters[task_id] += 1
148 | 
149 | 
150 | def completeness_check(name, data):
151 |     for task_id, task in data.items():
152 |         for key in [
153 |             "complete_prompt",
154 |             "instruct_prompt",
155 |             "canonical_solution",
156 |             "code_prompt",
157 |             "test",
158 |             "entry_point"
159 |         ]:
160 |             assert key in task, f"{key} not found in {name} #{task_id}!"
161 | 
162 | 
163 | def to_raw(string):
164 |     return string.encode("unicode-escape").decode().replace("\\\\", "\\")
165 | 


--------------------------------------------------------------------------------
/bigcodebench/eval/__init__.py:
--------------------------------------------------------------------------------
  1 | # The MIT License
  2 | #
  3 | # Copyright (c) OpenAI (https://openai.com)
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import itertools
 24 | import multiprocessing
 25 | import os
 26 | import sys
 27 | import time
 28 | import types
 29 | import unittest
 30 | from multiprocessing import Array, Value, Manager
 31 | from typing import Any, Dict, List, Tuple, Union
 32 | 
 33 | import numpy as np
 34 | 
 35 | from bigcodebench.eval._special_oracle import (
 36 |     _poly,
 37 | )
 38 | from bigcodebench.eval.utils import (
 39 |     create_tempdir,
 40 |     reliability_guard,
 41 |     swallow_io,
 42 |     time_limit,
 43 |     safe_environment,
 44 |     TIMEOUT_LIMIT,
 45 | )
 46 | 
 47 | 
 48 | def compatible_eval_result(results: Dict) -> Dict:
 49 |     # compatibility
 50 |     for task_results in results["eval"].values():
 51 |         # update the "files" field to "nfiles"
 52 |         if "files" in task_results and "nfiles" not in task_results:
 53 |             task_results["nfiles"] = len(task_results.pop("files"))
 54 |     return results
 55 | 
 56 | 
 57 | # unbiased estimator from https://github.com/openai/human-eval
 58 | def estimate_pass_at_k(
 59 |     num_samples: Union[int, List[int], np.ndarray],
 60 |     num_correct: Union[List[int], np.ndarray],
 61 |     k: int,
 62 | ) -> np.ndarray:
 63 |     """
 64 |     Estimates pass@k of each problem and returns them in an array.
 65 |     """
 66 | 
 67 |     def estimator(n: int, c: int, k: int) -> float:
 68 |         """
 69 |         Calculates 1 - comb(n - c, k) / comb(n, k).
 70 |         """
 71 |         if n - c < k:
 72 |             return 1.0
 73 |         return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
 74 | 
 75 |     if isinstance(num_samples, int):
 76 |         num_samples_it = itertools.repeat(num_samples, len(num_correct))
 77 |     else:
 78 |         assert len(num_samples) == len(num_correct)
 79 |         num_samples_it = iter(num_samples)
 80 | 
 81 |     return np.array(
 82 |         [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
 83 |     )
 84 | 
 85 | 
 86 | PASS = "pass"
 87 | FAIL = "fail"
 88 | TIMEOUT = "timeout"
 89 | 
 90 | _SUCCESS = 0
 91 | _FAILED = 1
 92 | _TIMEOUT = 2
 93 | _UNKNOWN = 3
 94 | 
 95 | _mapping = {_SUCCESS: PASS, _FAILED: FAIL, _TIMEOUT: TIMEOUT, _UNKNOWN: None}
 96 | 
 97 | 
 98 | def is_floats(x) -> bool:
 99 |     # check if it is float; List[float]; Tuple[float]
100 |     if isinstance(x, float):
101 |         return True
102 |     if isinstance(x, (list, tuple)):
103 |         return all(isinstance(i, float) for i in x)
104 |     if isinstance(x, np.ndarray):
105 |         return x.dtype == np.float64 or x.dtype == np.float32
106 |     return False
107 | 
108 | 
109 | def unsafe_execute(
110 |     entry_point: str,
111 |     code: str,
112 |     test_code: str,
113 |     timeout: float,
114 |     max_as_limit: float,
115 |     max_data_limit: float,
116 |     max_stack_limit: float,
117 |     stat,  # Value
118 |     details,  # Array
119 | ):
120 |     with safe_environment(), create_tempdir():
121 |         # These system calls are needed when cleaning up tempdir.
122 |         import os
123 |         import shutil
124 |         import builtins
125 |         
126 |         rmtree = shutil.rmtree
127 |         rmdir = os.rmdir
128 |         chdir = os.chdir
129 |         # Disable functionalities that can make destructive changes to the test.
130 |         reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
131 |         module_name = "__test__"
132 |         new_module = types.ModuleType(module_name)
133 |         # Set necessary attributes for the module
134 |         new_module.__dict__.update({
135 |             '__builtins__': builtins,
136 |             '__file__': f"{module_name}.py",
137 |             '__package__': None,
138 |             '__doc__': None,
139 |             'sys': sys,
140 |             'os': os,
141 |             'environ': os.environ,
142 |         })
143 | 
144 |         try:
145 |             full_code = code + "\n" + test_code
146 | 
147 |             with swallow_io():
148 |                 exec(compile(full_code, f"{module_name}.py", 'exec'), new_module.__dict__)
149 |                 sys.modules[module_name] = new_module
150 |                 TestCases = getattr(new_module, 'TestCases')
151 |                 loader = unittest.TestLoader()
152 |                 suite = loader.loadTestsFromTestCase(TestCases)
153 |                 test_result = unittest.TestResult()
154 |                 start_time = time.time()
155 |                 with time_limit(timeout):
156 |                     suite.run(test_result)
157 |             
158 |             issues = test_result.failures + test_result.errors
159 |             for test, trace in issues:
160 |                 details[test.id().split(".")[-1]] = trace
161 |             stat.value = _SUCCESS
162 |         except BaseException as e:
163 |             details["ALL"] = str(e)
164 |             stat.value = _FAILED
165 |         # Needed for cleaning up.
166 |         shutil.rmtree = rmtree
167 |         os.rmdir = rmdir
168 |         os.chdir = chdir
169 | 
170 | 
171 | def untrusted_check(
172 |     code: str,
173 |     test_code: str,
174 |     entry_point: str,
175 |     max_as_limit: float,
176 |     max_data_limit: float,
177 |     max_stack_limit: float,
178 |     min_time_limit: float = 10,
179 |     gt_time_limit: float = 60
180 | ) -> Tuple[str, np.ndarray]:
181 |     min_time_limit = max(min_time_limit, gt_time_limit)
182 |     timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1
183 |     # shared memory objects
184 |     stat = Value("i", _UNKNOWN)
185 |     manager = Manager()
186 |     details = manager.dict()
187 | 
188 |     p = multiprocessing.Process(
189 |         target=unsafe_execute,
190 |         args=(
191 |             entry_point,
192 |             code,
193 |             test_code,
194 |             timeout,
195 |             max_as_limit,
196 |             max_data_limit,
197 |             max_stack_limit,
198 |             stat,
199 |             details,
200 |         ),
201 |     )
202 |     p.start()
203 |     p.join(timeout=timeout+1)
204 |     if p.is_alive():
205 |         p.terminate()
206 |         time.sleep(0.1)
207 |     if p.is_alive():
208 |         p.kill()
209 |         time.sleep(0.1)
210 | 
211 |     stat = _mapping[stat.value]
212 |     # convert details to a dict
213 |     details = dict(details)
214 |     
215 |     if not stat:
216 |         stat = TIMEOUT
217 |     if stat == PASS:
218 |         if details:
219 |             stat = FAIL
220 | 
221 |     return stat, details
222 | 
223 | 
224 | def evaluate_files(
225 |     files: List[str],
226 |     inputs: List,
227 |     entry_point: str,
228 |     min_time_limit: float = 0.1,
229 |     gt_time_limit_factor: float = 2.0,
230 | ) -> List[Tuple[str, List[bool]]]:
231 |     ret = []
232 |     # sort files by the id in name (i.e., "../n.py")
233 |     files = sorted(files, key=lambda x: int(x.split("/")[-1].split(".")[0]))
234 |     for file in files:
235 |         code = open(file, "r").read()
236 |         stat, det = untrusted_check(
237 |             code,
238 |             inputs,
239 |             entry_point,
240 |         )
241 |         ret.append((stat, det.tolist()))
242 |     return ret
243 | 


--------------------------------------------------------------------------------
/bigcodebench/eval/_special_oracle.py:
--------------------------------------------------------------------------------
 1 | """Special oracle handlings for problems where direct differential testing is not applicable."""
 2 | 
 3 | import math
 4 | 
 5 | # oracle for HumaneEval/032
 6 | def _poly(xs: list, x: float):
 7 |     """
 8 |     Evaluates polynomial with coefficients xs at point x.
 9 |     return xs[0] + xs[1] * x + xs[1] * x^2 + .... xs[n] * x^n
10 |     """
11 |     return sum([coeff * math.pow(x, i) for i, coeff in enumerate(xs)])
12 | 


--------------------------------------------------------------------------------
/bigcodebench/eval/utils.py:
--------------------------------------------------------------------------------
  1 | # The MIT License
  2 | #
  3 | # Copyright (c) OpenAI (https://openai.com)
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in
 13 | # all copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | # THE SOFTWARE.
 22 | 
 23 | import contextlib
 24 | import faulthandler
 25 | import io
 26 | import os
 27 | import platform
 28 | import signal
 29 | import tempfile
 30 | import subprocess
 31 | import multiprocessing
 32 | import time
 33 | from typing import Optional
 34 | 
 35 | TIMEOUT_LIMIT=240.0
 36 | 
 37 | @contextlib.contextmanager
 38 | def swallow_subprocess_output():
 39 |     """Context manager to swallow stdout and stderr for subprocesses."""
 40 |     original_popen = subprocess.Popen
 41 |     original_run = subprocess.run
 42 | 
 43 |     def _popen_patch(*args, **kwargs):
 44 |         if 'capture_output' in kwargs and kwargs['capture_output']:
 45 |             # Avoid setting stdout or stderr if capture_output is True
 46 |             kwargs.pop('stdout', None)
 47 |             kwargs.pop('stderr', None)
 48 |         else:
 49 |             kwargs.setdefault('stdout', subprocess.PIPE)
 50 |             kwargs.setdefault('stderr', subprocess.PIPE)
 51 |         return original_popen(*args, **kwargs)
 52 | 
 53 |     def _run_patch(*args, **kwargs):
 54 |         if 'capture_output' in kwargs and kwargs['capture_output']:
 55 |             # Avoid setting stdout or stderr if capture_output is True
 56 |             kwargs.pop('stdout', None)
 57 |             kwargs.pop('stderr', None)
 58 |         else:
 59 |             kwargs.setdefault('stdout', subprocess.PIPE)
 60 |             kwargs.setdefault('stderr', subprocess.PIPE)
 61 |         return original_run(*args, **kwargs)
 62 | 
 63 |     subprocess.Popen = _popen_patch
 64 |     subprocess.run = _run_patch
 65 |     try:
 66 |         yield
 67 |     finally:
 68 |         subprocess.Popen = original_popen
 69 |         subprocess.run = original_run
 70 | 
 71 | @contextlib.contextmanager
 72 | def swallow_io():
 73 |     stream = WriteOnlyStringIO()
 74 |     with contextlib.redirect_stdout(stream):
 75 |         with contextlib.redirect_stderr(stream):
 76 |             with redirect_stdin(stream):
 77 |                 with swallow_subprocess_output():
 78 |                     yield
 79 | 
 80 | 
 81 | @contextlib.contextmanager
 82 | def time_limit(seconds: float):
 83 |     def signal_handler(signum, frame):
 84 |         raise TimeoutException("Timed out!")
 85 | 
 86 |     signal.setitimer(signal.ITIMER_REAL, seconds)
 87 |     signal.signal(signal.SIGALRM, signal_handler)
 88 |     try:
 89 |         yield
 90 |     finally:
 91 |         signal.setitimer(signal.ITIMER_REAL, 0)
 92 | 
 93 | 
 94 | @contextlib.contextmanager
 95 | def create_tempdir():
 96 |     with tempfile.TemporaryDirectory() as dirname:
 97 |         with chdir(dirname):
 98 |             yield dirname
 99 | 
100 | 
101 | @contextlib.contextmanager
102 | def chdir(root):
103 |     if root == ".":
104 |         yield
105 |         return
106 |     cwd = os.getcwd()
107 |     os.chdir(root)
108 |     try:
109 |         yield
110 |     except BaseException as exc:
111 |         raise exc
112 |     finally:
113 |         os.chdir(cwd)
114 | 
115 | 
116 | @contextlib.contextmanager
117 | def safe_environment():
118 |     # Save original functions
119 |     original_kill = os.kill
120 |     original_killpg = os.killpg
121 |     original_system = os.system
122 |     original_subprocess_call = subprocess.call
123 |     original_subprocess_check_output = subprocess.check_output
124 |     original_subprocess_run = subprocess.run
125 |     original_subprocess_popen = subprocess.Popen
126 |     original_os_popen = os.popen
127 |     original_os_execv = os.execv
128 |     original_os_execvp = os.execvp
129 |     original_os_execvpe = os.execvpe
130 | 
131 |     current_pid = os.getpid()
132 |     current_pgid = os.getpgid(current_pid)
133 |     manager = multiprocessing.Manager()
134 |     child_pids = manager.list()
135 | 
136 |     def safe_kill(pid, sig):
137 |         try:
138 |             pgid = os.getpgid(pid)
139 |             if pid == current_pid or pid in child_pids:
140 |                 original_kill(pid, sig)
141 |             else:
142 |                 print(f"Prevented attempt to kill PID {pid} with signal {sig}")
143 |         except ProcessLookupError:
144 |             pass
145 | 
146 |     def safe_killpg(pgid, sig):
147 |         if pgid == current_pgid or pgid in {os.getpgid(pid) for pid in child_pids}:
148 |             original_killpg(pgid, sig)
149 |         else:
150 |             print(f"Prevented attempt to kill PGID {pgid} with signal {sig}")
151 | 
152 |     def safe_system(command):
153 |         print(f"Intercepted system command: {command}")
154 |         if 'kill' in command or 'killall' in command:
155 |             return 0  # Simulate successful execution without doing anything
156 |         return original_system(command)
157 | 
158 |     def safe_subprocess_call(command, *args, **kwargs):
159 |         print(f"Intercepted subprocess call: {command}")
160 |         if 'kill' in command or 'killall' in command:
161 |             return 0  # Simulate successful execution without doing anything
162 |         return original_subprocess_call(command, *args, **kwargs)
163 | 
164 |     def safe_subprocess_check_output(command, *args, **kwargs):
165 |         print(f"Intercepted command: {command}")
166 |         if 'ps' in command:
167 |             return b""  # Simulate no processes found
168 |         return original_subprocess_check_output(command, *args, **kwargs)
169 | 
170 |     def safe_subprocess_run(*args, **kwargs):
171 |         print(f"Intercepted subprocess run command: {args}")
172 |         if 'kill' in args[0] or 'killall' in args[0]:
173 |             return subprocess.CompletedProcess(args, 0, b'', b'')  # Simulate successful execution
174 |         return original_subprocess_run(*args, **kwargs)
175 | 
176 |     class SafePopen(subprocess.Popen):
177 |         def __init__(self, *args, **kwargs):
178 |             print(f"Intercepted Popen command: {args}")
179 |             kwargs['preexec_fn'] = os.setsid  # Start the process in a new session
180 |             super().__init__(*args, **kwargs)
181 |             child_pids.append(self.pid)
182 | 
183 |         def communicate(self, *args, **kwargs):
184 |             try:
185 |                 return super().communicate(*args, **kwargs)
186 |             except subprocess.TimeoutExpired:
187 |                 print("Timeout expired, intercepted and returning None")
188 |                 return None, None
189 | 
190 |         def kill(self):
191 |             print(f"Intercepted kill call for PID {self.pid}")
192 |             safe_kill(self.pid, signal.SIGTERM)
193 | 
194 |         def terminate(self):
195 |             print(f"Intercepted terminate call for PID {self.pid}")
196 |             safe_kill(self.pid, signal.SIGTERM)
197 | 
198 |     def safe_os_popen(command):
199 |         print(f"Intercepted os.popen command: {command}")
200 |         if 'kill' in command or 'killall' in command:
201 |             return os.popen('echo Intercepted')
202 |         return original_os_popen(command)
203 | 
204 |     def safe_exec(*args, **kwargs):
205 |         print(f"Intercepted exec command: {args}")
206 | 
207 |     # Override the risky functions with the safe versions
208 |     os.kill = safe_kill
209 |     os.killpg = safe_killpg
210 |     os.system = safe_system
211 |     subprocess.call = safe_subprocess_call
212 |     subprocess.check_output = safe_subprocess_check_output
213 |     subprocess.run = safe_subprocess_run
214 |     subprocess.Popen = SafePopen
215 |     os.popen = safe_os_popen
216 |     os.execv = safe_exec
217 |     os.execvp = safe_exec
218 |     os.execvpe = safe_exec
219 | 
220 |     try:
221 |         yield
222 |     finally:
223 |         for pid in child_pids:
224 |             try:
225 |                 os.kill(pid, signal.SIGTERM)
226 |                 for _ in range(10):
227 |                     time.sleep(0.1)
228 |                     try:
229 |                         os.kill(pid, 0)
230 |                     except ProcessLookupError:
231 |                         break
232 |                 else:
233 |                     os.kill(pid, signal.SIGKILL)
234 |             except ProcessLookupError:
235 |                 pass
236 |             except Exception as e:
237 |                 print(f"Error handling process {pid}: {e}")
238 |         
239 |         os.kill = original_kill
240 |         os.killpg = original_killpg
241 |         os.system = original_system
242 |         subprocess.call = original_subprocess_call
243 |         subprocess.check_output = original_subprocess_check_output
244 |         subprocess.run = original_subprocess_run
245 |         subprocess.Popen = original_subprocess_popen
246 |         os.popen = original_os_popen
247 |         os.execv = original_os_execv
248 |         os.execvp = original_os_execvp
249 |         os.execvpe = original_os_execvpe
250 | 
251 | 
252 | class TimeoutException(Exception):
253 |     pass
254 | 
255 | 
256 | class WriteOnlyStringIO(io.StringIO):
257 |     """StringIO that throws an exception when it's read from"""
258 | 
259 |     def read(self, *args, **kwargs):
260 |         raise IOError
261 | 
262 |     def readline(self, *args, **kwargs):
263 |         raise IOError
264 | 
265 |     def readlines(self, *args, **kwargs):
266 |         raise IOError
267 | 
268 |     def readable(self, *args, **kwargs):
269 |         """Returns True if the IO object can be read."""
270 |         return False
271 | 
272 | 
273 | class redirect_stdin(contextlib._RedirectStream):  # type: ignore
274 |     _stream = "stdin"
275 | 
276 | 
277 | def reliability_guard(max_as_limit, max_data_limit, max_stack_limit):
278 |     """
279 |     This disables various destructive functions and prevents the generated code
280 |     from interfering with the test (e.g. fork bomb, killing other processes,
281 |     removing filesystem files, etc.)
282 | 
283 |     WARNING
284 |     This function is NOT a security sandbox. Untrusted code, including, model-
285 |     generated code, should not be blindly executed outside of one. See the
286 |     Codex paper for more information about OpenAI's code sandbox, and proceed
287 |     with caution.
288 |     """
289 |     
290 |     import os
291 |     import time
292 |     from datetime import datetime
293 | 
294 |     os.environ['TZ'] = 'UTC'
295 |     time.tzset()
296 |     
297 |     os.environ["OMP_NUM_THREADS"] = "1"
298 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" 
299 |     os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
300 |     
301 |     if max_as_limit and max_data_limit and max_stack_limit:
302 |         import resource
303 |         
304 |         max_as_limit = max_as_limit * 1024 * 1024
305 |         max_data_limit = max_data_limit * 1024 * 1024
306 |         max_stack_limit = max_stack_limit * 1024 * 1024
307 |         
308 |         resource.setrlimit(
309 |             resource.RLIMIT_AS, (max_as_limit, max_as_limit)
310 |         )
311 |         resource.setrlimit(
312 |             resource.RLIMIT_DATA, (max_data_limit, max_data_limit)
313 |         )
314 |         if not platform.uname().system == "Darwin":
315 |             resource.setrlimit(
316 |                 resource.RLIMIT_STACK, (max_stack_limit, max_stack_limit)
317 |             )
318 | 
319 |     faulthandler.disable()
320 | 
321 |     import builtins
322 | 
323 |     builtins.exit = None
324 |     builtins.quit = None
325 | 
326 |     import matplotlib.pyplot as plt
327 |     plt.close('all')
328 | 


--------------------------------------------------------------------------------
/bigcodebench/gen/__init__.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from typing import Any, List
 3 | 
 4 | 
 5 | class BaseGen(object):
 6 |     def __init__(self, inputs: List[Any], entry_point: str, contract: str):
 7 |         """Initializing a input mutator.
 8 | 
 9 |         Args:
10 |             inputs (List[Any]): The set of initial inputs (i.e., seeds)
11 |             entry_point (str): The function name to invoke with the input
12 |             contract (str): The contract to verify input validity
13 |         """
14 |         self.contract = contract
15 |         self.entry_point = entry_point
16 |         self.seed_pool: List[Any] = copy.deepcopy(inputs)
17 |         self.new_inputs = []
18 |         self.seed_hash = set([hash(str(x)) for x in self.seed_pool])
19 | 
20 |     def generate(self, num: int) -> List[Any]:
21 |         raise NotImplementedError
22 | 


--------------------------------------------------------------------------------
/bigcodebench/gen/util/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import sys
  4 | import types
  5 | import unittest
  6 | import tempfile
  7 | import multiprocessing
  8 | from multiprocessing import Array, Value, Manager
  9 | from bigcodebench.eval.utils import (
 10 |     create_tempdir,
 11 |     reliability_guard,
 12 |     swallow_io,
 13 |     time_limit,
 14 |     safe_environment,
 15 |     TIMEOUT_LIMIT,
 16 | )
 17 | 
 18 | 
 19 | def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_stack_limit, times):
 20 |     """Execute trusted code in place."""
 21 |     # Specify a unique cache dir by modifying XDG_CONFIG_HOME
 22 |     old_xdg = os.environ.get("XDG_CONFIG_HOME")
 23 |     temp_xdg = tempfile.mkdtemp(prefix="xdg_config_")
 24 |     os.environ["XDG_CONFIG_HOME"] = temp_xdg
 25 | 
 26 |     try:
 27 |         with create_tempdir():
 28 |             import shutil
 29 |             import builtins
 30 | 
 31 |             rmtree = shutil.rmtree
 32 |             rmdir = os.rmdir
 33 |             chdir = os.chdir
 34 |             module_name = "__test__"
 35 |             new_module = types.ModuleType(module_name)
 36 | 
 37 |             reliability_guard(max_as_limit, max_data_limit, max_stack_limit)
 38 | 
 39 |             # Set necessary attributes for the module
 40 |             new_module.__dict__.update({
 41 |                 '__builtins__': builtins,
 42 |                 '__file__': f"{module_name}.py",
 43 |                 '__package__': None,
 44 |                 '__doc__': None,
 45 |                 'sys': sys,
 46 |                 'os': os,
 47 |                 'environ': os.environ,
 48 |             })
 49 | 
 50 |             # Combine the user code and the test code
 51 |             full_code = code + "\n" + test_code
 52 | 
 53 |             # Compile and execute the combined code within the new module
 54 |             exec(compile(full_code, f"{module_name}.py", 'exec'),
 55 |                  new_module.__dict__)
 56 |             sys.modules[module_name] = new_module
 57 |             TestCases = getattr(new_module, 'TestCases')
 58 |             loader = unittest.TestLoader()
 59 |             suite = loader.loadTestsFromTestCase(TestCases)
 60 |             test_result = unittest.TestResult()
 61 |             start = time.time()
 62 |             with safe_environment(), swallow_io(), time_limit(seconds=TIMEOUT_LIMIT):
 63 |                 suite.run(test_result)
 64 | 
 65 |             errors = test_result.failures + test_result.errors
 66 |             if len(errors) > 0:
 67 |                 print(errors)
 68 |                 times.value = -1
 69 |             else:
 70 |                 times.value = time.time() - start
 71 | 
 72 |             # Needed for cleaning up.
 73 |             shutil.rmtree = rmtree
 74 |             os.rmdir = rmdir
 75 |             os.chdir = chdir
 76 | 
 77 |     finally:
 78 |         # Restore the original environment variable
 79 |         if old_xdg is None:
 80 |             os.environ.pop("XDG_CONFIG_HOME", None)
 81 |         else:
 82 |             os.environ["XDG_CONFIG_HOME"] = old_xdg
 83 |         shutil.rmtree(temp_xdg, ignore_errors=True)
 84 | 
 85 | 
 86 | def trusted_check_exec(code, inputs):
 87 |     """Check trusted_exec success."""
 88 |     try:
 89 |         with time_limit(seconds=TIMEOUT_LIMIT):
 90 |             trusted_exec(code, inputs)
 91 |     except Exception:
 92 |         return False
 93 |     return True
 94 | 
 95 | 
 96 | def trusted_check(
 97 |     code: str,
 98 |     test_code: str,
 99 |     task_id: str,
100 |     max_as_limit: float,
101 |     max_data_limit: float,
102 |     max_stack_limit: float,
103 |     min_time_limit: float = 10,
104 | ):
105 |     timeout = max(os.getenv("BIGCODEBENCH_TIMEOUT_PER_TASK", TIMEOUT_LIMIT), min_time_limit) + 1
106 |     # shared memory objects
107 |     times = Value("d", -1)
108 |     manager = Manager()
109 | 
110 |     p = multiprocessing.Process(
111 |         target=trusted_exec,
112 |         args=(
113 |             code,
114 |             test_code,
115 |             task_id,
116 |             max_as_limit,
117 |             max_data_limit,
118 |             max_stack_limit,
119 |             times,
120 |         ),
121 |     )
122 |     p.start()
123 |     p.join(timeout=timeout+1)
124 |     if p.is_alive():
125 |         p.terminate()
126 |         time.sleep(0.1)
127 |     if p.is_alive():
128 |         p.kill()
129 |         time.sleep(0.1)
130 | 
131 |     if times.value == -1:
132 |         times = None
133 |     else:
134 |         times = times.value
135 |     
136 |     return {"task_id": task_id, "time": times}


--------------------------------------------------------------------------------
/bigcodebench/gen/util/anthropic_request.py:
--------------------------------------------------------------------------------
 1 | import signal
 2 | import time
 3 | 
 4 | import anthropic
 5 | from anthropic.types import Message
 6 | 
 7 | 
 8 | def handler(signum, frame):
 9 |     # swallow signum and frame
10 |     raise Exception("end of time")
11 | 
12 | 
13 | def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message:
14 |     ret = None
15 |     while ret is None:
16 |         try:
17 |             signal.signal(signal.SIGALRM, handler)
18 |             signal.alarm(100)
19 |             if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs:
20 |                 kwargs["thinking"] = {
21 |                     "type": "enabled",
22 |                     "budget_tokens": kwargs["reasoning_budget"],
23 |                 }
24 |                 kwargs["betas"] = [kwargs["reasoning_beta"]]
25 |                 kwargs.pop("reasoning_budget")
26 |                 kwargs.pop("reasoning_beta")
27 |                 kwargs.pop("temperature")
28 |             if "thinking" in kwargs:
29 |                 ret = client.beta.messages.create(*args, **kwargs, stream=True)
30 |             else:
31 |                 ret = client.messages.create(*args, **kwargs)
32 |             signal.alarm(0)
33 |         except anthropic.RateLimitError:
34 |             print("Rate limit exceeded. Waiting...")
35 |             signal.alarm(0)
36 |             time.sleep(5)
37 |         except anthropic.APIConnectionError:
38 |             print("API connection error. Waiting...")
39 |             signal.alarm(0)
40 |             time.sleep(5)
41 |         except anthropic.InternalServerError:
42 |             print("Internal server error. Waiting...")
43 |             signal.alarm(0)
44 |             time.sleep(5)
45 |         except anthropic.APIError as e:
46 |             print("Unknown API error")
47 |             print(e)
48 |             if (
49 |                 e.body["error"]["message"]
50 |                 == "Output blocked by content filtering policy"
51 |             ):
52 |                 raise Exception("Content filtering policy blocked output")
53 |             signal.alarm(0)
54 |         except Exception as e:
55 |             print("Unknown error. Waiting...")
56 |             print(e)
57 |             signal.alarm(0)
58 |             time.sleep(1)
59 |     return ret


--------------------------------------------------------------------------------
/bigcodebench/gen/util/google_request.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from google import genai
 4 | from google.api_core.exceptions import GoogleAPICallError, ResourceExhausted
 5 | 
 6 | 
 7 | def make_request(
 8 |     model: str,
 9 |     client: genai.Client,
10 |     message: str,
11 |     temperature: float,
12 |     n: int,
13 |     max_new_tokens: int = 2048,
14 | ) -> genai.types.GenerateContentResponse:
15 |     kwargs = {"temperature": temperature, "max_output_tokens": max_new_tokens}
16 | 
17 |     if "-thinking-" in model:
18 |         kwargs.pop("max_output_tokens")
19 |     
20 |     response = client.models.generate_content(
21 |         model=model,
22 |         contents=message,
23 |         config=genai.types.GenerateContentConfig(
24 |             candidate_count=n,
25 |             safety_settings=[
26 |                 genai.types.SafetySetting(
27 |                     category='HARM_CATEGORY_DANGEROUS_CONTENT',
28 |                     threshold='BLOCK_NONE'
29 |                 ),
30 |                 genai.types.SafetySetting(
31 |                     category='HARM_CATEGORY_SEXUALLY_EXPLICIT',
32 |                     threshold='BLOCK_NONE'
33 |                 ),
34 |                 genai.types.SafetySetting(
35 |                     category='HARM_CATEGORY_HATE_SPEECH',
36 |                     threshold='BLOCK_NONE'
37 |                 ),
38 |                 genai.types.SafetySetting(
39 |                     category='HARM_CATEGORY_HARASSMENT',
40 |                     threshold='BLOCK_NONE'
41 |                 ),
42 |             ],
43 |             **kwargs
44 |         ),            
45 |     )
46 | 
47 |     return response
48 | 
49 | 
50 | def make_auto_request(*args, **kwargs) -> genai.types.GenerateContentResponse:
51 |     ret = None
52 |     while ret is None:
53 |         try:
54 |             ret = make_request(*args, **kwargs)
55 |         except ResourceExhausted as e:
56 |             print("Rate limit exceeded. Waiting...", e.message)
57 |             time.sleep(10)
58 |         except GoogleAPICallError as e:
59 |             print(e.message)
60 |             time.sleep(1)
61 |         except Exception as e:
62 |             print("Unknown error. Waiting...")
63 |             print(e)
64 |             time.sleep(1)
65 |     return ret


--------------------------------------------------------------------------------
/bigcodebench/gen/util/hf_inference_request.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from huggingface_hub import InferenceClient
 4 | from huggingface_hub.inference._generated.types import TextGenerationOutput
 5 | 
 6 | 
 7 | def make_request(
 8 |     client: InferenceClient,
 9 |     message: str,
10 |     model: str,
11 |     temperature: float,
12 |     n: int,
13 |     max_new_tokens: int = 2048,
14 | ) -> TextGenerationOutput:
15 |     response = client.text_generation(
16 |         model=model,
17 |         prompt=message,
18 |         do_sample=False,
19 |         max_new_tokens=max_new_tokens,
20 |     )
21 | 
22 |     return response
23 | 
24 | 
25 | def make_auto_request(*args, **kwargs) -> TextGenerationOutput:
26 |     ret = None
27 |     while ret is None:
28 |         try:
29 |             ret = make_request(*args, **kwargs)
30 |         except Exception as e:
31 |             print("Unknown error. Waiting...")
32 |             print(e)
33 |             time.sleep(1)
34 |     return ret
35 | 


--------------------------------------------------------------------------------
/bigcodebench/gen/util/mistral_request.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from mistralai.client import MistralClient
 4 | from mistralai.models.chat_completion import ChatMessage
 5 | 
 6 | def make_auto_request(client: MistralClient, *args, **kwargs) -> ChatMessage:
 7 |     ret = None
 8 |     while ret is None:
 9 |         try:
10 |             ret = client.chat(*args, **kwargs)
11 |         except Exception as e:
12 |             print("Unknown error. Waiting...")
13 |             print(e)
14 |             time.sleep(1)
15 |     return ret


--------------------------------------------------------------------------------
/bigcodebench/gen/util/openai_request.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import openai
 4 | from openai.types.chat import ChatCompletion
 5 | 
 6 | 
 7 | def make_request(
 8 |     client: openai.Client,
 9 |     message: str,
10 |     model: str,
11 |     max_tokens: int = 512,
12 |     temperature: float = 1,
13 |     reasoning_effort: str = "medium",
14 |     n: int = 1,
15 |     **kwargs
16 | ) -> ChatCompletion:
17 |     kwargs["top_p"] = 0.95
18 |     kwargs["max_completion_tokens"] = max_tokens
19 |     kwargs["temperature"] = temperature
20 |     if any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]):  # pop top-p and max_completion_tokens
21 |         kwargs.pop("top_p")
22 |         kwargs.pop("max_completion_tokens")
23 |         kwargs.pop("temperature")
24 |         kwargs["reasoning_effort"] = reasoning_effort
25 |     
26 |     return client.chat.completions.create(
27 |         model=model,
28 |         messages=[
29 |             {"role": "user", "content": message},
30 |         ],
31 |         n=n,
32 |         **kwargs
33 |     )
34 | 
35 | 
36 | def make_auto_request(*args, **kwargs) -> ChatCompletion:
37 |     ret = None
38 |     while ret is None:
39 |         try:
40 |             ret = make_request(*args, **kwargs)
41 |         except openai.RateLimitError:
42 |             print("Rate limit exceeded. Waiting...")
43 |             time.sleep(5)
44 |         except openai.APIConnectionError:
45 |             print("API connection error. Waiting...")
46 |             time.sleep(5)
47 |         except openai.APIError as e:
48 |             print(e)
49 |         except Exception as e:
50 |             print("Unknown error. Waiting...")
51 |             print(e)
52 |             time.sleep(1)
53 |     return ret


--------------------------------------------------------------------------------
/bigcodebench/generate.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import argparse
  4 | from typing import Optional, Tuple
  5 | 
  6 | from bigcodebench.provider import DecoderBase, make_model
  7 | from bigcodebench.data import get_bigcodebench, write_jsonl
  8 | from bigcodebench.sanitize import sanitize
  9 | from rich.progress import (
 10 |     BarColumn,
 11 |     MofNCompleteColumn,
 12 |     Progress,
 13 |     TextColumn,
 14 |     TimeElapsedColumn,
 15 | )
 16 | 
 17 | 
 18 | def codegen(
 19 |     model: DecoderBase,
 20 |     target_path: str,
 21 |     split: str,
 22 |     subset: str,
 23 |     greedy: bool = False,
 24 |     strip_newlines: bool = False,
 25 |     n_samples: int = 1,
 26 |     id_range: Tuple[int, int] = None,
 27 |     resume: bool = True,
 28 |     batch_size: int = -1,
 29 | ):
 30 |     with Progress(
 31 |         TextColumn(f"BigCodeBench--{split.capitalize()} ({subset.capitalize()}) •" + "[progress.percentage]{task.percentage:>3.0f}%"),
 32 |         BarColumn(),
 33 |         MofNCompleteColumn(),
 34 |         TextColumn("•"),
 35 |         TimeElapsedColumn(),
 36 |     ) as p:
 37 |             
 38 |         dataset = get_bigcodebench(subset=subset)
 39 | 
 40 |         if model.is_direct_completion() and split == "instruct":
 41 |             raise Exception("Base model does not support direct completion for instruct tasks")
 42 |         
 43 |         # create target_path if it doesn't exist, e.g., a/b.jsonl
 44 |         dirname = os.path.dirname(target_path)
 45 |         if not os.path.exists(dirname) and dirname != "":
 46 |             os.makedirs(dirname)
 47 |             
 48 |         batch_prompts = []
 49 |         batch_task_ids = []
 50 |         batch_nsamples = []
 51 |         batch_entry_points = []
 52 |         
 53 |         # Read existing data once if resuming
 54 |         task2nexist = {}
 55 |         if resume and os.path.exists(target_path):
 56 |             with open(target_path, "r") as f:
 57 |                 for line in f:
 58 |                     item = json.loads(line)
 59 |                     task2nexist[item["task_id"]] = task2nexist.get(item["task_id"], 0) + 1
 60 |         
 61 |         for id_num, (task_id, task) in enumerate(p.track(dataset.items())):
 62 |             if id_range is not None:
 63 |                 low, high = id_range
 64 |                 if id_num < low:
 65 |                     p.console.print(f"Skipping {task_id} as it is not in {id_range}")
 66 |                     continue
 67 |                 if id_num >= id_range[1]:
 68 |                     break
 69 | 
 70 |             p_name = task_id.replace("/", "_")
 71 | 
 72 |             n_existing = task2nexist.get(task_id, 0)
 73 |             nsamples = n_samples - n_existing
 74 |             
 75 |             try:
 76 |                 prompt = task[f"{split}_prompt"]
 77 |             except:
 78 |                 raise Exception(f"Invalid split {split} for bigcodebench-{subset}")
 79 |             if strip_newlines:
 80 |                 prompt = prompt.strip("\n")
 81 |             
 82 |             if nsamples > 0:
 83 |                 batch_prompts.append(prompt)
 84 |                 batch_task_ids.append(task_id)
 85 |                 batch_nsamples.append(nsamples)
 86 |                 batch_entry_points.append(task["entry_point"])
 87 |                 
 88 |                 log = f"Codegen: {p_name} @ {model}"
 89 |                 if n_existing > 0:
 90 |                     log += f" (resuming from {n_existing})"
 91 |                 p.console.print(log)
 92 |             
 93 |             if (batch_size and len(batch_prompts) == batch_size) or id_num == len(dataset) - 1 or (id_range and id_num == id_range[1] - 1):
 94 |                 if not batch_prompts and (id_num == len(dataset) - 1 or (id_range and id_num == id_range[1] - 1)):
 95 |                     break
 96 |                 outputs = model.codegen(
 97 |                     batch_prompts,
 98 |                     do_sample=not greedy,
 99 |                     num_samples=max(batch_nsamples),
100 |                 )
101 |                 assert outputs, "No outputs from model!"
102 |                 
103 |                 samples = []
104 |                 for task_id, content, entry_point, nsamples, task_outputs in zip(batch_task_ids, batch_prompts, batch_entry_points, batch_nsamples, outputs):
105 |                     if model.is_direct_completion():
106 |                         samples.extend([
107 |                             dict(task_id=task_id, solution=sanitize(content+completion, entry_point), raw_solution=content+completion)
108 |                             for completion in task_outputs[:nsamples]
109 |                         ])
110 |                     else:
111 |                         samples.extend([
112 |                             dict(task_id=task_id, solution=sanitize(completion, entry_point), raw_solution=completion)
113 |                             for completion in task_outputs[:nsamples]
114 |                         ])
115 | 
116 |                 print(f"Generated {len(samples)} samples")
117 |                 write_jsonl(target_path, samples, append=True)
118 |             
119 |                 # Clear batches
120 |                 batch_prompts = []
121 |                 batch_task_ids = []
122 |                 batch_nsamples = []
123 | 
124 | 
125 | def run_codegen(
126 |     model: str,
127 |     split: str,
128 |     subset: str,
129 |     root: str = "bcb_results",
130 |     lora_path: str = None,
131 |     bs: Optional[int] = None,
132 |     n_samples: int = 1,
133 |     temperature: float = 0.0,
134 |     max_new_tokens: int = 1280,
135 |     # vllm
136 |     max_model_len: int = 12800,
137 |     greedy: bool = False,
138 |     # openai
139 |     reasoning_effort: str = "medium",
140 |     # anthropic
141 |     reasoning_budget: int = 0,
142 |     reasoning_beta: str = "output-128k-2025-02-19",
143 |     strip_newlines: bool = False,
144 |     direct_completion: bool = False,
145 |     resume: bool = True,
146 |     id_range: str = None,
147 |     backend: str = "vllm",
148 |     base_url: str = None,
149 |     tp: int = 1,
150 |     instruction_prefix: str = "Please provide a self-contained Python script that solves the following problem in a markdown code block:",
151 |     response_prefix: str ="Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:",
152 |     skip_prefill: bool = False,
153 |     revision: str = "main",
154 |     trust_remote_code: bool = False,
155 |     tokenizer_name: str = None,
156 |     tokenizer_legacy: bool = False,
157 | ):
158 | 
159 |     if greedy or (temperature == 0 and n_samples == 1):
160 |         temperature = 0
161 |         n_samples = 1
162 |         greedy = True
163 |         print("Greedy decoding ON (--greedy): setting n_samples=1, temperature=0")
164 | 
165 |     if id_range is not None:
166 |         id_range = [int(i) for i in id_range.split("-")]
167 |         assert len(id_range) == 2, "id_range must be a list of length 2"
168 |         assert id_range[0] < id_range[1], "id_range must be increasing"
169 |         id_range = tuple(id_range)
170 | 
171 |     # Make project dir
172 |     os.makedirs(root, exist_ok=True)
173 |     
174 |     # Make dir for codes generated by each model
175 |     model_runner = make_model(
176 |         model=model,
177 |         backend=backend,
178 |         subset=subset,
179 |         split=split,
180 |         lora_path=lora_path,
181 |         temperature=temperature,
182 |         max_new_tokens=max_new_tokens,
183 |         max_model_len=max_model_len,
184 |         reasoning_effort=reasoning_effort,
185 |         reasoning_budget=reasoning_budget,
186 |         reasoning_beta=reasoning_beta,
187 |         instruction_prefix=instruction_prefix,
188 |         response_prefix=response_prefix,
189 |         prefill=not skip_prefill,
190 |         base_url=base_url,
191 |         tp=tp,
192 |         revision=revision,
193 |         trust_remote_code=trust_remote_code,
194 |         direct_completion=direct_completion,
195 |         tokenizer_name=tokenizer_name,
196 |         tokenizer_legacy=tokenizer_legacy
197 |     )
198 |     
199 |     extra = "-" + subset if subset != "full" else ""
200 |     if backend == "openai" and reasoning_effort and any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]):
201 |         model = model + f"--{reasoning_effort}"
202 |     
203 |     if lora_path:
204 |         model = model + f"--lora-{lora_path}"
205 |     
206 |     if backend == "anthropic" and reasoning_budget and reasoning_beta:
207 |         model = model + f"--{reasoning_budget}-{reasoning_beta}"
208 |     
209 |     if skip_prefill:
210 |         identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
211 |     else:
212 |         identifier = model.replace("/", "--") + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
213 |     
214 |     target_path = os.path.join(root, identifier)
215 |     
216 |     if not resume:
217 |         os.remove(target_path)
218 |     
219 |     codegen(
220 |         model=model_runner,
221 |         target_path=target_path,
222 |         split=split,
223 |         subset=subset,
224 |         greedy=greedy,
225 |         strip_newlines=strip_newlines,
226 |         n_samples=n_samples,
227 |         resume=resume,
228 |         id_range=id_range,
229 |         batch_size=bs
230 |     )
231 | 
232 |     return target_path
233 | 
234 | 
235 | def main():
236 |     from fire import Fire
237 |     Fire(run_codegen)
238 | 
239 | 
240 | if __name__ == "__main__":
241 |     main()
242 | 


--------------------------------------------------------------------------------
/bigcodebench/inspect.py:
--------------------------------------------------------------------------------
 1 | from bigcodebench.data import get_bigcodebench
 2 | import os
 3 | import shutil
 4 | import json
 5 | import argparse
 6 | 
 7 | def inspection(args):
 8 |     """
 9 |     Write a series of files for each task into a directory.
10 |     
11 |     Each Directory Structure:
12 |     -- task_id
13 |         -- ground_truth.py: prompt + canonical_solution
14 |         -- completion.py: prompt + completion
15 |         -- execution_trace.txt: execution trace
16 |     """
17 |     path = os.path.join(args.save_path, args.eval_results.split("/")[-1].replace(".json", ""))
18 |     if args.in_place:
19 |         shutil.rmtree(path, ignore_errors=True)
20 |     if not os.path.exists(path):
21 |         os.makedirs(path)
22 |     problems = get_bigcodebench(subset=args.subset)
23 | 
24 |     eval_results = json.load(open(args.eval_results, "r"))
25 |     for task_id, results in eval_results["eval"].items():
26 |         if task_id not in problems:
27 |             continue
28 |         if all(result["status"] == "pass" for result in results):
29 |             continue
30 |         task_path = os.path.join(path, task_id)
31 |         if not os.path.exists(task_path):
32 |             os.makedirs(task_path)
33 |         task_id_data = problems[task_id]
34 |         with open(os.path.join(task_path, "ground_truth.py"), "w") as f:
35 |             f.write(task_id_data[f"{args.split}_prompt"] + "\n\n" + task_id_data["canonical_solution"])
36 |         
37 |         # write test
38 |         with open(os.path.join(task_path, "test_case.py"), "w") as f:
39 |             f.write(task_id_data["test"])
40 | 
41 |         for i, result in enumerate(results):
42 |             with open(os.path.join(task_path, f"completion_{i}.py"), "w") as f:
43 |                 f.write(result["solution"])
44 |                 
45 |         for i, result in enumerate(results):
46 |             with open(os.path.join(task_path, f"complete_{i}_execution_trace.txt"), "w") as f:
47 |                 for test_case, execution_trace in result["details"].items():
48 |                     f.write(f"Test Case: {test_case}\n\n")
49 |                     f.write(execution_trace)
50 |                     f.write("="*50 + "\n")
51 | def main():
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument("--eval_results", required=True, type=str)
54 |     parser.add_argument(
55 |         "--split", required=True, type=str, choices=["complete", "instruct"]
56 |     )
57 |     parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"])
58 |     parser.add_argument("--save_path", default="inspect", type=str)
59 |     parser.add_argument("--in_place", action="store_true")
60 |     args = parser.parse_args()
61 |     
62 |     inspection(args)
63 |     
64 | if __name__ == "__main__":
65 |     main()


--------------------------------------------------------------------------------
/bigcodebench/provider/__init__.py:
--------------------------------------------------------------------------------
  1 | from bigcodebench.provider.base import DecoderBase
  2 | 
  3 | 
  4 | def make_model(
  5 |     model: str,
  6 |     backend: str,
  7 |     subset: str,
  8 |     split: str,
  9 |     lora_path: str = None,
 10 |     dataset: str = "bigcodebench",
 11 |     temperature: float = 0.0,
 12 |     max_new_tokens: int = 1280,
 13 |     max_model_len: int = 12800,
 14 |     # openai only
 15 |     reasoning_effort: str = "medium",
 16 |     # anthropic only
 17 |     reasoning_budget: int = 0,
 18 |     reasoning_beta: str = "output-128k-2025-02-19",
 19 |     # instruction model only
 20 |     instruction_prefix: str = None,
 21 |     response_prefix: str = None,
 22 |     prefill: bool = True,
 23 |     # vllm and hf only
 24 |     revision: str = "main",
 25 |     # vllm only
 26 |     tp: int = 1,
 27 |     direct_completion: bool = False,
 28 |     base_url: str = None,
 29 |     trust_remote_code: bool = False,
 30 |     # hf only
 31 |     attn_implementation: str = "eager",
 32 |     # tokenizer
 33 |     tokenizer_name: str = None,
 34 |     tokenizer_legacy: bool = True,
 35 | ) -> DecoderBase:
 36 |     if backend == "vllm":
 37 |         from bigcodebench.provider.vllm import VllmDecoder
 38 | 
 39 |         return VllmDecoder(
 40 |             name=model,
 41 |             subset=subset,
 42 |             split=split,
 43 |             lora_path=lora_path,
 44 |             temperature=temperature,
 45 |             max_new_tokens=max_new_tokens,
 46 |             max_model_len=max_model_len,
 47 |             revision=revision,
 48 |             dataset=dataset,
 49 |             direct_completion=direct_completion,
 50 |             tp=tp,
 51 |             instruction_prefix=instruction_prefix,
 52 |             response_prefix=response_prefix,
 53 |             prefill=prefill,
 54 |             trust_remote_code=trust_remote_code,
 55 |             tokenizer_name=tokenizer_name,
 56 |             tokenizer_legacy=tokenizer_legacy,
 57 |         )
 58 |     elif backend == "hf":
 59 |         from bigcodebench.provider.hf import HuggingFaceDecoder
 60 | 
 61 |         return HuggingFaceDecoder(
 62 |             name=model,
 63 |             subset=subset,
 64 |             split=split,
 65 |             lora_path=lora_path,
 66 |             temperature=temperature,
 67 |             max_new_tokens=max_new_tokens,
 68 |             revision=revision,
 69 |             dataset=dataset,
 70 |             direct_completion=direct_completion,
 71 |             instruction_prefix=instruction_prefix,
 72 |             response_prefix=response_prefix,
 73 |             prefill=prefill,
 74 |             attn_implementation=attn_implementation,
 75 |             trust_remote_code=trust_remote_code,
 76 |             tokenizer_name=tokenizer_name,
 77 |             tokenizer_legacy=tokenizer_legacy,
 78 |         )
 79 |     elif backend == "hf-inference":
 80 |         from bigcodebench.provider.hf_inference import HuggingFaceInferenceDecoder
 81 | 
 82 |         return HuggingFaceInferenceDecoder(
 83 |             name=model,
 84 |             subset=subset,
 85 |             split=split,
 86 |             temperature=temperature,
 87 |             max_new_tokens=max_new_tokens,
 88 |             direct_completion=direct_completion,
 89 |             instruction_prefix=instruction_prefix,
 90 |             response_prefix=response_prefix,
 91 |         )
 92 |     elif backend == "openai":
 93 |         from bigcodebench.provider.openai import OpenAIChatDecoder
 94 | 
 95 |         assert not direct_completion, f"{backend} backend does not serve base model"
 96 |         return OpenAIChatDecoder(
 97 |             name=model,
 98 |             subset=subset,
 99 |             split=split,
100 |             temperature=temperature,
101 |             max_new_tokens=max_new_tokens,
102 |             reasoning_effort=reasoning_effort,
103 |             base_url=base_url,
104 |             instruction_prefix=instruction_prefix,
105 |             response_prefix=response_prefix,
106 |         )
107 |     elif backend == "mistral":
108 |         from bigcodebench.provider.mistral import MistralChatDecoder
109 |         
110 |         return MistralChatDecoder(
111 |             name=model,
112 |             subset=subset,
113 |             split=split,
114 |             temperature=temperature,
115 |             max_new_tokens=max_new_tokens,
116 |             instruction_prefix=instruction_prefix,
117 |             response_prefix=response_prefix,
118 |         )
119 |     elif backend == "anthropic":
120 |         from bigcodebench.provider.anthropic import AnthropicDecoder
121 | 
122 |         assert not direct_completion, f"{backend} backend does not serve base model"
123 |         return AnthropicDecoder(
124 |             name=model,
125 |             subset=subset,
126 |             split=split,
127 |             temperature=temperature,
128 |             max_new_tokens=max_new_tokens,
129 |             reasoning_budget=reasoning_budget,
130 |             reasoning_beta=reasoning_beta,
131 |             instruction_prefix=instruction_prefix,
132 |             response_prefix=response_prefix,
133 |         )
134 |     elif backend == "google":
135 |         from bigcodebench.provider.google import GoogleDecoder
136 | 
137 |         assert not direct_completion, f"{backend} backend does not serve base model"
138 |         return GoogleDecoder(
139 |             name=model,
140 |             subset=subset,
141 |             split=split,
142 |             temperature=temperature,
143 |             max_new_tokens=max_new_tokens,
144 |             instruction_prefix=instruction_prefix,
145 |             response_prefix=response_prefix,
146 |         )


--------------------------------------------------------------------------------
/bigcodebench/provider/anthropic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | from tqdm import tqdm
 4 | 
 5 | import anthropic
 6 | 
 7 | from bigcodebench.gen.util.anthropic_request import make_auto_request
 8 | from bigcodebench.provider.base import DecoderBase
 9 | from bigcodebench.provider.utility import make_raw_chat_prompt
10 | 
11 | class AnthropicDecoder(DecoderBase):
12 |     def __init__(self, name: str, reasoning_budget: int = 0, reasoning_beta: str = "output-128k-2025-02-19", **kwargs) -> None:
13 |         super().__init__(name, **kwargs)
14 |         self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
15 |         self.reasoning_budget = reasoning_budget
16 |         self.reasoning_beta = reasoning_beta
17 | 
18 |     def codegen(
19 |         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
20 |     ) -> List[str]:
21 |         if do_sample:
22 |             assert self.temperature > 0, "Temperature must be positive for sampling"
23 | 
24 |         all_outputs = []
25 |         for prompt in tqdm(prompts):
26 |             outputs = []
27 |             
28 |             for _ in range(num_samples):
29 |                 ret = make_auto_request(
30 |                     client=self.client,
31 |                     model=self.name,
32 |                     messages=[
33 |                         {
34 |                             "role": "user",
35 |                             "content": make_raw_chat_prompt(
36 |                                 task_prompt=prompt,
37 |                                 subset=self.subset,
38 |                                 split=self.split,
39 |                                 instruction_prefix=self.instruction_prefix,
40 |                                 response_prefix=self.response_prefix,
41 |                                 tokenizer=None,
42 |                             )
43 |                         }
44 |                     ],
45 |                     max_tokens=self.max_new_tokens,
46 |                     temperature=self.temperature,
47 |                     stop_sequences=self.eos,
48 |                     reasoning_budget=self.reasoning_budget,
49 |                     reasoning_beta=self.reasoning_beta,
50 |                 )
51 |                 if isinstance(ret, anthropic.Stream):
52 |                     output = ""
53 |                     for chunk in ret:
54 |                         if chunk.type == "content_block_delta":
55 |                             # if chunk.delta.type == "thinking_delta":
56 |                             #     output += chunk.delta.thinking
57 |                             if chunk.delta.type == "text_delta":
58 |                                 output += chunk.delta.text
59 |                     outputs.append(output)
60 |                 else:
61 |                     outputs.append(ret.content[0].text)
62 |             all_outputs.append(outputs)
63 |         return all_outputs
64 | 
65 |     def is_direct_completion(self) -> bool:
66 |         return False


--------------------------------------------------------------------------------
/bigcodebench/provider/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List
 3 | 
 4 | from bigcodebench.provider.utility import EOS
 5 | 
 6 | 
 7 | class DecoderBase(ABC):
 8 |     def __init__(
 9 |         self,
10 |         name: str,
11 |         subset: str,
12 |         split: str,
13 |         temperature: float = 0.8,
14 |         max_new_tokens: int = 1280,
15 |         revision: str = "main",
16 |         dtype: str = "bfloat16",  # default
17 |         direct_completion: bool = False,
18 |         trust_remote_code: bool = False,
19 |         tokenizer_name: str = None,
20 |         tokenizer_legacy: bool = False,
21 |         instruction_prefix: str = None,
22 |         response_prefix: str = None,
23 |         prefill: bool = True,
24 |     ) -> None:
25 |         print("Initializing a decoder model: {} ...".format(name))
26 |         self.name = name
27 |         self.subset = subset
28 |         self.split = split
29 |         self.temperature = temperature
30 |         self.eos = EOS
31 |         self.skip_special_tokens = False
32 |         self.max_new_tokens = max_new_tokens
33 |         self.dtype = dtype
34 |         self.revision = revision
35 |         self.direct_completion = direct_completion
36 |         self.trust_remote_code = trust_remote_code
37 |         self.tokenizer_name = tokenizer_name
38 |         self.tokenizer_legacy = tokenizer_legacy
39 |         self.instruction_prefix = instruction_prefix
40 |         self.response_prefix = response_prefix
41 |         self.prefill = prefill
42 | 
43 |     @abstractmethod
44 |     def codegen(
45 |         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
46 |     ) -> List[str]:
47 |         pass
48 | 
49 |     @abstractmethod
50 |     def is_direct_completion(self) -> bool:
51 |         pass
52 | 
53 |     def __repr__(self) -> str:
54 |         return self.name
55 | 
56 |     def __str__(self) -> str:
57 |         return self.name


--------------------------------------------------------------------------------
/bigcodebench/provider/google.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | from tqdm import tqdm
 4 | 
 5 | from google import genai
 6 | 
 7 | from bigcodebench.provider.base import DecoderBase
 8 | from bigcodebench.gen.util.google_request import make_auto_request
 9 | from bigcodebench.provider.utility import make_raw_chat_prompt
10 | 
11 | 
12 | class GoogleDecoder(DecoderBase):
13 |     def __init__(self, name: str, **kwargs):
14 |         super().__init__(name, **kwargs)
15 |         self.model = name
16 |         self.client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
17 | 
18 |     def codegen(
19 |         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
20 |     ) -> List[str]:
21 |         if do_sample:
22 |             assert self.temperature > 0, "Temperature must be positive for sampling"
23 |         
24 |         all_outputs = []
25 |         
26 |         for prompt in tqdm(prompts):
27 |             outputs = []
28 |             message = make_raw_chat_prompt(
29 |                 task_prompt=prompt,
30 |                 subset=self.subset,
31 |                 split=self.split,
32 |                 instruction_prefix=self.instruction_prefix,
33 |                 response_prefix=self.response_prefix,
34 |                 tokenizer=None,
35 |             )
36 |             ret = make_auto_request(
37 |                 model=self.model,
38 |                 client=self.client,
39 |                 message=message,
40 |                 n=num_samples,
41 |                 temperature=self.temperature,
42 |                 max_new_tokens=self.max_new_tokens,
43 |             )
44 |             for candidate in ret.candidates:
45 |                 parts = candidate.content.parts
46 |                 if parts:
47 |                     outputs.append(parts[0].text)
48 |                 else:
49 |                     print("Empty response!")
50 |                     outputs.append("")
51 |                     print(f"{candidate.safety_ratings = }")
52 |             all_outputs.append(outputs)
53 |         return all_outputs
54 | 
55 |     def is_direct_completion(self) -> bool:
56 |         return False


--------------------------------------------------------------------------------
/bigcodebench/provider/hf.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import torch
  4 | from stop_sequencer import StopSequencer
  5 | from transformers import AutoModelForCausalLM, AutoTokenizer
  6 | 
  7 | from bigcodebench.provider.base import DecoderBase
  8 | from bigcodebench.provider.utility import (
  9 |     extra_eos_for_direct_completion,
 10 |     make_raw_chat_prompt,
 11 | )
 12 | 
 13 | 
 14 | class HuggingFaceDecoder(DecoderBase):
 15 |     def __init__(
 16 |         self,
 17 |         name: str,
 18 |         dataset: str,
 19 |         attn_implementation: str = "eager",
 20 |         **kwargs,
 21 |     ):
 22 |         super().__init__(name=name, **kwargs)
 23 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 24 | 
 25 |         kwargs = {
 26 |             "device_map": "auto",
 27 |             "trust_remote_code": self.trust_remote_code,
 28 |             "torch_dtype": getattr(torch, self.dtype),
 29 |             "attn_implementation": attn_implementation,  # "eager", "flash_attention_2", "sdpa"
 30 |             "revision": self.revision,
 31 |         }
 32 |         self.skip_special_tokens = True
 33 | 
 34 |         print(f"{kwargs = }")
 35 | 
 36 |         self.tokenizer = AutoTokenizer.from_pretrained(name, use_fast=False, legacy=self.tokenizer_legacy)
 37 |         self.tokenizer.pad_token = self.tokenizer.eos_token
 38 |         # assume the model is decoder-only
 39 |         self.tokenizer.padding_side = 'left'
 40 |         
 41 |         if self.is_direct_completion():  # no chat template
 42 |             self.eos += extra_eos_for_direct_completion(dataset)
 43 |         else:  # with chat template
 44 |             if self.prefill and "```" in self.response_prefix:
 45 |                 self.eos += ["\n```\n"]
 46 | 
 47 |         print(f"{self.eos = }")
 48 |         self.model = AutoModelForCausalLM.from_pretrained(name, **kwargs)
 49 | 
 50 |     def is_direct_completion(self) -> bool:
 51 |         return self.direct_completion or self.tokenizer.chat_template is None
 52 | 
 53 |     @torch.inference_mode()
 54 |     def codegen(
 55 |         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
 56 |     ) -> List[str]:
 57 |         if self.temperature == 0:
 58 |             assert not do_sample
 59 |             assert num_samples == 1
 60 | 
 61 |         prompts = [
 62 |             prompt
 63 |             if self.is_direct_completion()
 64 |             else make_raw_chat_prompt(
 65 |                 prompt, self.subset, self.split, self.instruction_prefix, self.response_prefix, self.tokenizer, self.direct_completion
 66 |             )
 67 |             for prompt in prompts
 68 |         ]
 69 |         
 70 |         input_tokens = self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True).to(
 71 |             self.device
 72 |         )["input_ids"]
 73 |         
 74 |         kwargs = {}
 75 |         if do_sample:
 76 |             kwargs["top_p"] = 0.95
 77 |             kwargs["temperature"] = self.temperature
 78 |         ret = self.model.generate(
 79 |             input_tokens,
 80 |             max_new_tokens=self.max_new_tokens,
 81 |             do_sample=do_sample,
 82 |             num_return_sequences=num_samples,
 83 |             pad_token_id=self.tokenizer.eos_token_id,
 84 |             stop_strings=self.eos,
 85 |             tokenizer=self.tokenizer,
 86 |             **kwargs,
 87 |         )
 88 |         
 89 |         # Reshape ret into a list of lists, each sublist containing num_samples elements
 90 |         ret_chunks = [ret[i:i + num_samples] for i in range(0, len(ret), num_samples)]
 91 | 
 92 |         all_outputs = []
 93 |         # Process each chunk in ret_chunks
 94 |         for i, ret_chunk in enumerate(ret_chunks):
 95 |             gen_strs = self.tokenizer.batch_decode(
 96 |                 ret_chunk[:, input_tokens[i].size(-1):],
 97 |                 skip_special_tokens=self.skip_special_tokens,
 98 |             )
 99 |             outputs = []
100 |             for output in gen_strs:
101 |                 min_index = 10000
102 |                 for eos in self.eos:
103 |                     if eos in output:
104 |                         min_index = min(min_index, output.index(eos))
105 |                 outputs.append(output[:min_index].replace("\t", "    "))
106 |             all_outputs.append(outputs)
107 |         return all_outputs


--------------------------------------------------------------------------------
/bigcodebench/provider/hf_inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | from tqdm import tqdm
 4 | 
 5 | from huggingface_hub import InferenceClient
 6 | 
 7 | from bigcodebench.provider.base import DecoderBase
 8 | from bigcodebench.gen.util.hf_inference_request import make_auto_request
 9 | from bigcodebench.provider.utility import make_raw_chat_prompt
10 | 
11 | 
12 | class HuggingFaceInferenceDecoder(DecoderBase):
13 |     def __init__(self, name: str, **kwargs):
14 |         super().__init__(name, **kwargs)
15 |         self.client = InferenceClient(
16 |             provider="hf-inference", api_key=os.getenv("HF_INFERENCE_API_KEY")
17 |         )
18 | 
19 |     def codegen(
20 |         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
21 |     ) -> List[str]:
22 |         if do_sample:
23 |             assert self.temperature > 0, "Temperature must be positive for sampling"
24 | 
25 |         all_outputs = []
26 | 
27 |         for prompt in tqdm(prompts):
28 |             outputs = []
29 |             message = (
30 |                 prompt
31 |                 if self.is_direct_completion()
32 |                 else make_raw_chat_prompt(
33 |                     task_prompt=prompt,
34 |                     subset=self.subset,
35 |                     split=self.split,
36 |                     instruction_prefix=self.instruction_prefix,
37 |                     response_prefix=self.response_prefix,
38 |                     tokenizer=None,
39 |                 )
40 |             )
41 |             ret = make_auto_request(
42 |                 self.client,
43 |                 message=message,
44 |                 model=self.name,
45 |                 n=num_samples,
46 |                 temperature=self.temperature,
47 |                 max_new_tokens=self.max_new_tokens,
48 |             )
49 |             outputs.append(ret)
50 |             all_outputs.append(outputs)
51 |         return all_outputs
52 | 
53 |     def is_direct_completion(self) -> bool:
54 |         return self.direct_completion
55 | 


--------------------------------------------------------------------------------
/bigcodebench/provider/mistral.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | from tqdm import tqdm
 4 | 
 5 | from mistralai.client import MistralClient
 6 | from mistralai.models.chat_completion import ChatMessage
 7 | 
 8 | from bigcodebench.provider.base import DecoderBase
 9 | from bigcodebench.gen.util.mistral_request import make_auto_request
10 | from bigcodebench.provider.utility import make_raw_chat_prompt
11 | 
12 | class MistralChatDecoder(DecoderBase):
13 |     def __init__(self, name: str, **kwargs) -> None:
14 |         super().__init__(name, **kwargs)
15 |         self.client = MistralClient(api_key=os.getenv("MISTRAL_API_KEY"))
16 | 
17 |     def codegen(
18 |         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
19 |     ) -> List[str]:
20 |         if do_sample:
21 |             assert self.temperature > 0, "Temperature must be positive for sampling"
22 | 
23 |         all_outputs = []
24 |         for prompt in tqdm(prompts):
25 |             outputs = []
26 |             
27 |             for _ in range(num_samples):
28 |                 ret = make_auto_request(
29 |                     client=self.client,
30 |                     model=self.name,
31 |                     messages=[
32 |                         ChatMessage(
33 |                             role="user",
34 |                             content=make_raw_chat_prompt(
35 |                                 task_prompt=prompt,
36 |                                 subset=self.subset,
37 |                                 split=self.split,
38 |                                 instruction_prefix=self.instruction_prefix,
39 |                                 response_prefix=self.response_prefix,
40 |                                 tokenizer=None,
41 |                                 direct_completion=None,
42 |                             )
43 |                         )
44 |                     ],
45 |                     max_tokens=self.max_new_tokens,
46 |                 )
47 |                 outputs.append(ret.choices[0].message.content)
48 |             all_outputs.append(outputs)
49 |         return all_outputs
50 | 
51 |     def is_direct_completion(self) -> bool:
52 |         return False


--------------------------------------------------------------------------------
/bigcodebench/provider/openai.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | from tqdm import tqdm
 4 | import openai
 5 | 
 6 | from bigcodebench.gen.util.openai_request import make_auto_request
 7 | from bigcodebench.provider.utility import make_raw_chat_prompt
 8 | from bigcodebench.provider.base import DecoderBase
 9 | from bigcodebench.provider.utility import concurrent_call
10 | 
11 | class OpenAIChatDecoder(DecoderBase):
12 |     def __init__(self, name: str, base_url=None, reasoning_effort="medium", **kwargs) -> None:
13 |         super().__init__(name, **kwargs)
14 |         self.base_url = base_url
15 |         self.reasoning_effort = reasoning_effort
16 |     
17 |     def codegen(
18 |         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
19 |     ) -> List[str]:
20 |         if do_sample:
21 |             assert self.temperature > 0, "Temperature must be positive for sampling"
22 |         messages = [make_raw_chat_prompt(
23 |             task_prompt=prompt,
24 |             subset=self.subset,
25 |             split=self.split,
26 |             instruction_prefix=self.instruction_prefix,
27 |             response_prefix=self.response_prefix,
28 |             tokenizer=None,
29 |         ) for prompt in prompts]
30 |         # use concurrency based batching for o1 and deepseek models
31 |         if any(self.name.startswith(model) or self.name.endswith(model) for model in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]):
32 |             return self._codegen_batch_via_concurrency(messages, num_samples)
33 | 
34 |         return self._codegen_api_batch(messages, num_samples)
35 | 
36 |     def _codegen_api_batch(self, messages: List[str], num_samples: int) -> List[str]:
37 |         client = openai.OpenAI(
38 |             api_key=os.getenv("OPENAI_API_KEY", "none"), base_url=self.base_url
39 |         )
40 |         
41 |         all_outputs = []
42 |         for message in tqdm(messages):
43 |             ret = make_auto_request(
44 |                 client,
45 |                 message=message,
46 |                 model=self.name,
47 |                 max_tokens=self.max_new_tokens,
48 |                 temperature=self.temperature,
49 |                 reasoning_effort=self.reasoning_effort,
50 |                 n=num_samples,
51 |             )
52 |             outputs = []
53 |             for item in ret.choices:
54 |                 outputs.append(item.message.content)
55 |             all_outputs.append(outputs)
56 |         return all_outputs
57 | 
58 |     def _codegen_batch_via_concurrency(self, messages: List[str], num_samples: int) -> List[str]:
59 |         batches = concurrent_call(
60 |             num_samples, self._codegen_api_batch, messages, num_samples=1
61 |         )
62 |         return [[element for sublist in item for element in sublist] for item in zip(*batches)]
63 | 
64 |     def is_direct_completion(self) -> bool:
65 |         return False


--------------------------------------------------------------------------------
/bigcodebench/provider/utility.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from transformers import AutoTokenizer
 3 | from concurrent.futures import ThreadPoolExecutor
 4 | 
 5 | EOS = [
 6 |     "<|endoftext|>",
 7 |     "<|endofmask|>",
 8 |     "</s>",
 9 |     "\nif __name__",
10 |     "\ndef main(",
11 |     "\nprint(",
12 | ]
13 | 
14 | 
15 | def extra_eos_for_direct_completion(dataset) -> List[str]:
16 |     if dataset.lower() == "bigcodebench":
17 |         return ["\ndef ", "\nclass ", "\nimport ", "\nfrom ", "\nassert "]
18 |     raise ValueError(f"Unknown dataset: {dataset}")
19 | 
20 | 
21 | # some random words which serves as the splitter
22 | _MAGIC_SPLITTER_ = "-[[]]-this-is-really-our-highest-priority-[[]]-"
23 | 
24 | 
25 | def make_raw_chat_prompt(
26 |     task_prompt: str,
27 |     subset: str,
28 |     split: str, 
29 |     instruction_prefix: str,
30 |     response_prefix: str,
31 |     tokenizer: AutoTokenizer,
32 |     prefill: bool = True,
33 |     direct_completion: bool = False,
34 | ) -> str:
35 |     # directly return prompt if it does not have a tokenizer.chat_template
36 |     if tokenizer:
37 |         if tokenizer.chat_template is None or direct_completion:
38 |             return task_prompt
39 | 
40 |     assert instruction_prefix is not None, "Instruction prefix is required!"
41 |     assert response_prefix is not None, "Response prefix is required!"
42 |     
43 |     if split == "complete":
44 |         task_prompt = f"""\
45 | {instruction_prefix}
46 | ```
47 | {task_prompt.strip()}
48 | ```
49 | """
50 |     else:
51 |         task_prompt = f"""\
52 | {instruction_prefix}
53 | {task_prompt.strip()}
54 | """
55 |     response = f"""\
56 | {response_prefix}
57 | ```python
58 | {_MAGIC_SPLITTER_}
59 | ```
60 | """
61 |     if tokenizer:
62 |         if prefill:
63 |             task_prompt = tokenizer.apply_chat_template(
64 |                 [
65 |                     {"role": "user", "content": task_prompt},
66 |                     {"role": "assistant", "content": response},
67 |                 ],
68 |                 tokenize=False,
69 |             ).split(_MAGIC_SPLITTER_)[0]
70 |         else:
71 |             task_prompt = tokenizer.apply_chat_template(
72 |                 [
73 |                     {"role": "user", "content": task_prompt},
74 |                 ],
75 |                 tokenize=False, add_generation_prompt=True
76 |             ).split(_MAGIC_SPLITTER_)[0]
77 |     return task_prompt
78 | 
79 | 
80 | def concurrent_call(n, callback, /, *args, **kwargs):
81 |     with ThreadPoolExecutor(max_workers=n) as executor:
82 |         futures = [executor.submit(callback, *args, **kwargs) for _ in range(n)]
83 |         return [future.result() for future in futures]


--------------------------------------------------------------------------------
/bigcodebench/provider/vllm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | 
 4 | from transformers import AutoTokenizer
 5 | from vllm import LLM, SamplingParams
 6 | from vllm.lora.request import LoRARequest
 7 | from huggingface_hub import snapshot_download
 8 | 
 9 | from bigcodebench.provider.base import DecoderBase
10 | from bigcodebench.provider.utility import (
11 |     extra_eos_for_direct_completion,
12 |     make_raw_chat_prompt,
13 | )
14 | 
15 | class VllmDecoder(DecoderBase):
16 |     def __init__(self, name: str, lora_path: str, dataset: str, tp: int, max_model_len: int, **kwargs) -> None:
17 |         super().__init__(name, **kwargs)
18 | 
19 |         kwargs = {
20 |             "tensor_parallel_size": int(os.getenv("VLLM_N_GPUS", tp)),
21 |             "dtype": self.dtype,
22 |             "trust_remote_code": self.trust_remote_code,
23 |             "revision": self.revision,
24 |         }
25 |         if self.tokenizer_name is None:
26 |             self.tokenizer_name = self.name
27 |         
28 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, **kwargs, legacy=self.tokenizer_legacy)
29 |         if self.is_direct_completion():
30 |             self.eos += extra_eos_for_direct_completion(dataset)
31 |         else:
32 |             if self.prefill and "```" in self.response_prefix:
33 |                 self.eos += ["\n```\n"]
34 |         
35 |         self.lora_request = None
36 |         if lora_path:
37 |             local_lora_path = snapshot_download(lora_path)
38 |             self.lora_request = LoRARequest(
39 |                 "lora",
40 |                 1,
41 |                 local_lora_path,
42 |             )
43 |         
44 |         self.llm = LLM(model=name, max_model_len=max_model_len, enable_lora=True if self.lora_request else False, **kwargs)
45 |         self.llm.set_tokenizer(tokenizer=self.tokenizer)
46 | 
47 |     def is_direct_completion(self) -> bool:
48 |         return self.tokenizer.chat_template is None or self.direct_completion
49 | 
50 |     def codegen(
51 |         self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
52 |     ) -> List[str]:
53 |         if do_sample:
54 |             assert self.temperature > 0, "Temperature must be greater than 0!"
55 | 
56 |         prompts = [
57 |             make_raw_chat_prompt(
58 |                 task_prompt=prompt,
59 |                 subset=self.subset,
60 |                 split=self.split,
61 |                 instruction_prefix=self.instruction_prefix,
62 |                 response_prefix=self.response_prefix,
63 |                 prefill=self.prefill,
64 |                 tokenizer=self.tokenizer,
65 |                 direct_completion=self.direct_completion,
66 |             )
67 |             for prompt in prompts
68 |         ]
69 |         vllm_outputs = self.llm.generate(
70 |             prompts,
71 |             SamplingParams(
72 |                 n=num_samples,
73 |                 temperature=self.temperature,
74 |                 max_tokens=self.max_new_tokens,
75 |                 top_p=0.95 if do_sample else 1.0,
76 |                 stop=self.eos,
77 |                 skip_special_tokens=self.skip_special_tokens,
78 |             ),
79 |             lora_request=self.lora_request,
80 |             use_tqdm=True,
81 |         )
82 | 
83 |         gen_strs = [[x.text.replace("\t", "    ") for x in output.outputs] for output in vllm_outputs]
84 |         return gen_strs


--------------------------------------------------------------------------------
/bigcodebench/sanitize.py:
--------------------------------------------------------------------------------
  1 | """Post-processing LLM-generated Python code implemented using tree-sitter."""
  2 | 
  3 | import os
  4 | import pathlib
  5 | from typing import Dict, Generator, List, Optional, Set, Tuple
  6 | from pqdm.processes import pqdm
  7 | 
  8 | from tqdm import tqdm
  9 | import tree_sitter_python
 10 | from tree_sitter import Language, Node, Parser
 11 | 
 12 | from bigcodebench.data import (
 13 |     get_bigcodebench,
 14 |     load_solutions,
 15 |     write_directory,
 16 |     write_jsonl,
 17 | )
 18 | from bigcodebench.syncheck import syntax_check
 19 | 
 20 | CLASS_TYPE = "class_definition"
 21 | FUNCTION_TYPE = "function_definition"
 22 | IMPORT_TYPE = ["import_statement", "import_from_statement"]
 23 | IDENTIFIER_TYPE = "identifier"
 24 | ATTRIBUTE_TYPE = "attribute"
 25 | RETURN_TYPE = "return_statement"
 26 | EXPRESSION_TYPE = "expression_statement"
 27 | ASSIGNMENT_TYPE = "assignment"
 28 | 
 29 | 
 30 | def code_extract(text: str) -> str:
 31 |     lines = text.split("\n")
 32 |     longest_line_pair = (0, 0)
 33 |     longest_so_far = 0
 34 | 
 35 |     for i in range(len(lines)):
 36 |         for j in range(i + 1, len(lines)):
 37 |             current_lines = "\n".join(lines[i : j + 1])
 38 |             if syntax_check(current_lines):
 39 |                 current_length = sum(1 for line in lines[i : j + 1] if line.strip())
 40 |                 if current_length > longest_so_far:
 41 |                     longest_so_far = current_length
 42 |                     longest_line_pair = (i, j)
 43 | 
 44 |     return "\n".join(lines[longest_line_pair[0] : longest_line_pair[1] + 1])
 45 | 
 46 | 
 47 | def get_deps(nodes: List[Tuple[str, Node]]) -> Dict[str, Set[str]]:
 48 | 
 49 |     def dfs_get_deps(node: Node, deps: Set[str]) -> None:
 50 |         for child in node.children:
 51 |             if child.type == IDENTIFIER_TYPE:
 52 |                 deps.add(child.text.decode("utf8"))
 53 |             else:
 54 |                 dfs_get_deps(child, deps)
 55 | 
 56 |     name2deps = {}
 57 |     for name, node in nodes:
 58 |         deps = set()
 59 |         dfs_get_deps(node, deps)
 60 |         name2deps[name] = deps
 61 |     return name2deps
 62 | 
 63 | 
 64 | def get_function_dependency(entrypoint: str, call_graph: Dict[str, str]) -> Set[str]:
 65 |     queue = [entrypoint]
 66 |     visited = {entrypoint}
 67 |     while queue:
 68 |         current = queue.pop(0)
 69 |         if current not in call_graph:
 70 |             continue
 71 |         for neighbour in call_graph[current]:
 72 |             if not (neighbour in visited):
 73 |                 visited.add(neighbour)
 74 |                 queue.append(neighbour)
 75 |     return visited
 76 | 
 77 | 
 78 | def get_definition_name(node: Node) -> str:
 79 |     for child in node.children:
 80 |         if child.type == IDENTIFIER_TYPE:
 81 |             return child.text.decode("utf8")
 82 | 
 83 | 
 84 | def traverse_tree(node: Node) -> Generator[Node, None, None]:
 85 |     cursor = node.walk()
 86 |     depth = 0
 87 | 
 88 |     visited_children = False
 89 |     while True:
 90 |         if not visited_children:
 91 |             yield cursor.node
 92 |             if not cursor.goto_first_child():
 93 |                 depth += 1
 94 |                 visited_children = True
 95 |         elif cursor.goto_next_sibling():
 96 |             visited_children = False
 97 |         elif not cursor.goto_parent() or depth == 0:
 98 |             break
 99 |         else:
100 |             depth -= 1
101 | 
102 | 
103 | def has_return_statement(node: Node) -> bool:
104 |     traverse_nodes = traverse_tree(node)
105 |     for node in traverse_nodes:
106 |         if node.type == RETURN_TYPE:
107 |             return True
108 |     return False
109 | 
110 | 
111 | def extract_target_code_or_empty(code: str, entrypoint: Optional[str] = None) -> str:
112 |     code = code_extract(code.strip())
113 |     code_bytes = bytes(code, "utf8")
114 |     parser = Parser(Language(tree_sitter_python.language()))
115 |     tree = parser.parse(code_bytes)
116 |     class_names = set()
117 |     function_names = set()
118 |     variable_names = set()
119 | 
120 |     root_node = tree.root_node
121 |     import_nodes = []
122 |     definition_nodes = []
123 | 
124 |     for child in root_node.children:
125 |         if child.type in IMPORT_TYPE:
126 |             import_nodes.append(child)
127 |         elif child.type == CLASS_TYPE:
128 |             name = get_definition_name(child)
129 |             if not (
130 |                 name in class_names or name in variable_names or name in function_names
131 |             ):
132 |                 definition_nodes.append((name, child))
133 |                 class_names.add(name)
134 |         elif child.type == FUNCTION_TYPE:
135 |             name = get_definition_name(child)
136 |             if not (
137 |                 name in function_names or name in variable_names or name in class_names
138 |             ):
139 |                 definition_nodes.append((name, child))
140 |                 function_names.add(get_definition_name(child))
141 |         elif (
142 |             child.type == EXPRESSION_TYPE and child.children[0].type == ASSIGNMENT_TYPE
143 |         ):
144 |             subchild = child.children[0]
145 |             name = get_definition_name(subchild)
146 |             if not (
147 |                 name in variable_names or name in function_names or name in class_names
148 |             ):
149 |                 definition_nodes.append((name, subchild))
150 |                 variable_names.add(name)
151 | 
152 |     if entrypoint:
153 |         name2deps = get_deps(definition_nodes)
154 |         reacheable = get_function_dependency(entrypoint, name2deps)
155 | 
156 |     sanitized_output = b""
157 | 
158 |     for node in import_nodes:
159 |         sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n"
160 | 
161 |     for pair in definition_nodes:
162 |         name, node = pair
163 |         if entrypoint and not (name in reacheable):
164 |             continue
165 |         sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n"
166 |         
167 |     sanitized_output = sanitized_output[:-1].decode("utf8")
168 |     
169 |     # ad-hoc approach to remove unnecessary lines, but it works
170 |     lines = sanitized_output.splitlines()
171 |     outer_lines = []
172 |     for i in range(len(lines) - 1, -1, -1):
173 |         if lines[i].startswith(" "):
174 |             break
175 |         if not lines[i].startswith(" ") and entrypoint in lines[i]:
176 |             outer_lines.append(i)
177 |     if outer_lines:
178 |         sanitized_output = "\n".join(lines[: outer_lines[-1]])
179 |     return sanitized_output
180 | 
181 | 
182 | def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
183 |     sanitized_code = extract_target_code_or_empty(code, entrypoint).strip()
184 |     if not sanitized_code:
185 |         return code_extract(code)
186 |     return sanitized_code
187 | 
188 | 
189 | def process_solution(
190 |     sample_solution: Dict,
191 |     dataset: Dict,
192 |     entry_point: Dict,
193 |     debug_task: str = None,
194 |     calibrate: bool = False,
195 |     is_folder: bool = False,
196 |     target_path: str = None,
197 | ):
198 | 
199 |     task_id = sample_solution.get("task_id")
200 |     if not task_id or task_id not in dataset:
201 |         return None
202 | 
203 |     dbg_identifier = sample_solution["_identifier"]
204 |     if debug_task is not None and task_id != debug_task:
205 |         return None
206 | 
207 |     function_name = entry_point.get(task_id)
208 |     old_code = sample_solution.get("solution")
209 | 
210 |     if old_code is None:
211 |         assert "completion" in sample_solution, sample_solution
212 |         old_code = dataset[task_id]["complete_prompt"] + "\n" + sample_solution.get("completion")
213 |     else:
214 |         if calibrate:
215 |             old_code = old_code.replace("```python\n    ", "```python\n"+dataset[task_id]["complete_prompt"]+"    ")
216 | 
217 |     new_code = sanitize(code=old_code, entrypoint=function_name)
218 | 
219 |     # if old code and new code are different, print msg
220 |     if new_code != old_code:
221 |         msg = "Sanitized: " + dbg_identifier
222 |         if is_folder:
223 |             msg += " -> " + dbg_identifier.replace(samples, target_path)
224 |         print(msg)
225 | 
226 |     return {"task_id": task_id, "solution": new_code}
227 | 
228 | 
229 | def script(
230 |     samples: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
231 | ):
232 |     # task_id -> entry_point
233 |     entry_point = {}
234 |     # merge two datasets
235 |     dataset = {**get_bigcodebench()}
236 | 
237 |     for task_id, problem in dataset.items():
238 |         entry_point[task_id] = problem["entry_point"]
239 | 
240 |     # make a new folder with "-sanitized" suffix
241 |     is_folder = os.path.isdir(samples)
242 |     target_path = pathlib.Path(samples)
243 |     if not inplace:
244 |         if is_folder:
245 |             if calibrate:
246 |                 new_name = target_path.name + "-sanitized-calibrated"
247 |             else:
248 |                 new_name = target_path.name + "-sanitized"
249 |         else:
250 |             if calibrate:
251 |                 new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl")
252 |             else:
253 |                 new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
254 |         target_path = target_path.parent / new_name
255 |     target_path = str(target_path)
256 | 
257 |     nsan = 0
258 |     ntotal = 0
259 | 
260 |     new_solutions = []
261 | 
262 |     parallel_arg_list = [
263 |         {
264 |             "sample_solution": sample_solution,
265 |             "dataset": dataset,
266 |             "entry_point": entry_point,
267 |             "debug_task": debug_task,
268 |             "calibrate": calibrate,
269 |             "is_folder": is_folder,
270 |             "target_path": target_path
271 |         }
272 |         for sample_solution in load_solutions(samples)
273 |     ]
274 | 
275 |     results = pqdm(parallel_arg_list, process_solution, n_jobs=min(parallel, os.cpu_count()), argument_type="kwargs")
276 | 
277 |     for result in results:
278 |         if result is not None:
279 |             new_solutions.append(result)
280 |             nsan += 1
281 |         ntotal += 1
282 | 
283 |     if is_folder:
284 |         write_directory(target_path, new_solutions)
285 |     else:
286 |         write_jsonl(target_path, new_solutions)
287 | 
288 |     if nsan > 0:
289 |         print(f"Sanitized {nsan} out of {ntotal} files.")
290 |     else:
291 |         print(f"All files seems valid -- no files are sanitized.")
292 |     print(f"Check the sanitized files at {target_path}")
293 | 
294 | 
295 | def main():
296 |     from fire import Fire
297 | 
298 |     Fire(script)
299 | 
300 | 
301 | if __name__ == "__main__":
302 |     main()
303 | 


--------------------------------------------------------------------------------
/bigcodebench/syncheck.py:
--------------------------------------------------------------------------------
  1 | """This file checks two things:
  2 | 1. Is the LLMs codegen completed for each benchmark?
  3 | 2. Warn the code that are not compilable (it could be some impl issues).
  4 | """
  5 | 
  6 | import ast
  7 | import traceback
  8 | 
  9 | from termcolor import colored
 10 | 
 11 | from bigcodebench.data import load_solutions
 12 | 
 13 | 
 14 | def syntax_check(code, verbose=False):
 15 |     try:
 16 |         ast.parse(code)
 17 |         return True
 18 |     except (SyntaxError, MemoryError):
 19 |         if verbose:
 20 |             traceback.print_exc()
 21 |         return False
 22 | 
 23 | 
 24 | def script(
 25 |     samples: str, nsample_check: int = None, verbose: bool = False
 26 | ):
 27 |     # List[Dict{"task_id", "solution"}]
 28 |     solutions = load_solutions(samples)
 29 | 
 30 |     from bigcodebench.data import get_bigcodebench
 31 | 
 32 |     dataset = get_bigcodebench()
 33 |     dataset_name = "BigCodeBench"
 34 | 
 35 |     print(colored(f"Dataset: {dataset_name}", "blue"))
 36 | 
 37 |     id2solutions = {}
 38 |     for solution in solutions:
 39 |         task_id = solution["task_id"]
 40 |         if task_id not in id2solutions:
 41 |             id2solutions[task_id] = []
 42 |         if "solution" not in solution:
 43 |             assert "completion" in solution, "solution or completion must exist!"
 44 |             solution["solution"] = dataset[task_id]["complete_prompt"] + solution["completion"]
 45 |         id2solutions[task_id].append(solution)
 46 | 
 47 |     print(colored("==============================", "blue"))
 48 |     print(colored(" ::: Checking completeness... ", "blue"))
 49 |     print(colored(" ::::: All tasks complete?    ", "blue"))
 50 |     ndone = 0
 51 | 
 52 |     task_ids = dataset.keys()
 53 |     ntask = len(task_ids)
 54 |     for task_id in task_ids:
 55 |         if task_id not in id2solutions:
 56 |             print(colored(f" ⚠️ {task_id} is missing!", "red"))
 57 |             continue
 58 |         nfiles = len(id2solutions[task_id])
 59 | 
 60 |         if nsample_check is None or nfiles <= nsample_check:
 61 |             ndone += 1
 62 |             continue
 63 | 
 64 |         print(
 65 |             colored(
 66 |                 f" ⚠️ {task_id} only has {nfiles} samples! But {nsample_check} are expected.",
 67 |                 "red",
 68 |             )
 69 |         )
 70 | 
 71 |     # check if there is enough number of samples here.
 72 |     if nsample_check is not None:
 73 |         if ntask != ndone:
 74 |             ntbd = ntask - ndone
 75 |             print(colored(f" ::::: ⚠️ {ntbd}/{ntask} tasks incomplete!", "red"))
 76 |         else:
 77 |             print(colored(f" ::::: All {ntask} tasks complete!", "green"))
 78 | 
 79 |     print(colored("==============================", "blue"))
 80 |     print(colored(" ::: Checking compilation...  ", "blue"))
 81 |     print(colored(" ::::: All code compilable?   ", "blue"))
 82 |     ncode = 0
 83 |     nwrong = 0
 84 |     for task_id in task_ids:
 85 |         # task_id must exist
 86 |         if task_id not in id2solutions:
 87 |             continue
 88 | 
 89 |         for solution in id2solutions[task_id]:
 90 |             ncode += 1
 91 |             code = solution["solution"]
 92 |             dbg_identifier = solution["_identifier"]
 93 |             if code.strip() == "":
 94 |                 print(colored(f" ⚠️ {dbg_identifier} is empty!", "red"))
 95 |                 nwrong += 1
 96 |             elif not syntax_check(code, verbose):
 97 |                 print(colored(f" ⚠️ {dbg_identifier} is not compilable!", "red"))
 98 |                 nwrong += 1
 99 |     if 0 != nwrong:
100 |         print(colored(f" ::::: ⚠️ {nwrong}/{ncode} code are not compilable!", "red"))
101 |     else:
102 |         print(colored(f" ::::: All {ncode} code are compilable!", "green"))
103 | 
104 | 
105 | def main():
106 |     from fire import Fire
107 | 
108 |     Fire(script)
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     main()
113 | 


--------------------------------------------------------------------------------
/decontamination/n_gram_check.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, load_from_disk
 2 | from collections import Counter
 3 | import tiktoken
 4 | from nltk import ngrams
 5 | from tqdm import tqdm
 6 | import datasets
 7 | 
 8 | def has_overlap(sample_1, sample_2):
 9 |     """Check if there is any N-gram overlap between the long string and a given string."""
10 |     return not set(sample_1).isdisjoint(set(sample_2))
11 | 
12 | from concurrent.futures import ThreadPoolExecutor, as_completed
13 | 
14 | def calculate_overlap_percentage(samples_1, samples_2):
15 |     def check_sample(sample):
16 |         for long_sample in samples_2:
17 |             if has_overlap(sample, long_sample["ngram"]):
18 |                 return 1
19 |         return 0
20 | 
21 |     count = 0
22 |     with ThreadPoolExecutor() as executor:
23 |         futures = [executor.submit(check_sample, sample) for sample in samples_1]
24 |         for future in tqdm(as_completed(futures), total=len(futures)):
25 |             count += future.result()
26 | 
27 |     return count / len(samples_1) * 100
28 | 
29 | def load_odex_data(n=10):
30 |     def map_ngram(sample):
31 |         return {"ngram": set([" ".join(ngram) for ngram in ngrams(sample["intent"].split(), n)])}
32 |     dataset = load_dataset("neulab/odex", "en", split="test")
33 |     dataset = dataset.map(map_ngram, num_proc=16, batch_size=16, remove_columns=dataset.column_names)
34 |     return dataset
35 | 
36 | def load_stackoverflow(n=10):
37 |     def map_ngram(sample):
38 |         return {"ngram": set([" ".join(ngram) for ngram in ngrams(sample["question"].split(), n)])}
39 |     dataset = load_dataset("bigcode/stack-exchange-preferences-20230914-clean-anonymization", split="train")
40 |     dataset = dataset.map(map_ngram, num_proc=16, batch_size=16, remove_columns=dataset.column_names)
41 |     dataset.push_to_hub(f"stackoverflow_ngram_{n}")
42 |     return dataset
43 | 
44 | 
45 | def load_starcoderdata(n=10):
46 |     def map_ngram(sample):
47 |         return {"ngram": set([" ".join(ngram) for ngram in ngrams(sample["content"].split(), n)])}
48 |     dataset = load_dataset("bigcode/starcoderdata", data_dir="python", split="train")
49 |     dataset = dataset.map(map_ngram, num_proc=16, batch_size=16, remove_columns=dataset.column_names)
50 |     dataset.push_to_hub(f"starcoderdata_ngram_{n}")
51 |     return dataset
52 | 
53 | def load_bigcodebench(n=10):
54 |     def map_ngram(sample):
55 |         return {"ngram": set([" ".join(ngram) for ngram in ngrams(sample["instruct_prompt"].split("```")[0].split(), n)])}
56 |     dataset = load_dataset("bigcode/bigcodebench", split="v0.1.0_hf")
57 |     dataset = dataset.map(map_ngram, num_proc=16, batch_size=16, remove_columns=dataset.column_names)
58 |     dataset.push_to_hub(f"bigcodebench_ngram_{n}")
59 |     return dataset
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     n_gram_size = 10
64 |     N_SHARDS = 50
65 |     user_name = "terryyz"
66 |     bigcodebench = load_dataset(f"{user_name}/bigcodebench_ngram_{n_gram_size}", split="train")
67 | 
68 |     dataset_name = "starcoderdata"
69 |     print(dataset_name, n_gram_size)
70 |     indices = []
71 |     for i in tqdm(range(N_SHARDS)):
72 |         ds = load_dataset(f"{user_name}/{dataset_name}_ngram_{n_gram_size}_overlap_{i}", split="train")
73 |         overlap_indices = [idx for idx, example in enumerate(ds) if example["overlap"]]
74 |         indices.extend(overlap_indices)
75 |     with open(f"{dataset_name}_ngram_{n_gram_size}_overlap.txt", "w") as f:
76 |         f.write(f"{len(set(indices))/1140*100:.2f}%")


--------------------------------------------------------------------------------
/decontamination/odex_10_overlap.txt:
--------------------------------------------------------------------------------
1 | 0.09%


--------------------------------------------------------------------------------
/decontamination/odex_13_overlap.txt:
--------------------------------------------------------------------------------
1 | odex: 0.00%


--------------------------------------------------------------------------------
/decontamination/stackoverflow_10_overlap.txt:
--------------------------------------------------------------------------------
1 | 1.49%


--------------------------------------------------------------------------------
/decontamination/stackoverflow_13_overlap.txt:
--------------------------------------------------------------------------------
1 | 0.18%


--------------------------------------------------------------------------------
/decontamination/starcoderdata_10_overlap.txt:
--------------------------------------------------------------------------------
1 | 2.54%


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
3 | build-backend = "setuptools.build_meta"
4 | 
5 | [tool.setuptools_scm]
6 | write_to = "bigcodebench/_version.py"
7 | version_scheme = "release-branch-semver"
8 | local_scheme = "no-local-version"
9 | 


--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
 1 | # argument version
 2 | 
 3 | set -eux
 4 | 
 5 | while getopts "v:" opt; do
 6 |   case $opt in
 7 |     v)
 8 |       version=$OPTARG
 9 |       ;;
10 |     \?)
11 |       echo "Invalid option: -$OPTARG" >&2
12 |       ;;
13 |   esac
14 | done
15 | 
16 | if [ -z "$version" ]; then
17 |   echo "version is required"
18 |   exit 1
19 | fi
20 | 
21 | export PYTHONPATH=$PWD pytest tests
22 | 
23 | git tag $version
24 | 
25 | rm -rf dist
26 | python3 -m build
27 | python3 -m twine upload dist/*
28 | 
29 | git push
30 | git push --tags


--------------------------------------------------------------------------------
/release_docker.sh:
--------------------------------------------------------------------------------
 1 | # argument version
 2 | 
 3 | set -eux
 4 | 
 5 | while getopts "v:" opt; do
 6 |   case $opt in
 7 |     v)
 8 |       version=$OPTARG
 9 |       ;;
10 |     \?)
11 |       echo "Invalid option: -$OPTARG" >&2
12 |       ;;
13 |   esac
14 | done
15 | 
16 | if [ -z "$version" ]; then
17 |   echo "version is required"
18 |   exit 1
19 | fi
20 | 
21 | export PYTHONPATH=$PWD pytest tests
22 | 
23 | docker buildx create --name multiplatform-builder --use || true
24 | docker buildx use multiplatform-builder
25 | 
26 | # Build and push evaluate image
27 | docker buildx build --platform linux/amd64 \
28 |     -f Docker/Evaluate.Dockerfile . \
29 |     -t bigcodebench/bigcodebench-evaluate:$version \
30 |     -t bigcodebench/bigcodebench-evaluate:latest \
31 |     --push
32 | 
33 | # Build and push gradio image
34 | docker buildx build --platform linux/amd64 \
35 |     -f Docker/Gradio.Dockerfile . \
36 |     -t bigcodebench/bigcodebench-gradio:$version \
37 |     -t bigcodebench/bigcodebench-gradio:latest \
38 |     --push


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | DATASET=bigcodebench
 2 | MODEL=meta-llama/Meta-Llama-3.1-8B-Instruct
 3 | BACKEND=vllm
 4 | NUM_GPU=2
 5 | SPLIT=complete
 6 | SUBSET=full
 7 | export E2B_API_KEY="e2b_0a231fa3b0a2b01690ab6c66a23b55c0979ce4ee"
 8 | 
 9 | bigcodebench.evaluate \
10 |   --model $MODEL \
11 |   --split $SPLIT \
12 |   --subset $SUBSET \
13 |   --backend $BACKEND


--------------------------------------------------------------------------------
/sandbox-templates/e2b.Dockerfile:
--------------------------------------------------------------------------------
 1 | # Better use newer Python as generated code can use new features
 2 | FROM python:3.10-slim
 3 | 
 4 | # install git, g++ and python3-tk
 5 | RUN apt-get update && apt-get install -y \
 6 |     git \
 7 |     g++ \
 8 |     python3-tk \
 9 |     zip \
10 |     unzip \
11 |     procps \
12 |     r-base \
13 |     libgdal-dev \
14 |     # Add these new dependencies for matplotlib
15 |     libfreetype6-dev \
16 |     libpng-dev \
17 |     pkg-config \
18 |     python3-dev \
19 |     python3-matplotlib \
20 |     && rm -rf /var/lib/apt/lists/*
21 | 
22 | # upgrade to latest pip
23 | RUN pip install --upgrade pip
24 | 
25 | # Add a new user "bigcodebenchuser"
26 | RUN adduser --disabled-password --gecos "" bigcodebenchuser
27 | 
28 | RUN rm -rf /bigcodebench
29 | 
30 | RUN echo 1
31 | # Acquire benchmark code to local
32 | ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
33 | RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
34 | 
35 | RUN pip install numpy==1.24.3 pyarrow==14.0.1
36 | 
37 | RUN cd /bigcodebench && \
38 |     pip install . --no-deps
39 |     
40 | RUN pip install --timeout 2000 \
41 |     appdirs \
42 |     fire \
43 |     multipledispatch \
44 |     pqdm \
45 |     tempdir \
46 |     termcolor \
47 |     tqdm \
48 |     transformers \
49 |     tree_sitter \
50 |     tree-sitter-python \
51 |     wget \
52 |     datasets \
53 |     gradio-client \
54 |     numpy \
55 |     rich \
56 |     e2b
57 | 
58 | RUN pip install -I --timeout 2000 -r https://raw.githubusercontent.com/bigcode-project/bigcodebench/refs/heads/main/Requirements/requirements-eval.txt
59 | 
60 | # Ensure the numpy version is compatible with the datasets version
61 | RUN pip install datasets==2.17.0
62 | 
63 | WORKDIR /app
64 | 
65 | RUN chown -R bigcodebenchuser:bigcodebenchuser /app
66 | 
67 | RUN chmod -R 777 /app && rm -rf /root/.cache/pip
68 | 
69 | USER bigcodebenchuser


--------------------------------------------------------------------------------
/sandbox-templates/e2b.toml:
--------------------------------------------------------------------------------
 1 | # This is a config for E2B sandbox template.
 2 | # You can use template ID (xs3c9i0hy53751xam77h) or template name (bigcodebench_evaluator) to create a sandbox:
 3 | 
 4 | # Python SDK
 5 | # from e2b import Sandbox, AsyncSandbox
 6 | # sandbox = Sandbox("bigcodebench_evaluator") # Sync sandbox
 7 | # sandbox = await AsyncSandbox.create("bigcodebench_evaluator") # Async sandbox
 8 | 
 9 | # JS SDK
10 | # import { Sandbox } from 'e2b'
11 | # const sandbox = await Sandbox.create('bigcodebench_evaluator')
12 | 
13 | team_id = "f317d0d2-ba02-44c5-8b77-e4a2d7830c7c"
14 | dockerfile = "e2b.Dockerfile"
15 | template_name = "bigcodebench_evaluator"
16 | template_id = "xs3c9i0hy53751xam77h"
17 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = bigcodebench
 3 | description = "Evaluation package for BigCodeBench"
 4 | long_description = file: README.md
 5 | long_description_content_type = text/markdown
 6 | url = https://github.com/bigcode-project/bigcodebench
 7 | license = Apache-2.0
 8 | license_files = LICENSE
 9 | platform = any
10 | classifiers =
11 |     Operating System :: OS Independent
12 |     Programming Language :: Python :: 3
13 |     License :: OSI Approved :: Apache Software License
14 | 
15 | [options]
16 | packages = find:
17 | python_requires = >=3.8
18 | dependency_links =
19 | install_requires =
20 |     appdirs>=1.4.4
21 |     fire>=0.6.0
22 |     multipledispatch>=0.6.0
23 |     pqdm>=0.2.0
24 |     tempdir>=0.7.1
25 |     termcolor>=2.0.0
26 |     tqdm>=4.56.0
27 |     tree_sitter>=0.22.0
28 |     tree-sitter-python>=0.21.0
29 |     wget>=3.2
30 |     transformers
31 |     datasets
32 |     gradio-client
33 |     vllm
34 |     numpy
35 |     rich
36 |     accelerate>=0.30.1
37 |     anthropic>=0.26.1
38 |     google-genai
39 |     mistralai>=0.2.0,<1.0.0
40 |     openai>=1.11.1
41 |     e2b
42 | 
43 | [options.entry_points]
44 | console_scripts =
45 |     bigcodebench.evaluate = bigcodebench.evaluate:main
46 |     bigcodebench.sanitize = bigcodebench.sanitize:main
47 |     bigcodebench.syncheck = bigcodebench.syncheck:main
48 |     bigcodebench.legacy_sanitize = bigcodebench.legacy_sanitize:main
49 |     bigcodebench.generate = bigcodebench.generate:main
50 |     bigcodebench.inspect = bigcodebench.inspect:main
51 | 


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | 


--------------------------------------------------------------------------------
/tests/test_legacy_sanitizer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from bigcodebench.lecacy_sanitize import sanitize
 4 | 
 5 | 
 6 | def test_inline_fn():
 7 |     assert (
 8 |         sanitize(
 9 |             """\
10 | def f(n):
11 |     def factorial(i):
12 |         if i == 0:
13 |             return 1
14 |         else:
15 |             return i * factorial(i-1)
16 | 
17 |     result = []
18 |     for i in range(1, n+1):
19 |         if i % 2 == 0:
20 |             result.append(factorial(i))
21 |         else:
22 |             result.append(sum(range(1, i+1)))
23 |     return result
24 | 
25 | # Test the function
26 | print(f(5))""",
27 |             entry_point="f",
28 |         )
29 |         == """\
30 | def f(n):
31 |     def factorial(i):
32 |         if i == 0:
33 |             return 1
34 |         else:
35 |             return i * factorial(i-1)
36 | 
37 |     result = []
38 |     for i in range(1, n+1):
39 |         if i % 2 == 0:
40 |             result.append(factorial(i))
41 |         else:
42 |             result.append(sum(range(1, i+1)))
43 |     return result"""
44 |     )
45 | 


--------------------------------------------------------------------------------
/tests/test_treesitter_sanitizer.py:
--------------------------------------------------------------------------------
  1 | from bigcodebench.sanitize import code_extract, sanitize
  2 | 
  3 | 
  4 | def test_code_extract():
  5 |     test_simple = r"""Here is some python code generated
  6 | import numpy as np
  7 | Sorry, I made a mistake, let me try again
  8 | from numpy import sin, cos, tan
  9 | 
 10 | def f(x):
 11 |     return tan(x)
 12 | As you can observe from above
 13 |     """
 14 |     assert (
 15 |         code_extract(test_simple)
 16 |         == r"""from numpy import sin, cos, tan
 17 | 
 18 | def f(x):
 19 |     return tan(x)"""
 20 |     )
 21 | 
 22 |     test_empty_lines = r"""import numpy as np
 23 | 
 24 | 
 25 | import pandas
 26 | Sorry, let me try again
 27 | from numpy import sin, cos, tan
 28 | def f(x):
 29 |     return tan(x)
 30 | """
 31 |     assert (
 32 |         code_extract(test_empty_lines)
 33 |         == r"""from numpy import sin, cos, tan
 34 | def f(x):
 35 |     return tan(x)"""
 36 |     )
 37 | 
 38 | 
 39 | def test_sanitize_simple():
 40 |     icode = r"""Following is the code snippet:
 41 | ```python
 42 | import numpy as np
 43 | from numpy import sin, cos
 44 | 
 45 | def f(x):
 46 |     return np.tan(x)
 47 | 
 48 | def g(x):
 49 |     return cos(f(x))
 50 | 
 51 | def g(x):
 52 |     return sin(f(x))
 53 | 
 54 | def c(x):
 55 |     assert 1==1
 56 | 
 57 | assert g(0) == 1
 58 | ```
 59 | """
 60 |     assert (
 61 |         sanitize(icode)
 62 |         == r"""import numpy as np
 63 | from numpy import sin, cos
 64 | def f(x):
 65 |     return np.tan(x)
 66 | def g(x):
 67 |     return cos(f(x))"""
 68 |     )
 69 | 
 70 | 
 71 | def test_sanitize_class():
 72 |     icode = r"""Following is the code snippet:
 73 | ```python
 74 | import numpy as np
 75 | from numpy import sin, cos
 76 | class g():
 77 |     def hello_world():
 78 |         return 0
 79 | def f(x):
 80 |     print(g.hello_world())
 81 |     return np.tan(x)
 82 | ```
 83 | """
 84 | 
 85 |     assert (
 86 |         sanitize(icode)
 87 |         == r"""import numpy as np
 88 | from numpy import sin, cos
 89 | class g():
 90 |     def hello_world():
 91 |         return 0
 92 | def f(x):
 93 |     print(g.hello_world())
 94 |     return np.tan(x)"""
 95 |     )
 96 | 
 97 | 
 98 | def test_entrypoint_basic():
 99 |     icode = r"""Following is the code snippet:
100 | ```python
101 | import numpy as np
102 | from numpy import sin, cos
103 | 
104 | def f(x):
105 |     return np.tan(x)
106 | 
107 | def g(x):
108 |     return cos(f(x))
109 | 
110 | def g(x):
111 |     return sin(f(x))
112 | 
113 | def c(x):
114 |     return 0
115 | 
116 | assert g(0) == 1
117 | ```
118 | """
119 |     assert (
120 |         sanitize(icode, "g")
121 |         == r"""import numpy as np
122 | from numpy import sin, cos
123 | def f(x):
124 |     return np.tan(x)
125 | def g(x):
126 |     return cos(f(x))"""
127 |     )
128 | 
129 | 
130 | def test_entrypoint_chain():
131 |     icode = r"""Following is the code snippet:
132 | ```python
133 | import numpy as np
134 | from numpy import sin, cos
135 | 
136 | def f(x):
137 |     return c(x)
138 | assert f(1) == 5
139 | def g(x):
140 |     return cos(f(x))
141 | 
142 | def c(x):
143 |     newObj = h()
144 |     return x
145 | 
146 | class h():
147 |     def hello_world():
148 |         return 0
149 | 
150 | class h():
151 |     def goodbye_world():
152 |         return 0
153 | 
154 | 
155 | assert g(0) == 1
156 | ```
157 | """
158 |     print(sanitize(icode, "g"))
159 |     assert (
160 |         sanitize(icode, "g")
161 |         == r"""import numpy as np
162 | from numpy import sin, cos
163 | def f(x):
164 |     return c(x)
165 | def g(x):
166 |     return cos(f(x))
167 | def c(x):
168 |     newObj = h()
169 |     return x
170 | class h():
171 |     def hello_world():
172 |         return 0"""
173 |     )
174 | 
175 | 
176 | def test_entrypoint_no_chain():
177 |     icode = r"""Following is the code snippet:
178 | ```python
179 | import numpy as np
180 | from numpy import sin, cos, sum
181 | 
182 | def f(x):
183 |     return np.sum(x)
184 | assert f(1) == 5
185 | def g(x):
186 |     return cos(f(x))
187 | 
188 | def c(x):
189 |     newObj = h()
190 |     return x
191 | 
192 | class h():
193 |     def hello_world():
194 |         return 0
195 | 
196 | 
197 | assert g(0) == 1
198 | ```
199 | """
200 |     assert (
201 |         sanitize(icode, "g")
202 |         == r"""import numpy as np
203 | from numpy import sin, cos, sum
204 | def f(x):
205 |     return np.sum(x)
206 | def g(x):
207 |     return cos(f(x))"""
208 |     )
209 | 
210 | 
211 | def test_entrypoint_variable():
212 |     icode = r"""Following is the code snippet:
213 | ```python
214 | import numpy as np
215 | from numpy import sin, cos
216 | 
217 | SOME_CONSTANT = 5
218 | 
219 | def f(x):
220 |     return c(x)
221 | assert f(1) == 5
222 | def g(x):
223 |     return cos(f(x))
224 | 
225 | def c(x):
226 |     newObj = h()
227 |     return x
228 | 
229 | class h():
230 |     def hello_world():
231 |         return SOME_CONSTANT
232 | 
233 | def d(x):
234 |     return g(x)
235 | 
236 | 
237 | assert g(0) == 1
238 | ```
239 | """
240 | 
241 |     assert (
242 |         sanitize(icode, "g")
243 |         == r"""import numpy as np
244 | from numpy import sin, cos
245 | SOME_CONSTANT = 5
246 | def f(x):
247 |     return c(x)
248 | def g(x):
249 |     return cos(f(x))
250 | def c(x):
251 |     newObj = h()
252 |     return x
253 | class h():
254 |     def hello_world():
255 |         return SOME_CONSTANT"""
256 |     )
257 | 


--------------------------------------------------------------------------------
/tools/fix_v019.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, Dataset, DatasetDict
 2 | from huggingface_hub import HfApi
 3 | 
 4 | import json
 5 | import copy
 6 | 
 7 | BIGCODEBENCH_HF = "bigcode/bigcodebench"
 8 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard"
 9 | BIGCODEBENCH_VERSION = "v0.1.0_hf"
10 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update"
11 | BIGCODEBENCH_NEW_VERSION = "v0.1.1"
12 | 
13 | def map_ds(sample):
14 |         
15 |     if sample["task_id"] in ["BigCodeBench/1006"]:
16 |         sample["test"] = sample["test"].replace(
17 | '''\
18 |     def test_valid_zip_url(self):
19 |         """Test a valid ZIP URL."""
20 |         url = "https://getsamplefiles.com/download/zip/sample-1.zip"
21 |         result = task_func(url)
22 |         self.assertTrue(result.startswith("mnt/data/downloads/"))
23 |         self.assertTrue(result.endswith("sample-1"))
24 |         shutil.rmtree("mnt/data/downloads")
25 | ''',
26 | '''\
27 |     @patch("requests.get")
28 |     def test_non_zip_content(self, mock_get):
29 |         """Test a valid ZIP URL."""
30 |         mock_get.return_value.status_code = 200
31 |         mock_get.return_value.headers = {"Content-Type": "application/zip"}
32 |         mock_get.return_value.content = b"1"
33 |         url = "https://valid-url.com/sample.zip"
34 |         result = task_func(url)
35 | ''',
36 |         )
37 |     
38 |     if sample["task_id"] in ["BigCodeBench/760"]:
39 |         for k in sample.keys():
40 |             if "prompt" in k:
41 |                 sample[k] = sample[k].replace(
42 |                     "from datetime import datetime",
43 |                     "import datetime"
44 |                 )
45 |                 
46 |     if sample["task_id"] in  ["BigCodeBench/178"]:
47 |         for k in sample.keys():
48 |             sample[k] = sample[k].replace(
49 |                 "from urllib import request\n",
50 |                 ""
51 |             )
52 |             sample[k] = sample[k].replace(
53 |                 "    - urllib.request\n",
54 |                 ""
55 |             )
56 |     
57 |     return sample
58 |     
59 | if __name__ == "__main__":
60 |     api = HfApi()
61 |     ds_dict = load_dataset(BIGCODEBENCH_HF)
62 |     hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
63 |     ds = ds_dict[BIGCODEBENCH_VERSION]
64 |     hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
65 |     function_id = [178, 760, 1006]
66 |     
67 |     new_ds = ds.map(map_ds)
68 |     new_ds.to_json("BigCodeBench.jsonl")
69 |     ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds
70 |     ds_dict.push_to_hub(BIGCODEBENCH_HF)
71 |     
72 |     new_hard_ds = hard_ds.map(map_ds)
73 |     new_hard_ds.to_json("BigCodeBench-Hard.jsonl")
74 |     hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds
75 |     hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF)
76 |     
77 |     for i in function_id:
78 |         old_sample = ds.select([i])
79 |         new_sample = new_ds.select([i])
80 |         old_sample.to_json("old.jsonl")
81 |         new_sample.to_json("new.jsonl")
82 |         api.upload_file(
83 |             path_or_fileobj="old.jsonl",
84 |             path_in_repo=f"{i}/old.jsonl",
85 |             repo_id=BIGCODEBENCH_UPDATE,
86 |             # repo_type="dataset"
87 |         )
88 |         api.upload_file(
89 |             path_or_fileobj="new.jsonl",
90 |             path_in_repo=f"{i}/new.jsonl",
91 |             repo_id=BIGCODEBENCH_UPDATE,
92 |             # repo_type="dataset"
93 |         )
94 |     
95 | 


--------------------------------------------------------------------------------
/tools/fix_v020.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, Dataset, DatasetDict
 2 | from huggingface_hub import HfApi
 3 | 
 4 | import json
 5 | import copy
 6 | 
 7 | BIGCODEBENCH_HF = "bigcode/bigcodebench"
 8 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard"
 9 | BIGCODEBENCH_VERSION = "v0.1.1"
10 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update"
11 | BIGCODEBENCH_NEW_VERSION = "v0.1.2"
12 | 
13 | def map_ds(sample):
14 |     if sample["task_id"] in ["BigCodeBench/16"]:
15 |         for k in sample.keys():
16 |             sample[k] = sample[k].replace(
17 |                 "No logs found to backup.", "No logs found to backup"
18 |             )
19 | 
20 |     if sample["task_id"] in ["BigCodeBench/37"]:
21 |         for k in sample.keys():
22 |             if "prompt" in k:
23 |                 sample[k] = "import pandas as pd\n" + sample[k]
24 |                 sample[k] = sample[k].replace(
25 |                             "Requirements:\n    - sklearn.ensemble\n",
26 |                             "Requirements:\n    - pandas\n    - sklearn.ensemble\n"    
27 |                 )
28 |     
29 |     if sample["task_id"] in ["BigCodeBench/241"]:
30 |         for k in sample.keys():
31 |             if "prompt" in k:
32 |                 sample[k] = sample[k].replace(
33 |                             "The function will plot the original and normalized arrays using matplotlib.",
34 |                             "The function will plot the original and normalized arrays with a title of 'Original vs. Normalized Data'."    
35 |                 )
36 |     
37 |     if sample["task_id"] in ["BigCodeBench/267"]:
38 |         for k in sample.keys():
39 |             if "prompt" in k:
40 |                 sample[k] = sample[k].replace(
41 |                             "Plots and returns the FFT of the signal.",
42 |                             "Plots and returns the FFT of the signal with a title of 'FFT of the signal'."    
43 |                 )
44 |     
45 |     return sample
46 |     
47 | if __name__ == "__main__":
48 |     api = HfApi()
49 |     ds_dict = load_dataset(BIGCODEBENCH_HF)
50 |     hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
51 |     ds = ds_dict[BIGCODEBENCH_VERSION]
52 |     hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
53 |     function_id = [16, 37, 241, 267]
54 |     
55 |     new_ds = ds.map(map_ds)
56 |     new_ds.to_json("BigCodeBench.jsonl")
57 |     ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds
58 |     ds_dict.push_to_hub(BIGCODEBENCH_HF)
59 |     
60 |     new_hard_ds = hard_ds.map(map_ds)
61 |     new_hard_ds.to_json("BigCodeBench-Hard.jsonl")
62 |     hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds
63 |     hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF)
64 |     
65 |     for i in function_id:
66 |         old_sample = ds.select([i])
67 |         new_sample = new_ds.select([i])
68 |         old_sample.to_json("old.jsonl")
69 |         new_sample.to_json("new.jsonl")
70 |         api.upload_file(
71 |             path_or_fileobj="old.jsonl",
72 |             path_in_repo=f"{i}/old.jsonl",
73 |             repo_id=BIGCODEBENCH_UPDATE,
74 |             # repo_type="dataset"
75 |         )
76 |         api.upload_file(
77 |             path_or_fileobj="new.jsonl",
78 |             path_in_repo=f"{i}/new.jsonl",
79 |             repo_id=BIGCODEBENCH_UPDATE,
80 |             # repo_type="dataset"
81 |         )
82 | 


--------------------------------------------------------------------------------
/tools/fix_v022.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, Dataset, DatasetDict
 2 | from huggingface_hub import HfApi
 3 | 
 4 | import json
 5 | import copy
 6 | 
 7 | BIGCODEBENCH_HF = "bigcode/bigcodebench"
 8 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard"
 9 | BIGCODEBENCH_VERSION = "v0.1.2"
10 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update"
11 | BIGCODEBENCH_NEW_VERSION = "v0.1.3"
12 | 
13 | def map_ds(sample):
14 |     if sample["task_id"] in ["BigCodeBench/1005"]:
15 |         for k in sample.keys():
16 |             sample[k] = sample[k].replace(
17 |                 "https://getsamplefiles.com/download/zip/sample-2.zip", "https://getsamplefiles.com/download/zip/sample-5.zip"
18 |             ).replace(
19 |                 "sample_2", "sample_5"
20 |             ).replace(
21 |                 "Sample 2", "Sample 5"
22 |             )
23 |     return sample
24 |     
25 | if __name__ == "__main__":
26 |     api = HfApi()
27 |     ds_dict = load_dataset(BIGCODEBENCH_HF)
28 |     hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
29 |     ds = ds_dict[BIGCODEBENCH_VERSION]
30 |     hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
31 |     function_id = [1005]
32 |     
33 |     new_ds = ds.map(map_ds)
34 |     new_ds.to_json("BigCodeBench.jsonl")
35 |     ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds
36 |     ds_dict.push_to_hub(BIGCODEBENCH_HF)
37 |     
38 |     new_hard_ds = hard_ds.map(map_ds)
39 |     new_hard_ds.to_json("BigCodeBench-Hard.jsonl")
40 |     hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds
41 |     hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF)
42 |     
43 |     for i in function_id:
44 |         old_sample = ds.select([i])
45 |         new_sample = new_ds.select([i])
46 |         old_sample.to_json("old.jsonl")
47 |         new_sample.to_json("new.jsonl")
48 |         api.upload_file(
49 |             path_or_fileobj="old.jsonl",
50 |             path_in_repo=f"{i}/old.jsonl",
51 |             repo_id=BIGCODEBENCH_UPDATE,
52 |             # repo_type="dataset"
53 |         )
54 |         api.upload_file(
55 |             path_or_fileobj="new.jsonl",
56 |             path_in_repo=f"{i}/new.jsonl",
57 |             repo_id=BIGCODEBENCH_UPDATE,
58 |             # repo_type="dataset"
59 |         )
60 | 


--------------------------------------------------------------------------------
/tools/fix_v023.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset, Dataset, DatasetDict
 2 | from huggingface_hub import HfApi
 3 | 
 4 | import json
 5 | import copy
 6 | 
 7 | BIGCODEBENCH_HF = "bigcode/bigcodebench"
 8 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard"
 9 | BIGCODEBENCH_VERSION = "v0.1.3"
10 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update"
11 | BIGCODEBENCH_NEW_VERSION = "v0.1.4"
12 | 
13 | def map_ds(sample):
14 |     if sample["task_id"] in ["BigCodeBench/211"]:
15 |         sample['test'] = sample['test'].replace(
16 | """
17 |         mock_response = MagicMock()
18 |         mock_response.content = MOCK_CONTENT
19 | """,
20 | """
21 |         mock_response = MagicMock()
22 |         mock_response.content = MOCK_CONTENT
23 |         mock_response.status_code = 200
24 | """
25 |         )
26 |     if sample["task_id"] in ["BigCodeBench/215"]:
27 |         sample['test'] = sample['test'].replace(
28 | """
29 |         mock_response = Mock()
30 | """,
31 | """
32 |         mock_response = Mock()
33 |         mock_response.status_code = 200
34 | """
35 |         )
36 |         sample['test'] = sample['test'].replace(
37 | """
38 |         mock_response.text =""",
39 | """
40 |         MOCK_TEXT ="""
41 |         )
42 |         sample['test'] = sample['test'].replace(
43 | """
44 |         mock_get.return_value = mock_response
45 | """,
46 | """
47 |         mock_response.text = MOCK_TEXT
48 |         mock_response.json = lambda: json.loads(MOCK_TEXT)
49 |         mock_get.return_value = mock_response
50 | """
51 |         )
52 |         sample['complete_prompt'] = sample['complete_prompt'].replace("Thif function will raise", "This function will raise")
53 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace("Thif function will raise", "This function will raise")
54 |         sample['doc_struct'] = sample['doc_struct'].replace("Thif function will raise", "This function will raise")
55 |     return sample
56 |     
57 | if __name__ == "__main__":
58 |     api = HfApi()
59 |     ds_dict = load_dataset(BIGCODEBENCH_HF)
60 |     hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
61 |     ds = ds_dict[BIGCODEBENCH_VERSION]
62 |     hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
63 |     function_id = [211, 215]
64 |     
65 |     new_ds = ds.map(map_ds)
66 |     new_ds.to_json("BigCodeBench.jsonl")
67 |     ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds
68 |     ds_dict.push_to_hub(BIGCODEBENCH_HF)
69 |     
70 |     new_hard_ds = hard_ds.map(map_ds)
71 |     new_hard_ds.to_json("BigCodeBench-Hard.jsonl")
72 |     hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds
73 |     hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF)
74 |     
75 |     for i in function_id:
76 |         old_sample = ds.select([i])
77 |         new_sample = new_ds.select([i])
78 |         old_sample.to_json("old.jsonl")
79 |         new_sample.to_json("new.jsonl")
80 |         api.upload_file(
81 |             path_or_fileobj="old.jsonl",
82 |             path_in_repo=f"{i}/old.jsonl",
83 |             repo_id=BIGCODEBENCH_UPDATE,
84 |             # repo_type="dataset"
85 |         )
86 |         api.upload_file(
87 |             path_or_fileobj="new.jsonl",
88 |             path_in_repo=f"{i}/new.jsonl",
89 |             repo_id=BIGCODEBENCH_UPDATE,
90 |             # repo_type="dataset"
91 |         )
92 | 


--------------------------------------------------------------------------------
/tools/fix_v025.py:
--------------------------------------------------------------------------------
  1 | from datasets import load_dataset
  2 | from huggingface_hub import HfApi
  3 | 
  4 | BIGCODEBENCH_HF = "bigcode/bigcodebench"
  5 | BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard"
  6 | BIGCODEBENCH_VERSION = "v0.1.4"
  7 | BIGCODEBENCH_UPDATE = "bigcode/bcb_update"
  8 | BIGCODEBENCH_NEW_VERSION = "v0.1.5"
  9 | 
 10 | def map_ds(sample):
 11 |     if sample["task_id"] in ["BigCodeBench/332"]:
 12 |         sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt']
 13 |         sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt']
 14 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace(
 15 |             "\nYou should write self-contained code starting with:\n```\n",
 16 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
 17 |         )
 18 | 
 19 |     if sample["task_id"] in ["BigCodeBench/334"]:
 20 |         sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
 21 |         sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
 22 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace(
 23 |             "\nYou should write self-contained code starting with:\n```\n",
 24 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
 25 |         )
 26 | 
 27 |     if sample["task_id"] in ["BigCodeBench/376"]:
 28 |         sample['code_prompt'] = sample['code_prompt'].replace(
 29 |             "import nltk\n",
 30 |             "import nltk\nnltk.download('stopwords')\n",
 31 |             1
 32 |         )
 33 |         sample['complete_prompt'] = sample['complete_prompt'].replace(
 34 |                 "import nltk\n",
 35 |                 "import nltk\nnltk.download('stopwords')\n",
 36 |                 1
 37 |         )
 38 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace(
 39 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\n",
 40 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
 41 |         )
 42 |         
 43 |     if sample["task_id"] in ["BigCodeBench/383"]:
 44 |         sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
 45 |         sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
 46 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace(
 47 |             "\nYou should write self-contained code starting with:\n```\n",
 48 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
 49 |         )
 50 | 
 51 |     if sample["task_id"] in ["BigCodeBench/633"]:
 52 |         sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt']
 53 |         sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt']
 54 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace(
 55 |             "\nYou should write self-contained code starting with:\n```\n",
 56 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
 57 |         )
 58 | 
 59 |     if sample["task_id"] in ["BigCodeBench/635"]:
 60 |         sample['code_prompt'] = sample['code_prompt'].replace(
 61 |             "# Importing the required libraries",
 62 |             "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n"
 63 |         )
 64 |                 
 65 |         sample['complete_prompt'] = sample['complete_prompt'].replace(
 66 |             "# Importing the required libraries",
 67 |             "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n"
 68 |         )
 69 | 
 70 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace(
 71 |             "\nYou should write self-contained code starting with:\n```\n",
 72 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
 73 |         )
 74 | 
 75 |     if sample["task_id"] in ["BigCodeBench/849"]:
 76 |         sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt']
 77 |         sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt']
 78 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace(
 79 |             "\nYou should write self-contained code starting with:\n```\n",
 80 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
 81 |         )
 82 | 
 83 |     if sample["task_id"] in ["BigCodeBench/940"]:
 84 |         sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
 85 |         sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
 86 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace(
 87 |             "\nYou should write self-contained code starting with:\n```\n",
 88 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
 89 |         )
 90 | 
 91 |     if sample["task_id"] in ["BigCodeBench/1109"]:
 92 |         sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
 93 |         sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
 94 |         sample['instruct_prompt'] = sample['instruct_prompt'].replace(
 95 |             "\nYou should write self-contained code starting with:\n```\n",
 96 |             "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
 97 |         )
 98 |    
 99 |     return sample
100 |     
101 | if __name__ == "__main__":
102 |     api = HfApi()
103 |     ds_dict = load_dataset(BIGCODEBENCH_HF)
104 |     hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
105 |     ds = ds_dict[BIGCODEBENCH_VERSION]
106 |     hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
107 |     function_id = [332, 334, 376, 383, 633, 635, 849, 940, 1109]
108 |     
109 |     new_ds = ds.map(map_ds)
110 |     new_ds.to_json("BigCodeBench.jsonl")
111 |     ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds
112 |     ds_dict.push_to_hub(BIGCODEBENCH_HF)
113 |     
114 |     new_hard_ds = hard_ds.map(map_ds)
115 |     new_hard_ds.to_json("BigCodeBench-Hard.jsonl")
116 |     hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds
117 |     hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF)
118 | 
119 |     for i in function_id:
120 |         old_sample = ds.select([i])
121 |         new_sample = new_ds.select([i])
122 |         old_sample.to_json("old.jsonl")
123 |         new_sample.to_json("new.jsonl")
124 |         api.upload_file(
125 |             path_or_fileobj="old.jsonl",
126 |             path_in_repo=f"{i}/old.jsonl",
127 |             repo_id=BIGCODEBENCH_UPDATE,
128 |             # repo_type="dataset"
129 |         )
130 |         api.upload_file(
131 |             path_or_fileobj="new.jsonl",
132 |             path_in_repo=f"{i}/new.jsonl",
133 |             repo_id=BIGCODEBENCH_UPDATE,
134 |             # repo_type="dataset"
135 |         )


--------------------------------------------------------------------------------