├── .gitignore
├── HISTORY.md
├── LICENSE
├── README.md
├── asset
    └── code_mmlu_banner.png
├── data
    ├── README.md
    ├── codemmlu
    │   └── codemmlu.py
    └── submission_sample.txt
├── paper
    └── 2410.01999v4.pdf
├── pyproject.toml
├── requirements.txt
└── src
    └── codemmlu
        ├── __init__.py
        ├── __main__.py
        ├── backends
            ├── __init__.py
            ├── base.py
            ├── hf.py
            └── vllm.py
        ├── evaluator.py
        ├── prompts
            ├── __init__.py
            ├── _codecomp.py
            ├── _coderepair.py
            ├── _defect.py
            ├── _fim.py
            └── _general.py
        └── task_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | *.json
165 | *.jsonl


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
1 | # Releases
2 | 
3 | ## Version 0.0.1 
4 | Release date: 14 Oct, 2024
5 | - Support backends: native huggingface, VLLMs
6 | - Support model's checkpoint from huggingface
7 | - Support lora model


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | Copyright © 2024 FPT Software AI Center
3 | 
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 | 
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 | 
8 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities
  2 | 
  3 | <center>
  4 | <img src="asset/code_mmlu_banner.png" alt="CodeMMLU">
  5 | </center>
  6 | 
  7 | <p align="center">
  8 |     <a href="https://fsoft-ai4code.github.io/codemmlu/"><img src="https://custom-icon-badges.demolab.com/badge/Leaderboard-orange?style=flat&logo=barchart&label=%20"></a>
  9 |     <a href="https://huggingface.co/datasets/Fsoft-AIC/CodeMMLU"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Datasets-CodeMMLU-f9a602?style=flat"></a>
 10 |     <a href="https://arxiv.org/abs/2410.01999"><img src="https://img.shields.io/badge/2410.01999-red?style=flat&label=arXiv"></a>
 11 |     <a href="https://pypi.org/project/codemmlu/"><img src="https://img.shields.io/pypi/v/codemmlu?color=g"></a>
 12 |     <a href="https://github.com/FSoft-AI4Code/codemmlu/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-MIT-green.svg"></a>
 13 | </p>
 14 | 
 15 | <p align="center">
 16 |     <a href="#-news">📰 News</a> •
 17 |     <a href="#-quick-start">🚀 Quick Start</a> •
 18 |     <a href="#-evaluation">📋 Evaluation</a> •
 19 |     <a href="#-citation">📌 Citation</a>
 20 | </p>
 21 | 
 22 | ## 📌 About
 23 | 
 24 | ### CodeMMLU
 25 | 
 26 | **CodeMMLU** is a comprehensive benchmark designed to evaluate the capabilities of large language models (LLMs) in coding and software knowledge. 
 27 | It builds upon the structure of multiple-choice question answering (MCQA) to cover a wide range of programming tasks and domains, including code generation, defect detection, software engineering principles, and much more.
 28 | 
 29 | ### Why CodeMMLU?
 30 | 
 31 | - **CodeMMLU** comprises over 10,000 questions curated from diverse, high-quality sources. It covers a wide spectrum of software knowledge, including general QA, code generation, defect detection, and code repair across various domains and more than 10 programming languages.
 32 | 
 33 | - **Precise and comprehensive:** Checkout our [LEADERBOARD](https://fsoft-ai4code.github.io/codemmlu/) for latest LLM rankings.
 34 | 
 35 | ## 🚀 Quick Start
 36 | 
 37 | Install CodeMMLU and setup dependencies via `pip`:
 38 | ```bash
 39 | pip install codemmlu
 40 | ```
 41 | 
 42 | Generate response for CodeMMLU MCQs benchmark:
 43 | ```bash
 44 | codemmlu --model_name <your_model_name_or_path> \
 45 |   --subset <subset> \
 46 |   --backend <backend> \
 47 |   --output_dir <your_output_dir>
 48 | ```
 49 | 
 50 | 
 51 | ## 📋 Evaluation
 52 | 
 53 | Build `codemmlu` from source:
 54 | ```bash
 55 | git clone https://github.com/Fsoft-AI4Code/CodeMMLU.git
 56 | cd CodeMMLU
 57 | pip install -e .
 58 | ```
 59 | 
 60 | > [!Note]
 61 | >
 62 | > If you prefer `vllm` backend, we highly recommend you install [vllm from official project](https://github.com/vllm-project/vllm/) before install `codemmlu`.
 63 | 
 64 | Generating with CodeMMLU questions:
 65 | ```bash
 66 | codemmlu --model_name <your_model_name_or_path> \
 67 |   --peft_model <your_peft_model_name_or_path> \
 68 |   --subset all \
 69 |   --batch_size 16 \
 70 |   --backend [vllm|hf] \
 71 |   --max_new_tokens 1024 \
 72 |   --temperature 0.0 \
 73 |   --output_dir <your_output_dir> \
 74 |   --instruction_prefix <special_prefix> \
 75 |   --assistant_prefix <special_prefix> \
 76 |   --cache_dir <your_cache_dir>
 77 | ```
 78 | 
 79 | <details><summary>⏬ API Usage <i>:: click to expand ::</i></summary>
 80 | <div>
 81 | 
 82 | ```bash
 83 | codemmlu [-h] [-V] [--subset SUBSET] [--batch_size BATCH_SIZE] [--instruction_prefix INSTRUCTION_PREFIX]
 84 |                 [--assistant_prefix ASSISTANT_PREFIX] [--output_dir OUTPUT_DIR] [--model_name MODEL_NAME]
 85 |                 [--peft_model PEFT_MODEL] [--backend BACKEND] [--max_new_tokens MAX_NEW_TOKENS]
 86 |                 [--temperature TEMPERATURE] [--prompt_mode PROMPT_MODE] [--cache_dir CACHE_DIR] [--trust_remote_code]
 87 | 
 88 | ==================== CodeMMLU ====================
 89 | 
 90 | optional arguments:
 91 |   -h, --help            show this help message and exit
 92 |   -V, --version         Get version
 93 |   --subset SUBSET       Select evaluate subset
 94 |   --batch_size BATCH_SIZE
 95 |   --instruction_prefix INSTRUCTION_PREFIX
 96 |   --assistant_prefix ASSISTANT_PREFIX
 97 |   --output_dir OUTPUT_DIR
 98 |                         Save generation and result path
 99 |   --model_name MODEL_NAME
100 |                         Local path or Huggingface Hub link to load model
101 |   --peft_model PEFT_MODEL
102 |                         Lora config
103 |   --backend BACKEND     LLM generation backend (default: hf)
104 |   --max_new_tokens MAX_NEW_TOKENS
105 |                         Number of max new tokens
106 |   --temperature TEMPERATURE
107 |   --prompt_mode PROMPT_MODE
108 |                         Prompt available: zeroshot, fewshot, cot_zs, cot_fs
109 |   --cache_dir CACHE_DIR
110 |                         Cache for save model download checkpoint and dataset
111 |   --trust_remote_code
112 | ```
113 | 
114 | </div>
115 | </details>
116 | 
117 | 
118 | List of supported backends:
119 | 
120 | | Backend          	| DecoderModel 	| LoRA 	|
121 | |------------------	|--------------	|------	|
122 | | [Transformers](https://github.com/huggingface/transformers) (hf) 	| ✅            | ✅    |
123 | | [Vllm](https://github.com/vllm-project/vllm) (vllm)      	| ✅            | ✅    |
124 | 
125 | ### Leaderboard
126 | To evaluate your model and submit your results to the [leaderboard](https://fsoft-ai4code.github.io/codemmlu/), please follow the instruction in [data/README.md](data/README.md).
127 | 
128 | ## 📌 Citation
129 | If you find this repository useful, please consider citing our paper:
130 | 
131 | ```
132 | @article{nguyen2024codemmlu,
133 |   title={CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities},
134 |   author={Nguyen, Dung Manh and Phan, Thang Chau and Le, Nam Hai and Doan, Thong T. and Nguyen, Nam V. and Pham, Quang and Bui, Nghi D. Q.},
135 |   journal={arXiv preprint},
136 |   year={2024}
137 | }
138 | ```
139 | 


--------------------------------------------------------------------------------
/asset/code_mmlu_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeMMLU/2999a6a888734a9dc48cc84d8a77a8311e9ca245/asset/code_mmlu_banner.png


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | ## Evaluation Submission
 2 | 
 3 | To submit your model's results to the [leaderboard](https://fsoft-ai4code.github.io/leaderboards/codemmlu/), please send us an email at `dungnm31@fpt.com` with the following information:
 4 | 
 5 | - **Model Name**: The name of your model.
 6 | - **Model Description**: A brief description of your model.
 7 | - **Model Configuration**: 
 8 |     - Base or Instruct
 9 |     - Base or LoRA
10 | - **Model Answer**: The response generated by your model followed the format:
11 | 
12 | ```
13 | <task_id> <answer>
14 | ```
15 | 
16 | Checkout the example in `submission_sample.txt`
17 | 
18 | 


--------------------------------------------------------------------------------
/data/codemmlu/codemmlu.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """The CodeMMLU benchmark."""
 15 | 
 16 | import os
 17 | import json
 18 | from glob import glob
 19 | 
 20 | import datasets
 21 | 
 22 | 
 23 | _CITATION = """\
 24 | @article{nguyen2024codemmlu,
 25 |   title={CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities},
 26 |   author={Nguyen, Dung Manh and Phan, Thang Chau and Le, Nam Hai and Doan, Thong T. and Nguyen, Nam V. and Pham, Quang and Bui, Nghi D. Q.},
 27 |   journal={arXiv preprint},
 28 |   year={2024}
 29 | }
 30 | """
 31 | 
 32 | _DESCRIPTION = """\
 33 | CodeMMLU is a comprehensive benchmark designed to evaluate the capabilities of large language models (LLMs) in coding and software knowledge
 34 | """
 35 | 
 36 | _HOMEPAGE = "https://fsoft-ai4code.github.io/codemmlu/"
 37 | 
 38 | _URL = "./data/test"
 39 | 
 40 | _SUBJECTS = [
 41 |     "programming_syntax", "api_frameworks",
 42 |     "software_principles", "dbms_sql", "others",
 43 |     "code_completion", "fill_in_the_middle", "code_repair", "defect_detection"
 44 | ]
 45 | 
 46 | 
 47 | class CodeMMLU(datasets.GeneratorBasedBuilder):
 48 |     """CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities"""
 49 |     # Version history:
 50 |     # 0.0.1: Initial release.
 51 |     VERSION = datasets.Version("0.0.1")
 52 | 
 53 |     BUILDER_CONFIGS = [
 54 |         datasets.BuilderConfig(
 55 |             name=sub, version=datasets.Version("0.0.1"),
 56 |             description="CodeMMLU test subject {}".format(sub)
 57 |         ) for sub in _SUBJECTS
 58 |     ]
 59 | 
 60 | 
 61 |     def _info(self):
 62 |         features = datasets.Features(
 63 |             {
 64 |                 "task_id": datasets.Value("string"),
 65 |                 "question": datasets.Value("string"),
 66 |                 "choices": datasets.features.Sequence(datasets.Value("string")),
 67 |             }
 68 |         )
 69 |         
 70 |         if self.config.name == "fill_in_the_middle":
 71 |             features["problem_description"] = datasets.Value("string")
 72 | 
 73 |         return datasets.DatasetInfo(
 74 |             description=_DESCRIPTION,
 75 |             features=features,
 76 |             homepage=_HOMEPAGE,
 77 |             citation=_CITATION,
 78 |         )
 79 | 
 80 |     def _split_generators(self, dl_manager):
 81 |         """Returns SplitGenerators."""
 82 |         path = os.path.join(_URL, self.config.name + ".jsonl")
 83 |         dl_dir = dl_manager.download(path)
 84 |         return [
 85 |             datasets.SplitGenerator(
 86 |                 name=datasets.Split.TEST,
 87 |                 gen_kwargs={"data_path": dl_dir},
 88 |             ),
 89 |         ]
 90 | 
 91 |     def _generate_examples(self, data_path):
 92 |         """This function returns the examples in the raw (text) form."""
 93 |         if data_path.endswith(".jsonl"):
 94 |             lines = open(data_path, "r", encoding="utf-8").readlines()
 95 |             reader = [json.loads(line) for line in lines]
 96 |             for idx, data in enumerate(reader):
 97 |                 return_dict = {
 98 |                     "task_id": data['task_id'],
 99 |                     "question": data['question'], 
100 |                     "choices": data['choices'],
101 |                 }
102 | 
103 |                 if "fill_in_the_middle" in data_path:
104 |                     return_dict['problem_description'] = data['problem_description']
105 | 
106 |                 yield idx, return_dict
107 | 


--------------------------------------------------------------------------------
/data/submission_sample.txt:
--------------------------------------------------------------------------------
1 | 1 A
2 | 2 B
3 | 3 C
4 | 4 D


--------------------------------------------------------------------------------
/paper/2410.01999v4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeMMLU/2999a6a888734a9dc48cc84d8a77a8311e9ca245/paper/2410.01999v4.pdf


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "codemmlu"
 7 | version = "0.0.2.1"
 8 | authors = [
 9 |   { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" },
10 | ]
11 | description = "CodeMMLU Evaluator: A framework for evaluating language models on CodeMMLU benchmark."
12 | readme = "README.md"
13 | requires-python = ">=3.9"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "transformers>=4.39.0",
21 |     "datasets>=2.17.1",
22 |     "accelerate>=0.27.2",
23 |     "deepspeed>=0.13.2",
24 |     "peft>=0.10.0",
25 |     "vllm"
26 | ]
27 | 
28 | [project.urls]
29 | "Homepage" = "https://fsoft-ai4code.github.io/codemmlu/"
30 | "Bug Tracker" = "https://github.com/FSoft-AI4Code/CodeMMLU/issues"
31 | 
32 | [project.scripts]
33 | codemmlu = "codemmlu.__main__:main"
34 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.39.0
2 | accelerate==0.27.2
3 | bitsandbytes==0.42.0
4 | datasets==2.17.1
5 | deepspeed==0.13.2
6 | peft==0.10.0
7 | vllm


--------------------------------------------------------------------------------
/src/codemmlu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeMMLU/2999a6a888734a9dc48cc84d8a77a8311e9ca245/src/codemmlu/__init__.py


--------------------------------------------------------------------------------
/src/codemmlu/__main__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import pkg_resources
 4 | 
 5 | from codemmlu.task_utils import ALL_TASK
 6 | 
 7 | def get_args():
 8 |     parser = argparse.ArgumentParser(description=f"{20*'='} CodeMMLU {20*'='}")
 9 |     
10 |     parser.add_argument("-V", "--version", action="version", help="Get version",
11 |                         version=pkg_resources.get_distribution("codemmlu").version)
12 |     
13 |     # Data args
14 |     parser.add_argument("--subset", default="programming_syntax", type=str,
15 |                         help='Select evaluate subset')
16 |     parser.add_argument("--batch_size", default=16, type=int)
17 |     parser.add_argument("--instruction_prefix", default="", type=str)
18 |     parser.add_argument("--assistant_prefix", default="", type=str)
19 |     parser.add_argument("--output_dir", default="./output", type=str,
20 |                         help='Save generation and result path')
21 |     
22 |     # Generation args
23 |     parser.add_argument("--model_name", type=str,
24 |                         help='Local path or Huggingface Hub link to load model')
25 |     parser.add_argument("--peft_model", default=None, type=str,
26 |                         help='Lora config')
27 |     parser.add_argument("--backend", default="hf", type=str,
28 |                         help="LLM generation backend (default: hf)")
29 |     parser.add_argument("--max_new_tokens", default=128, type=int,
30 |                         help='Number of max new tokens')
31 |     parser.add_argument("--temperature", default=0.0, type=float)
32 |     parser.add_argument("--prompt_mode", default='zeroshot', type=str,
33 |                         help='Prompt available: zeroshot, fewshot, cot_zs, cot_fs')
34 |     parser.add_argument("--cache_dir", default=None, type=str,
35 |                         help='Cache for save model download checkpoint and dataset')
36 |     parser.add_argument("--trust_remote_code", action='store_true')
37 |     
38 |     args = parser.parse_args()
39 |     
40 |     if not args.cache_dir:
41 |         TRANSFORMER_CACHE = os.getenv("TRANSFORMER_CACHE")
42 |         HF_HOME = os.getenv("HF_HOME")
43 |         if TRANSFORMER_CACHE:
44 |             args.cache_dir = TRANSFORMER_CACHE
45 |         else:
46 |             args.cache_dir = HF_HOME
47 |     
48 |     assert args.subset in ALL_TASK, f"Invalid subset name, expect {ALL_TASK}, but got {args.subset}"
49 | 
50 |     return args, parser
51 | 
52 | 
53 | def main():
54 |     args, parsre = get_args()
55 |     if args.model_name:
56 |         generate(args=args)
57 |     else:
58 |         parsre.print_help()
59 |     
60 | 
61 | def generate(args):
62 |     from codemmlu.evaluator import Evaluator
63 |     
64 |     evaluator = Evaluator(
65 |         subset=args.subset,
66 |         model_name=args.model_name,
67 |         peft_model=args.peft_model,
68 |         backend=args.backend,
69 |         batch_size=args.batch_size,
70 |         cache_dir=args.cache_dir,
71 |         output_dir=args.output_dir,
72 |         trust_remote_code=args.trust_remote_code,
73 |         instruction_prefix=args.instruction_prefix,
74 |         assistant_prefix=args.assistant_prefix,
75 |         prompt_mode=args.prompt_mode,
76 |     )
77 | 
78 |     evaluator.generate(
79 |         temperature=args.temperature,
80 |         max_new_tokens=args.max_new_tokens,
81 |     )
82 |     
83 |     print("======= Finish generated =======")
84 | 
85 | if __name__ == '__main__':
86 |     main()


--------------------------------------------------------------------------------
/src/codemmlu/backends/__init__.py:
--------------------------------------------------------------------------------
 1 | from codemmlu.backends.base import Backend
 2 | from codemmlu.task_utils import CodeMMLU
 3 | 
 4 | SUPPORTED_BACKENDS = ["vllm", "hf"]
 5 | 
 6 | def make_model(
 7 |     model_name: str,
 8 |     backend: str,
 9 |     subset: str,
10 |     split: str,
11 |     output_dir: str,
12 |     temperature: float = 0.0,
13 |     max_new_tokens: int = 1280,
14 |     batch_size: int = 16,
15 |     prompt_mode: str = "zeroshot",
16 |     # instruction model only
17 |     instruction_prefix: str = None,
18 |     assistant_prefix: str = None,
19 |     trust_remote_code: bool = False,
20 |     # peft model only
21 |     peft_model: str = None,
22 |     # cache dir
23 |     cache_dir: str = None,
24 |     
25 | ) -> Backend:
26 |     # Load dataset
27 |     dataset = CodeMMLU(subset=subset, 
28 |                        split=split,
29 |                        prompt_mode=prompt_mode,
30 |                        instruction_prefix=instruction_prefix,
31 |                        assistant_prefix=assistant_prefix)
32 | 
33 |     # Initialize backend
34 |     if backend == "vllm":
35 |         from codemmlu.backends.vllm import VllmEngine
36 | 
37 |         return VllmEngine(
38 |             model_name=model_name,
39 |             peft_model=peft_model,
40 |             dataset=dataset,
41 |             temperature=temperature,
42 |             batch_size=batch_size,
43 |             max_new_tokens=max_new_tokens,
44 |             trust_remote_code=trust_remote_code,
45 |             cache_dir=cache_dir,
46 |             output_dir=output_dir
47 |         )
48 |     elif backend == "hf":
49 |         from codemmlu.backends.hf import HuggingfaceEngine
50 | 
51 |         return HuggingfaceEngine(
52 |             model_name=model_name,
53 |             peft_model=peft_model,
54 |             dataset=dataset,
55 |             temperature=temperature,
56 |             batch_size=batch_size,
57 |             max_new_tokens=max_new_tokens,
58 |             trust_remote_code=trust_remote_code,
59 |             cache_dir=cache_dir,
60 |             output_dir=output_dir
61 |         )
62 |     else:
63 |         raise ValueError(f"Unknown backend: {backend}")
64 | 


--------------------------------------------------------------------------------
/src/codemmlu/backends/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from codemmlu.task_utils import CodeMMLU
 3 | 
 4 | class Backend(ABC):
 5 |     def __init__(self,
 6 |         dataset: CodeMMLU,
 7 |         model_name: str,
 8 |         temperature: float,
 9 |         max_new_tokens: int,
10 |         peft_model: str = None,
11 |         batch_size: int = 16,
12 |         trust_remote_code: bool = False,
13 |         cache_dir: str = None,
14 |         output_dir: str='./'):
15 |         print(f"Initializing {self.__class__.__name__} backend")
16 |         print(f"Initializing a decoding model: {model_name}")
17 | 
18 |         self.TASK_NAME = dataset.TASK_NAME
19 |         self.subset = dataset.subset
20 |         self.split = dataset.split
21 |         self.model_name = model_name
22 |         self.batch_size = batch_size
23 |         self.peft_model = peft_model
24 |         self.cache_dir = cache_dir
25 |         self.output_dir = output_dir
26 |         self.dataset = dataset.prepare_dataset()
27 |         self.temperature = temperature
28 |         self.max_new_tokens = max_new_tokens
29 |         self.trust_remote_code = trust_remote_code
30 | 
31 |     
32 |     @abstractmethod
33 |     def generate(self) -> str:
34 |         raise NotImplementedError
35 | 
36 |     def get_dataset(self) -> CodeMMLU:
37 |         return self.dataset
38 | 


--------------------------------------------------------------------------------
/src/codemmlu/backends/hf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from typing import Dict
  4 | from tqdm import tqdm 
  5 | 
  6 | from accelerate import Accelerator
  7 | from accelerate.utils import gather_object
  8 | from transformers import (
  9 |     GenerationConfig,
 10 |     AutoModelForCausalLM,
 11 |     AutoModelForSeq2SeqLM,
 12 |     AutoTokenizer
 13 | )
 14 | 
 15 | from codemmlu.backends.base import Backend
 16 | 
 17 | class HuggingfaceEngine(Backend):
 18 |     def __init__(self, model_name: str, **kwargs):
 19 |         super().__init__(model_name=model_name, **kwargs)
 20 |         self.accelerator = Accelerator()
 21 | 
 22 |         # TODO: add generation args
 23 |         generate_args = dict(
 24 |             temperature=self.temperature,
 25 |             max_new_tokens=self.max_new_tokens,
 26 |         )
 27 |         self.generation_config = GenerationConfig(**generate_args)
 28 |         
 29 |         model_kwargs = dict(
 30 |             cache_dir=self.cache_dir,
 31 |             trust_remote_code=self.trust_remote_code,
 32 |             load_in_8bit=False
 33 |         )
 34 |         try:
 35 |             self.model = AutoModelForCausalLM.from_pretrained(
 36 |                 self.model_name, **model_kwargs)
 37 |             
 38 |         except KeyError: # Except load seq2seq model
 39 |             self.model = AutoModelForSeq2SeqLM.from_pretrained(
 40 |                 self.model_name, **model_kwargs)
 41 |         
 42 |         if self.peft_model:
 43 |             from peft import PeftModel
 44 |             self.model = PeftModel.from_pretrained(self.model, self.peft_model)
 45 |         
 46 |         self.model.to(self.accelerator.device)
 47 |         
 48 |         self.tokenizer = AutoTokenizer.from_pretrained(
 49 |             self.model_name, 
 50 |             trust_remote_code=self.trust_remote_code,
 51 |             padding_side="left"
 52 |         )
 53 |         
 54 |         if not self.tokenizer.pad_token:
 55 |             print("Set EOS_TOKEN to PAD_TOKEN")
 56 |             self.tokenizer.pad_token = self.tokenizer.eos_token
 57 | 
 58 |     def generate(self) -> str:
 59 |         # ``Accelerate`` distribute data and model
 60 |         assert self.accelerator
 61 | 
 62 |         ds_loader = [self.dataset[i:i+self.batch_size] 
 63 |                     for i in range(0, len(self.dataset), self.batch_size)]
 64 |         
 65 |         for i in range(len(ds_loader)):
 66 |             question = ds_loader[i]['question']
 67 |             ds_loader[i]['question_ids'] = self.tokenizer(question, return_tensors="pt", padding=True)
 68 |         
 69 |         result = []
 70 |         with self.accelerator.split_between_processes(ds_loader, apply_padding=True) as batched_prompts:
 71 |             index = self.accelerator.process_index
 72 |             for batch in tqdm(batched_prompts, desc=f"Process: {index} | Generating", position=index):
 73 |                 input_ids = batch['question_ids'].to(self.accelerator.device)
 74 |                 outputs = self.model.generate(**input_ids, 
 75 |                                             generation_config=self.generation_config,
 76 |                                             pad_token_id=self.tokenizer.eos_token_id,
 77 |                                             eos_token_id=self.tokenizer.eos_token_id)
 78 |                 
 79 |                 outputs = [output[len(prompt) :] for prompt, output in zip(input_ids["input_ids"], outputs)]
 80 |                 batch_results = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
 81 |                 
 82 |                 batch['generation'] = batch_results
 83 |                 result.extend(batch['generation'])
 84 |                 self._save_result(batch)
 85 |                 
 86 |         
 87 |         result_gather = gather_object(result)[: len(self.dataset)]
 88 |         self.dataset = self.dataset.add_column('generation', result_gather)
 89 |         # TODO: process response and extract answer
 90 |         return self.dataset
 91 | 
 92 |     def _save_result(self, batched_outputs: Dict):
 93 |         assert 'question' in batched_outputs.keys()
 94 |         assert 'generation' in batched_outputs.keys()
 95 |         
 96 |         if self.accelerator.distributed_type == "MULTI_GPU":
 97 |             save_path = os.path.join(self.save_dir, 
 98 |                         f"{self.subset}.raw.generated.{self.accelerator.process_index}.jsonl")
 99 |         else:
100 |             save_path = os.path.join(self.save_dir, f"{self.subset}.final.generated.jsonl")
101 |         
102 |         with open(save_path, "a") as writer:
103 |             for idx in range(len(batched_outputs['question'])):
104 |                 res = dict(
105 |                     task_id=batched_outputs['task_id'][idx],
106 |                     prompt=batched_outputs['question'][idx],
107 |                     response=batched_outputs['generation'][idx]
108 |                 )
109 |             
110 |                 json.dump(res, writer)
111 |                 writer.write("\n")
112 | 


--------------------------------------------------------------------------------
/src/codemmlu/backends/vllm.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from tqdm import tqdm
 4 | from typing import Dict
 5 | 
 6 | import torch
 7 | from vllm import LLM, SamplingParams
 8 | from vllm.lora.request import LoRARequest
 9 | 
10 | from codemmlu.backends.base import Backend
11 | 
12 | class VllmEngine(Backend):
13 |     def __init__(self, model_name: str, **kwargs):
14 |         super().__init__(model_name=model_name, **kwargs)
15 |         ngpus = torch.cuda.device_count()
16 |         backend_kwargs = dict(
17 |             disable_log_stats=True,
18 |             tensor_parallel_size=ngpus,
19 |             download_dir=self.cache_dir,
20 |             trust_remote_code=self.trust_remote_code,
21 |         )
22 |         
23 |         self.model = LLM(self.model_name, 
24 |             enable_lora=True if self.peft_model else None,
25 |             **backend_kwargs)
26 |         
27 |         self.lora_request = None
28 |         if self.peft_model:
29 |             self.lora_request=LoRARequest("lora", 1, self.peft_model)
30 |             
31 |         self.sampling_params = SamplingParams(
32 |             max_tokens=self.max_new_tokens,
33 |             temperature=self.temperature,
34 |         )
35 | 
36 |     def generate(self):
37 |         ds_loader = [self.dataset[i:i+self.batch_size] 
38 |                     for i in range(0, len(self.dataset), self.batch_size)]
39 |         
40 |         result = []
41 |         for batch in tqdm(ds_loader, total=len(ds_loader), desc="Generating"):
42 |             outputs = self.model.generate(batch['question'], 
43 |                                           self.sampling_params, 
44 |                                           lora_request=self.lora_request)
45 | 
46 |             batch['generation'] = [output.outputs[0].text for output in outputs]
47 |             result.extend(batch['generation'])
48 |             self._save_result(batch)
49 |             
50 |         self.dataset = self.dataset.add_column('generation', result)
51 |         # TODO: process response and extract answer
52 |         return self.dataset
53 | 
54 |     def _save_result(self, batched_outputs: Dict):
55 |         assert 'question' in batched_outputs.keys()
56 |         assert 'generation' in batched_outputs.keys()
57 |         
58 |         save_path = os.path.join(self.output_dir, f"{self.subset}.final.generated.jsonl")
59 |         
60 |         with open(save_path, "a") as writer:
61 |             for idx in range(len(batched_outputs['question'])):
62 |                 res = dict(
63 |                     task_id=batched_outputs['task_id'][idx],
64 |                     prompt=batched_outputs['question'][idx],
65 |                     response=batched_outputs['generation'][idx]
66 |                 )
67 |             
68 |                 json.dump(res, writer)
69 |                 writer.write("\n")
70 | 


--------------------------------------------------------------------------------
/src/codemmlu/evaluator.py:
--------------------------------------------------------------------------------
  1 | """Evaluator to load CodeMMLU and extract answer from response. 
  2 | 
  3 | For example:
  4 | 
  5 | .. code-block:: python
  6 | 
  7 |     >>> from codemmlu import Evaluator
  8 |     >>> evaluator = Evaluator(subset="semantic")
  9 |     >>> response = evaluator.generate(temperature=0.9, num_return_sequences=3)
 10 | 
 11 | """
 12 | import os
 13 | import sys
 14 | import json
 15 | import time
 16 | from warnings import warn
 17 | from typing import Optional, Dict, List
 18 | 
 19 | import torch
 20 | 
 21 | from codemmlu.backends import make_model, SUPPORTED_BACKENDS, Backend
 22 | 
 23 | class Evaluator:
 24 |     """Evaluator class.
 25 | 
 26 |         :param model_name: Selected model for evaluating
 27 |         :type model_name: str
 28 |         :param peft_model: Adapter model, defaults to None
 29 |         :type peft_model: Optional[str], optional
 30 |         :param trust_remote_code: Huggingface argument, defaults to False
 31 |         :type trust_remote_code: Optional[bool], optional
 32 |         :param cache_dir: Downloaded cache directory, defaults to None
 33 |         :type cache_dir: Optional[str], optional
 34 |         :param batch_size: Generation batch size, defaults to 16
 35 |         :type batch_size: Optional[int], optional
 36 |         :param output_dir: Saving generation directory, defaults to "./output"
 37 |         :type output_dir: Optional[str], optional
 38 |     """
 39 |     
 40 |     def __init__(self, 
 41 |                  model_name: str,
 42 |                  subset: Optional[str] = None,
 43 |                  split: Optional[str] = "test",
 44 |                  peft_model: Optional[str] = None,
 45 |                  backend: str = "hf",
 46 |                  trust_remote_code: Optional[bool] = False,
 47 |                  cache_dir: Optional[str] = None,
 48 |                  batch_size: Optional[int] = 16,
 49 |                  output_dir: Optional[str] = "./output",
 50 |                  instruction_prefix: Optional[str] = "",
 51 |                  assistant_prefix: Optional[str] = "",
 52 |                  prompt_mode: Optional[str] = None,
 53 |                  ) -> None:
 54 | 
 55 |         # Dataset args
 56 |         self.split = split
 57 |         self.subset = subset
 58 |         self.instruction_prefix = instruction_prefix
 59 |         self.assistant_prefix = assistant_prefix
 60 | 
 61 |         # Generation args
 62 |         self.backend = backend
 63 |         self.model_name = model_name
 64 |         self.peft_model = peft_model
 65 |         self.output_dir = output_dir
 66 |         self.trust_remote_code = trust_remote_code
 67 |         self.cache_dir = cache_dir
 68 |         self.batch_size = batch_size
 69 |         self.prompt_mode = prompt_mode
 70 | 
 71 |         if backend not in SUPPORTED_BACKENDS:
 72 |             raise ValueError(f"Backend {backend} is not supported. Please choose from {SUPPORTED_BACKENDS}")
 73 |         
 74 |         os.makedirs(self.output_dir, exist_ok=True)
 75 | 
 76 |     
 77 |     def generate(self,
 78 |         max_new_tokens: int = 1024,
 79 |         temperature: float = 0.0,
 80 |         ) -> List:
 81 |         """Start backend, generate and extract answer from response
 82 | 
 83 |         :param max_new_tokens: Max new tokens, defaults to 256
 84 |         :type max_new_tokens: Optional[int], optional
 85 |         :param temperature: Model generate temperature, defaults to 0.9
 86 |         :type temperature: Optional[float], optional
 87 |         
 88 |         :return: List of generated result, stored in dictionary object 
 89 |             with ``task_id``, ``prompt`` and ``answer`` key.
 90 |         :rtype: List
 91 |         """
 92 |         self.engine : Backend = make_model(
 93 |             subset=self.subset,
 94 |             split=self.split,
 95 |             model_name=self.model_name,
 96 |             backend=self.backend,
 97 |             peft_model=self.peft_model,
 98 |             trust_remote_code=self.trust_remote_code,
 99 |             batch_size=self.batch_size,
100 |             temperature=temperature,
101 |             max_new_tokens=max_new_tokens,
102 |             cache_dir=self.cache_dir,
103 |             instruction_prefix=self.instruction_prefix,
104 |             assistant_prefix=self.assistant_prefix,
105 |             output_dir=self.output_dir,
106 |             prompt_mode=self.prompt_mode,
107 |         )
108 | 
109 | 
110 |         print(f"Evaluating task: [{self.engine.TASK_NAME}]")
111 |         print(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
112 |         print(f"device compute capabilities={torch.cuda.get_device_capability()}")
113 |         print(f"pytorch compute capabilities={torch.cuda.get_arch_list()}")
114 | 
115 |         start_time = time.time()
116 |         results = self.engine.generate()
117 |         
118 |         print("=======  Finished {}  =======".format(self.engine.TASK_NAME))
119 |         print("Completion time: %d s", (time.time() - start_time))
120 |         
121 |         return results
122 | 
123 | 
124 | def acc_evaluate(
125 |     subset: str,
126 |     response_path: str):
127 |     
128 |     pass


--------------------------------------------------------------------------------
/src/codemmlu/prompts/__init__.py:
--------------------------------------------------------------------------------
 1 | from codemmlu.prompts._general import GENERAL_PROMPT
 2 | from codemmlu.prompts._codecomp import CODECOMP_PROMPT
 3 | from codemmlu.prompts._fim import FIM_PROMPT
 4 | from codemmlu.prompts._coderepair import CODEREPAIR_PROMPT
 5 | from codemmlu.prompts._defect import DEFECT_PROMPT
 6 | 
 7 | __all__ = [
 8 |     "GENERAL_PROMPT",
 9 |     "CODECOMP_PROMPT",
10 |     "FIM_PROMPT",
11 |     "CODEREPAIR_PROMPT",
12 |     "DEFECT_PROMPT"
13 | ]


--------------------------------------------------------------------------------
/src/codemmlu/prompts/_codecomp.py:
--------------------------------------------------------------------------------
  1 | 
  2 | zeroshot = """The following are multiple choice questions (with answers) about 
  3 | programming problem.
  4 | 
  5 | Question: Which solution below is the most likely completion the following 
  6 | code snippet to achieve the desired goal?
  7 | {question}
  8 | 
  9 | {choices}
 10 | 
 11 | Answer: """
 12 | 
 13 | fewshot = """The following are multiple choice questions (with answers) about 
 14 | programming problem.
 15 | 
 16 | Question: Which solution below is the most likely completion the following 
 17 | code snippet to achieve the desired goal?
 18 | ```python
 19 | from typing import List
 20 | 
 21 | def two_sum(nums: List[int], target: int) -> List[int]:
 22 |     '''
 23 |     Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. 
 24 |     You may assume that each input would have exactly one solution, and you may not use the same element twice.
 25 |     
 26 |     >>> two_sum([2,7,11,15], 9) 
 27 |     [0,1]
 28 |     >>> two_sum([3,2,4], 6) 
 29 |     [1,2]
 30 |     >>> two_sum([3,3], 6) 
 31 |     [0,1]
 32 |     '''
 33 | ```
 34 | (A) ```python
 35 |     n = len(nums)
 36 |     for i in range(n - 1):
 37 |         for j in range(i + 1, n):
 38 |             if nums[i] + nums[j] == target:
 39 |                 return [i, j]
 40 |     return []
 41 | ```
 42 | (B) ```python
 43 |     for num in nums:
 44 |         if target - num in nums:
 45 |             return [nums.index(num), nums.index(target - num)]
 46 |     return []
 47 | ```
 48 | (C) ```python
 49 |     for i in range(len(nums)):
 50 |         if nums[i] * 2 == target:
 51 |             return [i, i]
 52 |     return []
 53 | ```
 54 | (D) ```python
 55 |     num_dict = {{}}
 56 |     for i, num in enumerate(nums):
 57 |         if target - num in num_dict:
 58 |             return [num_dict[target - num], i]
 59 |         num_dict[i] = num
 60 |     return []
 61 | ```
 62 | Answer: The answer is (A).
 63 | 
 64 | Question: Which solution below is the most likely completion the following 
 65 | code snippet to achieve the desired goal?
 66 | ```python
 67 | {question}
 68 | ```
 69 | 
 70 | {choices}
 71 | 
 72 | Answer: """
 73 | 
 74 | cot_zs = '''The following are multiple choice questions (with answers) about 
 75 | programming problem.
 76 | 
 77 | Question: Which solution below is the most likely completion the following 
 78 | code snippet to achieve the desired goal?
 79 | ```python
 80 | {question}
 81 | ```
 82 | {choices}
 83 | 
 84 | Answer: Let's think step by step. '''
 85 | 
 86 | cot_fs = """The following are multiple choice questions (with answers) about 
 87 | programming problem.
 88 | 
 89 | Question: Which solution below is the most likely completion the following 
 90 | code snippet to achieve the desired goal?
 91 | ```python
 92 | from typing import List
 93 | 
 94 | def two_sum(nums: List[int], target: int) -> List[int]:
 95 |     '''
 96 |     Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. 
 97 |     You may assume that each input would have exactly one solution, and you may not use the same element twice.
 98 |     
 99 |     >>> two_sum([2,7,11,15], 9) 
100 |     [0,1]
101 |     >>> two_sum([3,2,4], 6) 
102 |     [1,2]
103 |     >>> two_sum([3,3], 6) 
104 |     [0,1]
105 |     '''
106 | ```
107 | (A) ```python
108 |     n = len(nums)
109 |     for i in range(n - 1):
110 |         for j in range(i + 1, n):
111 |             if nums[i] + nums[j] == target:
112 |                 return [i, j]
113 |     return []
114 | ```
115 | (B) ```python
116 |     for num in nums:
117 |         if target - num in nums:
118 |             return [nums.index(num), nums.index(target - num)]
119 |     return []
120 | ```
121 | (C) ```python
122 |     for i in range(len(nums)):
123 |         if nums[i] * 2 == target:
124 |             return [i, i]
125 |     return []
126 | ```
127 | (D) ```python
128 |     num_dict = {{}}
129 |     for i, num in enumerate(nums):
130 |         if target - num in num_dict:
131 |             return [num_dict[target - num], i]
132 |         num_dict[i] = num
133 |     return []
134 | ```
135 | 
136 | Answer: Let's think step by step. The answer (A) uses a straightforward brute-force approach by checking every possible pair of indices to see if their corresponding values sum to the target. While this method has a time complexity of O(n^2), it is simple and guaranteed to find the correct solution for small input sizes, as it exhaustively evaluates all pairs. This solution works reliably within the problem's constraints and ensures the correct indices are returned when the target sum is found. The other solutions have issues such as incorrect handling of duplicate values or incorrect logic (as in C) that disqualify them.
137 | The answer is (A).
138 | 
139 | Question: Which solution below is the most likely completion the following 
140 | code snippet to achieve the desired goal?
141 | ```python
142 | {question}
143 | ```
144 | {choices}
145 | 
146 | Answer: Let's think step by step. """
147 | 
148 | CODECOMP_PROMPT = dict(zeroshot=zeroshot, 
149 |                       fewshot=fewshot,
150 |                       cot_zs=cot_zs,
151 |                       cot_fs=cot_fs)


--------------------------------------------------------------------------------
/src/codemmlu/prompts/_coderepair.py:
--------------------------------------------------------------------------------
 1 | 
 2 | zeroshot = """The following are multiple-choice questions (with answers) about debugging a programming problem.
 3 | 
 4 | Question: The implementation below is producing incorrect results. 
 5 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
 6 | {question}
 7 | 
 8 | {choices}
 9 | 
10 | Answer: """
11 | 
12 | fewshot = """The following are multiple-choice questions (with answers) about debugging a programming problem.
13 | 
14 | Question: The implementation below is producing incorrect results. 
15 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
16 | 
17 | 1 def two_sum(nums, target):
18 | 2     complement_map = {{}}    
19 | 3     for i, num in enumerate(nums):
20 | 4         complement = target - num
21 | 5         complement_map[num] = i
22 | 6         if complement in complement_map:
23 | 7             return [complement_map[complement], i]  
24 | 8     return None
25 | 
26 | (A) Remove line 5.
27 | 
28 | (B) Remove line 5. Add at line 7:
29 | ```        complement_map[num] = i```
30 | 
31 | (C) Modify line 7:
32 | ```         return [i, complement_map[complement]]```
33 | 
34 | (D) Remove line 5. Add at line 7:
35 | ```     if i == len(nums) - 1:
36 |             return None
37 |         complement_map[num] = i```
38 | 
39 | Answer: The answer is (B).
40 | 
41 | Question: The implementation below is producing incorrect results. 
42 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
43 | {question}
44 | 
45 | {choices}
46 | 
47 | Answer: """
48 | 
49 | cot_zs = zeroshot + "Let's think step by step. "
50 | 
51 | cot_fs = """The following are multiple-choice questions (with answers) about debugging a programming problem.
52 | 
53 | Question: The implementation below is producing incorrect results. 
54 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
55 | 
56 | 1 def two_sum(nums, target):
57 | 2     complement_map = {{}}  
58 | 3     for i, num in enumerate(nums):
59 | 4         complement = target - num
60 | 5         complement_map[num] = i
61 | 6         if complement in complement_map:
62 | 7             return [complement_map[complement], i]  
63 | 8     return None
64 | 
65 | (A) Remove line 5.
66 | 
67 | (B) Remove line 5. Add at line 7:
68 | ```        complement_map[num] = i```
69 | 
70 | (C) Modify line 7:
71 | ```         return [i, complement_map[complement]]```
72 | 
73 | (D) Remove line 5. Add at line 7:
74 | ```     if i == len(nums) - 1:
75 |             return None
76 |         complement_map[num] = i```
77 | 
78 | Answer: Let's think step by step. The bug in the code occurs because the current number is added to the complement_map before checking if its complement already exists, which can lead to incorrectly matching a number with itself. To fix this, the number should only be added to the map after checking for its complement. Solution (B) does exactly this by moving the line that adds the current number to the map after the complement check, ensuring the logic works as intended without self-matching errors.
79 | The answer is (B).
80 | 
81 | Question: The implementation below is producing incorrect results. 
82 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
83 | {question}
84 | 
85 | {choices}
86 | 
87 | Answer: Let's think step by step. """
88 | 
89 | CODEREPAIR_PROMPT = dict(zeroshot=zeroshot, 
90 |                       fewshot=fewshot,
91 |                       cot_zs=cot_zs,
92 |                       cot_fs=cot_fs)


--------------------------------------------------------------------------------
/src/codemmlu/prompts/_defect.py:
--------------------------------------------------------------------------------
 1 | zeroshot = """The following are multiple choice questions (with answers) about programming problem.
 2 | 
 3 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
 4 | {question}
 5 | 
 6 | {choices}
 7 | 
 8 | Answer: """
 9 | 
10 | fewshot = """The following are multiple choice questions (with answers) about programming problem.
11 | 
12 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
13 | ```python
14 | def chkPair(A, size, x):
15 |     for i in range(0, size - 1):
16 |         for j in range(i + 1, size):
17 |             if (A[i] + A[j] == x):
18 |                 return 1
19 |     return 0
20 | 
21 | ```
22 | 
23 | (A). The code contain no issue.
24 | (B). Memory Limit Exceeded
25 | (C). Internal error
26 | (D). Runtime Error
27 | 
28 | Answer: The answer is (A).
29 | 
30 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
31 | {question}
32 | 
33 | {choices}
34 | 
35 | Answer: """
36 | 
37 | cot_zs = zeroshot + "Let's think step by step. "
38 | 
39 | cot_fs = """The following are multiple choice questions (with answers) about programming problem.
40 | 
41 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
42 | ```python
43 | def chkPair(A, size, x):
44 |     for i in range(0, size - 1):
45 |         for j in range(i + 1, size):
46 |             if (A[i] + A[j] == x):
47 |                 return 1
48 |     return 0
49 | 
50 | ```
51 | 
52 | (A). The code contain no issue.
53 | (B). Memory Limit Exceeded
54 | (C). Internal error
55 | (D). Runtime Error
56 | 
57 | Answer: Let's think step by step. The code defines a function `chkPair` that checks for a pair of elements in an array A whose sum equals x. It uses two nested loops to iterate over all possible pairs and returns 1 if a valid pair is found, or 0 otherwise. The function has a time complexity of O(n^2) due to the nested loops, which could slow down performance for large inputs, but it doesn't involve excessive memory usage or problematic operations that would lead to errors like memory limit exceeded, runtime errors, or internal issues. Hence, the most likely outcome is that the code contains no issue.
58 | The answer is (A).
59 | 
60 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
61 | {question}
62 | 
63 | {choices}
64 | 
65 | Answer: Let's think step by step."""
66 | 
67 | DEFECT_PROMPT = dict(zeroshot=zeroshot, 
68 |                       fewshot=fewshot,
69 |                       cot_zs=cot_zs,
70 |                       cot_fs=cot_fs)


--------------------------------------------------------------------------------
/src/codemmlu/prompts/_fim.py:
--------------------------------------------------------------------------------
  1 | zeroshot = """The following are multiple-choice questions (with answers) about a programming problem with incomplete solution.
  2 | 
  3 | Problem statement: {problem_description}
  4 | 
  5 | Incomplete Solution:
  6 | {question}
  7 | 
  8 | Question: The provided solution is missing a part, Which option below is the most likely to complete the solution and achieve the desired goal?
  9 | 
 10 | {choices}
 11 | 
 12 | Answer: """
 13 | 
 14 | fewshot = """The following are multiple-choice questions (with answers) about a programming problem with incomplete solution.
 15 | 
 16 | Problem statement: You are given an array of intervals, where intervals[i] = [starti, endi] and each starti is unique. 
 17 | The right interval for an interval i is an interval j such that startj >= endi and startj is minimized. 
 18 | Note that i may equal j. Return an array of right interval indices for each interval i. 
 19 | If no right interval exists for interval i, then put -1 at index i.
 20 | 
 21 | Incomplete Solution:
 22 | python```
 23 | def find_right_interval(intervals):
 24 |     n = len(intervals)
 25 |     res = [-1] * n
 26 |     for i in range(n):
 27 |         intervals[i].append(i)
 28 | 
 29 |     def binary_search(ele):
 30 |         left, right = 0, n-1
 31 |         ans = float('inf')
 32 |         while left <= right:
 33 |             mid = (left + right) // 2
 34 |             if intervals[mid][0] >= ele:
 35 |                 ans = min(ans, mid)
 36 |                 right = mid - 1
 37 |             else:
 38 |                 left = mid + 1
 39 |         return ans
 40 |             
 41 |     intervals.sort()
 42 |     for i in intervals:
 43 |         _________________
 44 | 
 45 |     return res
 46 | ```
 47 | 
 48 | Question: The provided solution is missing a part, Which option below is the most likely to complete the solution and achieve the desired goal?
 49 | 
 50 | (A) ```python
 51 |     val = binary_search(i[1])
 52 |     if val != float('inf'):
 53 |         res[i[2]] = intervals[val][2]
 54 | ```
 55 | (B) ```python
 56 |     if val != float('inf'): 
 57 |         res[i[2]] = intervals[val][2]
 58 |     else:
 59 |         continue
 60 | ```
 61 | (C) ```python
 62 |     val = binary_search(i[1])
 63 | 		if val != float('inf'): res[i[2] + 1] = intervals[val][2]
 64 | ```
 65 | (D) ```python
 66 |     if val != float('inf'): 
 67 | 			  res[i[2]] = intervals[val][2]
 68 | 		else:
 69 | 		  continue
 70 | ```
 71 | 
 72 | Answer: The answer is (A).
 73 | 
 74 | Problem statement: {problem_description}
 75 | 
 76 | Incomplete Solution:
 77 | {question}
 78 | 
 79 | Question: The provided solution is missing a part, Which option below is the most likely to complete the solution and achieve the desired goal?
 80 | 
 81 | {choices}
 82 | 
 83 | Answer: """
 84 | 
 85 | cot_zs = """The following are multiple-choice questions (with answers) about a programming problem with incomplete solution.
 86 | 
 87 | Problem statement: {problem_description}
 88 | 
 89 | Incomplete Solution:
 90 | {question}
 91 | 
 92 | Question: The provided solution is missing a part, Which option below is the most likely to
 93 | complete the solution and achieve the desired goal?
 94 | 
 95 | {choices}
 96 | 
 97 | Answer: Let's think step by step. """
 98 | 
 99 | cot_fs = """The following are multiple-choice questions (with answers) about a programming problem
100 | with incomplete solution.
101 | 
102 | Problem statement: You are given an array of intervals, where intervals[i] = [starti, endi] and each starti is unique. 
103 | The right interval for an interval i is an interval j such that startj >= endi and startj is minimized. 
104 | Note that i may equal j. Return an array of right interval indices for each interval i. 
105 | If no right interval exists for interval i, then put -1 at index i.
106 | 
107 | Incomplete Solution:
108 | python```
109 | def find_right_interval(intervals):
110 |     n = len(intervals)
111 |     res = [-1] * n
112 |     for i in range(n):
113 |         intervals[i].append(i)
114 | 
115 |     def binary_search(ele):
116 |         left, right = 0, n-1
117 |         ans = float('inf')
118 |         while left <= right:
119 |             mid = (left + right) // 2
120 |             if intervals[mid][0] >= ele:
121 |                 ans = min(ans, mid)
122 |                 right = mid - 1
123 |             else:
124 |                 left = mid + 1
125 |         return ans
126 |             
127 |     intervals.sort()
128 |     for i in intervals:
129 |         _________________
130 | 
131 |     return res
132 | ```
133 | 
134 | Question: The provided solution is missing a part, Which option below is the most likely to
135 | complete the solution and achieve the desired goal?
136 | 
137 | (A) ```python
138 |     val = binary_search(i[1])
139 |     if val != float('inf'):
140 |         res[i[2]] = intervals[val][2]
141 | ```
142 | (B) ```python
143 |     if val != float('inf'): 
144 |         res[i[2]] = intervals[val][2]
145 |     else:
146 |         continue
147 | ```
148 | (C) ```python
149 |     val = binary_search(i[1])
150 | 		if val != float('inf'): res[i[2] + 1] = intervals[val][2]
151 | ```
152 | (D) ```python
153 |     if val != float('inf'): 
154 | 			  res[i[2]] = intervals[val][2]
155 | 		else:
156 | 		  continue
157 | ```
158 | 
159 | Answer: Let's think step by step. The incomplete solution first sorts the intervals and then iterates over the sorted intervals. For each interval, it finds the right interval using a binary search.
160 | This option (A) finds the right interval index using the binary search and updates the result array accordingly.
161 | The option (B) is similar to (A), but it does not increment the index when finding the right interval index. This could lead to incorrect results.
162 | The option (C) increments the index when finding the right interval index. However, this is incorrect because the problem statement asks for the index of the right interval, not the offset from the original index.
163 | The option (D) uses the same index for both the original interval and the right interval, which could lead to incorrect results.
164 | The answer is (A).
165 | 
166 | Problem statement: {problem_description}
167 | 
168 | Incomplete Solution:
169 | {question}
170 | 
171 | Question: The provided solution is missing a part, Which option below is the most likely to
172 | complete the solution and achieve the desired goal?
173 | 
174 | {choices}
175 | 
176 | Answer: Let's think step by step. """
177 | 
178 | FIM_PROMPT = dict(zeroshot=zeroshot, 
179 |                     fewshot=fewshot,
180 |                     cot_zs=cot_zs,
181 |                     cot_fs=cot_fs)


--------------------------------------------------------------------------------
/src/codemmlu/prompts/_general.py:
--------------------------------------------------------------------------------
 1 | zeroshot = """The following are multiple choice questions (with answers) about software development.
 2 | 
 3 | Question: {question}
 4 | {choices}
 5 | 
 6 | Answer: """
 7 | 
 8 | fewshot = """The following are multiple choice questions (with answers) about software development.
 9 | 
10 | Question: If a sorted array of integers is guaranteed to not contain duplicate values, 
11 | in order to search a for a specific value which of the following algorithms is the most efficient for this task?
12 | 
13 | (A) Bubble Sort (B) Linear Search (C) Insertion Sort (D) Binary Search
14 | 
15 | Answer: The answer is (D).
16 | 
17 | Question: {question}
18 | {choices}
19 | 
20 | Answer: """
21 | 
22 | cot_zs = """The following are multiple choice questions (with answers) about software devopment.
23 |  
24 | Question: {question}
25 | {choices}
26 | 
27 | Answer: Let's think step by step. """
28 | 
29 | cot_fs = '''The following are multiple choice questions (with answers) about software devopment.
30 | 
31 | Question: If a sorted array of integers is guaranteed to not contain duplicate values, in order to search a for a specific value which of the following algorithms is the most efficient for this task?
32 |  
33 | (A) Bubble Sort (B) Linear Search (C) Insertion Sort (D) Binary Search
34 |  
35 | Answer: Let's think step by step. Binary Search is a divide-and-conquer algorithm that works by repeatedly dividing the search interval in half and searching for the value in the appropriate half. Since the array is already sorted and does not contain any duplicate value, this algorithm is optimal to find the desired value. The answer is (D).
36 |  
37 | Question: {question}
38 | {choices}
39 | 
40 | Answer: Let's think step by step. '''
41 | 
42 | GENERAL_PROMPT = dict(zeroshot=zeroshot, 
43 |                       fewshot=fewshot,
44 |                       cot_zs=cot_zs,
45 |                       cot_fs=cot_fs)


--------------------------------------------------------------------------------
/src/codemmlu/task_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import json
  4 | import glob
  5 | from string import ascii_uppercase
  6 | from typing import Optional
  7 | from datasets import Dataset, load_dataset
  8 | 
  9 | from codemmlu.prompts import GENERAL_PROMPT, CODECOMP_PROMPT, FIM_PROMPT, CODEREPAIR_PROMPT, DEFECT_PROMPT
 10 | 
 11 | 
 12 | SEMANTIC_TASK = ["software_principles", "dbms_sql", "others"]
 13 | 
 14 | SYNTACTIC_TASK = ["programming_syntax", "api_frameworks"]
 15 | 
 16 | REALWORLD_TASK = ["code_completion", "fill_in_the_middle", "code_repair", "defect_detection"]
 17 | 
 18 | ALL_TASK = SEMANTIC_TASK + SYNTACTIC_TASK + REALWORLD_TASK
 19 | 
 20 | 
 21 | def get_prompt(subset: str, prompt_mode: str) -> str:
 22 |     """Get prompt for a given task."""
 23 |     assert prompt_mode in ["zeroshot", "fewshot", "cot_zs", "cot_fs"]
 24 | 
 25 |     if subset in SEMANTIC_TASK + SYNTACTIC_TASK:
 26 |         return GENERAL_PROMPT[prompt_mode]
 27 |     else:
 28 |         if subset == "code_completion":
 29 |             return CODECOMP_PROMPT[prompt_mode]
 30 |         elif subset == "fill_in_the_middle":
 31 |             return FIM_PROMPT[prompt_mode]
 32 |         elif subset == "code_repair":
 33 |             return CODEREPAIR_PROMPT[prompt_mode]
 34 |         elif subset == "defect_detection":
 35 |             return DEFECT_PROMPT[prompt_mode]
 36 |         else:
 37 |             raise ValueError(f"Invalid subset: {subset}")
 38 | 
 39 | 
 40 | class CodeMMLU:
 41 |     """CodeMMLU benchmark loader."""
 42 |     TASK_NAME = "codemmlu"
 43 |     DATASET_NAME_OR_PATH = "Fsoft-AIC/codemmlu"
 44 |     
 45 |     def __init__(self, 
 46 |                  split: str,
 47 |                  subset: str,
 48 |                  prompt_mode: str = "zeroshot",
 49 |                  instruction_prefix: Optional[str] = "",
 50 |                  assistant_prefix: Optional[str] = "") -> None:
 51 |         
 52 |         self.stop_words = ['\n\nQ:', '\n\nQuestion:', '\n\n###', '\n#', "\n<|/", "\n```"]
 53 |         self.instruction_prefix = instruction_prefix
 54 |         self.assistant_prefix = assistant_prefix
 55 |         self.split = split
 56 |         self.subset = subset
 57 |         self.prompt_mode = prompt_mode
 58 | 
 59 |         self.dataset = load_dataset(self.DATASET_NAME_OR_PATH, subset,
 60 |                                     split=split, use_auth_token=True)
 61 | 
 62 |     def __len__(self):
 63 |         return len(self.dataset)
 64 |     
 65 |     def get_dataset(self) -> Dataset:
 66 |         return self.dataset
 67 |     
 68 |     def prepare_dataset(self) -> Dataset:
 69 |         """Preprocess CodeMMLU question.
 70 |         
 71 |         - Default CodeMMLU prompt is zeroshot. All support prompt modes are:
 72 |             - zeroshot
 73 |             - fewshot
 74 |             - cot_zs (Chain-of-Thought zershot)
 75 |             - cot_fs (Chain-of-Thought fewshot)
 76 |         """
 77 | 
 78 |         TEMPLATE = get_prompt(self.subset, self.prompt_mode)
 79 |             
 80 |         def _preprocess(example):
 81 |             model_inputs = dict(task_id=[], question=[])
 82 |             
 83 |             # for idx in range(len(examples[key_column])):
 84 |             # question = examples[key_column][idx]
 85 |             task_id = example.pop('task_id')
 86 |             example['choices'] = "\n".join([f"({ascii_uppercase[idx]}) {choice}" for idx, choice in enumerate(example['choices'])])
 87 | 
 88 |             # MODEL INPUTS HERE
 89 |             question = TEMPLATE.format(**example)
 90 |             question = self.instruction_prefix + question + self.assistant_prefix
 91 |             model_inputs['question'] = question
 92 |             model_inputs['task_id'] = task_id
 93 |             
 94 |             return model_inputs
 95 |         
 96 |         preprocessed_ds = self.dataset.map(_preprocess, 
 97 |                                            batched=False,
 98 |                                            remove_columns=self.dataset.column_names)
 99 |         
100 |         print(f"Preprocessed dataset: {preprocessed_ds}")
101 |         # Visualize 3 sample
102 |         print("Preprocessed prompts:")
103 |         for i in range(3):
104 |             print(preprocessed_ds['question'][i])
105 |         return preprocessed_ds
106 |     
107 | 
108 |     @staticmethod
109 |     def _stop_at_stop_token(decoded_string, stop_tokens):
110 |         """
111 |         Produces the prefix of decoded_string that ends at the first occurrence of
112 |         a stop_token.
113 |         WARNING: the decoded_string *must not* include the prompt, 
114 |         which may have stop tokens itself.
115 |         """
116 |         min_stop_index = len(decoded_string)
117 |         for stop_token in stop_tokens:
118 |             stop_index = decoded_string.find(stop_token)
119 |             if stop_index != -1 and stop_index < min_stop_index:
120 |                 min_stop_index = stop_index
121 |         return decoded_string[:min_stop_index]
122 | 
123 |     def process_response(self, example):
124 |         answer = self._stop_at_stop_token(example, self.stop_words)
125 | 
126 |         # Substitute special characters with empty string
127 |         answer = re.sub(r'[^A-Za-z0-9 \n]', "", answer)
128 |         new_answer = []
129 |         for item in answer.splitlines():
130 |             for subitem in item.split(" "):
131 |                 if len(subitem) != 1:
132 |                     new_answer.append(subitem.lower())
133 |                 else:
134 |                     new_answer.append(subitem)
135 |                 
136 |         new_answer = ' '.join(new_answer)
137 |         new_answer = re.sub(r'\s+', ' ', new_answer).strip()
138 |         
139 |         return new_answer    
140 | 
141 |     def parse_answer(self, example):
142 |         """Answer extract function.
143 |         
144 |         Args:
145 |             example (str): The example to extract the answer from
146 |         Returns:
147 |             str: The extracted answer
148 |         """
149 |         extract = re.search(r"answer is (\(*[A-E][\).]*)", example, flags=re.IGNORECASE)
150 |         if extract:
151 |             return extract.group(1).replace("(", "").replace(")", "").replace(".", "").strip()
152 |         
153 |         
154 |         extract = re.search(r"(\(*[A-E][\).]*) is correct", example, flags=re.IGNORECASE)
155 |         if extract:
156 |             return extract.group(1).replace("(", "").replace(")", "").replace(".", "").strip()
157 |         
158 | 
159 |         match = re.findall(r"(A|B|C|D|E)", example)
160 | 
161 |         if match:
162 |             # if len(match) > 1:
163 |             #     return None
164 |             return list(match)[0] # Take the first one
165 |         return None
166 | 


--------------------------------------------------------------------------------