├── .gitignore ├── HISTORY.md ├── LICENSE ├── README.md ├── asset └── code_mmlu_banner.png ├── data ├── README.md ├── codemmlu │ └── codemmlu.py └── submission_sample.txt ├── paper └── 2410.01999v4.pdf ├── pyproject.toml ├── requirements.txt └── src └── codemmlu ├── __init__.py ├── __main__.py ├── backends ├── __init__.py ├── base.py ├── hf.py └── vllm.py ├── evaluator.py ├── prompts ├── __init__.py ├── _codecomp.py ├── _coderepair.py ├── _defect.py ├── _fim.py └── _general.py └── task_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | *.json 165 | *.jsonl -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | # Releases 2 | 3 | ## Version 0.0.1 4 | Release date: 14 Oct, 2024 5 | - Support backends: native huggingface, VLLMs 6 | - Support model's checkpoint from huggingface 7 | - Support lora model -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright © 2024 FPT Software AI Center 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities 2 | 3 |
4 | CodeMMLU 5 |
6 | 7 |

8 | 9 | 10 | 11 | 12 | 13 |

14 | 15 |

16 | 📰 News • 17 | 🚀 Quick Start • 18 | 📋 Evaluation • 19 | 📌 Citation 20 |

21 | 22 | ## 📌 About 23 | 24 | ### CodeMMLU 25 | 26 | **CodeMMLU** is a comprehensive benchmark designed to evaluate the capabilities of large language models (LLMs) in coding and software knowledge. 27 | It builds upon the structure of multiple-choice question answering (MCQA) to cover a wide range of programming tasks and domains, including code generation, defect detection, software engineering principles, and much more. 28 | 29 | ### Why CodeMMLU? 30 | 31 | - **CodeMMLU** comprises over 10,000 questions curated from diverse, high-quality sources. It covers a wide spectrum of software knowledge, including general QA, code generation, defect detection, and code repair across various domains and more than 10 programming languages. 32 | 33 | - **Precise and comprehensive:** Checkout our [LEADERBOARD](https://fsoft-ai4code.github.io/codemmlu/) for latest LLM rankings. 34 | 35 | ## 🚀 Quick Start 36 | 37 | Install CodeMMLU and setup dependencies via `pip`: 38 | ```bash 39 | pip install codemmlu 40 | ``` 41 | 42 | Generate response for CodeMMLU MCQs benchmark: 43 | ```bash 44 | codemmlu --model_name \ 45 | --subset \ 46 | --backend \ 47 | --output_dir 48 | ``` 49 | 50 | 51 | ## 📋 Evaluation 52 | 53 | Build `codemmlu` from source: 54 | ```bash 55 | git clone https://github.com/Fsoft-AI4Code/CodeMMLU.git 56 | cd CodeMMLU 57 | pip install -e . 58 | ``` 59 | 60 | > [!Note] 61 | > 62 | > If you prefer `vllm` backend, we highly recommend you install [vllm from official project](https://github.com/vllm-project/vllm/) before install `codemmlu`. 63 | 64 | Generating with CodeMMLU questions: 65 | ```bash 66 | codemmlu --model_name \ 67 | --peft_model \ 68 | --subset all \ 69 | --batch_size 16 \ 70 | --backend [vllm|hf] \ 71 | --max_new_tokens 1024 \ 72 | --temperature 0.0 \ 73 | --output_dir \ 74 | --instruction_prefix \ 75 | --assistant_prefix \ 76 | --cache_dir 77 | ``` 78 | 79 |
⏬ API Usage :: click to expand :: 80 |
81 | 82 | ```bash 83 | codemmlu [-h] [-V] [--subset SUBSET] [--batch_size BATCH_SIZE] [--instruction_prefix INSTRUCTION_PREFIX] 84 | [--assistant_prefix ASSISTANT_PREFIX] [--output_dir OUTPUT_DIR] [--model_name MODEL_NAME] 85 | [--peft_model PEFT_MODEL] [--backend BACKEND] [--max_new_tokens MAX_NEW_TOKENS] 86 | [--temperature TEMPERATURE] [--prompt_mode PROMPT_MODE] [--cache_dir CACHE_DIR] [--trust_remote_code] 87 | 88 | ==================== CodeMMLU ==================== 89 | 90 | optional arguments: 91 | -h, --help show this help message and exit 92 | -V, --version Get version 93 | --subset SUBSET Select evaluate subset 94 | --batch_size BATCH_SIZE 95 | --instruction_prefix INSTRUCTION_PREFIX 96 | --assistant_prefix ASSISTANT_PREFIX 97 | --output_dir OUTPUT_DIR 98 | Save generation and result path 99 | --model_name MODEL_NAME 100 | Local path or Huggingface Hub link to load model 101 | --peft_model PEFT_MODEL 102 | Lora config 103 | --backend BACKEND LLM generation backend (default: hf) 104 | --max_new_tokens MAX_NEW_TOKENS 105 | Number of max new tokens 106 | --temperature TEMPERATURE 107 | --prompt_mode PROMPT_MODE 108 | Prompt available: zeroshot, fewshot, cot_zs, cot_fs 109 | --cache_dir CACHE_DIR 110 | Cache for save model download checkpoint and dataset 111 | --trust_remote_code 112 | ``` 113 | 114 |
115 |
116 | 117 | 118 | List of supported backends: 119 | 120 | | Backend | DecoderModel | LoRA | 121 | |------------------ |-------------- |------ | 122 | | [Transformers](https://github.com/huggingface/transformers) (hf) | ✅ | ✅ | 123 | | [Vllm](https://github.com/vllm-project/vllm) (vllm) | ✅ | ✅ | 124 | 125 | ### Leaderboard 126 | To evaluate your model and submit your results to the [leaderboard](https://fsoft-ai4code.github.io/codemmlu/), please follow the instruction in [data/README.md](data/README.md). 127 | 128 | ## 📌 Citation 129 | If you find this repository useful, please consider citing our paper: 130 | 131 | ``` 132 | @article{nguyen2024codemmlu, 133 | title={CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities}, 134 | author={Nguyen, Dung Manh and Phan, Thang Chau and Le, Nam Hai and Doan, Thong T. and Nguyen, Nam V. and Pham, Quang and Bui, Nghi D. Q.}, 135 | journal={arXiv preprint}, 136 | year={2024} 137 | } 138 | ``` 139 | -------------------------------------------------------------------------------- /asset/code_mmlu_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeMMLU/2999a6a888734a9dc48cc84d8a77a8311e9ca245/asset/code_mmlu_banner.png -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | ## Evaluation Submission 2 | 3 | To submit your model's results to the [leaderboard](https://fsoft-ai4code.github.io/leaderboards/codemmlu/), please send us an email at `dungnm31@fpt.com` with the following information: 4 | 5 | - **Model Name**: The name of your model. 6 | - **Model Description**: A brief description of your model. 7 | - **Model Configuration**: 8 | - Base or Instruct 9 | - Base or LoRA 10 | - **Model Answer**: The response generated by your model followed the format: 11 | 12 | ``` 13 | 14 | ``` 15 | 16 | Checkout the example in `submission_sample.txt` 17 | 18 | -------------------------------------------------------------------------------- /data/codemmlu/codemmlu.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """The CodeMMLU benchmark.""" 15 | 16 | import os 17 | import json 18 | from glob import glob 19 | 20 | import datasets 21 | 22 | 23 | _CITATION = """\ 24 | @article{nguyen2024codemmlu, 25 | title={CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities}, 26 | author={Nguyen, Dung Manh and Phan, Thang Chau and Le, Nam Hai and Doan, Thong T. and Nguyen, Nam V. and Pham, Quang and Bui, Nghi D. Q.}, 27 | journal={arXiv preprint}, 28 | year={2024} 29 | } 30 | """ 31 | 32 | _DESCRIPTION = """\ 33 | CodeMMLU is a comprehensive benchmark designed to evaluate the capabilities of large language models (LLMs) in coding and software knowledge 34 | """ 35 | 36 | _HOMEPAGE = "https://fsoft-ai4code.github.io/codemmlu/" 37 | 38 | _URL = "./data/test" 39 | 40 | _SUBJECTS = [ 41 | "programming_syntax", "api_frameworks", 42 | "software_principles", "dbms_sql", "others", 43 | "code_completion", "fill_in_the_middle", "code_repair", "defect_detection" 44 | ] 45 | 46 | 47 | class CodeMMLU(datasets.GeneratorBasedBuilder): 48 | """CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities""" 49 | # Version history: 50 | # 0.0.1: Initial release. 51 | VERSION = datasets.Version("0.0.1") 52 | 53 | BUILDER_CONFIGS = [ 54 | datasets.BuilderConfig( 55 | name=sub, version=datasets.Version("0.0.1"), 56 | description="CodeMMLU test subject {}".format(sub) 57 | ) for sub in _SUBJECTS 58 | ] 59 | 60 | 61 | def _info(self): 62 | features = datasets.Features( 63 | { 64 | "task_id": datasets.Value("string"), 65 | "question": datasets.Value("string"), 66 | "choices": datasets.features.Sequence(datasets.Value("string")), 67 | } 68 | ) 69 | 70 | if self.config.name == "fill_in_the_middle": 71 | features["problem_description"] = datasets.Value("string") 72 | 73 | return datasets.DatasetInfo( 74 | description=_DESCRIPTION, 75 | features=features, 76 | homepage=_HOMEPAGE, 77 | citation=_CITATION, 78 | ) 79 | 80 | def _split_generators(self, dl_manager): 81 | """Returns SplitGenerators.""" 82 | path = os.path.join(_URL, self.config.name + ".jsonl") 83 | dl_dir = dl_manager.download(path) 84 | return [ 85 | datasets.SplitGenerator( 86 | name=datasets.Split.TEST, 87 | gen_kwargs={"data_path": dl_dir}, 88 | ), 89 | ] 90 | 91 | def _generate_examples(self, data_path): 92 | """This function returns the examples in the raw (text) form.""" 93 | if data_path.endswith(".jsonl"): 94 | lines = open(data_path, "r", encoding="utf-8").readlines() 95 | reader = [json.loads(line) for line in lines] 96 | for idx, data in enumerate(reader): 97 | return_dict = { 98 | "task_id": data['task_id'], 99 | "question": data['question'], 100 | "choices": data['choices'], 101 | } 102 | 103 | if "fill_in_the_middle" in data_path: 104 | return_dict['problem_description'] = data['problem_description'] 105 | 106 | yield idx, return_dict 107 | -------------------------------------------------------------------------------- /data/submission_sample.txt: -------------------------------------------------------------------------------- 1 | 1 A 2 | 2 B 3 | 3 C 4 | 4 D -------------------------------------------------------------------------------- /paper/2410.01999v4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeMMLU/2999a6a888734a9dc48cc84d8a77a8311e9ca245/paper/2410.01999v4.pdf -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "codemmlu" 7 | version = "0.0.2.1" 8 | authors = [ 9 | { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" }, 10 | ] 11 | description = "CodeMMLU Evaluator: A framework for evaluating language models on CodeMMLU benchmark." 12 | readme = "README.md" 13 | requires-python = ">=3.9" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: MIT License", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "transformers>=4.39.0", 21 | "datasets>=2.17.1", 22 | "accelerate>=0.27.2", 23 | "deepspeed>=0.13.2", 24 | "peft>=0.10.0", 25 | "vllm" 26 | ] 27 | 28 | [project.urls] 29 | "Homepage" = "https://fsoft-ai4code.github.io/codemmlu/" 30 | "Bug Tracker" = "https://github.com/FSoft-AI4Code/CodeMMLU/issues" 31 | 32 | [project.scripts] 33 | codemmlu = "codemmlu.__main__:main" 34 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.39.0 2 | accelerate==0.27.2 3 | bitsandbytes==0.42.0 4 | datasets==2.17.1 5 | deepspeed==0.13.2 6 | peft==0.10.0 7 | vllm -------------------------------------------------------------------------------- /src/codemmlu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeMMLU/2999a6a888734a9dc48cc84d8a77a8311e9ca245/src/codemmlu/__init__.py -------------------------------------------------------------------------------- /src/codemmlu/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pkg_resources 4 | 5 | from codemmlu.task_utils import ALL_TASK 6 | 7 | def get_args(): 8 | parser = argparse.ArgumentParser(description=f"{20*'='} CodeMMLU {20*'='}") 9 | 10 | parser.add_argument("-V", "--version", action="version", help="Get version", 11 | version=pkg_resources.get_distribution("codemmlu").version) 12 | 13 | # Data args 14 | parser.add_argument("--subset", default="programming_syntax", type=str, 15 | help='Select evaluate subset') 16 | parser.add_argument("--batch_size", default=16, type=int) 17 | parser.add_argument("--instruction_prefix", default="", type=str) 18 | parser.add_argument("--assistant_prefix", default="", type=str) 19 | parser.add_argument("--output_dir", default="./output", type=str, 20 | help='Save generation and result path') 21 | 22 | # Generation args 23 | parser.add_argument("--model_name", type=str, 24 | help='Local path or Huggingface Hub link to load model') 25 | parser.add_argument("--peft_model", default=None, type=str, 26 | help='Lora config') 27 | parser.add_argument("--backend", default="hf", type=str, 28 | help="LLM generation backend (default: hf)") 29 | parser.add_argument("--max_new_tokens", default=128, type=int, 30 | help='Number of max new tokens') 31 | parser.add_argument("--temperature", default=0.0, type=float) 32 | parser.add_argument("--prompt_mode", default='zeroshot', type=str, 33 | help='Prompt available: zeroshot, fewshot, cot_zs, cot_fs') 34 | parser.add_argument("--cache_dir", default=None, type=str, 35 | help='Cache for save model download checkpoint and dataset') 36 | parser.add_argument("--trust_remote_code", action='store_true') 37 | 38 | args = parser.parse_args() 39 | 40 | if not args.cache_dir: 41 | TRANSFORMER_CACHE = os.getenv("TRANSFORMER_CACHE") 42 | HF_HOME = os.getenv("HF_HOME") 43 | if TRANSFORMER_CACHE: 44 | args.cache_dir = TRANSFORMER_CACHE 45 | else: 46 | args.cache_dir = HF_HOME 47 | 48 | assert args.subset in ALL_TASK, f"Invalid subset name, expect {ALL_TASK}, but got {args.subset}" 49 | 50 | return args, parser 51 | 52 | 53 | def main(): 54 | args, parsre = get_args() 55 | if args.model_name: 56 | generate(args=args) 57 | else: 58 | parsre.print_help() 59 | 60 | 61 | def generate(args): 62 | from codemmlu.evaluator import Evaluator 63 | 64 | evaluator = Evaluator( 65 | subset=args.subset, 66 | model_name=args.model_name, 67 | peft_model=args.peft_model, 68 | backend=args.backend, 69 | batch_size=args.batch_size, 70 | cache_dir=args.cache_dir, 71 | output_dir=args.output_dir, 72 | trust_remote_code=args.trust_remote_code, 73 | instruction_prefix=args.instruction_prefix, 74 | assistant_prefix=args.assistant_prefix, 75 | prompt_mode=args.prompt_mode, 76 | ) 77 | 78 | evaluator.generate( 79 | temperature=args.temperature, 80 | max_new_tokens=args.max_new_tokens, 81 | ) 82 | 83 | print("======= Finish generated =======") 84 | 85 | if __name__ == '__main__': 86 | main() -------------------------------------------------------------------------------- /src/codemmlu/backends/__init__.py: -------------------------------------------------------------------------------- 1 | from codemmlu.backends.base import Backend 2 | from codemmlu.task_utils import CodeMMLU 3 | 4 | SUPPORTED_BACKENDS = ["vllm", "hf"] 5 | 6 | def make_model( 7 | model_name: str, 8 | backend: str, 9 | subset: str, 10 | split: str, 11 | output_dir: str, 12 | temperature: float = 0.0, 13 | max_new_tokens: int = 1280, 14 | batch_size: int = 16, 15 | prompt_mode: str = "zeroshot", 16 | # instruction model only 17 | instruction_prefix: str = None, 18 | assistant_prefix: str = None, 19 | trust_remote_code: bool = False, 20 | # peft model only 21 | peft_model: str = None, 22 | # cache dir 23 | cache_dir: str = None, 24 | 25 | ) -> Backend: 26 | # Load dataset 27 | dataset = CodeMMLU(subset=subset, 28 | split=split, 29 | prompt_mode=prompt_mode, 30 | instruction_prefix=instruction_prefix, 31 | assistant_prefix=assistant_prefix) 32 | 33 | # Initialize backend 34 | if backend == "vllm": 35 | from codemmlu.backends.vllm import VllmEngine 36 | 37 | return VllmEngine( 38 | model_name=model_name, 39 | peft_model=peft_model, 40 | dataset=dataset, 41 | temperature=temperature, 42 | batch_size=batch_size, 43 | max_new_tokens=max_new_tokens, 44 | trust_remote_code=trust_remote_code, 45 | cache_dir=cache_dir, 46 | output_dir=output_dir 47 | ) 48 | elif backend == "hf": 49 | from codemmlu.backends.hf import HuggingfaceEngine 50 | 51 | return HuggingfaceEngine( 52 | model_name=model_name, 53 | peft_model=peft_model, 54 | dataset=dataset, 55 | temperature=temperature, 56 | batch_size=batch_size, 57 | max_new_tokens=max_new_tokens, 58 | trust_remote_code=trust_remote_code, 59 | cache_dir=cache_dir, 60 | output_dir=output_dir 61 | ) 62 | else: 63 | raise ValueError(f"Unknown backend: {backend}") 64 | -------------------------------------------------------------------------------- /src/codemmlu/backends/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from codemmlu.task_utils import CodeMMLU 3 | 4 | class Backend(ABC): 5 | def __init__(self, 6 | dataset: CodeMMLU, 7 | model_name: str, 8 | temperature: float, 9 | max_new_tokens: int, 10 | peft_model: str = None, 11 | batch_size: int = 16, 12 | trust_remote_code: bool = False, 13 | cache_dir: str = None, 14 | output_dir: str='./'): 15 | print(f"Initializing {self.__class__.__name__} backend") 16 | print(f"Initializing a decoding model: {model_name}") 17 | 18 | self.TASK_NAME = dataset.TASK_NAME 19 | self.subset = dataset.subset 20 | self.split = dataset.split 21 | self.model_name = model_name 22 | self.batch_size = batch_size 23 | self.peft_model = peft_model 24 | self.cache_dir = cache_dir 25 | self.output_dir = output_dir 26 | self.dataset = dataset.prepare_dataset() 27 | self.temperature = temperature 28 | self.max_new_tokens = max_new_tokens 29 | self.trust_remote_code = trust_remote_code 30 | 31 | 32 | @abstractmethod 33 | def generate(self) -> str: 34 | raise NotImplementedError 35 | 36 | def get_dataset(self) -> CodeMMLU: 37 | return self.dataset 38 | -------------------------------------------------------------------------------- /src/codemmlu/backends/hf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from typing import Dict 4 | from tqdm import tqdm 5 | 6 | from accelerate import Accelerator 7 | from accelerate.utils import gather_object 8 | from transformers import ( 9 | GenerationConfig, 10 | AutoModelForCausalLM, 11 | AutoModelForSeq2SeqLM, 12 | AutoTokenizer 13 | ) 14 | 15 | from codemmlu.backends.base import Backend 16 | 17 | class HuggingfaceEngine(Backend): 18 | def __init__(self, model_name: str, **kwargs): 19 | super().__init__(model_name=model_name, **kwargs) 20 | self.accelerator = Accelerator() 21 | 22 | # TODO: add generation args 23 | generate_args = dict( 24 | temperature=self.temperature, 25 | max_new_tokens=self.max_new_tokens, 26 | ) 27 | self.generation_config = GenerationConfig(**generate_args) 28 | 29 | model_kwargs = dict( 30 | cache_dir=self.cache_dir, 31 | trust_remote_code=self.trust_remote_code, 32 | load_in_8bit=False 33 | ) 34 | try: 35 | self.model = AutoModelForCausalLM.from_pretrained( 36 | self.model_name, **model_kwargs) 37 | 38 | except KeyError: # Except load seq2seq model 39 | self.model = AutoModelForSeq2SeqLM.from_pretrained( 40 | self.model_name, **model_kwargs) 41 | 42 | if self.peft_model: 43 | from peft import PeftModel 44 | self.model = PeftModel.from_pretrained(self.model, self.peft_model) 45 | 46 | self.model.to(self.accelerator.device) 47 | 48 | self.tokenizer = AutoTokenizer.from_pretrained( 49 | self.model_name, 50 | trust_remote_code=self.trust_remote_code, 51 | padding_side="left" 52 | ) 53 | 54 | if not self.tokenizer.pad_token: 55 | print("Set EOS_TOKEN to PAD_TOKEN") 56 | self.tokenizer.pad_token = self.tokenizer.eos_token 57 | 58 | def generate(self) -> str: 59 | # ``Accelerate`` distribute data and model 60 | assert self.accelerator 61 | 62 | ds_loader = [self.dataset[i:i+self.batch_size] 63 | for i in range(0, len(self.dataset), self.batch_size)] 64 | 65 | for i in range(len(ds_loader)): 66 | question = ds_loader[i]['question'] 67 | ds_loader[i]['question_ids'] = self.tokenizer(question, return_tensors="pt", padding=True) 68 | 69 | result = [] 70 | with self.accelerator.split_between_processes(ds_loader, apply_padding=True) as batched_prompts: 71 | index = self.accelerator.process_index 72 | for batch in tqdm(batched_prompts, desc=f"Process: {index} | Generating", position=index): 73 | input_ids = batch['question_ids'].to(self.accelerator.device) 74 | outputs = self.model.generate(**input_ids, 75 | generation_config=self.generation_config, 76 | pad_token_id=self.tokenizer.eos_token_id, 77 | eos_token_id=self.tokenizer.eos_token_id) 78 | 79 | outputs = [output[len(prompt) :] for prompt, output in zip(input_ids["input_ids"], outputs)] 80 | batch_results = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) 81 | 82 | batch['generation'] = batch_results 83 | result.extend(batch['generation']) 84 | self._save_result(batch) 85 | 86 | 87 | result_gather = gather_object(result)[: len(self.dataset)] 88 | self.dataset = self.dataset.add_column('generation', result_gather) 89 | # TODO: process response and extract answer 90 | return self.dataset 91 | 92 | def _save_result(self, batched_outputs: Dict): 93 | assert 'question' in batched_outputs.keys() 94 | assert 'generation' in batched_outputs.keys() 95 | 96 | if self.accelerator.distributed_type == "MULTI_GPU": 97 | save_path = os.path.join(self.save_dir, 98 | f"{self.subset}.raw.generated.{self.accelerator.process_index}.jsonl") 99 | else: 100 | save_path = os.path.join(self.save_dir, f"{self.subset}.final.generated.jsonl") 101 | 102 | with open(save_path, "a") as writer: 103 | for idx in range(len(batched_outputs['question'])): 104 | res = dict( 105 | task_id=batched_outputs['task_id'][idx], 106 | prompt=batched_outputs['question'][idx], 107 | response=batched_outputs['generation'][idx] 108 | ) 109 | 110 | json.dump(res, writer) 111 | writer.write("\n") 112 | -------------------------------------------------------------------------------- /src/codemmlu/backends/vllm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from tqdm import tqdm 4 | from typing import Dict 5 | 6 | import torch 7 | from vllm import LLM, SamplingParams 8 | from vllm.lora.request import LoRARequest 9 | 10 | from codemmlu.backends.base import Backend 11 | 12 | class VllmEngine(Backend): 13 | def __init__(self, model_name: str, **kwargs): 14 | super().__init__(model_name=model_name, **kwargs) 15 | ngpus = torch.cuda.device_count() 16 | backend_kwargs = dict( 17 | disable_log_stats=True, 18 | tensor_parallel_size=ngpus, 19 | download_dir=self.cache_dir, 20 | trust_remote_code=self.trust_remote_code, 21 | ) 22 | 23 | self.model = LLM(self.model_name, 24 | enable_lora=True if self.peft_model else None, 25 | **backend_kwargs) 26 | 27 | self.lora_request = None 28 | if self.peft_model: 29 | self.lora_request=LoRARequest("lora", 1, self.peft_model) 30 | 31 | self.sampling_params = SamplingParams( 32 | max_tokens=self.max_new_tokens, 33 | temperature=self.temperature, 34 | ) 35 | 36 | def generate(self): 37 | ds_loader = [self.dataset[i:i+self.batch_size] 38 | for i in range(0, len(self.dataset), self.batch_size)] 39 | 40 | result = [] 41 | for batch in tqdm(ds_loader, total=len(ds_loader), desc="Generating"): 42 | outputs = self.model.generate(batch['question'], 43 | self.sampling_params, 44 | lora_request=self.lora_request) 45 | 46 | batch['generation'] = [output.outputs[0].text for output in outputs] 47 | result.extend(batch['generation']) 48 | self._save_result(batch) 49 | 50 | self.dataset = self.dataset.add_column('generation', result) 51 | # TODO: process response and extract answer 52 | return self.dataset 53 | 54 | def _save_result(self, batched_outputs: Dict): 55 | assert 'question' in batched_outputs.keys() 56 | assert 'generation' in batched_outputs.keys() 57 | 58 | save_path = os.path.join(self.output_dir, f"{self.subset}.final.generated.jsonl") 59 | 60 | with open(save_path, "a") as writer: 61 | for idx in range(len(batched_outputs['question'])): 62 | res = dict( 63 | task_id=batched_outputs['task_id'][idx], 64 | prompt=batched_outputs['question'][idx], 65 | response=batched_outputs['generation'][idx] 66 | ) 67 | 68 | json.dump(res, writer) 69 | writer.write("\n") 70 | -------------------------------------------------------------------------------- /src/codemmlu/evaluator.py: -------------------------------------------------------------------------------- 1 | """Evaluator to load CodeMMLU and extract answer from response. 2 | 3 | For example: 4 | 5 | .. code-block:: python 6 | 7 | >>> from codemmlu import Evaluator 8 | >>> evaluator = Evaluator(subset="semantic") 9 | >>> response = evaluator.generate(temperature=0.9, num_return_sequences=3) 10 | 11 | """ 12 | import os 13 | import sys 14 | import json 15 | import time 16 | from warnings import warn 17 | from typing import Optional, Dict, List 18 | 19 | import torch 20 | 21 | from codemmlu.backends import make_model, SUPPORTED_BACKENDS, Backend 22 | 23 | class Evaluator: 24 | """Evaluator class. 25 | 26 | :param model_name: Selected model for evaluating 27 | :type model_name: str 28 | :param peft_model: Adapter model, defaults to None 29 | :type peft_model: Optional[str], optional 30 | :param trust_remote_code: Huggingface argument, defaults to False 31 | :type trust_remote_code: Optional[bool], optional 32 | :param cache_dir: Downloaded cache directory, defaults to None 33 | :type cache_dir: Optional[str], optional 34 | :param batch_size: Generation batch size, defaults to 16 35 | :type batch_size: Optional[int], optional 36 | :param output_dir: Saving generation directory, defaults to "./output" 37 | :type output_dir: Optional[str], optional 38 | """ 39 | 40 | def __init__(self, 41 | model_name: str, 42 | subset: Optional[str] = None, 43 | split: Optional[str] = "test", 44 | peft_model: Optional[str] = None, 45 | backend: str = "hf", 46 | trust_remote_code: Optional[bool] = False, 47 | cache_dir: Optional[str] = None, 48 | batch_size: Optional[int] = 16, 49 | output_dir: Optional[str] = "./output", 50 | instruction_prefix: Optional[str] = "", 51 | assistant_prefix: Optional[str] = "", 52 | prompt_mode: Optional[str] = None, 53 | ) -> None: 54 | 55 | # Dataset args 56 | self.split = split 57 | self.subset = subset 58 | self.instruction_prefix = instruction_prefix 59 | self.assistant_prefix = assistant_prefix 60 | 61 | # Generation args 62 | self.backend = backend 63 | self.model_name = model_name 64 | self.peft_model = peft_model 65 | self.output_dir = output_dir 66 | self.trust_remote_code = trust_remote_code 67 | self.cache_dir = cache_dir 68 | self.batch_size = batch_size 69 | self.prompt_mode = prompt_mode 70 | 71 | if backend not in SUPPORTED_BACKENDS: 72 | raise ValueError(f"Backend {backend} is not supported. Please choose from {SUPPORTED_BACKENDS}") 73 | 74 | os.makedirs(self.output_dir, exist_ok=True) 75 | 76 | 77 | def generate(self, 78 | max_new_tokens: int = 1024, 79 | temperature: float = 0.0, 80 | ) -> List: 81 | """Start backend, generate and extract answer from response 82 | 83 | :param max_new_tokens: Max new tokens, defaults to 256 84 | :type max_new_tokens: Optional[int], optional 85 | :param temperature: Model generate temperature, defaults to 0.9 86 | :type temperature: Optional[float], optional 87 | 88 | :return: List of generated result, stored in dictionary object 89 | with ``task_id``, ``prompt`` and ``answer`` key. 90 | :rtype: List 91 | """ 92 | self.engine : Backend = make_model( 93 | subset=self.subset, 94 | split=self.split, 95 | model_name=self.model_name, 96 | backend=self.backend, 97 | peft_model=self.peft_model, 98 | trust_remote_code=self.trust_remote_code, 99 | batch_size=self.batch_size, 100 | temperature=temperature, 101 | max_new_tokens=max_new_tokens, 102 | cache_dir=self.cache_dir, 103 | instruction_prefix=self.instruction_prefix, 104 | assistant_prefix=self.assistant_prefix, 105 | output_dir=self.output_dir, 106 | prompt_mode=self.prompt_mode, 107 | ) 108 | 109 | 110 | print(f"Evaluating task: [{self.engine.TASK_NAME}]") 111 | print(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}") 112 | print(f"device compute capabilities={torch.cuda.get_device_capability()}") 113 | print(f"pytorch compute capabilities={torch.cuda.get_arch_list()}") 114 | 115 | start_time = time.time() 116 | results = self.engine.generate() 117 | 118 | print("======= Finished {} =======".format(self.engine.TASK_NAME)) 119 | print("Completion time: %d s", (time.time() - start_time)) 120 | 121 | return results 122 | 123 | 124 | def acc_evaluate( 125 | subset: str, 126 | response_path: str): 127 | 128 | pass -------------------------------------------------------------------------------- /src/codemmlu/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | from codemmlu.prompts._general import GENERAL_PROMPT 2 | from codemmlu.prompts._codecomp import CODECOMP_PROMPT 3 | from codemmlu.prompts._fim import FIM_PROMPT 4 | from codemmlu.prompts._coderepair import CODEREPAIR_PROMPT 5 | from codemmlu.prompts._defect import DEFECT_PROMPT 6 | 7 | __all__ = [ 8 | "GENERAL_PROMPT", 9 | "CODECOMP_PROMPT", 10 | "FIM_PROMPT", 11 | "CODEREPAIR_PROMPT", 12 | "DEFECT_PROMPT" 13 | ] -------------------------------------------------------------------------------- /src/codemmlu/prompts/_codecomp.py: -------------------------------------------------------------------------------- 1 | 2 | zeroshot = """The following are multiple choice questions (with answers) about 3 | programming problem. 4 | 5 | Question: Which solution below is the most likely completion the following 6 | code snippet to achieve the desired goal? 7 | {question} 8 | 9 | {choices} 10 | 11 | Answer: """ 12 | 13 | fewshot = """The following are multiple choice questions (with answers) about 14 | programming problem. 15 | 16 | Question: Which solution below is the most likely completion the following 17 | code snippet to achieve the desired goal? 18 | ```python 19 | from typing import List 20 | 21 | def two_sum(nums: List[int], target: int) -> List[int]: 22 | ''' 23 | Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. 24 | You may assume that each input would have exactly one solution, and you may not use the same element twice. 25 | 26 | >>> two_sum([2,7,11,15], 9) 27 | [0,1] 28 | >>> two_sum([3,2,4], 6) 29 | [1,2] 30 | >>> two_sum([3,3], 6) 31 | [0,1] 32 | ''' 33 | ``` 34 | (A) ```python 35 | n = len(nums) 36 | for i in range(n - 1): 37 | for j in range(i + 1, n): 38 | if nums[i] + nums[j] == target: 39 | return [i, j] 40 | return [] 41 | ``` 42 | (B) ```python 43 | for num in nums: 44 | if target - num in nums: 45 | return [nums.index(num), nums.index(target - num)] 46 | return [] 47 | ``` 48 | (C) ```python 49 | for i in range(len(nums)): 50 | if nums[i] * 2 == target: 51 | return [i, i] 52 | return [] 53 | ``` 54 | (D) ```python 55 | num_dict = {{}} 56 | for i, num in enumerate(nums): 57 | if target - num in num_dict: 58 | return [num_dict[target - num], i] 59 | num_dict[i] = num 60 | return [] 61 | ``` 62 | Answer: The answer is (A). 63 | 64 | Question: Which solution below is the most likely completion the following 65 | code snippet to achieve the desired goal? 66 | ```python 67 | {question} 68 | ``` 69 | 70 | {choices} 71 | 72 | Answer: """ 73 | 74 | cot_zs = '''The following are multiple choice questions (with answers) about 75 | programming problem. 76 | 77 | Question: Which solution below is the most likely completion the following 78 | code snippet to achieve the desired goal? 79 | ```python 80 | {question} 81 | ``` 82 | {choices} 83 | 84 | Answer: Let's think step by step. ''' 85 | 86 | cot_fs = """The following are multiple choice questions (with answers) about 87 | programming problem. 88 | 89 | Question: Which solution below is the most likely completion the following 90 | code snippet to achieve the desired goal? 91 | ```python 92 | from typing import List 93 | 94 | def two_sum(nums: List[int], target: int) -> List[int]: 95 | ''' 96 | Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target. 97 | You may assume that each input would have exactly one solution, and you may not use the same element twice. 98 | 99 | >>> two_sum([2,7,11,15], 9) 100 | [0,1] 101 | >>> two_sum([3,2,4], 6) 102 | [1,2] 103 | >>> two_sum([3,3], 6) 104 | [0,1] 105 | ''' 106 | ``` 107 | (A) ```python 108 | n = len(nums) 109 | for i in range(n - 1): 110 | for j in range(i + 1, n): 111 | if nums[i] + nums[j] == target: 112 | return [i, j] 113 | return [] 114 | ``` 115 | (B) ```python 116 | for num in nums: 117 | if target - num in nums: 118 | return [nums.index(num), nums.index(target - num)] 119 | return [] 120 | ``` 121 | (C) ```python 122 | for i in range(len(nums)): 123 | if nums[i] * 2 == target: 124 | return [i, i] 125 | return [] 126 | ``` 127 | (D) ```python 128 | num_dict = {{}} 129 | for i, num in enumerate(nums): 130 | if target - num in num_dict: 131 | return [num_dict[target - num], i] 132 | num_dict[i] = num 133 | return [] 134 | ``` 135 | 136 | Answer: Let's think step by step. The answer (A) uses a straightforward brute-force approach by checking every possible pair of indices to see if their corresponding values sum to the target. While this method has a time complexity of O(n^2), it is simple and guaranteed to find the correct solution for small input sizes, as it exhaustively evaluates all pairs. This solution works reliably within the problem's constraints and ensures the correct indices are returned when the target sum is found. The other solutions have issues such as incorrect handling of duplicate values or incorrect logic (as in C) that disqualify them. 137 | The answer is (A). 138 | 139 | Question: Which solution below is the most likely completion the following 140 | code snippet to achieve the desired goal? 141 | ```python 142 | {question} 143 | ``` 144 | {choices} 145 | 146 | Answer: Let's think step by step. """ 147 | 148 | CODECOMP_PROMPT = dict(zeroshot=zeroshot, 149 | fewshot=fewshot, 150 | cot_zs=cot_zs, 151 | cot_fs=cot_fs) -------------------------------------------------------------------------------- /src/codemmlu/prompts/_coderepair.py: -------------------------------------------------------------------------------- 1 | 2 | zeroshot = """The following are multiple-choice questions (with answers) about debugging a programming problem. 3 | 4 | Question: The implementation below is producing incorrect results. 5 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal? 6 | {question} 7 | 8 | {choices} 9 | 10 | Answer: """ 11 | 12 | fewshot = """The following are multiple-choice questions (with answers) about debugging a programming problem. 13 | 14 | Question: The implementation below is producing incorrect results. 15 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal? 16 | 17 | 1 def two_sum(nums, target): 18 | 2 complement_map = {{}} 19 | 3 for i, num in enumerate(nums): 20 | 4 complement = target - num 21 | 5 complement_map[num] = i 22 | 6 if complement in complement_map: 23 | 7 return [complement_map[complement], i] 24 | 8 return None 25 | 26 | (A) Remove line 5. 27 | 28 | (B) Remove line 5. Add at line 7: 29 | ``` complement_map[num] = i``` 30 | 31 | (C) Modify line 7: 32 | ``` return [i, complement_map[complement]]``` 33 | 34 | (D) Remove line 5. Add at line 7: 35 | ``` if i == len(nums) - 1: 36 | return None 37 | complement_map[num] = i``` 38 | 39 | Answer: The answer is (B). 40 | 41 | Question: The implementation below is producing incorrect results. 42 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal? 43 | {question} 44 | 45 | {choices} 46 | 47 | Answer: """ 48 | 49 | cot_zs = zeroshot + "Let's think step by step. " 50 | 51 | cot_fs = """The following are multiple-choice questions (with answers) about debugging a programming problem. 52 | 53 | Question: The implementation below is producing incorrect results. 54 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal? 55 | 56 | 1 def two_sum(nums, target): 57 | 2 complement_map = {{}} 58 | 3 for i, num in enumerate(nums): 59 | 4 complement = target - num 60 | 5 complement_map[num] = i 61 | 6 if complement in complement_map: 62 | 7 return [complement_map[complement], i] 63 | 8 return None 64 | 65 | (A) Remove line 5. 66 | 67 | (B) Remove line 5. Add at line 7: 68 | ``` complement_map[num] = i``` 69 | 70 | (C) Modify line 7: 71 | ``` return [i, complement_map[complement]]``` 72 | 73 | (D) Remove line 5. Add at line 7: 74 | ``` if i == len(nums) - 1: 75 | return None 76 | complement_map[num] = i``` 77 | 78 | Answer: Let's think step by step. The bug in the code occurs because the current number is added to the complement_map before checking if its complement already exists, which can lead to incorrectly matching a number with itself. To fix this, the number should only be added to the map after checking for its complement. Solution (B) does exactly this by moving the line that adds the current number to the map after the complement check, ensuring the logic works as intended without self-matching errors. 79 | The answer is (B). 80 | 81 | Question: The implementation below is producing incorrect results. 82 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal? 83 | {question} 84 | 85 | {choices} 86 | 87 | Answer: Let's think step by step. """ 88 | 89 | CODEREPAIR_PROMPT = dict(zeroshot=zeroshot, 90 | fewshot=fewshot, 91 | cot_zs=cot_zs, 92 | cot_fs=cot_fs) -------------------------------------------------------------------------------- /src/codemmlu/prompts/_defect.py: -------------------------------------------------------------------------------- 1 | zeroshot = """The following are multiple choice questions (with answers) about programming problem. 2 | 3 | Question: Given a code snippet below, which behavior most likely to occur when execute it? 4 | {question} 5 | 6 | {choices} 7 | 8 | Answer: """ 9 | 10 | fewshot = """The following are multiple choice questions (with answers) about programming problem. 11 | 12 | Question: Given a code snippet below, which behavior most likely to occur when execute it? 13 | ```python 14 | def chkPair(A, size, x): 15 | for i in range(0, size - 1): 16 | for j in range(i + 1, size): 17 | if (A[i] + A[j] == x): 18 | return 1 19 | return 0 20 | 21 | ``` 22 | 23 | (A). The code contain no issue. 24 | (B). Memory Limit Exceeded 25 | (C). Internal error 26 | (D). Runtime Error 27 | 28 | Answer: The answer is (A). 29 | 30 | Question: Given a code snippet below, which behavior most likely to occur when execute it? 31 | {question} 32 | 33 | {choices} 34 | 35 | Answer: """ 36 | 37 | cot_zs = zeroshot + "Let's think step by step. " 38 | 39 | cot_fs = """The following are multiple choice questions (with answers) about programming problem. 40 | 41 | Question: Given a code snippet below, which behavior most likely to occur when execute it? 42 | ```python 43 | def chkPair(A, size, x): 44 | for i in range(0, size - 1): 45 | for j in range(i + 1, size): 46 | if (A[i] + A[j] == x): 47 | return 1 48 | return 0 49 | 50 | ``` 51 | 52 | (A). The code contain no issue. 53 | (B). Memory Limit Exceeded 54 | (C). Internal error 55 | (D). Runtime Error 56 | 57 | Answer: Let's think step by step. The code defines a function `chkPair` that checks for a pair of elements in an array A whose sum equals x. It uses two nested loops to iterate over all possible pairs and returns 1 if a valid pair is found, or 0 otherwise. The function has a time complexity of O(n^2) due to the nested loops, which could slow down performance for large inputs, but it doesn't involve excessive memory usage or problematic operations that would lead to errors like memory limit exceeded, runtime errors, or internal issues. Hence, the most likely outcome is that the code contains no issue. 58 | The answer is (A). 59 | 60 | Question: Given a code snippet below, which behavior most likely to occur when execute it? 61 | {question} 62 | 63 | {choices} 64 | 65 | Answer: Let's think step by step.""" 66 | 67 | DEFECT_PROMPT = dict(zeroshot=zeroshot, 68 | fewshot=fewshot, 69 | cot_zs=cot_zs, 70 | cot_fs=cot_fs) -------------------------------------------------------------------------------- /src/codemmlu/prompts/_fim.py: -------------------------------------------------------------------------------- 1 | zeroshot = """The following are multiple-choice questions (with answers) about a programming problem with incomplete solution. 2 | 3 | Problem statement: {problem_description} 4 | 5 | Incomplete Solution: 6 | {question} 7 | 8 | Question: The provided solution is missing a part, Which option below is the most likely to complete the solution and achieve the desired goal? 9 | 10 | {choices} 11 | 12 | Answer: """ 13 | 14 | fewshot = """The following are multiple-choice questions (with answers) about a programming problem with incomplete solution. 15 | 16 | Problem statement: You are given an array of intervals, where intervals[i] = [starti, endi] and each starti is unique. 17 | The right interval for an interval i is an interval j such that startj >= endi and startj is minimized. 18 | Note that i may equal j. Return an array of right interval indices for each interval i. 19 | If no right interval exists for interval i, then put -1 at index i. 20 | 21 | Incomplete Solution: 22 | python``` 23 | def find_right_interval(intervals): 24 | n = len(intervals) 25 | res = [-1] * n 26 | for i in range(n): 27 | intervals[i].append(i) 28 | 29 | def binary_search(ele): 30 | left, right = 0, n-1 31 | ans = float('inf') 32 | while left <= right: 33 | mid = (left + right) // 2 34 | if intervals[mid][0] >= ele: 35 | ans = min(ans, mid) 36 | right = mid - 1 37 | else: 38 | left = mid + 1 39 | return ans 40 | 41 | intervals.sort() 42 | for i in intervals: 43 | _________________ 44 | 45 | return res 46 | ``` 47 | 48 | Question: The provided solution is missing a part, Which option below is the most likely to complete the solution and achieve the desired goal? 49 | 50 | (A) ```python 51 | val = binary_search(i[1]) 52 | if val != float('inf'): 53 | res[i[2]] = intervals[val][2] 54 | ``` 55 | (B) ```python 56 | if val != float('inf'): 57 | res[i[2]] = intervals[val][2] 58 | else: 59 | continue 60 | ``` 61 | (C) ```python 62 | val = binary_search(i[1]) 63 | if val != float('inf'): res[i[2] + 1] = intervals[val][2] 64 | ``` 65 | (D) ```python 66 | if val != float('inf'): 67 | res[i[2]] = intervals[val][2] 68 | else: 69 | continue 70 | ``` 71 | 72 | Answer: The answer is (A). 73 | 74 | Problem statement: {problem_description} 75 | 76 | Incomplete Solution: 77 | {question} 78 | 79 | Question: The provided solution is missing a part, Which option below is the most likely to complete the solution and achieve the desired goal? 80 | 81 | {choices} 82 | 83 | Answer: """ 84 | 85 | cot_zs = """The following are multiple-choice questions (with answers) about a programming problem with incomplete solution. 86 | 87 | Problem statement: {problem_description} 88 | 89 | Incomplete Solution: 90 | {question} 91 | 92 | Question: The provided solution is missing a part, Which option below is the most likely to 93 | complete the solution and achieve the desired goal? 94 | 95 | {choices} 96 | 97 | Answer: Let's think step by step. """ 98 | 99 | cot_fs = """The following are multiple-choice questions (with answers) about a programming problem 100 | with incomplete solution. 101 | 102 | Problem statement: You are given an array of intervals, where intervals[i] = [starti, endi] and each starti is unique. 103 | The right interval for an interval i is an interval j such that startj >= endi and startj is minimized. 104 | Note that i may equal j. Return an array of right interval indices for each interval i. 105 | If no right interval exists for interval i, then put -1 at index i. 106 | 107 | Incomplete Solution: 108 | python``` 109 | def find_right_interval(intervals): 110 | n = len(intervals) 111 | res = [-1] * n 112 | for i in range(n): 113 | intervals[i].append(i) 114 | 115 | def binary_search(ele): 116 | left, right = 0, n-1 117 | ans = float('inf') 118 | while left <= right: 119 | mid = (left + right) // 2 120 | if intervals[mid][0] >= ele: 121 | ans = min(ans, mid) 122 | right = mid - 1 123 | else: 124 | left = mid + 1 125 | return ans 126 | 127 | intervals.sort() 128 | for i in intervals: 129 | _________________ 130 | 131 | return res 132 | ``` 133 | 134 | Question: The provided solution is missing a part, Which option below is the most likely to 135 | complete the solution and achieve the desired goal? 136 | 137 | (A) ```python 138 | val = binary_search(i[1]) 139 | if val != float('inf'): 140 | res[i[2]] = intervals[val][2] 141 | ``` 142 | (B) ```python 143 | if val != float('inf'): 144 | res[i[2]] = intervals[val][2] 145 | else: 146 | continue 147 | ``` 148 | (C) ```python 149 | val = binary_search(i[1]) 150 | if val != float('inf'): res[i[2] + 1] = intervals[val][2] 151 | ``` 152 | (D) ```python 153 | if val != float('inf'): 154 | res[i[2]] = intervals[val][2] 155 | else: 156 | continue 157 | ``` 158 | 159 | Answer: Let's think step by step. The incomplete solution first sorts the intervals and then iterates over the sorted intervals. For each interval, it finds the right interval using a binary search. 160 | This option (A) finds the right interval index using the binary search and updates the result array accordingly. 161 | The option (B) is similar to (A), but it does not increment the index when finding the right interval index. This could lead to incorrect results. 162 | The option (C) increments the index when finding the right interval index. However, this is incorrect because the problem statement asks for the index of the right interval, not the offset from the original index. 163 | The option (D) uses the same index for both the original interval and the right interval, which could lead to incorrect results. 164 | The answer is (A). 165 | 166 | Problem statement: {problem_description} 167 | 168 | Incomplete Solution: 169 | {question} 170 | 171 | Question: The provided solution is missing a part, Which option below is the most likely to 172 | complete the solution and achieve the desired goal? 173 | 174 | {choices} 175 | 176 | Answer: Let's think step by step. """ 177 | 178 | FIM_PROMPT = dict(zeroshot=zeroshot, 179 | fewshot=fewshot, 180 | cot_zs=cot_zs, 181 | cot_fs=cot_fs) -------------------------------------------------------------------------------- /src/codemmlu/prompts/_general.py: -------------------------------------------------------------------------------- 1 | zeroshot = """The following are multiple choice questions (with answers) about software development. 2 | 3 | Question: {question} 4 | {choices} 5 | 6 | Answer: """ 7 | 8 | fewshot = """The following are multiple choice questions (with answers) about software development. 9 | 10 | Question: If a sorted array of integers is guaranteed to not contain duplicate values, 11 | in order to search a for a specific value which of the following algorithms is the most efficient for this task? 12 | 13 | (A) Bubble Sort (B) Linear Search (C) Insertion Sort (D) Binary Search 14 | 15 | Answer: The answer is (D). 16 | 17 | Question: {question} 18 | {choices} 19 | 20 | Answer: """ 21 | 22 | cot_zs = """The following are multiple choice questions (with answers) about software devopment. 23 | 24 | Question: {question} 25 | {choices} 26 | 27 | Answer: Let's think step by step. """ 28 | 29 | cot_fs = '''The following are multiple choice questions (with answers) about software devopment. 30 | 31 | Question: If a sorted array of integers is guaranteed to not contain duplicate values, in order to search a for a specific value which of the following algorithms is the most efficient for this task? 32 | 33 | (A) Bubble Sort (B) Linear Search (C) Insertion Sort (D) Binary Search 34 | 35 | Answer: Let's think step by step. Binary Search is a divide-and-conquer algorithm that works by repeatedly dividing the search interval in half and searching for the value in the appropriate half. Since the array is already sorted and does not contain any duplicate value, this algorithm is optimal to find the desired value. The answer is (D). 36 | 37 | Question: {question} 38 | {choices} 39 | 40 | Answer: Let's think step by step. ''' 41 | 42 | GENERAL_PROMPT = dict(zeroshot=zeroshot, 43 | fewshot=fewshot, 44 | cot_zs=cot_zs, 45 | cot_fs=cot_fs) -------------------------------------------------------------------------------- /src/codemmlu/task_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | import glob 5 | from string import ascii_uppercase 6 | from typing import Optional 7 | from datasets import Dataset, load_dataset 8 | 9 | from codemmlu.prompts import GENERAL_PROMPT, CODECOMP_PROMPT, FIM_PROMPT, CODEREPAIR_PROMPT, DEFECT_PROMPT 10 | 11 | 12 | SEMANTIC_TASK = ["software_principles", "dbms_sql", "others"] 13 | 14 | SYNTACTIC_TASK = ["programming_syntax", "api_frameworks"] 15 | 16 | REALWORLD_TASK = ["code_completion", "fill_in_the_middle", "code_repair", "defect_detection"] 17 | 18 | ALL_TASK = SEMANTIC_TASK + SYNTACTIC_TASK + REALWORLD_TASK 19 | 20 | 21 | def get_prompt(subset: str, prompt_mode: str) -> str: 22 | """Get prompt for a given task.""" 23 | assert prompt_mode in ["zeroshot", "fewshot", "cot_zs", "cot_fs"] 24 | 25 | if subset in SEMANTIC_TASK + SYNTACTIC_TASK: 26 | return GENERAL_PROMPT[prompt_mode] 27 | else: 28 | if subset == "code_completion": 29 | return CODECOMP_PROMPT[prompt_mode] 30 | elif subset == "fill_in_the_middle": 31 | return FIM_PROMPT[prompt_mode] 32 | elif subset == "code_repair": 33 | return CODEREPAIR_PROMPT[prompt_mode] 34 | elif subset == "defect_detection": 35 | return DEFECT_PROMPT[prompt_mode] 36 | else: 37 | raise ValueError(f"Invalid subset: {subset}") 38 | 39 | 40 | class CodeMMLU: 41 | """CodeMMLU benchmark loader.""" 42 | TASK_NAME = "codemmlu" 43 | DATASET_NAME_OR_PATH = "Fsoft-AIC/codemmlu" 44 | 45 | def __init__(self, 46 | split: str, 47 | subset: str, 48 | prompt_mode: str = "zeroshot", 49 | instruction_prefix: Optional[str] = "", 50 | assistant_prefix: Optional[str] = "") -> None: 51 | 52 | self.stop_words = ['\n\nQ:', '\n\nQuestion:', '\n\n###', '\n#', "\n<|/", "\n```"] 53 | self.instruction_prefix = instruction_prefix 54 | self.assistant_prefix = assistant_prefix 55 | self.split = split 56 | self.subset = subset 57 | self.prompt_mode = prompt_mode 58 | 59 | self.dataset = load_dataset(self.DATASET_NAME_OR_PATH, subset, 60 | split=split, use_auth_token=True) 61 | 62 | def __len__(self): 63 | return len(self.dataset) 64 | 65 | def get_dataset(self) -> Dataset: 66 | return self.dataset 67 | 68 | def prepare_dataset(self) -> Dataset: 69 | """Preprocess CodeMMLU question. 70 | 71 | - Default CodeMMLU prompt is zeroshot. All support prompt modes are: 72 | - zeroshot 73 | - fewshot 74 | - cot_zs (Chain-of-Thought zershot) 75 | - cot_fs (Chain-of-Thought fewshot) 76 | """ 77 | 78 | TEMPLATE = get_prompt(self.subset, self.prompt_mode) 79 | 80 | def _preprocess(example): 81 | model_inputs = dict(task_id=[], question=[]) 82 | 83 | # for idx in range(len(examples[key_column])): 84 | # question = examples[key_column][idx] 85 | task_id = example.pop('task_id') 86 | example['choices'] = "\n".join([f"({ascii_uppercase[idx]}) {choice}" for idx, choice in enumerate(example['choices'])]) 87 | 88 | # MODEL INPUTS HERE 89 | question = TEMPLATE.format(**example) 90 | question = self.instruction_prefix + question + self.assistant_prefix 91 | model_inputs['question'] = question 92 | model_inputs['task_id'] = task_id 93 | 94 | return model_inputs 95 | 96 | preprocessed_ds = self.dataset.map(_preprocess, 97 | batched=False, 98 | remove_columns=self.dataset.column_names) 99 | 100 | print(f"Preprocessed dataset: {preprocessed_ds}") 101 | # Visualize 3 sample 102 | print("Preprocessed prompts:") 103 | for i in range(3): 104 | print(preprocessed_ds['question'][i]) 105 | return preprocessed_ds 106 | 107 | 108 | @staticmethod 109 | def _stop_at_stop_token(decoded_string, stop_tokens): 110 | """ 111 | Produces the prefix of decoded_string that ends at the first occurrence of 112 | a stop_token. 113 | WARNING: the decoded_string *must not* include the prompt, 114 | which may have stop tokens itself. 115 | """ 116 | min_stop_index = len(decoded_string) 117 | for stop_token in stop_tokens: 118 | stop_index = decoded_string.find(stop_token) 119 | if stop_index != -1 and stop_index < min_stop_index: 120 | min_stop_index = stop_index 121 | return decoded_string[:min_stop_index] 122 | 123 | def process_response(self, example): 124 | answer = self._stop_at_stop_token(example, self.stop_words) 125 | 126 | # Substitute special characters with empty string 127 | answer = re.sub(r'[^A-Za-z0-9 \n]', "", answer) 128 | new_answer = [] 129 | for item in answer.splitlines(): 130 | for subitem in item.split(" "): 131 | if len(subitem) != 1: 132 | new_answer.append(subitem.lower()) 133 | else: 134 | new_answer.append(subitem) 135 | 136 | new_answer = ' '.join(new_answer) 137 | new_answer = re.sub(r'\s+', ' ', new_answer).strip() 138 | 139 | return new_answer 140 | 141 | def parse_answer(self, example): 142 | """Answer extract function. 143 | 144 | Args: 145 | example (str): The example to extract the answer from 146 | Returns: 147 | str: The extracted answer 148 | """ 149 | extract = re.search(r"answer is (\(*[A-E][\).]*)", example, flags=re.IGNORECASE) 150 | if extract: 151 | return extract.group(1).replace("(", "").replace(")", "").replace(".", "").strip() 152 | 153 | 154 | extract = re.search(r"(\(*[A-E][\).]*) is correct", example, flags=re.IGNORECASE) 155 | if extract: 156 | return extract.group(1).replace("(", "").replace(")", "").replace(".", "").strip() 157 | 158 | 159 | match = re.findall(r"(A|B|C|D|E)", example) 160 | 161 | if match: 162 | # if len(match) > 1: 163 | # return None 164 | return list(match)[0] # Take the first one 165 | return None 166 | --------------------------------------------------------------------------------