├── .gitignore
├── HISTORY.md
├── LICENSE
├── README.md
├── asset
└── code_mmlu_banner.png
├── data
├── README.md
├── codemmlu
│ └── codemmlu.py
└── submission_sample.txt
├── paper
└── 2410.01999v4.pdf
├── pyproject.toml
├── requirements.txt
└── src
└── codemmlu
├── __init__.py
├── __main__.py
├── backends
├── __init__.py
├── base.py
├── hf.py
└── vllm.py
├── evaluator.py
├── prompts
├── __init__.py
├── _codecomp.py
├── _coderepair.py
├── _defect.py
├── _fim.py
└── _general.py
└── task_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
164 | *.json
165 | *.jsonl
--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
1 | # Releases
2 |
3 | ## Version 0.0.1
4 | Release date: 14 Oct, 2024
5 | - Support backends: native huggingface, VLLMs
6 | - Support model's checkpoint from huggingface
7 | - Support lora model
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | Copyright © 2024 FPT Software AI Center
3 |
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 |
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 |
8 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | 📰 News •
17 | 🚀 Quick Start •
18 | 📋 Evaluation •
19 | 📌 Citation
20 |
21 |
22 | ## 📌 About
23 |
24 | ### CodeMMLU
25 |
26 | **CodeMMLU** is a comprehensive benchmark designed to evaluate the capabilities of large language models (LLMs) in coding and software knowledge.
27 | It builds upon the structure of multiple-choice question answering (MCQA) to cover a wide range of programming tasks and domains, including code generation, defect detection, software engineering principles, and much more.
28 |
29 | ### Why CodeMMLU?
30 |
31 | - **CodeMMLU** comprises over 10,000 questions curated from diverse, high-quality sources. It covers a wide spectrum of software knowledge, including general QA, code generation, defect detection, and code repair across various domains and more than 10 programming languages.
32 |
33 | - **Precise and comprehensive:** Checkout our [LEADERBOARD](https://fsoft-ai4code.github.io/codemmlu/) for latest LLM rankings.
34 |
35 | ## 🚀 Quick Start
36 |
37 | Install CodeMMLU and setup dependencies via `pip`:
38 | ```bash
39 | pip install codemmlu
40 | ```
41 |
42 | Generate response for CodeMMLU MCQs benchmark:
43 | ```bash
44 | codemmlu --model_name \
45 | --subset \
46 | --backend \
47 | --output_dir
48 | ```
49 |
50 |
51 | ## 📋 Evaluation
52 |
53 | Build `codemmlu` from source:
54 | ```bash
55 | git clone https://github.com/Fsoft-AI4Code/CodeMMLU.git
56 | cd CodeMMLU
57 | pip install -e .
58 | ```
59 |
60 | > [!Note]
61 | >
62 | > If you prefer `vllm` backend, we highly recommend you install [vllm from official project](https://github.com/vllm-project/vllm/) before install `codemmlu`.
63 |
64 | Generating with CodeMMLU questions:
65 | ```bash
66 | codemmlu --model_name \
67 | --peft_model \
68 | --subset all \
69 | --batch_size 16 \
70 | --backend [vllm|hf] \
71 | --max_new_tokens 1024 \
72 | --temperature 0.0 \
73 | --output_dir \
74 | --instruction_prefix \
75 | --assistant_prefix \
76 | --cache_dir
77 | ```
78 |
79 | ⏬ API Usage :: click to expand ::
80 |
81 |
82 | ```bash
83 | codemmlu [-h] [-V] [--subset SUBSET] [--batch_size BATCH_SIZE] [--instruction_prefix INSTRUCTION_PREFIX]
84 | [--assistant_prefix ASSISTANT_PREFIX] [--output_dir OUTPUT_DIR] [--model_name MODEL_NAME]
85 | [--peft_model PEFT_MODEL] [--backend BACKEND] [--max_new_tokens MAX_NEW_TOKENS]
86 | [--temperature TEMPERATURE] [--prompt_mode PROMPT_MODE] [--cache_dir CACHE_DIR] [--trust_remote_code]
87 |
88 | ==================== CodeMMLU ====================
89 |
90 | optional arguments:
91 | -h, --help show this help message and exit
92 | -V, --version Get version
93 | --subset SUBSET Select evaluate subset
94 | --batch_size BATCH_SIZE
95 | --instruction_prefix INSTRUCTION_PREFIX
96 | --assistant_prefix ASSISTANT_PREFIX
97 | --output_dir OUTPUT_DIR
98 | Save generation and result path
99 | --model_name MODEL_NAME
100 | Local path or Huggingface Hub link to load model
101 | --peft_model PEFT_MODEL
102 | Lora config
103 | --backend BACKEND LLM generation backend (default: hf)
104 | --max_new_tokens MAX_NEW_TOKENS
105 | Number of max new tokens
106 | --temperature TEMPERATURE
107 | --prompt_mode PROMPT_MODE
108 | Prompt available: zeroshot, fewshot, cot_zs, cot_fs
109 | --cache_dir CACHE_DIR
110 | Cache for save model download checkpoint and dataset
111 | --trust_remote_code
112 | ```
113 |
114 |
115 |
116 |
117 |
118 | List of supported backends:
119 |
120 | | Backend | DecoderModel | LoRA |
121 | |------------------ |-------------- |------ |
122 | | [Transformers](https://github.com/huggingface/transformers) (hf) | ✅ | ✅ |
123 | | [Vllm](https://github.com/vllm-project/vllm) (vllm) | ✅ | ✅ |
124 |
125 | ### Leaderboard
126 | To evaluate your model and submit your results to the [leaderboard](https://fsoft-ai4code.github.io/codemmlu/), please follow the instruction in [data/README.md](data/README.md).
127 |
128 | ## 📌 Citation
129 | If you find this repository useful, please consider citing our paper:
130 |
131 | ```
132 | @article{nguyen2024codemmlu,
133 | title={CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities},
134 | author={Nguyen, Dung Manh and Phan, Thang Chau and Le, Nam Hai and Doan, Thong T. and Nguyen, Nam V. and Pham, Quang and Bui, Nghi D. Q.},
135 | journal={arXiv preprint},
136 | year={2024}
137 | }
138 | ```
139 |
--------------------------------------------------------------------------------
/asset/code_mmlu_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeMMLU/2999a6a888734a9dc48cc84d8a77a8311e9ca245/asset/code_mmlu_banner.png
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | ## Evaluation Submission
2 |
3 | To submit your model's results to the [leaderboard](https://fsoft-ai4code.github.io/leaderboards/codemmlu/), please send us an email at `dungnm31@fpt.com` with the following information:
4 |
5 | - **Model Name**: The name of your model.
6 | - **Model Description**: A brief description of your model.
7 | - **Model Configuration**:
8 | - Base or Instruct
9 | - Base or LoRA
10 | - **Model Answer**: The response generated by your model followed the format:
11 |
12 | ```
13 |
14 | ```
15 |
16 | Checkout the example in `submission_sample.txt`
17 |
18 |
--------------------------------------------------------------------------------
/data/codemmlu/codemmlu.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """The CodeMMLU benchmark."""
15 |
16 | import os
17 | import json
18 | from glob import glob
19 |
20 | import datasets
21 |
22 |
23 | _CITATION = """\
24 | @article{nguyen2024codemmlu,
25 | title={CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities},
26 | author={Nguyen, Dung Manh and Phan, Thang Chau and Le, Nam Hai and Doan, Thong T. and Nguyen, Nam V. and Pham, Quang and Bui, Nghi D. Q.},
27 | journal={arXiv preprint},
28 | year={2024}
29 | }
30 | """
31 |
32 | _DESCRIPTION = """\
33 | CodeMMLU is a comprehensive benchmark designed to evaluate the capabilities of large language models (LLMs) in coding and software knowledge
34 | """
35 |
36 | _HOMEPAGE = "https://fsoft-ai4code.github.io/codemmlu/"
37 |
38 | _URL = "./data/test"
39 |
40 | _SUBJECTS = [
41 | "programming_syntax", "api_frameworks",
42 | "software_principles", "dbms_sql", "others",
43 | "code_completion", "fill_in_the_middle", "code_repair", "defect_detection"
44 | ]
45 |
46 |
47 | class CodeMMLU(datasets.GeneratorBasedBuilder):
48 | """CodeMMLU: A Multi-Task Benchmark for Assessing Code Understanding Capabilities"""
49 | # Version history:
50 | # 0.0.1: Initial release.
51 | VERSION = datasets.Version("0.0.1")
52 |
53 | BUILDER_CONFIGS = [
54 | datasets.BuilderConfig(
55 | name=sub, version=datasets.Version("0.0.1"),
56 | description="CodeMMLU test subject {}".format(sub)
57 | ) for sub in _SUBJECTS
58 | ]
59 |
60 |
61 | def _info(self):
62 | features = datasets.Features(
63 | {
64 | "task_id": datasets.Value("string"),
65 | "question": datasets.Value("string"),
66 | "choices": datasets.features.Sequence(datasets.Value("string")),
67 | }
68 | )
69 |
70 | if self.config.name == "fill_in_the_middle":
71 | features["problem_description"] = datasets.Value("string")
72 |
73 | return datasets.DatasetInfo(
74 | description=_DESCRIPTION,
75 | features=features,
76 | homepage=_HOMEPAGE,
77 | citation=_CITATION,
78 | )
79 |
80 | def _split_generators(self, dl_manager):
81 | """Returns SplitGenerators."""
82 | path = os.path.join(_URL, self.config.name + ".jsonl")
83 | dl_dir = dl_manager.download(path)
84 | return [
85 | datasets.SplitGenerator(
86 | name=datasets.Split.TEST,
87 | gen_kwargs={"data_path": dl_dir},
88 | ),
89 | ]
90 |
91 | def _generate_examples(self, data_path):
92 | """This function returns the examples in the raw (text) form."""
93 | if data_path.endswith(".jsonl"):
94 | lines = open(data_path, "r", encoding="utf-8").readlines()
95 | reader = [json.loads(line) for line in lines]
96 | for idx, data in enumerate(reader):
97 | return_dict = {
98 | "task_id": data['task_id'],
99 | "question": data['question'],
100 | "choices": data['choices'],
101 | }
102 |
103 | if "fill_in_the_middle" in data_path:
104 | return_dict['problem_description'] = data['problem_description']
105 |
106 | yield idx, return_dict
107 |
--------------------------------------------------------------------------------
/data/submission_sample.txt:
--------------------------------------------------------------------------------
1 | 1 A
2 | 2 B
3 | 3 C
4 | 4 D
--------------------------------------------------------------------------------
/paper/2410.01999v4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeMMLU/2999a6a888734a9dc48cc84d8a77a8311e9ca245/paper/2410.01999v4.pdf
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "codemmlu"
7 | version = "0.0.2.1"
8 | authors = [
9 | { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" },
10 | ]
11 | description = "CodeMMLU Evaluator: A framework for evaluating language models on CodeMMLU benchmark."
12 | readme = "README.md"
13 | requires-python = ">=3.9"
14 | classifiers = [
15 | "Programming Language :: Python :: 3",
16 | "License :: OSI Approved :: MIT License",
17 | "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 | "transformers>=4.39.0",
21 | "datasets>=2.17.1",
22 | "accelerate>=0.27.2",
23 | "deepspeed>=0.13.2",
24 | "peft>=0.10.0",
25 | "vllm"
26 | ]
27 |
28 | [project.urls]
29 | "Homepage" = "https://fsoft-ai4code.github.io/codemmlu/"
30 | "Bug Tracker" = "https://github.com/FSoft-AI4Code/CodeMMLU/issues"
31 |
32 | [project.scripts]
33 | codemmlu = "codemmlu.__main__:main"
34 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.39.0
2 | accelerate==0.27.2
3 | bitsandbytes==0.42.0
4 | datasets==2.17.1
5 | deepspeed==0.13.2
6 | peft==0.10.0
7 | vllm
--------------------------------------------------------------------------------
/src/codemmlu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeMMLU/2999a6a888734a9dc48cc84d8a77a8311e9ca245/src/codemmlu/__init__.py
--------------------------------------------------------------------------------
/src/codemmlu/__main__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import pkg_resources
4 |
5 | from codemmlu.task_utils import ALL_TASK
6 |
7 | def get_args():
8 | parser = argparse.ArgumentParser(description=f"{20*'='} CodeMMLU {20*'='}")
9 |
10 | parser.add_argument("-V", "--version", action="version", help="Get version",
11 | version=pkg_resources.get_distribution("codemmlu").version)
12 |
13 | # Data args
14 | parser.add_argument("--subset", default="programming_syntax", type=str,
15 | help='Select evaluate subset')
16 | parser.add_argument("--batch_size", default=16, type=int)
17 | parser.add_argument("--instruction_prefix", default="", type=str)
18 | parser.add_argument("--assistant_prefix", default="", type=str)
19 | parser.add_argument("--output_dir", default="./output", type=str,
20 | help='Save generation and result path')
21 |
22 | # Generation args
23 | parser.add_argument("--model_name", type=str,
24 | help='Local path or Huggingface Hub link to load model')
25 | parser.add_argument("--peft_model", default=None, type=str,
26 | help='Lora config')
27 | parser.add_argument("--backend", default="hf", type=str,
28 | help="LLM generation backend (default: hf)")
29 | parser.add_argument("--max_new_tokens", default=128, type=int,
30 | help='Number of max new tokens')
31 | parser.add_argument("--temperature", default=0.0, type=float)
32 | parser.add_argument("--prompt_mode", default='zeroshot', type=str,
33 | help='Prompt available: zeroshot, fewshot, cot_zs, cot_fs')
34 | parser.add_argument("--cache_dir", default=None, type=str,
35 | help='Cache for save model download checkpoint and dataset')
36 | parser.add_argument("--trust_remote_code", action='store_true')
37 |
38 | args = parser.parse_args()
39 |
40 | if not args.cache_dir:
41 | TRANSFORMER_CACHE = os.getenv("TRANSFORMER_CACHE")
42 | HF_HOME = os.getenv("HF_HOME")
43 | if TRANSFORMER_CACHE:
44 | args.cache_dir = TRANSFORMER_CACHE
45 | else:
46 | args.cache_dir = HF_HOME
47 |
48 | assert args.subset in ALL_TASK, f"Invalid subset name, expect {ALL_TASK}, but got {args.subset}"
49 |
50 | return args, parser
51 |
52 |
53 | def main():
54 | args, parsre = get_args()
55 | if args.model_name:
56 | generate(args=args)
57 | else:
58 | parsre.print_help()
59 |
60 |
61 | def generate(args):
62 | from codemmlu.evaluator import Evaluator
63 |
64 | evaluator = Evaluator(
65 | subset=args.subset,
66 | model_name=args.model_name,
67 | peft_model=args.peft_model,
68 | backend=args.backend,
69 | batch_size=args.batch_size,
70 | cache_dir=args.cache_dir,
71 | output_dir=args.output_dir,
72 | trust_remote_code=args.trust_remote_code,
73 | instruction_prefix=args.instruction_prefix,
74 | assistant_prefix=args.assistant_prefix,
75 | prompt_mode=args.prompt_mode,
76 | )
77 |
78 | evaluator.generate(
79 | temperature=args.temperature,
80 | max_new_tokens=args.max_new_tokens,
81 | )
82 |
83 | print("======= Finish generated =======")
84 |
85 | if __name__ == '__main__':
86 | main()
--------------------------------------------------------------------------------
/src/codemmlu/backends/__init__.py:
--------------------------------------------------------------------------------
1 | from codemmlu.backends.base import Backend
2 | from codemmlu.task_utils import CodeMMLU
3 |
4 | SUPPORTED_BACKENDS = ["vllm", "hf"]
5 |
6 | def make_model(
7 | model_name: str,
8 | backend: str,
9 | subset: str,
10 | split: str,
11 | output_dir: str,
12 | temperature: float = 0.0,
13 | max_new_tokens: int = 1280,
14 | batch_size: int = 16,
15 | prompt_mode: str = "zeroshot",
16 | # instruction model only
17 | instruction_prefix: str = None,
18 | assistant_prefix: str = None,
19 | trust_remote_code: bool = False,
20 | # peft model only
21 | peft_model: str = None,
22 | # cache dir
23 | cache_dir: str = None,
24 |
25 | ) -> Backend:
26 | # Load dataset
27 | dataset = CodeMMLU(subset=subset,
28 | split=split,
29 | prompt_mode=prompt_mode,
30 | instruction_prefix=instruction_prefix,
31 | assistant_prefix=assistant_prefix)
32 |
33 | # Initialize backend
34 | if backend == "vllm":
35 | from codemmlu.backends.vllm import VllmEngine
36 |
37 | return VllmEngine(
38 | model_name=model_name,
39 | peft_model=peft_model,
40 | dataset=dataset,
41 | temperature=temperature,
42 | batch_size=batch_size,
43 | max_new_tokens=max_new_tokens,
44 | trust_remote_code=trust_remote_code,
45 | cache_dir=cache_dir,
46 | output_dir=output_dir
47 | )
48 | elif backend == "hf":
49 | from codemmlu.backends.hf import HuggingfaceEngine
50 |
51 | return HuggingfaceEngine(
52 | model_name=model_name,
53 | peft_model=peft_model,
54 | dataset=dataset,
55 | temperature=temperature,
56 | batch_size=batch_size,
57 | max_new_tokens=max_new_tokens,
58 | trust_remote_code=trust_remote_code,
59 | cache_dir=cache_dir,
60 | output_dir=output_dir
61 | )
62 | else:
63 | raise ValueError(f"Unknown backend: {backend}")
64 |
--------------------------------------------------------------------------------
/src/codemmlu/backends/base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from codemmlu.task_utils import CodeMMLU
3 |
4 | class Backend(ABC):
5 | def __init__(self,
6 | dataset: CodeMMLU,
7 | model_name: str,
8 | temperature: float,
9 | max_new_tokens: int,
10 | peft_model: str = None,
11 | batch_size: int = 16,
12 | trust_remote_code: bool = False,
13 | cache_dir: str = None,
14 | output_dir: str='./'):
15 | print(f"Initializing {self.__class__.__name__} backend")
16 | print(f"Initializing a decoding model: {model_name}")
17 |
18 | self.TASK_NAME = dataset.TASK_NAME
19 | self.subset = dataset.subset
20 | self.split = dataset.split
21 | self.model_name = model_name
22 | self.batch_size = batch_size
23 | self.peft_model = peft_model
24 | self.cache_dir = cache_dir
25 | self.output_dir = output_dir
26 | self.dataset = dataset.prepare_dataset()
27 | self.temperature = temperature
28 | self.max_new_tokens = max_new_tokens
29 | self.trust_remote_code = trust_remote_code
30 |
31 |
32 | @abstractmethod
33 | def generate(self) -> str:
34 | raise NotImplementedError
35 |
36 | def get_dataset(self) -> CodeMMLU:
37 | return self.dataset
38 |
--------------------------------------------------------------------------------
/src/codemmlu/backends/hf.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from typing import Dict
4 | from tqdm import tqdm
5 |
6 | from accelerate import Accelerator
7 | from accelerate.utils import gather_object
8 | from transformers import (
9 | GenerationConfig,
10 | AutoModelForCausalLM,
11 | AutoModelForSeq2SeqLM,
12 | AutoTokenizer
13 | )
14 |
15 | from codemmlu.backends.base import Backend
16 |
17 | class HuggingfaceEngine(Backend):
18 | def __init__(self, model_name: str, **kwargs):
19 | super().__init__(model_name=model_name, **kwargs)
20 | self.accelerator = Accelerator()
21 |
22 | # TODO: add generation args
23 | generate_args = dict(
24 | temperature=self.temperature,
25 | max_new_tokens=self.max_new_tokens,
26 | )
27 | self.generation_config = GenerationConfig(**generate_args)
28 |
29 | model_kwargs = dict(
30 | cache_dir=self.cache_dir,
31 | trust_remote_code=self.trust_remote_code,
32 | load_in_8bit=False
33 | )
34 | try:
35 | self.model = AutoModelForCausalLM.from_pretrained(
36 | self.model_name, **model_kwargs)
37 |
38 | except KeyError: # Except load seq2seq model
39 | self.model = AutoModelForSeq2SeqLM.from_pretrained(
40 | self.model_name, **model_kwargs)
41 |
42 | if self.peft_model:
43 | from peft import PeftModel
44 | self.model = PeftModel.from_pretrained(self.model, self.peft_model)
45 |
46 | self.model.to(self.accelerator.device)
47 |
48 | self.tokenizer = AutoTokenizer.from_pretrained(
49 | self.model_name,
50 | trust_remote_code=self.trust_remote_code,
51 | padding_side="left"
52 | )
53 |
54 | if not self.tokenizer.pad_token:
55 | print("Set EOS_TOKEN to PAD_TOKEN")
56 | self.tokenizer.pad_token = self.tokenizer.eos_token
57 |
58 | def generate(self) -> str:
59 | # ``Accelerate`` distribute data and model
60 | assert self.accelerator
61 |
62 | ds_loader = [self.dataset[i:i+self.batch_size]
63 | for i in range(0, len(self.dataset), self.batch_size)]
64 |
65 | for i in range(len(ds_loader)):
66 | question = ds_loader[i]['question']
67 | ds_loader[i]['question_ids'] = self.tokenizer(question, return_tensors="pt", padding=True)
68 |
69 | result = []
70 | with self.accelerator.split_between_processes(ds_loader, apply_padding=True) as batched_prompts:
71 | index = self.accelerator.process_index
72 | for batch in tqdm(batched_prompts, desc=f"Process: {index} | Generating", position=index):
73 | input_ids = batch['question_ids'].to(self.accelerator.device)
74 | outputs = self.model.generate(**input_ids,
75 | generation_config=self.generation_config,
76 | pad_token_id=self.tokenizer.eos_token_id,
77 | eos_token_id=self.tokenizer.eos_token_id)
78 |
79 | outputs = [output[len(prompt) :] for prompt, output in zip(input_ids["input_ids"], outputs)]
80 | batch_results = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
81 |
82 | batch['generation'] = batch_results
83 | result.extend(batch['generation'])
84 | self._save_result(batch)
85 |
86 |
87 | result_gather = gather_object(result)[: len(self.dataset)]
88 | self.dataset = self.dataset.add_column('generation', result_gather)
89 | # TODO: process response and extract answer
90 | return self.dataset
91 |
92 | def _save_result(self, batched_outputs: Dict):
93 | assert 'question' in batched_outputs.keys()
94 | assert 'generation' in batched_outputs.keys()
95 |
96 | if self.accelerator.distributed_type == "MULTI_GPU":
97 | save_path = os.path.join(self.save_dir,
98 | f"{self.subset}.raw.generated.{self.accelerator.process_index}.jsonl")
99 | else:
100 | save_path = os.path.join(self.save_dir, f"{self.subset}.final.generated.jsonl")
101 |
102 | with open(save_path, "a") as writer:
103 | for idx in range(len(batched_outputs['question'])):
104 | res = dict(
105 | task_id=batched_outputs['task_id'][idx],
106 | prompt=batched_outputs['question'][idx],
107 | response=batched_outputs['generation'][idx]
108 | )
109 |
110 | json.dump(res, writer)
111 | writer.write("\n")
112 |
--------------------------------------------------------------------------------
/src/codemmlu/backends/vllm.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | from tqdm import tqdm
4 | from typing import Dict
5 |
6 | import torch
7 | from vllm import LLM, SamplingParams
8 | from vllm.lora.request import LoRARequest
9 |
10 | from codemmlu.backends.base import Backend
11 |
12 | class VllmEngine(Backend):
13 | def __init__(self, model_name: str, **kwargs):
14 | super().__init__(model_name=model_name, **kwargs)
15 | ngpus = torch.cuda.device_count()
16 | backend_kwargs = dict(
17 | disable_log_stats=True,
18 | tensor_parallel_size=ngpus,
19 | download_dir=self.cache_dir,
20 | trust_remote_code=self.trust_remote_code,
21 | )
22 |
23 | self.model = LLM(self.model_name,
24 | enable_lora=True if self.peft_model else None,
25 | **backend_kwargs)
26 |
27 | self.lora_request = None
28 | if self.peft_model:
29 | self.lora_request=LoRARequest("lora", 1, self.peft_model)
30 |
31 | self.sampling_params = SamplingParams(
32 | max_tokens=self.max_new_tokens,
33 | temperature=self.temperature,
34 | )
35 |
36 | def generate(self):
37 | ds_loader = [self.dataset[i:i+self.batch_size]
38 | for i in range(0, len(self.dataset), self.batch_size)]
39 |
40 | result = []
41 | for batch in tqdm(ds_loader, total=len(ds_loader), desc="Generating"):
42 | outputs = self.model.generate(batch['question'],
43 | self.sampling_params,
44 | lora_request=self.lora_request)
45 |
46 | batch['generation'] = [output.outputs[0].text for output in outputs]
47 | result.extend(batch['generation'])
48 | self._save_result(batch)
49 |
50 | self.dataset = self.dataset.add_column('generation', result)
51 | # TODO: process response and extract answer
52 | return self.dataset
53 |
54 | def _save_result(self, batched_outputs: Dict):
55 | assert 'question' in batched_outputs.keys()
56 | assert 'generation' in batched_outputs.keys()
57 |
58 | save_path = os.path.join(self.output_dir, f"{self.subset}.final.generated.jsonl")
59 |
60 | with open(save_path, "a") as writer:
61 | for idx in range(len(batched_outputs['question'])):
62 | res = dict(
63 | task_id=batched_outputs['task_id'][idx],
64 | prompt=batched_outputs['question'][idx],
65 | response=batched_outputs['generation'][idx]
66 | )
67 |
68 | json.dump(res, writer)
69 | writer.write("\n")
70 |
--------------------------------------------------------------------------------
/src/codemmlu/evaluator.py:
--------------------------------------------------------------------------------
1 | """Evaluator to load CodeMMLU and extract answer from response.
2 |
3 | For example:
4 |
5 | .. code-block:: python
6 |
7 | >>> from codemmlu import Evaluator
8 | >>> evaluator = Evaluator(subset="semantic")
9 | >>> response = evaluator.generate(temperature=0.9, num_return_sequences=3)
10 |
11 | """
12 | import os
13 | import sys
14 | import json
15 | import time
16 | from warnings import warn
17 | from typing import Optional, Dict, List
18 |
19 | import torch
20 |
21 | from codemmlu.backends import make_model, SUPPORTED_BACKENDS, Backend
22 |
23 | class Evaluator:
24 | """Evaluator class.
25 |
26 | :param model_name: Selected model for evaluating
27 | :type model_name: str
28 | :param peft_model: Adapter model, defaults to None
29 | :type peft_model: Optional[str], optional
30 | :param trust_remote_code: Huggingface argument, defaults to False
31 | :type trust_remote_code: Optional[bool], optional
32 | :param cache_dir: Downloaded cache directory, defaults to None
33 | :type cache_dir: Optional[str], optional
34 | :param batch_size: Generation batch size, defaults to 16
35 | :type batch_size: Optional[int], optional
36 | :param output_dir: Saving generation directory, defaults to "./output"
37 | :type output_dir: Optional[str], optional
38 | """
39 |
40 | def __init__(self,
41 | model_name: str,
42 | subset: Optional[str] = None,
43 | split: Optional[str] = "test",
44 | peft_model: Optional[str] = None,
45 | backend: str = "hf",
46 | trust_remote_code: Optional[bool] = False,
47 | cache_dir: Optional[str] = None,
48 | batch_size: Optional[int] = 16,
49 | output_dir: Optional[str] = "./output",
50 | instruction_prefix: Optional[str] = "",
51 | assistant_prefix: Optional[str] = "",
52 | prompt_mode: Optional[str] = None,
53 | ) -> None:
54 |
55 | # Dataset args
56 | self.split = split
57 | self.subset = subset
58 | self.instruction_prefix = instruction_prefix
59 | self.assistant_prefix = assistant_prefix
60 |
61 | # Generation args
62 | self.backend = backend
63 | self.model_name = model_name
64 | self.peft_model = peft_model
65 | self.output_dir = output_dir
66 | self.trust_remote_code = trust_remote_code
67 | self.cache_dir = cache_dir
68 | self.batch_size = batch_size
69 | self.prompt_mode = prompt_mode
70 |
71 | if backend not in SUPPORTED_BACKENDS:
72 | raise ValueError(f"Backend {backend} is not supported. Please choose from {SUPPORTED_BACKENDS}")
73 |
74 | os.makedirs(self.output_dir, exist_ok=True)
75 |
76 |
77 | def generate(self,
78 | max_new_tokens: int = 1024,
79 | temperature: float = 0.0,
80 | ) -> List:
81 | """Start backend, generate and extract answer from response
82 |
83 | :param max_new_tokens: Max new tokens, defaults to 256
84 | :type max_new_tokens: Optional[int], optional
85 | :param temperature: Model generate temperature, defaults to 0.9
86 | :type temperature: Optional[float], optional
87 |
88 | :return: List of generated result, stored in dictionary object
89 | with ``task_id``, ``prompt`` and ``answer`` key.
90 | :rtype: List
91 | """
92 | self.engine : Backend = make_model(
93 | subset=self.subset,
94 | split=self.split,
95 | model_name=self.model_name,
96 | backend=self.backend,
97 | peft_model=self.peft_model,
98 | trust_remote_code=self.trust_remote_code,
99 | batch_size=self.batch_size,
100 | temperature=temperature,
101 | max_new_tokens=max_new_tokens,
102 | cache_dir=self.cache_dir,
103 | instruction_prefix=self.instruction_prefix,
104 | assistant_prefix=self.assistant_prefix,
105 | output_dir=self.output_dir,
106 | prompt_mode=self.prompt_mode,
107 | )
108 |
109 |
110 | print(f"Evaluating task: [{self.engine.TASK_NAME}]")
111 | print(f"pt={torch.__version__}, cuda={torch.version.cuda}, nccl={torch.cuda.nccl.version()}")
112 | print(f"device compute capabilities={torch.cuda.get_device_capability()}")
113 | print(f"pytorch compute capabilities={torch.cuda.get_arch_list()}")
114 |
115 | start_time = time.time()
116 | results = self.engine.generate()
117 |
118 | print("======= Finished {} =======".format(self.engine.TASK_NAME))
119 | print("Completion time: %d s", (time.time() - start_time))
120 |
121 | return results
122 |
123 |
124 | def acc_evaluate(
125 | subset: str,
126 | response_path: str):
127 |
128 | pass
--------------------------------------------------------------------------------
/src/codemmlu/prompts/__init__.py:
--------------------------------------------------------------------------------
1 | from codemmlu.prompts._general import GENERAL_PROMPT
2 | from codemmlu.prompts._codecomp import CODECOMP_PROMPT
3 | from codemmlu.prompts._fim import FIM_PROMPT
4 | from codemmlu.prompts._coderepair import CODEREPAIR_PROMPT
5 | from codemmlu.prompts._defect import DEFECT_PROMPT
6 |
7 | __all__ = [
8 | "GENERAL_PROMPT",
9 | "CODECOMP_PROMPT",
10 | "FIM_PROMPT",
11 | "CODEREPAIR_PROMPT",
12 | "DEFECT_PROMPT"
13 | ]
--------------------------------------------------------------------------------
/src/codemmlu/prompts/_codecomp.py:
--------------------------------------------------------------------------------
1 |
2 | zeroshot = """The following are multiple choice questions (with answers) about
3 | programming problem.
4 |
5 | Question: Which solution below is the most likely completion the following
6 | code snippet to achieve the desired goal?
7 | {question}
8 |
9 | {choices}
10 |
11 | Answer: """
12 |
13 | fewshot = """The following are multiple choice questions (with answers) about
14 | programming problem.
15 |
16 | Question: Which solution below is the most likely completion the following
17 | code snippet to achieve the desired goal?
18 | ```python
19 | from typing import List
20 |
21 | def two_sum(nums: List[int], target: int) -> List[int]:
22 | '''
23 | Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.
24 | You may assume that each input would have exactly one solution, and you may not use the same element twice.
25 |
26 | >>> two_sum([2,7,11,15], 9)
27 | [0,1]
28 | >>> two_sum([3,2,4], 6)
29 | [1,2]
30 | >>> two_sum([3,3], 6)
31 | [0,1]
32 | '''
33 | ```
34 | (A) ```python
35 | n = len(nums)
36 | for i in range(n - 1):
37 | for j in range(i + 1, n):
38 | if nums[i] + nums[j] == target:
39 | return [i, j]
40 | return []
41 | ```
42 | (B) ```python
43 | for num in nums:
44 | if target - num in nums:
45 | return [nums.index(num), nums.index(target - num)]
46 | return []
47 | ```
48 | (C) ```python
49 | for i in range(len(nums)):
50 | if nums[i] * 2 == target:
51 | return [i, i]
52 | return []
53 | ```
54 | (D) ```python
55 | num_dict = {{}}
56 | for i, num in enumerate(nums):
57 | if target - num in num_dict:
58 | return [num_dict[target - num], i]
59 | num_dict[i] = num
60 | return []
61 | ```
62 | Answer: The answer is (A).
63 |
64 | Question: Which solution below is the most likely completion the following
65 | code snippet to achieve the desired goal?
66 | ```python
67 | {question}
68 | ```
69 |
70 | {choices}
71 |
72 | Answer: """
73 |
74 | cot_zs = '''The following are multiple choice questions (with answers) about
75 | programming problem.
76 |
77 | Question: Which solution below is the most likely completion the following
78 | code snippet to achieve the desired goal?
79 | ```python
80 | {question}
81 | ```
82 | {choices}
83 |
84 | Answer: Let's think step by step. '''
85 |
86 | cot_fs = """The following are multiple choice questions (with answers) about
87 | programming problem.
88 |
89 | Question: Which solution below is the most likely completion the following
90 | code snippet to achieve the desired goal?
91 | ```python
92 | from typing import List
93 |
94 | def two_sum(nums: List[int], target: int) -> List[int]:
95 | '''
96 | Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.
97 | You may assume that each input would have exactly one solution, and you may not use the same element twice.
98 |
99 | >>> two_sum([2,7,11,15], 9)
100 | [0,1]
101 | >>> two_sum([3,2,4], 6)
102 | [1,2]
103 | >>> two_sum([3,3], 6)
104 | [0,1]
105 | '''
106 | ```
107 | (A) ```python
108 | n = len(nums)
109 | for i in range(n - 1):
110 | for j in range(i + 1, n):
111 | if nums[i] + nums[j] == target:
112 | return [i, j]
113 | return []
114 | ```
115 | (B) ```python
116 | for num in nums:
117 | if target - num in nums:
118 | return [nums.index(num), nums.index(target - num)]
119 | return []
120 | ```
121 | (C) ```python
122 | for i in range(len(nums)):
123 | if nums[i] * 2 == target:
124 | return [i, i]
125 | return []
126 | ```
127 | (D) ```python
128 | num_dict = {{}}
129 | for i, num in enumerate(nums):
130 | if target - num in num_dict:
131 | return [num_dict[target - num], i]
132 | num_dict[i] = num
133 | return []
134 | ```
135 |
136 | Answer: Let's think step by step. The answer (A) uses a straightforward brute-force approach by checking every possible pair of indices to see if their corresponding values sum to the target. While this method has a time complexity of O(n^2), it is simple and guaranteed to find the correct solution for small input sizes, as it exhaustively evaluates all pairs. This solution works reliably within the problem's constraints and ensures the correct indices are returned when the target sum is found. The other solutions have issues such as incorrect handling of duplicate values or incorrect logic (as in C) that disqualify them.
137 | The answer is (A).
138 |
139 | Question: Which solution below is the most likely completion the following
140 | code snippet to achieve the desired goal?
141 | ```python
142 | {question}
143 | ```
144 | {choices}
145 |
146 | Answer: Let's think step by step. """
147 |
148 | CODECOMP_PROMPT = dict(zeroshot=zeroshot,
149 | fewshot=fewshot,
150 | cot_zs=cot_zs,
151 | cot_fs=cot_fs)
--------------------------------------------------------------------------------
/src/codemmlu/prompts/_coderepair.py:
--------------------------------------------------------------------------------
1 |
2 | zeroshot = """The following are multiple-choice questions (with answers) about debugging a programming problem.
3 |
4 | Question: The implementation below is producing incorrect results.
5 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
6 | {question}
7 |
8 | {choices}
9 |
10 | Answer: """
11 |
12 | fewshot = """The following are multiple-choice questions (with answers) about debugging a programming problem.
13 |
14 | Question: The implementation below is producing incorrect results.
15 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
16 |
17 | 1 def two_sum(nums, target):
18 | 2 complement_map = {{}}
19 | 3 for i, num in enumerate(nums):
20 | 4 complement = target - num
21 | 5 complement_map[num] = i
22 | 6 if complement in complement_map:
23 | 7 return [complement_map[complement], i]
24 | 8 return None
25 |
26 | (A) Remove line 5.
27 |
28 | (B) Remove line 5. Add at line 7:
29 | ``` complement_map[num] = i```
30 |
31 | (C) Modify line 7:
32 | ``` return [i, complement_map[complement]]```
33 |
34 | (D) Remove line 5. Add at line 7:
35 | ``` if i == len(nums) - 1:
36 | return None
37 | complement_map[num] = i```
38 |
39 | Answer: The answer is (B).
40 |
41 | Question: The implementation below is producing incorrect results.
42 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
43 | {question}
44 |
45 | {choices}
46 |
47 | Answer: """
48 |
49 | cot_zs = zeroshot + "Let's think step by step. "
50 |
51 | cot_fs = """The following are multiple-choice questions (with answers) about debugging a programming problem.
52 |
53 | Question: The implementation below is producing incorrect results.
54 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
55 |
56 | 1 def two_sum(nums, target):
57 | 2 complement_map = {{}}
58 | 3 for i, num in enumerate(nums):
59 | 4 complement = target - num
60 | 5 complement_map[num] = i
61 | 6 if complement in complement_map:
62 | 7 return [complement_map[complement], i]
63 | 8 return None
64 |
65 | (A) Remove line 5.
66 |
67 | (B) Remove line 5. Add at line 7:
68 | ``` complement_map[num] = i```
69 |
70 | (C) Modify line 7:
71 | ``` return [i, complement_map[complement]]```
72 |
73 | (D) Remove line 5. Add at line 7:
74 | ``` if i == len(nums) - 1:
75 | return None
76 | complement_map[num] = i```
77 |
78 | Answer: Let's think step by step. The bug in the code occurs because the current number is added to the complement_map before checking if its complement already exists, which can lead to incorrectly matching a number with itself. To fix this, the number should only be added to the map after checking for its complement. Solution (B) does exactly this by moving the line that adds the current number to the map after the complement check, ensuring the logic works as intended without self-matching errors.
79 | The answer is (B).
80 |
81 | Question: The implementation below is producing incorrect results.
82 | Which solution below correctly identifies the bug and repairs it to achieve the desired goal?
83 | {question}
84 |
85 | {choices}
86 |
87 | Answer: Let's think step by step. """
88 |
89 | CODEREPAIR_PROMPT = dict(zeroshot=zeroshot,
90 | fewshot=fewshot,
91 | cot_zs=cot_zs,
92 | cot_fs=cot_fs)
--------------------------------------------------------------------------------
/src/codemmlu/prompts/_defect.py:
--------------------------------------------------------------------------------
1 | zeroshot = """The following are multiple choice questions (with answers) about programming problem.
2 |
3 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
4 | {question}
5 |
6 | {choices}
7 |
8 | Answer: """
9 |
10 | fewshot = """The following are multiple choice questions (with answers) about programming problem.
11 |
12 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
13 | ```python
14 | def chkPair(A, size, x):
15 | for i in range(0, size - 1):
16 | for j in range(i + 1, size):
17 | if (A[i] + A[j] == x):
18 | return 1
19 | return 0
20 |
21 | ```
22 |
23 | (A). The code contain no issue.
24 | (B). Memory Limit Exceeded
25 | (C). Internal error
26 | (D). Runtime Error
27 |
28 | Answer: The answer is (A).
29 |
30 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
31 | {question}
32 |
33 | {choices}
34 |
35 | Answer: """
36 |
37 | cot_zs = zeroshot + "Let's think step by step. "
38 |
39 | cot_fs = """The following are multiple choice questions (with answers) about programming problem.
40 |
41 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
42 | ```python
43 | def chkPair(A, size, x):
44 | for i in range(0, size - 1):
45 | for j in range(i + 1, size):
46 | if (A[i] + A[j] == x):
47 | return 1
48 | return 0
49 |
50 | ```
51 |
52 | (A). The code contain no issue.
53 | (B). Memory Limit Exceeded
54 | (C). Internal error
55 | (D). Runtime Error
56 |
57 | Answer: Let's think step by step. The code defines a function `chkPair` that checks for a pair of elements in an array A whose sum equals x. It uses two nested loops to iterate over all possible pairs and returns 1 if a valid pair is found, or 0 otherwise. The function has a time complexity of O(n^2) due to the nested loops, which could slow down performance for large inputs, but it doesn't involve excessive memory usage or problematic operations that would lead to errors like memory limit exceeded, runtime errors, or internal issues. Hence, the most likely outcome is that the code contains no issue.
58 | The answer is (A).
59 |
60 | Question: Given a code snippet below, which behavior most likely to occur when execute it?
61 | {question}
62 |
63 | {choices}
64 |
65 | Answer: Let's think step by step."""
66 |
67 | DEFECT_PROMPT = dict(zeroshot=zeroshot,
68 | fewshot=fewshot,
69 | cot_zs=cot_zs,
70 | cot_fs=cot_fs)
--------------------------------------------------------------------------------
/src/codemmlu/prompts/_fim.py:
--------------------------------------------------------------------------------
1 | zeroshot = """The following are multiple-choice questions (with answers) about a programming problem with incomplete solution.
2 |
3 | Problem statement: {problem_description}
4 |
5 | Incomplete Solution:
6 | {question}
7 |
8 | Question: The provided solution is missing a part, Which option below is the most likely to complete the solution and achieve the desired goal?
9 |
10 | {choices}
11 |
12 | Answer: """
13 |
14 | fewshot = """The following are multiple-choice questions (with answers) about a programming problem with incomplete solution.
15 |
16 | Problem statement: You are given an array of intervals, where intervals[i] = [starti, endi] and each starti is unique.
17 | The right interval for an interval i is an interval j such that startj >= endi and startj is minimized.
18 | Note that i may equal j. Return an array of right interval indices for each interval i.
19 | If no right interval exists for interval i, then put -1 at index i.
20 |
21 | Incomplete Solution:
22 | python```
23 | def find_right_interval(intervals):
24 | n = len(intervals)
25 | res = [-1] * n
26 | for i in range(n):
27 | intervals[i].append(i)
28 |
29 | def binary_search(ele):
30 | left, right = 0, n-1
31 | ans = float('inf')
32 | while left <= right:
33 | mid = (left + right) // 2
34 | if intervals[mid][0] >= ele:
35 | ans = min(ans, mid)
36 | right = mid - 1
37 | else:
38 | left = mid + 1
39 | return ans
40 |
41 | intervals.sort()
42 | for i in intervals:
43 | _________________
44 |
45 | return res
46 | ```
47 |
48 | Question: The provided solution is missing a part, Which option below is the most likely to complete the solution and achieve the desired goal?
49 |
50 | (A) ```python
51 | val = binary_search(i[1])
52 | if val != float('inf'):
53 | res[i[2]] = intervals[val][2]
54 | ```
55 | (B) ```python
56 | if val != float('inf'):
57 | res[i[2]] = intervals[val][2]
58 | else:
59 | continue
60 | ```
61 | (C) ```python
62 | val = binary_search(i[1])
63 | if val != float('inf'): res[i[2] + 1] = intervals[val][2]
64 | ```
65 | (D) ```python
66 | if val != float('inf'):
67 | res[i[2]] = intervals[val][2]
68 | else:
69 | continue
70 | ```
71 |
72 | Answer: The answer is (A).
73 |
74 | Problem statement: {problem_description}
75 |
76 | Incomplete Solution:
77 | {question}
78 |
79 | Question: The provided solution is missing a part, Which option below is the most likely to complete the solution and achieve the desired goal?
80 |
81 | {choices}
82 |
83 | Answer: """
84 |
85 | cot_zs = """The following are multiple-choice questions (with answers) about a programming problem with incomplete solution.
86 |
87 | Problem statement: {problem_description}
88 |
89 | Incomplete Solution:
90 | {question}
91 |
92 | Question: The provided solution is missing a part, Which option below is the most likely to
93 | complete the solution and achieve the desired goal?
94 |
95 | {choices}
96 |
97 | Answer: Let's think step by step. """
98 |
99 | cot_fs = """The following are multiple-choice questions (with answers) about a programming problem
100 | with incomplete solution.
101 |
102 | Problem statement: You are given an array of intervals, where intervals[i] = [starti, endi] and each starti is unique.
103 | The right interval for an interval i is an interval j such that startj >= endi and startj is minimized.
104 | Note that i may equal j. Return an array of right interval indices for each interval i.
105 | If no right interval exists for interval i, then put -1 at index i.
106 |
107 | Incomplete Solution:
108 | python```
109 | def find_right_interval(intervals):
110 | n = len(intervals)
111 | res = [-1] * n
112 | for i in range(n):
113 | intervals[i].append(i)
114 |
115 | def binary_search(ele):
116 | left, right = 0, n-1
117 | ans = float('inf')
118 | while left <= right:
119 | mid = (left + right) // 2
120 | if intervals[mid][0] >= ele:
121 | ans = min(ans, mid)
122 | right = mid - 1
123 | else:
124 | left = mid + 1
125 | return ans
126 |
127 | intervals.sort()
128 | for i in intervals:
129 | _________________
130 |
131 | return res
132 | ```
133 |
134 | Question: The provided solution is missing a part, Which option below is the most likely to
135 | complete the solution and achieve the desired goal?
136 |
137 | (A) ```python
138 | val = binary_search(i[1])
139 | if val != float('inf'):
140 | res[i[2]] = intervals[val][2]
141 | ```
142 | (B) ```python
143 | if val != float('inf'):
144 | res[i[2]] = intervals[val][2]
145 | else:
146 | continue
147 | ```
148 | (C) ```python
149 | val = binary_search(i[1])
150 | if val != float('inf'): res[i[2] + 1] = intervals[val][2]
151 | ```
152 | (D) ```python
153 | if val != float('inf'):
154 | res[i[2]] = intervals[val][2]
155 | else:
156 | continue
157 | ```
158 |
159 | Answer: Let's think step by step. The incomplete solution first sorts the intervals and then iterates over the sorted intervals. For each interval, it finds the right interval using a binary search.
160 | This option (A) finds the right interval index using the binary search and updates the result array accordingly.
161 | The option (B) is similar to (A), but it does not increment the index when finding the right interval index. This could lead to incorrect results.
162 | The option (C) increments the index when finding the right interval index. However, this is incorrect because the problem statement asks for the index of the right interval, not the offset from the original index.
163 | The option (D) uses the same index for both the original interval and the right interval, which could lead to incorrect results.
164 | The answer is (A).
165 |
166 | Problem statement: {problem_description}
167 |
168 | Incomplete Solution:
169 | {question}
170 |
171 | Question: The provided solution is missing a part, Which option below is the most likely to
172 | complete the solution and achieve the desired goal?
173 |
174 | {choices}
175 |
176 | Answer: Let's think step by step. """
177 |
178 | FIM_PROMPT = dict(zeroshot=zeroshot,
179 | fewshot=fewshot,
180 | cot_zs=cot_zs,
181 | cot_fs=cot_fs)
--------------------------------------------------------------------------------
/src/codemmlu/prompts/_general.py:
--------------------------------------------------------------------------------
1 | zeroshot = """The following are multiple choice questions (with answers) about software development.
2 |
3 | Question: {question}
4 | {choices}
5 |
6 | Answer: """
7 |
8 | fewshot = """The following are multiple choice questions (with answers) about software development.
9 |
10 | Question: If a sorted array of integers is guaranteed to not contain duplicate values,
11 | in order to search a for a specific value which of the following algorithms is the most efficient for this task?
12 |
13 | (A) Bubble Sort (B) Linear Search (C) Insertion Sort (D) Binary Search
14 |
15 | Answer: The answer is (D).
16 |
17 | Question: {question}
18 | {choices}
19 |
20 | Answer: """
21 |
22 | cot_zs = """The following are multiple choice questions (with answers) about software devopment.
23 |
24 | Question: {question}
25 | {choices}
26 |
27 | Answer: Let's think step by step. """
28 |
29 | cot_fs = '''The following are multiple choice questions (with answers) about software devopment.
30 |
31 | Question: If a sorted array of integers is guaranteed to not contain duplicate values, in order to search a for a specific value which of the following algorithms is the most efficient for this task?
32 |
33 | (A) Bubble Sort (B) Linear Search (C) Insertion Sort (D) Binary Search
34 |
35 | Answer: Let's think step by step. Binary Search is a divide-and-conquer algorithm that works by repeatedly dividing the search interval in half and searching for the value in the appropriate half. Since the array is already sorted and does not contain any duplicate value, this algorithm is optimal to find the desired value. The answer is (D).
36 |
37 | Question: {question}
38 | {choices}
39 |
40 | Answer: Let's think step by step. '''
41 |
42 | GENERAL_PROMPT = dict(zeroshot=zeroshot,
43 | fewshot=fewshot,
44 | cot_zs=cot_zs,
45 | cot_fs=cot_fs)
--------------------------------------------------------------------------------
/src/codemmlu/task_utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import json
4 | import glob
5 | from string import ascii_uppercase
6 | from typing import Optional
7 | from datasets import Dataset, load_dataset
8 |
9 | from codemmlu.prompts import GENERAL_PROMPT, CODECOMP_PROMPT, FIM_PROMPT, CODEREPAIR_PROMPT, DEFECT_PROMPT
10 |
11 |
12 | SEMANTIC_TASK = ["software_principles", "dbms_sql", "others"]
13 |
14 | SYNTACTIC_TASK = ["programming_syntax", "api_frameworks"]
15 |
16 | REALWORLD_TASK = ["code_completion", "fill_in_the_middle", "code_repair", "defect_detection"]
17 |
18 | ALL_TASK = SEMANTIC_TASK + SYNTACTIC_TASK + REALWORLD_TASK
19 |
20 |
21 | def get_prompt(subset: str, prompt_mode: str) -> str:
22 | """Get prompt for a given task."""
23 | assert prompt_mode in ["zeroshot", "fewshot", "cot_zs", "cot_fs"]
24 |
25 | if subset in SEMANTIC_TASK + SYNTACTIC_TASK:
26 | return GENERAL_PROMPT[prompt_mode]
27 | else:
28 | if subset == "code_completion":
29 | return CODECOMP_PROMPT[prompt_mode]
30 | elif subset == "fill_in_the_middle":
31 | return FIM_PROMPT[prompt_mode]
32 | elif subset == "code_repair":
33 | return CODEREPAIR_PROMPT[prompt_mode]
34 | elif subset == "defect_detection":
35 | return DEFECT_PROMPT[prompt_mode]
36 | else:
37 | raise ValueError(f"Invalid subset: {subset}")
38 |
39 |
40 | class CodeMMLU:
41 | """CodeMMLU benchmark loader."""
42 | TASK_NAME = "codemmlu"
43 | DATASET_NAME_OR_PATH = "Fsoft-AIC/codemmlu"
44 |
45 | def __init__(self,
46 | split: str,
47 | subset: str,
48 | prompt_mode: str = "zeroshot",
49 | instruction_prefix: Optional[str] = "",
50 | assistant_prefix: Optional[str] = "") -> None:
51 |
52 | self.stop_words = ['\n\nQ:', '\n\nQuestion:', '\n\n###', '\n#', "\n<|/", "\n```"]
53 | self.instruction_prefix = instruction_prefix
54 | self.assistant_prefix = assistant_prefix
55 | self.split = split
56 | self.subset = subset
57 | self.prompt_mode = prompt_mode
58 |
59 | self.dataset = load_dataset(self.DATASET_NAME_OR_PATH, subset,
60 | split=split, use_auth_token=True)
61 |
62 | def __len__(self):
63 | return len(self.dataset)
64 |
65 | def get_dataset(self) -> Dataset:
66 | return self.dataset
67 |
68 | def prepare_dataset(self) -> Dataset:
69 | """Preprocess CodeMMLU question.
70 |
71 | - Default CodeMMLU prompt is zeroshot. All support prompt modes are:
72 | - zeroshot
73 | - fewshot
74 | - cot_zs (Chain-of-Thought zershot)
75 | - cot_fs (Chain-of-Thought fewshot)
76 | """
77 |
78 | TEMPLATE = get_prompt(self.subset, self.prompt_mode)
79 |
80 | def _preprocess(example):
81 | model_inputs = dict(task_id=[], question=[])
82 |
83 | # for idx in range(len(examples[key_column])):
84 | # question = examples[key_column][idx]
85 | task_id = example.pop('task_id')
86 | example['choices'] = "\n".join([f"({ascii_uppercase[idx]}) {choice}" for idx, choice in enumerate(example['choices'])])
87 |
88 | # MODEL INPUTS HERE
89 | question = TEMPLATE.format(**example)
90 | question = self.instruction_prefix + question + self.assistant_prefix
91 | model_inputs['question'] = question
92 | model_inputs['task_id'] = task_id
93 |
94 | return model_inputs
95 |
96 | preprocessed_ds = self.dataset.map(_preprocess,
97 | batched=False,
98 | remove_columns=self.dataset.column_names)
99 |
100 | print(f"Preprocessed dataset: {preprocessed_ds}")
101 | # Visualize 3 sample
102 | print("Preprocessed prompts:")
103 | for i in range(3):
104 | print(preprocessed_ds['question'][i])
105 | return preprocessed_ds
106 |
107 |
108 | @staticmethod
109 | def _stop_at_stop_token(decoded_string, stop_tokens):
110 | """
111 | Produces the prefix of decoded_string that ends at the first occurrence of
112 | a stop_token.
113 | WARNING: the decoded_string *must not* include the prompt,
114 | which may have stop tokens itself.
115 | """
116 | min_stop_index = len(decoded_string)
117 | for stop_token in stop_tokens:
118 | stop_index = decoded_string.find(stop_token)
119 | if stop_index != -1 and stop_index < min_stop_index:
120 | min_stop_index = stop_index
121 | return decoded_string[:min_stop_index]
122 |
123 | def process_response(self, example):
124 | answer = self._stop_at_stop_token(example, self.stop_words)
125 |
126 | # Substitute special characters with empty string
127 | answer = re.sub(r'[^A-Za-z0-9 \n]', "", answer)
128 | new_answer = []
129 | for item in answer.splitlines():
130 | for subitem in item.split(" "):
131 | if len(subitem) != 1:
132 | new_answer.append(subitem.lower())
133 | else:
134 | new_answer.append(subitem)
135 |
136 | new_answer = ' '.join(new_answer)
137 | new_answer = re.sub(r'\s+', ' ', new_answer).strip()
138 |
139 | return new_answer
140 |
141 | def parse_answer(self, example):
142 | """Answer extract function.
143 |
144 | Args:
145 | example (str): The example to extract the answer from
146 | Returns:
147 | str: The extracted answer
148 | """
149 | extract = re.search(r"answer is (\(*[A-E][\).]*)", example, flags=re.IGNORECASE)
150 | if extract:
151 | return extract.group(1).replace("(", "").replace(")", "").replace(".", "").strip()
152 |
153 |
154 | extract = re.search(r"(\(*[A-E][\).]*) is correct", example, flags=re.IGNORECASE)
155 | if extract:
156 | return extract.group(1).replace("(", "").replace(")", "").replace(".", "").strip()
157 |
158 |
159 | match = re.findall(r"(A|B|C|D|E)", example)
160 |
161 | if match:
162 | # if len(match) > 1:
163 | # return None
164 | return list(match)[0] # Take the first one
165 | return None
166 |
--------------------------------------------------------------------------------