├── .gitignore
├── CONTRIBUTING.md
├── DATASET_CARD.md
├── LICENSE
├── README.md
├── compute_eval
    ├── __init__.py
    ├── data.py
    ├── evaluation.py
    ├── execution.py
    ├── generate_completions.py
    ├── main.py
    ├── models
    │   ├── claude.py
    │   ├── model_interface.py
    │   ├── nim_model.py
    │   └── openAI_model.py
    └── prompts.py
├── data
    ├── LICENSE
    ├── cccl_problems_033125.jsonl
    ├── combined_problems_033125.jsonl
    ├── cuda_problems_033125.jsonl
    ├── example_test.jsonl
    └── example_test_generation.jsonl
├── example_config_evalcorrectness.yaml
├── example_config_gen_samples.yaml
├── poetry.lock
└── pyproject.toml


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | .Python
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | share/python-wheels/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | MANIFEST
25 | 
26 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
27 | __pypackages__/
28 | 
29 | # Celery stuff
30 | celerybeat-schedule
31 | celerybeat.pid
32 | 
33 | # Environments
34 | .env
35 | .venv
36 | env/
37 | 
38 | 
39 | # Ignore any generated sample files
40 | **/*samples*.jsonl
41 | **/*sample*.jsonl
42 | 
43 | # Ignore the generated results file
44 | **/*results.jsonl


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ## Development
 4 | 
 5 | To develop this library futher, it is recommended that you use Poetry for dependency management, virtualization and testing. You can read about how to install Poetry [here](https://python-poetry.org/docs/). One way for MacOS / Linux systems is
 6 | 
 7 | ```bash
 8 | pip install pipx
 9 | pipx install poetry
10 | pipx ensurepath # simply ensures that ~/.local/bin is added to $PATH
11 | ```
12 | 
13 | We also recommend creating the virtualenvs in the current directory itself
14 | 
15 | ```bash
16 | poetry config virtualenvs.in-project true
17 | ```
18 | 
19 | To install the dependencies and the project
20 | 
21 | ```bash
22 | poetry shell # starts a new shell inside the virtual environment
23 | poetry install
24 | ```
25 | 
26 | `poetry install` by default installs the package in editable mode.
27 | 
28 | Create a .env file in the `compute-eval` directory.
29 | 
30 | ```env
31 | NEMO_API_KEY="<PUT-YOUR-KEY-HERE>"
32 | ```
33 | 
34 | or
35 | 
36 | ```
37 | API_KEY="<PUT-YOUR-KEY-HERE>"
38 | ```
39 | 
40 | if using a custom model.
41 | 
42 | ### Linting
43 | 
44 | You will need to install the Black Python formatter and the Isort Formatter and lint on save. To do this in VSCode is simple, get [Black Python Formatter](https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter) and [Isort Formatter](https://marketplace.visualstudio.com/items?itemName=ms-python.isort) from the Marketplace
45 | and then add these lines to either your workspace settings.json or your global settings.json
46 | 
47 | ```json
48 | "[python]": {
49 |     "editor.defaultFormatter": "ms-python.black-formatter",
50 |     "editor.formatOnSave": true,
51 |     "editor.codeActionsOnSave": {
52 |         "source.organizeImports": "always"
53 |     },
54 | },
55 | "isort.args":["--profile", "black"],
56 | ```
57 | 
58 | Everytime you save the files, the linter will automatically lint for you. Depending on your workflow, you might want to have it check and report and then ask for permission to format the files.
59 | 
60 | ## Sharing your contributions
61 | 
62 | For any additonal contributions that are made, please include a DCO in your commit message: https://wiki.linuxfoundation.org/dco
63 | 


--------------------------------------------------------------------------------
/DATASET_CARD.md:
--------------------------------------------------------------------------------
 1 | # Dataset Card for ComputeEval
 2 | 
 3 | This dataset is designed for **CUDA code generation and evaluation** tasks, where each data entry provides a self-contained CUDA programming challenge. The data highlights various aspects of CUDA programming, such as kernel launches, thread-block manipulation, shared memory usage, CCCL: Thrust/CUB, and more.
 4 | 
 5 | Homepage: [Github](https://github.com/NVIDIA/compute-eval)
 6 | 
 7 | ## Format
 8 | 
 9 | JSON Lines (`.jsonl`), one task per line.
10 | 
11 | ## Data Fields
12 | 
13 | Each task entry includes:
14 | 
15 | - `task_id`: Uniquely identifies the task, e.g., `"CUDA/0"`.
16 | - `prompt`: A natural language description or instructions on what CUDA code to write.
17 | - `cc_flags`, `ld_flags`: Suggests compiler/linker arguments that may be used during compilation.
18 | - `declaration`: Preliminary code (e.g., includes, macros, or device stubs) required before the solution kernel.
19 | - `test`: A C++ snippet (often containing a `main()` function) that checks correctness at runtime.
20 | - `example_test`: Ancillary snippet or annotation (may be empty or contain additional tests).
21 | - `cuda_toolkit`: String indicating the CUDA version or requirements, e.g., `"12.0"`.
22 | - `solution`: An example or reference kernel that solves the prompt.
23 | 
24 | ## License
25 | 
26 | SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
27 | SPDX-License-Identifier: CC-BY-4.0
28 | 
29 | This work is licensed under a Creative Commons Attribution 4.0 International License. https://creativecommons.org/licenses/by/4.0/
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.SPDX-License-Identifier: Apache-2.0
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 | http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 
15 | NOTE: 
16 | All the data associated with this software is Licensed under CC BY 4.0 
17 | 
18 | SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
19 | SPDX-License-Identifier: CC-BY-4.0
20 | 
21 | This work is licensed under a Creative Commons Attribution 4.0 International License. https://creativecommons.org/licenses/by/4.0/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # compute-eval
  2 | 
  3 | ComputeEval: Evaluating Large Language Models for CUDA Code Generation
  4 | 
  5 | ComputeEval is a framework designed to generate and evaluate CUDA code from Large Language Models.
  6 | It features:
  7 | 
  8 | - A set of handcrafted CUDA programming challenges ("problem set") designed to evaluate an LLM's capability at writing reliable CUDA code
  9 | - Utilities for generating multiple solutions to each challenge ("samples")
 10 | - Utilities for functional correctness of generated CUDA code
 11 | 
 12 | ComputeEval is currently in Alpha. We plan to refine the evaluation framework
 13 | and make frequent updates to the dataset with additional problems spanning all
 14 | aspects of CUDA development.
 15 | 
 16 | ## Setup
 17 | 
 18 | ### Prerequisites
 19 | 
 20 | - Python 3.10+ or above
 21 | - NVIDIA GPU with CUDA Toolkit 12 or greater (for evaluation)
 22 | 
 23 | ### Installation
 24 | 
 25 | Install the package:
 26 | 
 27 | ```
 28 | # pip
 29 | pip install .
 30 | 
 31 | # Poetry
 32 | poetry install
 33 | ```
 34 | 
 35 | Note: If you use Poetry, version 2.0 or later is recommended.
 36 | 
 37 | ### API Keys
 38 | 
 39 | To query an LLM, you must first obtain an API key from the respective service.
 40 | 
 41 | #### NVIDIA NEMO (default)
 42 | 
 43 | To use ComputeEval with NVIDIA-hosted models, you need a key from
 44 | [build.nvidia.com](https://build.nvidia.com).
 45 | 
 46 | 1. Go to [build.nvidia.com](https://build.nvidia.com)
 47 | 1. Sign in with your account
 48 | 1. Verify that you have sufficient credits to call hosted models
 49 | 1. Navigate to the desired model and click on it
 50 | 1. Click on `Get API Key`
 51 | 1. Copy the generated API key
 52 | 1. Export it as an environment variable:
 53 | 
 54 | ```bash
 55 | export NEMO_API_KEY="<your-nvidia-key>"
 56 | ```
 57 | 
 58 | #### OpenAI
 59 | 
 60 | Follow the instructions in the [OpenAI docs](https://openai.com/index/openai-api),
 61 | then:
 62 | 
 63 | ```bash
 64 | export OPENAI_API_KEY="<your-openai-key>"
 65 | ```
 66 | 
 67 | #### Anthorpic (Claude)
 68 | 
 69 | Follow instruction on [Anthropic docs](https://www.anthropic.com/api), then:
 70 | 
 71 | ```bash
 72 | export ANTHROPIC_API_KEY="<your-anthropic-key>"
 73 | ```
 74 | 
 75 | ## Usage
 76 | 
 77 | **Note:** This repository executes machine-generated CUDA code.
 78 | While it's unlikely that the code is malicious, it could still pose potential risks.
 79 | Therefore, all code execution requires the `--allow-execution` flag.
 80 | We strongly recommend using a sandbox environment (e.g., a Docker container or virtual machine) when running generated code to minimize security risks.
 81 | 
 82 | ComputeEval can be configured using a YAML file that defines the parameters to the program.
 83 | For example `example_config_gen_samples.yaml`:
 84 | 
 85 | ```yaml
 86 | problem_file: data/cuda_problems_121924.jsonl # Input problems
 87 | sample_file: data/samples.jsonl # Generated samples
 88 | 
 89 | model: llama-3.1-nemotron-70b-instruct # Model to use
 90 | num_samples_per_problem: 3 # Samples to generate per problem
 91 | ```
 92 | 
 93 | Note: Please set NEMO_API_KEY when using a preset NIM model.
 94 | 
 95 | - Read the problem_file: `data/cuda_problems_121924.jsonl`
 96 | - Generate 3 completions per problem using the `llama-3.1-nemotron-70b-instruct` model
 97 | - Write all completions to the output samples file: `data/samples.jsonl`
 98 | 
 99 | To use a custom model:
100 | 
101 | ```yaml
102 | problem_file: data/problems.jsonl
103 | sample_file: data/samples.jsonl
104 | 
105 | num_samples_per_problem: 3
106 | 
107 | custom_model:
108 |   api_endpoint: https://integrate.api.nvidia.com/v1
109 |   model_id: nvidia/llama-3.1-nemotron-70b-instruct
110 | ```
111 | 
112 | Note: Please set OPENAI_API_KEY when using a custom model.
113 | 
114 | ### Models Available
115 | 
116 | The models available for completions are listed below:
117 | 
118 | - "mixtral-8x22b" => mistralai/mixtral-8x22b-instruct-v0.1
119 | - "gemma-2b" => google/gemma-2b
120 | - "llama3.1-8b" => meta/llama-3.1-8b-instruct
121 | - "llama3.1-70b" => meta/llama-3.1-70b-instruct
122 | - "llama3.1-405b" => meta/llama-3.1-405b-instruct
123 | - "llama3.2-1b" => meta/llama-3.2-1b-instruct
124 | - "llama3.2-3b" => meta/llama-3.2-3b-instruct
125 | - "llama3.2-90b" => meta/llama-3.2-90b-vision-instruct
126 | - "llama3.1-nemotron-70b" => nvidia/llama-3.1-nemotron-70b-instruct
127 | - "nemotron-mini-4b" => nvidia/nemotron-mini-4b-instruct
128 | - "starcoder2-7b" => bigcode/starcoder2-7b
129 | - "mistral-nemo-12b" => nv-mistralai/mistral-nemo-12b-instruct
130 | - "openai-" => nv-mistralai/mistral-nemo-12b-instruct
131 | 
132 | By default, NVIDIA hosted `llama-3.1-70b-instruct` is used.
133 | 
134 | ### Generate samples and evaluate
135 | 
136 | Generate samples based on the config file:
137 | 
138 | ```bash
139 | compute_eval generate_samples -config_file=example_config_gen_samples.yaml
140 | ```
141 | 
142 | Now you have a `data/samples.jsonl`.
143 | 
144 | To launch an evaluation on the generated samples create a config file
145 | where the content of `example_config_evalcorrectness.yaml`:
146 | 
147 | ```
148 | sample_file: data/samples.jsonl
149 | problem_file: data/cuda_problems_121924.jsonl
150 | 
151 | k: [1, 3]
152 | ```
153 | 
154 | ```bash
155 | compute_eval evaluate_functional_correctness -config_file=example_config_evalcorrectness.yaml
156 | ```
157 | 
158 | Note: the program will ask you to allow code execution by adding the `--allow-execution` flag.
159 | 
160 | - This will read the problems and the sample file
161 | - It will run each of the samples through a functional correctness testing suite
162 | - It will output a `pass@k` dictionary with 2 `pass@k` values for k = 1 nand k = 3
163 | 
164 | Caveats:
165 | 
166 | - The `k` argument for `evaluate_functional_correctness` should be a comma-separated e.g., `[1,10]`.
167 | - Note that if you have a list of `k` that you want used in evaluation, then `max(k) <= num_samples_per_problem` else that `k` value will not show up in the pass@k dict generated.
168 | 
169 | ## Command docs
170 | 
171 | ### `generate_samples`
172 | 
173 | This command generates samples for given problems using a specified model and writes them to the specified sample_file.
174 | 
175 | #### Arguments
176 | 
177 | - `problem_file` (str): The path to the file containing the problems to generate samples for.
178 | - `sample_file` (str, optional): The path to the file where the generated samples will be written. (default: `generated_samples.jsonl`).
179 | - `num_samples_per_problem` (int, optional): The number of samples to generate per problem (default: 100).
180 | - `n_workers` (int, optional): The number of worker threads to use (default: 20).
181 | - `system_prompt` (str, optional): The system prompt to use (default: a predefined CUDA programming prompt).
182 | - `max_tokens` (int, optional): The maximum number of tokens for the model to generate (default: 1024).
183 | - `print_completions` (bool, optional): Flag to specify if you want the completions printed to stdout. (default: False)
184 | - `model` (str, optional): The model to use for generating samples (default: "llama3.1-70b").
185 | - `model_type` (str, optional): The type of model (default: "instruct").
186 | - `custom_model`(dict, optional): api_endpoint (base url) and model_id (model name) for any model that uses the OpenAI API. Please use the OPENAI_API_KEY to set your credentials when using a custom model.
187 | - `params` (dict, optional): parameters for the chat completions request - temperature, top_p, max_tokens.
188 | 
189 | ### `evaluate_functional_correctness`
190 | 
191 | This command evaluates the functional correctness of generated samples and outputs a `pass@k` dictionary
192 | 
193 | #### Arguments
194 | 
195 | - `sample_file` (str): The path to the file containing the samples to be evaluated.
196 | - `problem_file` (str): The path to the file containing the problems to evaluate against.
197 | - `k` (str, optional): The list of values for k, as a comma-separated string (default: "1,10,100").
198 | - `n_workers` (int, optional): The number of worker threads to use (default: 4).
199 | - `timeout` (float, optional): The timeout for each evaluation in seconds (default: 3.0).
200 | - `save_completions_dir` (str, optional): Directory path where the samples will be stored as .cu files (default: "" i.e not saved)
201 | 
202 | ## Dataset
203 | 
204 | For more information about the dataset see `DATASET_CARD.md`.
205 | 
206 | ## Contributing
207 | 
208 | See `contributing.md` for development instructions.
209 | 


--------------------------------------------------------------------------------
/compute_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NVIDIA/compute-eval/4f5518a121350637c12e5238fd183fd47c5de8d4/compute_eval/__init__.py


--------------------------------------------------------------------------------
/compute_eval/data.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # Portions of this file from human-eval (https://github.com/openai/human-eval/).
 17 | #
 18 | # The MIT License
 19 | #
 20 | # Copyright (c) OpenAI (https://openai.com)
 21 | #
 22 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 23 | # of this software and associated documentation files (the "Software"), to deal
 24 | # in the Software without restriction, including without limitation the rights
 25 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 26 | # copies of the Software, and to permit persons to whom the Software is
 27 | # furnished to do so, subject to the following conditions:
 28 | #
 29 | # The above copyright notice and this permission notice shall be included in
 30 | # all copies or substantial portions of the Software.
 31 | #
 32 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 33 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 34 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 35 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 36 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 37 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 38 | # THE SOFTWARE.
 39 | 
 40 | import gzip
 41 | import json
 42 | import os
 43 | from typing import Dict, Iterable
 44 | 
 45 | ROOT = os.path.dirname(os.path.abspath(__file__))
 46 | 
 47 | 
 48 | def read_problems(evalset_file: str) -> Dict[str, Dict]:
 49 |     return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
 50 | 
 51 | 
 52 | def stream_jsonl(filename: str) -> Iterable[Dict]:
 53 |     """
 54 |     Parses each jsonl line and yields it as a dictionary
 55 |     """
 56 |     if filename.endswith(".gz"):
 57 |         with open(filename, "rb") as gzfp:
 58 |             with gzip.open(gzfp, "rt") as fp:
 59 |                 for line in fp:
 60 |                     if any(not x.isspace() for x in line):
 61 |                         yield json.loads(line, strict=False)
 62 |     else:
 63 |         with open(filename, "r") as fp:
 64 |             for line in fp:
 65 |                 if any(not x.isspace() for x in line):
 66 |                     yield json.loads(line, strict=False)
 67 | 
 68 | 
 69 | def write_completions_to_dir(dir_path: str, data: Iterable[Dict]):
 70 |     """
 71 |     Writes the prompt, pass / fail and the completion of each provided sample
 72 |     to a specific directory.
 73 | 
 74 |     REQUIRES: Each dict in the data iterable must have the keys "prompt",
 75 |     "completion", "completion_id" and "passed".
 76 |     """
 77 | 
 78 |     full_dir_path = os.path.abspath(dir_path)
 79 |     for sample in data:
 80 |         passed_string = "// " + ("PASSED" if sample["passed"] else "FAILED")
 81 |         prompt = sample["prompt"]
 82 |         file_path = os.path.join(
 83 |             full_dir_path,
 84 |             f"{sample['task_id'].replace('/', '_')}__{sample['completion_id']}.cu",
 85 |         )
 86 |         with open(file_path, "w+") as outfile:
 87 |             outfile.write(passed_string + "\n")
 88 |             outfile.write("/*\n" + prompt + "\n*/\n")
 89 |             outfile.write(sample["compilable_code"])
 90 | 
 91 | 
 92 | def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
 93 |     """
 94 |     Writes an iterable of dictionaries to jsonl
 95 |     """
 96 |     if append:
 97 |         mode = "ab"
 98 |     else:
 99 |         mode = "wb"
100 |     filename = os.path.expanduser(filename)
101 |     if filename.endswith(".gz"):
102 |         with open(filename, mode) as fp:
103 |             with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
104 |                 for x in data:
105 |                     if x:
106 |                         gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
107 |     else:
108 |         with open(filename, mode) as fp:
109 |             for x in data:
110 |                 if x:
111 |                     fp.write((json.dumps(x) + "\n").encode("utf-8"))
112 | 


--------------------------------------------------------------------------------
/compute_eval/evaluation.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # Portions of this file from human-eval (https://github.com/openai/human-eval/).
 17 | #
 18 | # The MIT License
 19 | #
 20 | # Copyright (c) OpenAI (https://openai.com)
 21 | #
 22 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 23 | # of this software and associated documentation files (the "Software"), to deal
 24 | # in the Software without restriction, including without limitation the rights
 25 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 26 | # copies of the Software, and to permit persons to whom the Software is
 27 | # furnished to do so, subject to the following conditions:
 28 | #
 29 | # The above copyright notice and this permission notice shall be included in
 30 | # all copies or substantial portions of the Software.
 31 | #
 32 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 33 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 34 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 35 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 36 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 37 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 38 | # THE SOFTWARE.
 39 | 
 40 | import itertools
 41 | import os
 42 | import sys
 43 | from collections import Counter, defaultdict
 44 | from concurrent.futures import ThreadPoolExecutor, as_completed
 45 | from typing import Dict, List, Tuple, Union
 46 | 
 47 | import numpy as np
 48 | import tqdm
 49 | from tabulate import tabulate
 50 | 
 51 | from compute_eval.data import (
 52 |     read_problems,
 53 |     stream_jsonl,
 54 |     write_completions_to_dir,
 55 |     write_jsonl,
 56 | )
 57 | from compute_eval.execution import check_correctness
 58 | 
 59 | WARNING_MSG = """===================
 60 |      WARNING
 61 | ===================
 62 | 
 63 | Evaluation of correctness or performance will execute untrusted model-generated
 64 | code.
 65 | 
 66 | Although it is highly unlikely that model-generated code will do something
 67 | overtly malicious in response to this test suite, model-generated code may act
 68 | destructively due to a lack of model capability or alignment.
 69 | 
 70 | Users are strongly encouraged to sandbox this evaluation suite so that it does
 71 | not perform destructive actions on their host or network.
 72 | 
 73 | In order to execute this code you must explicitly pass the --allow-execution flag.
 74 | """
 75 | 
 76 | 
 77 | def estimate_pass_at_k(
 78 |     num_samples: Union[int, List[int], np.ndarray],
 79 |     num_correct: Union[List[int], np.ndarray],
 80 |     k: int,
 81 | ) -> np.ndarray:
 82 |     """
 83 |     Estimates pass@k of each problem and returns them in an array.
 84 |     """
 85 | 
 86 |     def estimator(n: int, c: int, k: int) -> float:
 87 |         """
 88 |         Calculates 1 - comb(n - c, k) / comb(n, k).
 89 |         """
 90 |         if n - c < k:
 91 |             return 1.0
 92 |         return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
 93 | 
 94 |     if isinstance(num_samples, int):
 95 |         num_samples_it = itertools.repeat(num_samples, len(num_correct))
 96 |     else:
 97 |         assert len(num_samples) == len(num_correct)
 98 |         num_samples_it = iter(num_samples)
 99 | 
100 |     return np.array(
101 |         [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
102 |     )
103 | 
104 | 
105 | def get_cli_args(problem: Dict[str, Dict]):
106 |     cc_flags = problem.get("cc_flags")
107 |     ld_flags = problem.get("ld_flags")
108 | 
109 |     cli_args = ""
110 |     if cc_flags is not None:
111 |         cli_args += " " + cc_flags
112 |     if ld_flags is not None:
113 |         cli_args += " " + ld_flags
114 | 
115 |     return cli_args
116 | 
117 | 
118 | def evaluate_functional_correctness(
119 |     sample_file: str,
120 |     problem_file: str,
121 |     allow_execution: bool = False,
122 |     k: Tuple[int] = (1, 10, 100),
123 |     n_workers: int = 4,
124 |     timeout: float = 60.0,
125 |     save_completions_dir: str = "",
126 | ):
127 |     """
128 |     Evaluates the functional correctness of generated samples, and writes
129 |     results to f"{sample_file}_correctness_results.jsonl".
130 |     """
131 | 
132 |     if not allow_execution:
133 |         print(WARNING_MSG)
134 |         sys.exit(1)
135 | 
136 |     # Check if only one k value was passed in (as an integer)
137 |     if isinstance(k, int):
138 |         k_vals = [k]
139 |     else:
140 |         # Multiple k values (tuple) is converted to a list of int
141 |         k_vals = list(k)
142 | 
143 |     # If the user wants to save completions, check that the directory exists
144 |     if save_completions_dir != "":
145 |         assert os.path.exists(
146 |             os.path.abspath(save_completions_dir)
147 |         ), "You must have created the directory where the temporary completions will go"
148 | 
149 |     problems = read_problems(problem_file)
150 | 
151 |     # Check the generated samples against test suites.
152 |     with ThreadPoolExecutor(max_workers=n_workers) as executor:
153 |         futures = []
154 |         completion_id = Counter()
155 |         n_samples = 0
156 |         results = defaultdict(list)
157 | 
158 |         print("Reading samples...")
159 |         for sample in tqdm.tqdm(stream_jsonl(sample_file)):
160 |             task_id = sample["task_id"]
161 |             compilable_code = sample["compilable_code"]
162 | 
163 |             problem = problems[task_id]
164 | 
165 |             cli_args = get_cli_args(problem)
166 | 
167 |             cuda_version = problem.get("cuda_version")
168 | 
169 |             args = (
170 |                 problem,
171 |                 compilable_code,
172 |                 timeout,
173 |                 completion_id[task_id],
174 |                 cli_args,
175 |                 cuda_version,
176 |             )
177 |             future = executor.submit(check_correctness, *args)
178 |             futures.append(future)
179 |             completion_id[task_id] += 1
180 |             n_samples += 1
181 | 
182 |         # make sure that solved all the problems (at least once)
183 |         assert len(completion_id) == len(problems), "Some problems are not attempted."
184 | 
185 |         for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
186 |             result = future.result()
187 |             results[result["task_id"]].append((result["completion_id"], result))
188 | 
189 |     # Calculate pass@k.
190 |     total, correct = [], []
191 |     for result in results.values():
192 |         result.sort()
193 |         passed = [r[1]["passed"] for r in result if not r[1]["skipped"]]
194 | 
195 |         # If all test cases are skipped, we skip the problem.
196 |         if len(passed) == 0:
197 |             print(
198 |                 f"Skipping problem {result[0][1]['task_id']}, it would be ignored while calculating pass@k. Possible reasons maybe incompatible GPU architecture."
199 |             )
200 |             continue
201 |         total.append(len(passed))
202 |         correct.append(sum(passed))
203 |     total = np.array(total)
204 |     correct = np.array(correct)
205 | 
206 |     pass_at_k = {
207 |         f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
208 |         for k in k_vals
209 |         if (total >= k).all()
210 |     }
211 | 
212 |     # Finally, save the results in one file:
213 |     sample_results = []
214 |     for sample in stream_jsonl(sample_file):
215 |         task_id = sample["task_id"]
216 |         result = results[task_id].pop(0)
217 |         sample["result"] = result[1]["result"]
218 |         sample["skipped"] = result[1]["skipped"]
219 |         sample["passed"] = result[1]["passed"]
220 |         sample["completion_id"] = result[1]["completion_id"]
221 |         sample_results.append(sample)
222 | 
223 |     out_file = (
224 |         os.path.splitext(os.path.basename(sample_file))[0]
225 |         + "_correctness_results.jsonl"
226 |     )
227 |     print(f"Writing results to {out_file}...")
228 |     write_jsonl(out_file, sample_results)
229 | 
230 |     if save_completions_dir != "":
231 |         print(f"Saving the completions to {os.path.abspath(save_completions_dir)}...")
232 |         write_completions_to_dir(save_completions_dir, sample_results)
233 |     print(pass_at_k)
234 | 


--------------------------------------------------------------------------------
/compute_eval/execution.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | # Portions of this file from human-eval (https://github.com/openai/human-eval/).
 17 | #
 18 | # The MIT License
 19 | #
 20 | # Copyright (c) OpenAI (https://openai.com)
 21 | #
 22 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 23 | # of this software and associated documentation files (the "Software"), to deal
 24 | # in the Software without restriction, including without limitation the rights
 25 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 26 | # copies of the Software, and to permit persons to whom the Software is
 27 | # furnished to do so, subject to the following conditions:
 28 | #
 29 | # The above copyright notice and this permission notice shall be included in
 30 | # all copies or substantial portions of the Software.
 31 | #
 32 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 33 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 34 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 35 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 36 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 37 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 38 | # THE SOFTWARE.
 39 | 
 40 | import contextlib
 41 | import faulthandler
 42 | import io
 43 | import multiprocessing
 44 | import os
 45 | import platform
 46 | import re
 47 | import signal
 48 | import subprocess
 49 | import tempfile
 50 | import threading
 51 | import time
 52 | from typing import Callable, Dict, Iterable, Optional, Tuple
 53 | 
 54 | import psutil
 55 | 
 56 | 
 57 | def compile_cuda_file(
 58 |     cuda_file_path: str,
 59 |     output_binary_path: str,
 60 |     timeout: int,
 61 |     cli_args: Optional[str] = None,
 62 | ) -> Tuple[str, str]:
 63 |     """Compiles the given CUDA file into the given output binary location.
 64 | 
 65 |     Args:
 66 |         cuda_file_path (str): the path to the CUDA file to be compiled
 67 |         output_binary_path (str): the path of the binary file that will be generated
 68 |         timeout (int): max time in seconds to let the compilation take place for.
 69 | 
 70 |     Returns:
 71 |         Tuple[str, str]: 0th index is passed / failed, 1st index is more information.
 72 |     """
 73 | 
 74 |     try:
 75 |         compile_args = "-o" if not cli_args else cli_args + " -o "
 76 |         proc = subprocess.run(
 77 |             f"nvcc {cuda_file_path} {compile_args} {output_binary_path}",
 78 |             shell=True,
 79 |             stderr=subprocess.PIPE,
 80 |             timeout=timeout,
 81 |         )
 82 | 
 83 |         if proc.returncode == 0:
 84 |             return ("passed", output_binary_path)
 85 |         else:
 86 |             # Return code 127 means command being executed was not found
 87 |             if proc.returncode == 127:
 88 |                 print("nvcc not found. Please check CUDA installation & PATH.")
 89 |             return ("failed", proc.stderr.decode())
 90 |     except Exception as e:
 91 |         print("failed")
 92 |         return ("failed", str(e))
 93 | 
 94 | 
 95 | def execute_binary(binary_file_path: str, cuda_file_path: str, timeout: float) -> str:
 96 |     """Executes the given binary file.
 97 | 
 98 |     Args:
 99 |         binary_file_path (str): the binary file to be executed
100 |         cuda_file_path (str): the path of the CUDA file that was used to create the binary
101 |         timeout (float): max time in seconds to let the program run.
102 | 
103 |     Returns:
104 |         str:
105 |     """
106 | 
107 |     eval_env = os.environ.copy()
108 |     try:
109 |         eval_env["COMPUTE_EVAL_SRC_FILE"] = cuda_file_path
110 |         proc_run = subprocess.run(
111 |             [binary_file_path],
112 |             shell=True,
113 |             stderr=subprocess.PIPE,
114 |             timeout=timeout,
115 |             env=eval_env,
116 |         )
117 |         if proc_run.returncode == 0:
118 |             return "passed"
119 |         # If the program returns 200, it means the program couldn't be executed due to GPU architecture
120 |         elif proc_run.returncode == 200:
121 |             return "skipped"
122 |         else:
123 |             return f"Failed to run! Error: {proc_run.stderr.decode()}"
124 |     except subprocess.TimeoutExpired:
125 |         return "Timed out of CUDA program"
126 |     except Exception as e:
127 |         return f"failed {e}"
128 | 
129 | 
130 | @contextlib.contextmanager
131 | def safe_execution_environment(timeout: float):
132 |     with create_tempdir():
133 |         import os
134 |         import shutil
135 | 
136 |         # These system calls are needed when cleaning up tempdir.
137 |         rmtree = shutil.rmtree
138 |         rmdir = os.rmdir
139 |         chdir = os.chdir
140 | 
141 |         # Disable functionalities that can make destructive changes to the test.
142 |         reliability_guard()
143 | 
144 |         try:
145 |             # with swallow_io():
146 |             with time_limit(timeout):
147 |                 yield
148 |         finally:
149 |             # Restore the system calls
150 |             shutil.rmtree = rmtree
151 |             os.rmdir = rmdir
152 |             os.chdir = chdir
153 | 
154 | 
155 | def unsafe_execute(
156 |     timeout: float,
157 |     result: Iterable,
158 |     completion: str,
159 |     cli_args: str,
160 |     problem: Dict,
161 |     completion_id: int,
162 | ):
163 |     with safe_execution_environment(timeout):
164 |         res = None
165 |         try:
166 |             # Create temporary directories for the intermediate files
167 |             temp_dir = tempfile.mkdtemp(prefix="compute_eval_")
168 |             binary_dir = os.path.join(temp_dir, "bin")
169 |             os.makedirs(temp_dir, exist_ok=True)
170 |             os.makedirs(binary_dir, exist_ok=True)
171 | 
172 |             # Construct file name
173 |             file_name = f"{problem['task_id'].replace('/', '_')}-{completion_id}"
174 | 
175 |             # Create a temporary CUDA file path and the binary file path
176 |             cuda_file_path = os.path.join(temp_dir, f"{file_name}.cu")
177 |             binary_file_path = os.path.join(binary_dir, f"{file_name}")
178 | 
179 |             # Write the program
180 |             with open(cuda_file_path, "w") as cuda_file:
181 |                 cuda_file.write(completion)
182 | 
183 |             res = compile_cuda_file(cuda_file_path, binary_file_path, timeout, cli_args)
184 |             if res[0] != "passed":
185 |                 result.append(f"Failed to compile! Error: {res[1]}")
186 |                 return
187 |             else:
188 |                 binary_file_path = res[1]
189 | 
190 |             # Execute the binary
191 |             res = execute_binary(binary_file_path, cuda_file_path, timeout)
192 |             result.append(res)
193 |         except TimeoutException:
194 |             result.append("Timed out of CUDA program")
195 |         except BaseException as e:
196 |             result.append(f"failed: {e} CUDA program")
197 | 
198 | 
199 | @contextlib.contextmanager
200 | def catchtime() -> Callable[[], float]:
201 |     t1 = t2 = time.perf_counter()
202 |     yield lambda: t2 - t1
203 |     t2 = time.perf_counter()
204 | 
205 | 
206 | def check_correctness(
207 |     problem: Dict,
208 |     completion: str,
209 |     timeout: float,
210 |     completion_id: Optional[int] = None,
211 |     cli_args: Optional[str] = None,
212 |     cuda_version: Optional[str] = None,
213 | ) -> Dict:
214 |     """Evaluates the functional correctness of a completion by running the test
215 |         suite provided in the problem.
216 | 
217 |     Args:
218 |         problem (Dict): the problem dictionary
219 |         completion (str): the entire generated code
220 |         timeout (float): max time in seconds to let the program run for
221 | 
222 |     Returns:
223 |         Dict: the output dictionary with the result of the
224 |     """
225 | 
226 |     manager = multiprocessing.Manager()
227 |     result = manager.list()
228 | 
229 |     args = [timeout, result, completion, cli_args, problem, completion_id]
230 |     p = multiprocessing.Process(target=unsafe_execute, args=args)
231 |     p.start()
232 |     p.join(timeout=timeout + 1)
233 |     if p.is_alive():
234 |         # Kill all child processes
235 |         kill_descendants(p.pid)
236 |         # Kill the parent process
237 |         p.kill()
238 |     if not result:
239 |         result.append("Timed out of CUDA program")
240 |     result_status = result[0]
241 |     return dict(
242 |         task_id=problem["task_id"],
243 |         passed=result_status == "passed",
244 |         skipped=result_status == "skipped",
245 |         result=result[0],
246 |         completion_id=completion_id,
247 |     )
248 | 
249 | 
250 | @contextlib.contextmanager
251 | def time_limit(seconds: float):
252 |     if platform.system() == "Windows":
253 | 
254 |         def timeout_handler():
255 |             raise TimeoutException("Timed out!")
256 | 
257 |         timer = threading.Timer(seconds, timeout_handler)
258 |         timer.start()
259 |         try:
260 |             yield
261 |         finally:
262 |             timer.cancel()
263 |     else:
264 | 
265 |         def signal_handler(signum, frame):
266 |             raise TimeoutException("Timed out!")
267 | 
268 |         signal.setitimer(signal.ITIMER_REAL, seconds)
269 |         signal.signal(signal.SIGALRM, signal_handler)
270 |         try:
271 |             yield
272 |         finally:
273 |             signal.setitimer(signal.ITIMER_REAL, 0)
274 | 
275 | 
276 | @contextlib.contextmanager
277 | def swallow_io():
278 |     stream = WriteOnlyStringIO()
279 |     with contextlib.redirect_stdout(stream):
280 |         with contextlib.redirect_stderr(stream):
281 |             with redirect_stdin(stream):
282 |                 yield
283 | 
284 | 
285 | @contextlib.contextmanager
286 | def create_tempdir():
287 |     with tempfile.TemporaryDirectory() as dirname:
288 |         with chdir(dirname):
289 |             yield dirname
290 | 
291 | 
292 | class TimeoutException(Exception):
293 |     pass
294 | 
295 | 
296 | class WriteOnlyStringIO(io.StringIO):
297 |     """StringIO that throws an exception when it's read from"""
298 | 
299 |     def read(self, *args, **kwargs):
300 |         raise IOError
301 | 
302 |     def readline(self, *args, **kwargs):
303 |         raise IOError
304 | 
305 |     def readlines(self, *args, **kwargs):
306 |         raise IOError
307 | 
308 |     def readable(self, *args, **kwargs):
309 |         """Returns True if the IO object can be read."""
310 |         return False
311 | 
312 | 
313 | class redirect_stdin(contextlib._RedirectStream):  # type: ignore
314 |     _stream = "stdin"
315 | 
316 | 
317 | @contextlib.contextmanager
318 | def chdir(root):
319 |     if root == ".":
320 |         yield
321 |         return
322 |     cwd = os.getcwd()
323 |     os.chdir(root)
324 |     try:
325 |         yield
326 |     except BaseException as exc:
327 |         raise exc
328 |     finally:
329 |         os.chdir(cwd)
330 | 
331 | 
332 | def reliability_guard(maximum_memory_bytes: Optional[int] = None):
333 |     """
334 |     This disables various destructive functions and prevents the generated code
335 |     from interfering with the test (e.g. fork bomb, killing other processes,
336 |     removing filesystem files, etc.)
337 | 
338 |     WARNING
339 |     This function is NOT a security sandbox. Untrusted code, including, model-
340 |     generated code, should not be blindly executed outside of one. See the
341 |     Codex paper for more information about OpenAI's code sandbox, and proceed
342 |     with caution.
343 |     """
344 | 
345 |     if maximum_memory_bytes is not None:
346 |         import resource
347 | 
348 |         resource.setrlimit(
349 |             resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
350 |         )
351 |         resource.setrlimit(
352 |             resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
353 |         )
354 |         if not platform.uname().system == "Darwin":
355 |             resource.setrlimit(
356 |                 resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
357 |             )
358 | 
359 |     faulthandler.disable()
360 | 
361 |     import builtins
362 | 
363 |     builtins.exit = None
364 |     builtins.quit = None
365 | 
366 |     import os
367 | 
368 |     os.environ["OMP_NUM_THREADS"] = "1"
369 | 
370 |     os.kill = None
371 |     os.system = None
372 |     os.putenv = None
373 |     os.remove = None
374 |     os.removedirs = None
375 |     os.rmdir = None
376 |     os.fchdir = None
377 |     os.setuid = None
378 |     os.fork = None
379 |     os.forkpty = None
380 |     os.killpg = None
381 |     os.rename = None
382 |     os.renames = None
383 |     os.truncate = None
384 |     os.replace = None
385 |     os.unlink = None
386 |     os.fchmod = None
387 |     os.fchown = None
388 |     os.chmod = None
389 |     os.chown = None
390 |     os.chroot = None
391 |     os.fchdir = None
392 |     os.lchflags = None
393 |     os.lchmod = None
394 |     os.lchown = None
395 |     os.getcwd = None
396 |     os.chdir = None
397 | 
398 |     import shutil
399 | 
400 |     shutil.rmtree = None
401 |     shutil.move = None
402 |     shutil.chown = None
403 | 
404 |     # import subprocess
405 |     # subprocess.Popen = None  # type: ignore
406 | 
407 |     __builtins__["help"] = None
408 | 
409 |     import sys
410 | 
411 |     sys.modules["ipdb"] = None
412 |     sys.modules["joblib"] = None
413 |     sys.modules["resource"] = None
414 |     sys.modules["psutil"] = None
415 |     sys.modules["tkinter"] = None
416 | 
417 | 
418 | def kill_descendants(pid):
419 |     try:
420 |         parent = psutil.Process(pid)
421 |     except psutil.NoSuchProcess:
422 |         # The process might have already been terminated
423 |         return
424 | 
425 |     # Find all children recursively
426 |     children = parent.children(recursive=True)
427 | 
428 |     # Terminate all child processes
429 |     for child in children:
430 |         try:
431 |             child.kill()
432 |         except psutil.NoSuchProcess:
433 |             pass  # The process might have already been terminated
434 | 


--------------------------------------------------------------------------------
/compute_eval/generate_completions.py:
--------------------------------------------------------------------------------
  1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  2 | # SPDX-License-Identifier: Apache-2.0
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | # http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | from concurrent.futures import ThreadPoolExecutor, as_completed
 17 | from typing import Optional
 18 | 
 19 | import tqdm
 20 | 
 21 | from compute_eval.models.nim_model import NimModel
 22 | from compute_eval.models.openAI_model import OpenAIModel
 23 | from compute_eval.models.claude import ClaudeModel
 24 | 
 25 | from .data import read_problems, write_jsonl
 26 | from .prompts import SYSTEM_PROMPT, generate_user_prompt
 27 | 
 28 | 
 29 | def generate_model_completions(
 30 |     task_id,
 31 |     system_prompt,
 32 |     problem,
 33 |     print_completions,
 34 |     include_header_files,
 35 |     model: Optional[str],
 36 |     model_type: Optional[str],
 37 |     custom_model: Optional[dict] = None,
 38 |     params: Optional[dict] = None,
 39 | ):
 40 |     """
 41 |     Orchestrate the generation of code completions using the specified model.
 42 | 
 43 |     Args:
 44 |         system_prompt (str, optional): The system prompt to use for generating completions.
 45 |         problem (dict): The dictionary containing the problem prompt.
 46 |         model (str): The name of the model to use for generating completions.
 47 |         model_type (str): The type of the model ("instruct" or "base").
 48 |         print_completions (bool): Whether to print the completions.
 49 |         include_header_files (bool): Whether to include header files in the prompt.
 50 |         custom_model (dict, optional): Custom model object to use for generating completions.
 51 |         params (dict, optional): Additional parameters to pass to the model.
 52 | 
 53 |     Returns:
 54 |         str: runnable code completion, including declaration, completion, and test code.
 55 |     """
 56 | 
 57 |     # Means we are invoking a model from the preset list of models
 58 | 
 59 |     if custom_model is not None:
 60 |         model_instance = OpenAIModel(
 61 |             base_url=custom_model["api_endpoint"], model_name=custom_model["model_id"]
 62 |         )
 63 |     else:
 64 |         model_map = {
 65 |             "mixtral-8x22b-v0.1": lambda: NimModel(
 66 |                 "mistralai/mixtral-8x22b-instruct-v0.1"
 67 |             ),
 68 |             "gemma-2-2b-it": lambda: NimModel("google/gemma-2-2b-it"),
 69 |             "llama-3.1-8b-instruct": lambda: NimModel("meta/llama-3.1-8b-instruct"),
 70 |             "llama-3.1-70b-instruct": lambda: NimModel("meta/llama-3.1-70b-instruct"),
 71 |             "llama-3.1-405b-instruct": lambda: NimModel("meta/llama-3.1-405b-instruct"),
 72 |             "llama-3.2-1b-instruct": lambda: NimModel("meta/llama-3.2-1b-instruct"),
 73 |             "llama-3.2-3b-instruct": lambda: NimModel("meta/llama-3.2-3b-instruct"),
 74 |             "llama-3.1-nemotron-70b-instruct": lambda: NimModel(
 75 |                 "nvidia/llama-3.1-nemotron-70b-instruct"
 76 |             ),
 77 |             "nemotron-mini-4b-instruct": lambda: NimModel(
 78 |                 "nvidia/nemotron-mini-4b-instruct"
 79 |             ),
 80 |             "starcoder2-7b": lambda: NimModel("bigcode/starcoder2-7b"),
 81 |             "mistral-nemo-12b-instruct": lambda: NimModel(
 82 |                 "nv-mistralai/mistral-nemo-12b-instruct"
 83 |             ),
 84 |             "claude-sonnet-3.5": lambda: ClaudeModel("claude-3-5-sonnet-20241022"),
 85 |         }
 86 | 
 87 |         assert model in model_map, f"Unsupported model: {model}"
 88 | 
 89 |         model_instance_factory = model_map.get(model)
 90 |         if model_instance_factory is None:
 91 |             raise ValueError(f"Unsupported model: {model}")
 92 | 
 93 |         model_instance = model_instance_factory()
 94 | 
 95 |     prompt = generate_user_prompt(problem, include_header_files=include_header_files)
 96 |     completion = model_instance.generate_response(system_prompt, prompt, params)
 97 | 
 98 |     cuda_version = problem.get("cuda_version")
 99 | 
100 |     if print_completions:
101 |         if cuda_version is not None:
102 |             print("CUDA version: " + cuda_version)
103 |         print("=" * 30)
104 | 
105 |         print(problem["task_id"] + "\n")
106 |         print(f"=== Prompt ===\n{prompt}\n")
107 | 
108 |     if model_type == "instruct":
109 |         # we need to parse the completion to get the code
110 |         # first, check whether the declaration provides the function signature
111 |         drop_signature = False
112 |         declaration = problem.get("declaration", "")
113 |         if declaration.strip().endswith("{"):
114 |             drop_signature = True
115 | 
116 |         completion = parse_function_body(completion, drop_signature=drop_signature)
117 | 
118 |     if print_completions:
119 |         print(f"=== Completion ===\n{completion}\n")
120 | 
121 |     result = problem["declaration"] + "\n\n"
122 |     result = result + "// completion-begin \n"
123 |     result = result + completion + "\n"
124 |     result = result + "// completion-end \n\n"
125 |     result = result + problem["test"]
126 | 
127 |     return (task_id, result, completion, prompt)
128 | 
129 | 
130 | def parse_function_body(input_string, drop_signature: bool = True):
131 |     """
132 |     Extract function body from the response of the model.
133 | 
134 |     Args:
135 |         input_string (str): The response string from the model.
136 |         signature_provided (bool): Whether the function signature is provided in the response.
137 | 
138 |     Returns:
139 |         str: The extracted code lines.
140 |     """
141 |     lines = input_string.splitlines()
142 |     start_index = None
143 |     end_index = None
144 | 
145 |     # Find the indices for start and end of code block
146 |     for i, line in enumerate(lines):
147 |         if "```" in line.strip():
148 |             if start_index is None:
149 |                 start_index = i + 1  # start index is the line after "```"
150 |             else:
151 |                 end_index = i
152 |                 break
153 | 
154 |     if start_index is None or end_index is None or start_index >= end_index:
155 |         return input_string.strip()  # No code block found or empty code block
156 | 
157 |     # Extract the code between the markers
158 |     code = lines[start_index:end_index]
159 | 
160 |     final_start_index = 0
161 | 
162 |     # if the signature is provided, remove it
163 |     if drop_signature:
164 |         # Handle special keywords
165 |         special_keywords = ("__global__", "__device__", "void")
166 |         for i, line in enumerate(code):
167 |             if any(keyword in line for keyword in special_keywords):
168 |                 final_start_index = i + 1
169 |                 break
170 | 
171 |         # Remove opening brace line if present
172 |         while final_start_index < len(code) and code[final_start_index].strip() in (
173 |             "{",
174 |             "",
175 |         ):
176 |             final_start_index += 1
177 | 
178 |     # Extract the function body lines
179 |     function_body_lines = code[final_start_index:]
180 | 
181 |     return "\n".join(function_body_lines)
182 | 
183 | 
184 | def generate_samples(
185 |     problem_file: str,
186 |     sample_file: str = "generated_samples.jsonl",
187 |     num_samples_per_problem: int = 100,
188 |     n_workers: int = 20,
189 |     system_prompt: Optional[str] = SYSTEM_PROMPT,
190 |     print_completions: bool = False,
191 |     include_header_files: bool = False,
192 |     model: Optional[str] = "llama-3.1-70b-instruct",
193 |     model_type: Optional[str] = "instruct",
194 |     custom_model: Optional[dict] = None,
195 |     params: Optional[dict] = None,
196 | ):
197 |     """Generates `n_samples_per_problem` number of completions for each of the problems in the
198 |     problem file and then writes them out to the samples.jsonl file provided.
199 |     """
200 | 
201 |     # the number of samples generated per problem must be at least as much as the most k for pass k
202 |     problems = read_problems(problem_file)
203 | 
204 |     print("Started generating the model completions")
205 |     with ThreadPoolExecutor(max_workers=n_workers) as executor:
206 |         futures = []
207 |         # results is the list of dictionaries that will be serialized into the final JSONL file
208 |         results = []
209 | 
210 |         # for each problem, generate `num_samples` completions using the thread pool futures
211 |         for task_id, problem in problems.items():
212 |             for _ in range(num_samples_per_problem):
213 |                 args = (
214 |                     task_id,
215 |                     system_prompt,
216 |                     problem,
217 |                     print_completions,
218 |                     include_header_files,
219 |                     model,
220 |                     model_type,
221 |                     custom_model,
222 |                     params,
223 |                 )
224 |                 future = executor.submit(generate_model_completions, *args)
225 |                 futures.append(future)
226 | 
227 |         print("Waiting for all the model completions")
228 |         for future in tqdm.tqdm(as_completed(futures), total=len(futures)):
229 |             result = future.result()
230 |             results.append(
231 |                 {
232 |                     "task_id": result[0],
233 |                     "compilable_code": result[1],
234 |                     "generated_completion": result[2],
235 |                     "prompt": result[3],
236 |                 }
237 |             )
238 | 
239 |     results = sorted(results, key=lambda x: x["task_id"])
240 |     print("Writing the samples to the specified output JSONL file")
241 |     write_jsonl(sample_file, results)
242 |     print(
243 |         "Completed generating all the samples for the problems. Written to the samples JSONL file"
244 |     )
245 | 


--------------------------------------------------------------------------------
/compute_eval/main.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import fire
17 | import yaml
18 | 
19 | from compute_eval.evaluation import evaluate_functional_correctness
20 | from compute_eval.generate_completions import generate_samples
21 | 
22 | 
23 | def load_config(config_file):
24 |     with open(config_file, "r") as file:
25 |         return yaml.safe_load(file)
26 | 
27 | 
28 | def generate_samples_with_config(config_file=None, **kwargs):
29 |     if config_file:
30 |         config = load_config(config_file)
31 |         kwargs.update(config)
32 |     generate_samples(**kwargs)
33 | 
34 | 
35 | def evaluate_functional_correctness_with_config(config_file=None, **kwargs):
36 |     if config_file:
37 |         config = load_config(config_file)
38 |         kwargs.update(config)
39 |     evaluate_functional_correctness(**kwargs)
40 | 
41 | 
42 | def main():
43 |     fire.Fire(
44 |         {
45 |             "evaluate_functional_correctness": evaluate_functional_correctness_with_config,
46 |             "generate_samples": generate_samples_with_config,
47 |         }
48 |     )
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/compute_eval/models/claude.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | import dotenv
18 | import anthropic
19 | 
20 | 
21 | from compute_eval.models.model_interface import ModelInterface, get_parameter_value
22 | 
23 | 
24 | class ClaudeModel(ModelInterface):
25 |     """
26 |     Generate code completions using Clade models.
27 | 
28 |     Args:
29 |         base_url (str): Base URL for the OpenAI API model.
30 |         model_name (str): Name of the model to use for generating completions.
31 |     """
32 | 
33 |     def __init__(self, model_name):
34 |         dotenv.load_dotenv()
35 |         self.api_key = os.getenv("ANTHROPIC_API_KEY")
36 |         if self.api_key is None:
37 |             raise Exception("ANTHROPIC_API_KEY is missing from the .env file.")
38 | 
39 |         self.model_name = model_name
40 | 
41 |     def generate_response(self, system_prompt, prompt, params):
42 |         """
43 |         Generate code completions by communicating with the Claude API.
44 | 
45 |         Args:
46 |             system_prompt (str, optional): The system prompt to use for generating completions.
47 |             problem (dict): The dictionary containing the problem prompt.
48 |             model_type (str): The type of the model ("instruct" or "base").
49 |             temperature (float): Temperature for sampling.
50 |             max_tokens (int): Maximum tokens to generate.
51 | 
52 |         Returns:
53 |             str: Generated code completion.
54 |         """
55 | 
56 |         messages = []
57 | 
58 |         messages.append({"role": "user", "content": prompt})
59 | 
60 |         client = anthropic.Anthropic()
61 | 
62 |         try:
63 |             response = client.messages.create(
64 |                 model=self.model_name,
65 |                 system=system_prompt,
66 |                 messages=messages,
67 |                 temperature=get_parameter_value("temperature", params, 0.2),
68 |                 top_p=get_parameter_value("top_p", params, 0.95),
69 |                 max_tokens=get_parameter_value("max_tokens", params, 2048),
70 |                 stream=False,
71 |             )
72 |         except response.exceptions.RequestException as e:
73 |             if response.status_code == 400:
74 |                 raise Exception(
75 |                     "Invalid request was made. Check the headers and payload"
76 |                 )
77 |             elif response.status_code == 401:
78 |                 raise Exception(
79 |                     "Unauthorized HTTP request. Check your headers and API key"
80 |                 )
81 |             elif response.status_code == 403:
82 |                 raise Exception("You are forbidden from accessing this resource")
83 |             elif response.status_code > 400:
84 |                 raise Exception(
85 |                     "An error occurred when accessing the model API. Check your headers and payload"
86 |                 )
87 | 
88 |         try:
89 |             completion = response.content
90 |         except AttributeError as e:
91 |             print(
92 |                 f"WARNING: The completion object is invalid. Could not access 'content' attribute: {str(e)}"
93 |             )
94 |             completion = ""
95 |         except Exception as e:
96 |             raise Exception(f"There was an error when accessing the completion")
97 | 
98 |         return completion[0].text
99 | 


--------------------------------------------------------------------------------
/compute_eval/models/model_interface.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from openai import OpenAI
17 | 
18 | 
19 | class ModelInterface:
20 |     """
21 |     Base class for generating code completions.
22 |     """
23 | 
24 |     def generate_response(self, system_prompt, prompt, params):
25 |         """
26 |         Generate code completions by communicating with the OpenAI API.
27 | 
28 |         Args:
29 |             system_prompt (str, optional): The system prompt to use for generating completions.
30 |             problem (dict): The dictionary containing the problem prompt.
31 |             model_type (str): The type of the model ("instruct" or "base").
32 |             temperature (float): Temperature for sampling.
33 |             max_tokens (int): Maximum tokens to generate.
34 | 
35 |         Returns:
36 |             str: Generated code completion.
37 |         """
38 | 
39 |         messages = []
40 | 
41 |         if system_prompt is not None:
42 |             messages.append({"role": "system", "content": system_prompt})
43 | 
44 |         messages.append({"role": "user", "content": prompt})
45 | 
46 |         client = OpenAI(base_url=self.base_url, api_key=self.api_key)
47 | 
48 |         try:
49 |             response = client.chat.completions.create(
50 |                 model=self.model_name,
51 |                 messages=messages,
52 |                 temperature=get_parameter_value("temperature", params, 0.2),
53 |                 top_p=get_parameter_value("top_p", params, 0.95),
54 |                 max_tokens=get_parameter_value("max_tokens", params, 2048),
55 |                 stream=False,
56 |             )
57 |         except response.exceptions.RequestException as e:
58 |             if response.status_code == 400:
59 |                 raise Exception(
60 |                     "Invalid request was made. Check the headers and payload"
61 |                 )
62 |             elif response.status_code == 401:
63 |                 raise Exception(
64 |                     "Unauthorized HTTP request. Check your headers and API key"
65 |                 )
66 |             elif response.status_code == 403:
67 |                 raise Exception("You are forbidden from accessing this resource")
68 |             elif response.status_code > 400:
69 |                 raise Exception(
70 |                     "An error occurred when accessing the model API. Check your headers and payload"
71 |                 )
72 | 
73 |         try:
74 |             completion = response.choices[0].message.content
75 |         except KeyError as e:
76 |             print(
77 |                 f"WARNING: The completion object is invalid. Could not find the key {str(e)}"
78 |             )
79 |             completion = ""
80 |         except Exception as e:
81 |             raise Exception(f"There was an error when accessing the completion")
82 | 
83 |         return completion
84 | 
85 | 
86 | def get_parameter_value(parameter, parameters, default_value):
87 |     if parameters is not None and parameter in parameters:
88 |         return parameters[parameter]
89 |     else:
90 |         return default_value
91 | 


--------------------------------------------------------------------------------
/compute_eval/models/nim_model.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | import dotenv
18 | 
19 | from compute_eval.models.model_interface import ModelInterface
20 | 
21 | 
22 | class NimModel(ModelInterface):
23 |     """
24 |     Generate code completions using NVIDIA models from NIM.
25 | 
26 |     Args:
27 |         model_name (str): Name of the NVIDIA model to use for generating completions.
28 |     """
29 | 
30 |     def __init__(self, model_name):
31 |         dotenv.load_dotenv()
32 |         self.model_name = model_name
33 |         self.base_url = "https://integrate.api.nvidia.com/v1"
34 |         self.api_key = os.getenv("NEMO_API_KEY")
35 | 
36 |         if self.api_key is None:
37 |             raise Exception("NEMO_API_KEY is missing from the .env file.")
38 | 
39 |     def generate_response(self, system_prompt, prompt, params):
40 |         """
41 |         Interact with the NVIDIA API to generate code completions.
42 |         """
43 | 
44 |         return super().generate_response(system_prompt, prompt, params)
45 | 


--------------------------------------------------------------------------------
/compute_eval/models/openAI_model.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import os
17 | import dotenv
18 | 
19 | from compute_eval.models.model_interface import ModelInterface
20 | 
21 | 
22 | class OpenAIModel(ModelInterface):
23 |     """
24 |     Generate code completions using OpenAI models.
25 | 
26 |     Args:
27 |         base_url (str): Base URL for the OpenAI API model.
28 |         model_name (str): Name of the model to use for generating completions.
29 |     """
30 | 
31 |     def __init__(self, base_url, model_name):
32 |         dotenv.load_dotenv()
33 |         self.api_key = os.getenv("OPENAI_API_KEY")
34 |         if self.api_key is None:
35 |             raise Exception("OPENAI_API_KEY is missing from the .env file.")
36 | 
37 |         self.model_name = model_name
38 |         self.base_url = base_url
39 |         self.model_name = model_name
40 | 
41 |     def generate_response(self, system_prompt, prompt, params):
42 |         """
43 |         Interact with the OpenAI API to generate code completions.
44 |         """
45 | 
46 |         return super().generate_response(system_prompt, prompt, params)
47 | 


--------------------------------------------------------------------------------
/compute_eval/prompts.py:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | from typing import Dict
17 | 
18 | SYSTEM_PROMPT = """You are a CUDA programming expert capable of generating high-quality, efficient CUDA code with best practices, optimized for performance and clarity.
19 | Instructions:
20 | 1. Implement the body of the function(s). Do not include any additional code outside the function(s).
21 | 2. Wrap the completed function code, including the provided signatures, inside a single ```cuda markdown code block.
22 | 3. Make sure to use the precise function signature in your response if it's provided in the query.
23 | """
24 | 
25 | USER_PROMPT_TEMPLATE = """{user_prompt}\n{header_files_prompt}"""
26 | 
27 | HEADER_FILES_PROMPT_TEMPLATE = """
28 | The following headers are already defined and should not be included in the response:
29 | ```cuda
30 | {header_files}
31 | ```
32 | Please do not include any additional headers in your response.
33 | """
34 | 
35 | 
36 | def extract_header_files_from_problem(problem: Dict) -> str:
37 |     declaration = problem.get("declaration", "")
38 |     header_files = []
39 |     for header_file in declaration.split("\n"):
40 |         header_file = header_file.strip()
41 |         # header file starts with #include
42 |         if header_file.startswith("#include"):
43 |             header_files.append(header_file)
44 |     return "\n".join(header_files)
45 | 
46 | 
47 | def generate_user_prompt(problem: Dict, include_header_files: bool = False) -> str:
48 |     header_files = extract_header_files_from_problem(problem)
49 |     header_files_prompt = ""
50 |     if include_header_files and header_files:
51 |         header_files_prompt = HEADER_FILES_PROMPT_TEMPLATE.format(
52 |             header_files=header_files
53 |         )
54 | 
55 |     return USER_PROMPT_TEMPLATE.format(
56 |         user_prompt=problem["prompt"], header_files_prompt=header_files_prompt
57 |     )
58 | 


--------------------------------------------------------------------------------
/data/LICENSE:
--------------------------------------------------------------------------------
1 | SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 | SPDX-License-Identifier: CC-BY-4.0
3 | 
4 | This work is licensed under a Creative Commons Attribution 4.0 International License. https://creativecommons.org/licenses/by/4.0/


--------------------------------------------------------------------------------
/data/example_test.jsonl:
--------------------------------------------------------------------------------
1 | {"task_id": "CUDA/0", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n", "cc_flags": "-arch=sm_90a -arch=sm_89 -arch=sm_80 -arch=sm_70 -arch=sm_60", "ld_flags": "", "declaration": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n", "test": "int main() {\nlaunch(4, 1024);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 32, 4, 32);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 4);\ncudaCheckErrors(\"kernel launch failed\");\n\n}\n", "example_test": "", "cuda_toolkit": "12.0"}
2 | {"task_id": "CUDA/1", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons and also allocates dynamic shared memory. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n", "cc_flags": "-arch=sm_90a -arch=sm_89 -arch=sm_80 -arch=sm_70 -arch=sm_60", "ld_flags": "", "declaration": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\n", "test": "int main() {\nlaunch(4, 256);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 1);\ncudaCheckErrors(\"kernel launch failed\");\n\n}\n", "example_test": "@example_test\n", "cuda_toolkit": "12.0"}
3 | {"task_id": "CUDA/2", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons, allocates dynamic shared memory and also uses cuda streams. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n", "cc_flags": "-arch=sm_90a -arch=sm_89 -arch=sm_80 -arch=sm_70 -arch=sm_60", "ld_flags": "", "declaration": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n}\n\n", "test": "int main() {\nlaunch(4, 256);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 1);\ncudaCheckErrors(\"kernel launch failed\");\n}\n", "example_test": "@example_test\n", "cuda_toolkit": "12.0"}
4 | {"task_id": "CUDA/3", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` without using triple chevrons. The x,y,z grid and dimensions will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n", "cc_flags": "-arch=sm_90a -arch=sm_89 -arch=sm_80 -arch=sm_70 -arch=sm_60", "ld_flags": "", "declaration": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <fstream>\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\nstd::string trim(const std::string& str) {\n    size_t first = str.find_first_not_of(' ');\n    if (std::string::npos == first) {\n        return str;\n    }\n    size_t last = str.find_last_not_of(' ');\n    return str.substr(first, last - first + 1);\n}\n", "test": "int main() {\n    auto static_test = [] () {\n        const char* path = std::getenv(\"COMPUTE_EVAL_SRC_FILE\");\n        if (path == nullptr) {\n            std::cerr << \"Environment variable not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::ifstream file(path);\n        if (!file.is_open()) {\n            std::cerr << \"File not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::string line;\n\n        // Skip until the beginning of the completion block\n        while (std::getline(file, line)) {\n\n            if (line.find(\"completion-begin\") != std::string::npos && \n                line.find(\"std::string::npos\") == std::string::npos) {\n                break;\n            }\n        }\n\n        // Search for the CUDA kernel launch API call\n        bool found = false;\n        while (std::getline(file, line)) {\n\n            std::string trimmedLine = trim(line);\n\n            // If the line contains the completion-end marker, stop searching\n            if (trimmedLine.find(\"completion-end\") != std::string::npos) {\n                break;\n            }\n\n            // ignore commented lines\n            if (trimmedLine.find(\"//\") == 0) continue;\n            \n            if (trimmedLine.find(\"cudaLaunchKernelEx\") != std::string::npos) {\n                found = true;\n                break;\n            }\n\n        }\n\n        if (!found) {\n            std::cerr << \"Test failed because the generated code doesn't use CUDA kernel launch API!\" << std::endl;\n            std::exit(1);\n        }\n    };\n\n    auto dynamic_test = [] () {\n        int *output, *input;\n        launch(4, 256);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16, 4, 1);\n        cudaCheckErrors(\"kernel launch failed\");\n    };\n\n    static_test();\n    dynamic_test();\n}\n", "example_test": "@example_test\n", "cuda_toolkit": "12.0"}
5 | {"task_id": "CUDA/4", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with thread block clusters and wihout using triple chevrons. The x,y,z grid and dimensions will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n", "cc_flags": "-arch=sm_90a", "ld_flags": "", "declaration": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n#include <fstream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                      \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err == cudaErrorInvalidKernelImage || __err == cudaErrorNoKernelImageForDevice) \\\n        {                                                                                     \\\n            fprintf(stderr, \"Invalid GPU architecture: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(200);                                                                       \\\n        }                                                                                       \\\n        else if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n\n}\n\nstd::string trim(const std::string& str) {\n    size_t first = str.find_first_not_of(' ');\n    if (std::string::npos == first) {\n        return str;\n    }\n    size_t last = str.find_last_not_of(' ');\n    return str.substr(first, last - first + 1);\n}\n", "test": "int main() {\nauto static_test = [] () {\n        const char* path = std::getenv(\"COMPUTE_EVAL_SRC_FILE\");\n        if (path == nullptr) {\n            std::cerr << \"Environment variable not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::ifstream file(path);\n        if (!file.is_open()) {\n            std::cerr << \"File not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::string line;\n\n        // Skip until the beginning of the completion block\n        while (std::getline(file, line)) {\n\n            if (line.find(\"completion-begin\") != std::string::npos && \n                line.find(\"std::string::npos\") == std::string::npos) {\n                break;\n            }\n        }\n\n        // Search for the CUDA kernel launch API call\n        bool foundKernelLaunch = false, foundClusterDim = false;\n        while (std::getline(file, line)) {\n\n            std::string trimmedLine = trim(line);\n\n            // If the line contains the completion-end marker, stop searching\n            if (trimmedLine.find(\"completion-end\") != std::string::npos) {\n                break;\n            }\n\n            // ignore commented lines\n            if (trimmedLine.find(\"//\") == 0) continue;\n            \n            if (trimmedLine.find(\"cudaLaunchKernelEx\") != std::string::npos) foundKernelLaunch = true;\n            if (trimmedLine.find(\"cudaLaunchAttributeClusterDimension\") != std::string::npos) foundClusterDim = true;\n\n            if (foundKernelLaunch && foundClusterDim) break;\n\n        }\n\n        if (!foundKernelLaunch) {\n            std::cerr << \"Test failed because the generated code doesn't use CUDA kernel launch API!\" << std::endl;\n            std::exit(1);\n        }\n\n        if (!foundClusterDim) {\n            std::cerr << \"Test failed because the generated code doesn't use cluster dimension attribute!\" << std::endl;\n            std::exit(1);\n        }\n    };\n\n    auto dynamic_test = [] () {\n        int *output, *input;\n        launch(4, 256);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16, 4, 1);\n        cudaCheckErrors(\"kernel launch failed\");\n    };\n\n    static_test();\n    dynamic_test();\n}\n", "example_test": "@example_test\n", "cuda_toolkit": "12.0"}


--------------------------------------------------------------------------------
/data/example_test_generation.jsonl:
--------------------------------------------------------------------------------
 1 | {"task_id": "CUDA/0", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    int *output; \n    const int *input;\n\n    // Launch the kernel with the provided grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(output, input);\n}\n// completion-end \n\nint main() {\nlaunch(4, 1024);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 32, 4, 32);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 4);\ncudaCheckErrors(\"kernel launch failed\");\n\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    int *output; \n    const int *input;\n\n    // Launch the kernel with the provided grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(output, input);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
 2 | {"task_id": "CUDA/0", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    int *output; \n    const int *input;\n\n    // Launch the kernel with the provided grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(output, input);\n}\n// completion-end \n\nint main() {\nlaunch(4, 1024);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 32, 4, 32);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 4);\ncudaCheckErrors(\"kernel launch failed\");\n\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    int *output; \n    const int *input;\n\n    // Launch the kernel with the provided grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(output, input);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
 3 | {"task_id": "CUDA/0", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n    \n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    int *output; \n    const int *input;\n    \n    // Launch the kernel function with the provided grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(output, input);\n}\n// completion-end \n\nint main() {\nlaunch(4, 1024);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 32, 4, 32);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 4);\ncudaCheckErrors(\"kernel launch failed\");\n\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n    \n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    int *output; \n    const int *input;\n    \n    // Launch the kernel function with the provided grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(output, input);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
 4 | {"task_id": "CUDA/1", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Assuming 'output' and 'input' are already allocated and initialized on the device\n    int *output, *input; // Replace with actual device pointers\n    \n    // Define dynamic shared memory size (example: 1024 bytes, adjust according to kernel needs)\n    int sharedMemSize = 1024;\n    \n    // Launch kernel with specified grid and block dimensions, and dynamic shared memory allocation\n    kernel<<<dim3(gridSizeX, gridSizeY, gridSizeZ), dim3(blockSizeX, blockSizeY, blockSizeZ), sharedMemSize>>>\n         (output, input);\n}\n// completion-end \n\nint main() {\nlaunch(4, 256);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 1);\ncudaCheckErrors(\"kernel launch failed\");\n\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Assuming 'output' and 'input' are already allocated and initialized on the device\n    int *output, *input; // Replace with actual device pointers\n    \n    // Define dynamic shared memory size (example: 1024 bytes, adjust according to kernel needs)\n    int sharedMemSize = 1024;\n    \n    // Launch kernel with specified grid and block dimensions, and dynamic shared memory allocation\n    kernel<<<dim3(gridSizeX, gridSizeY, gridSizeZ), dim3(blockSizeX, blockSizeY, blockSizeZ), sharedMemSize>>>\n         (output, input);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons and also allocates dynamic shared memory. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
 5 | {"task_id": "CUDA/1", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Calculate the total number of threads per block\n    int blockSize = blockSizeX * blockSizeY * blockSizeZ;\n    \n    // Calculate the total number of blocks per grid\n    int gridSize = gridSizeX * gridSizeY * gridSizeZ;\n    \n    // Assuming dynamic shared memory size is known within the kernel, \n    // we directly pass the size as a template parameter to the <<<>>> syntax.\n    // For demonstration, let's assume the dynamic shared memory size is 'sharedMemSize' bytes.\n    int sharedMemSize = 1024; // Replace with actual shared memory size required by the kernel\n    \n    // Launch the kernel with the provided grid and block dimensions, and dynamic shared memory\n    kernel<<<dim3(gridSizeX, gridSizeY, gridSizeZ), dim3(blockSizeX, blockSizeY, blockSizeZ), sharedMemSize>>>(\n        // Assuming output and input pointers are already set and available in this scope\n        output, \n        input\n    );\n}\n// completion-end \n\nint main() {\nlaunch(4, 256);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 1);\ncudaCheckErrors(\"kernel launch failed\");\n\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Calculate the total number of threads per block\n    int blockSize = blockSizeX * blockSizeY * blockSizeZ;\n    \n    // Calculate the total number of blocks per grid\n    int gridSize = gridSizeX * gridSizeY * gridSizeZ;\n    \n    // Assuming dynamic shared memory size is known within the kernel, \n    // we directly pass the size as a template parameter to the <<<>>> syntax.\n    // For demonstration, let's assume the dynamic shared memory size is 'sharedMemSize' bytes.\n    int sharedMemSize = 1024; // Replace with actual shared memory size required by the kernel\n    \n    // Launch the kernel with the provided grid and block dimensions, and dynamic shared memory\n    kernel<<<dim3(gridSizeX, gridSizeY, gridSizeZ), dim3(blockSizeX, blockSizeY, blockSizeZ), sharedMemSize>>>(\n        // Assuming output and input pointers are already set and available in this scope\n        output, \n        input\n    );\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons and also allocates dynamic shared memory. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
 6 | {"task_id": "CUDA/1", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Calculate the total number of threads per block\n    int blockSize = blockSizeX * blockSizeY * blockSizeZ;\n    \n    // Calculate the total number of blocks per grid\n    int gridSize = gridSizeX * gridSizeY * gridSizeZ;\n    \n    // Assuming dynamic shared memory size is known within the kernel, \n    // we directly pass the size as a template parameter to the <<<>>> syntax.\n    // For demonstration, let's assume the dynamic shared memory size is 'sharedMemSize' bytes.\n    int sharedMemSize = 1024; // Replace with actual shared memory size required by the kernel\n    \n    // Launch the kernel with the specified grid and block dimensions, and dynamic shared memory\n    kernel<<<dim3(gridSizeX, gridSizeY, gridSizeZ), dim3(blockSizeX, blockSizeY, blockSizeZ), sharedMemSize>>>(\n        // Assuming output and input pointers are already set up and accessible here\n        output, \n        input\n    );\n}\n// completion-end \n\nint main() {\nlaunch(4, 256);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 1);\ncudaCheckErrors(\"kernel launch failed\");\n\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Calculate the total number of threads per block\n    int blockSize = blockSizeX * blockSizeY * blockSizeZ;\n    \n    // Calculate the total number of blocks per grid\n    int gridSize = gridSizeX * gridSizeY * gridSizeZ;\n    \n    // Assuming dynamic shared memory size is known within the kernel, \n    // we directly pass the size as a template parameter to the <<<>>> syntax.\n    // For demonstration, let's assume the dynamic shared memory size is 'sharedMemSize' bytes.\n    int sharedMemSize = 1024; // Replace with actual shared memory size required by the kernel\n    \n    // Launch the kernel with the specified grid and block dimensions, and dynamic shared memory\n    kernel<<<dim3(gridSizeX, gridSizeY, gridSizeZ), dim3(blockSizeX, blockSizeY, blockSizeZ), sharedMemSize>>>(\n        // Assuming output and input pointers are already set up and accessible here\n        output, \n        input\n    );\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons and also allocates dynamic shared memory. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
 7 | {"task_id": "CUDA/2", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n}\n\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the grid and block dimensions\n    dim3 gridDim(gridSizeX, gridSizeY, gridSizeZ);\n    dim3 blockDim(blockSizeX, blockSizeY, blockSizeZ);\n\n    // Allocate dynamic shared memory (assuming a size of 'sharedMemBytes' which should be defined based on the kernel's needs)\n    int sharedMemBytes = 1024; // Replace with the actual shared memory size required by the kernel\n    cudaStream_t stream;        // CUDA stream for concurrent execution\n\n    // Create a CUDA stream\n    cudaStreamCreate(&stream);\n\n    // Launch the kernel with the specified grid, block, and dynamic shared memory, using the created stream\n    kernel<<<gridDim, blockDim, sharedMemBytes, stream>>>(/* output, input */); \n    // NOTE: You need to pass the actual 'output' and 'input' pointers as arguments to the kernel function\n\n    // Synchronize the stream to ensure the kernel completes before proceeding\n    cudaStreamSynchronize(stream);\n\n    // Cleanup: Destroy the CUDA stream\n    cudaStreamDestroy(stream);\n}\n// completion-end \n\nint main() {\nlaunch(4, 256);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 1);\ncudaCheckErrors(\"kernel launch failed\");\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the grid and block dimensions\n    dim3 gridDim(gridSizeX, gridSizeY, gridSizeZ);\n    dim3 blockDim(blockSizeX, blockSizeY, blockSizeZ);\n\n    // Allocate dynamic shared memory (assuming a size of 'sharedMemBytes' which should be defined based on the kernel's needs)\n    int sharedMemBytes = 1024; // Replace with the actual shared memory size required by the kernel\n    cudaStream_t stream;        // CUDA stream for concurrent execution\n\n    // Create a CUDA stream\n    cudaStreamCreate(&stream);\n\n    // Launch the kernel with the specified grid, block, and dynamic shared memory, using the created stream\n    kernel<<<gridDim, blockDim, sharedMemBytes, stream>>>(/* output, input */); \n    // NOTE: You need to pass the actual 'output' and 'input' pointers as arguments to the kernel function\n\n    // Synchronize the stream to ensure the kernel completes before proceeding\n    cudaStreamSynchronize(stream);\n\n    // Cleanup: Destroy the CUDA stream\n    cudaStreamDestroy(stream);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons, allocates dynamic shared memory and also uses cuda streams. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
 8 | {"task_id": "CUDA/2", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n}\n\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the grid and block dimensions\n    dim3 gridDim(gridSizeX, gridSizeY, gridSizeZ);\n    dim3 blockDim(blockSizeX, blockSizeY, blockSizeZ);\n\n    // Allocate dynamic shared memory (assuming a size of 'sharedMemBytes' which should be defined based on the kernel's needs)\n    int sharedMemBytes = 1024; // Replace with the actual shared memory size required by the kernel\n    cudaStream_t stream;        // CUDA stream for concurrent execution\n\n    // Create a CUDA stream\n    cudaStreamCreate(&stream);\n\n    // Launch the kernel with the specified grid, block, and dynamic shared memory, using the created stream\n    kernel<<<gridDim, blockDim, sharedMemBytes, stream>>>(/* output, input */); \n    // NOTE: You need to pass the actual 'output' and 'input' pointers as arguments to the kernel function\n\n    // Synchronize the stream to ensure the kernel completes before proceeding\n    cudaStreamSynchronize(stream);\n\n    // Cleanup: Destroy the CUDA stream\n    cudaStreamDestroy(stream);\n}\n// completion-end \n\nint main() {\nlaunch(4, 256);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 1);\ncudaCheckErrors(\"kernel launch failed\");\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the grid and block dimensions\n    dim3 gridDim(gridSizeX, gridSizeY, gridSizeZ);\n    dim3 blockDim(blockSizeX, blockSizeY, blockSizeZ);\n\n    // Allocate dynamic shared memory (assuming a size of 'sharedMemBytes' which should be defined based on the kernel's needs)\n    int sharedMemBytes = 1024; // Replace with the actual shared memory size required by the kernel\n    cudaStream_t stream;        // CUDA stream for concurrent execution\n\n    // Create a CUDA stream\n    cudaStreamCreate(&stream);\n\n    // Launch the kernel with the specified grid, block, and dynamic shared memory, using the created stream\n    kernel<<<gridDim, blockDim, sharedMemBytes, stream>>>(/* output, input */); \n    // NOTE: You need to pass the actual 'output' and 'input' pointers as arguments to the kernel function\n\n    // Synchronize the stream to ensure the kernel completes before proceeding\n    cudaStreamSynchronize(stream);\n\n    // Cleanup: Destroy the CUDA stream\n    cudaStreamDestroy(stream);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons, allocates dynamic shared memory and also uses cuda streams. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
 9 | {"task_id": "CUDA/2", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n}\n\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the dynamic shared memory size (assuming it's a multiple of the block size for simplicity)\n    int dynamicSharedMemSize = blockSizeX * blockSizeY * blockSizeZ * sizeof(int);\n\n    // Check if CUDA streams are already created, for this example, we'll create one if not\n    cudaStream_t stream;\n    cudaStreamQuery(0); // Query default stream to ensure CUDA context is initialized\n    cudaStreamCreate(&stream);\n\n    // Launch the kernel with the provided grid and block dimensions, dynamic shared memory, and using the created stream\n    kernel<<<dim3(gridSizeX, gridSizeY, gridSizeZ), dim3(blockSizeX, blockSizeY, blockSizeZ), dynamicSharedMemSize, stream>>>\n        (/* output pointer */, /* input pointer */); // Replace /* output pointer */ and /* input pointer */ with actual pointers\n\n    // Synchronize the stream to ensure the kernel launch is completed\n    cudaStreamSynchronize(stream);\n\n    // Optionally, destroy the stream if not needed anymore to free resources\n    // cudaStreamDestroy(stream);\n}\n// completion-end \n\nint main() {\nlaunch(4, 256);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16);\ncudaCheckErrors(\"kernel launch failed\");\nlaunch(4, 16, 4, 16, 4, 1);\ncudaCheckErrors(\"kernel launch failed\");\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the dynamic shared memory size (assuming it's a multiple of the block size for simplicity)\n    int dynamicSharedMemSize = blockSizeX * blockSizeY * blockSizeZ * sizeof(int);\n\n    // Check if CUDA streams are already created, for this example, we'll create one if not\n    cudaStream_t stream;\n    cudaStreamQuery(0); // Query default stream to ensure CUDA context is initialized\n    cudaStreamCreate(&stream);\n\n    // Launch the kernel with the provided grid and block dimensions, dynamic shared memory, and using the created stream\n    kernel<<<dim3(gridSizeX, gridSizeY, gridSizeZ), dim3(blockSizeX, blockSizeY, blockSizeZ), dynamicSharedMemSize, stream>>>\n        (/* output pointer */, /* input pointer */); // Replace /* output pointer */ and /* input pointer */ with actual pointers\n\n    // Synchronize the stream to ensure the kernel launch is completed\n    cudaStreamSynchronize(stream);\n\n    // Optionally, destroy the stream if not needed anymore to free resources\n    // cudaStreamDestroy(stream);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with the provided grid and block dimensions using triple chevrons, allocates dynamic shared memory and also uses cuda streams. The x,y,z grid sizes and block sizes will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
10 | {"task_id": "CUDA/3", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <fstream>\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\nstd::string trim(const std::string& str) {\n    size_t first = str.find_first_not_of(' ');\n    if (std::string::npos == first) {\n        return str;\n    }\n    size_t last = str.find_last_not_of(' ');\n    return str.substr(first, last - first + 1);\n}\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Launch the kernel function with the specified grid and block dimensions\n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    kernel<<<gridSize, blockSize>>>(output, input);\n}\n// completion-end \n\nint main() {\n    auto static_test = [] () {\n        const char* path = std::getenv(\"COMPUTE_EVAL_SRC_FILE\");\n        if (path == nullptr) {\n            std::cerr << \"Environment variable not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::ifstream file(path);\n        if (!file.is_open()) {\n            std::cerr << \"File not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::string line;\n\n        // Skip until the beginning of the completion block\n        while (std::getline(file, line)) {\n\n            if (line.find(\"completion-begin\") != std::string::npos && \n                line.find(\"std::string::npos\") == std::string::npos) {\n                break;\n            }\n        }\n\n        // Search for the CUDA kernel launch API call\n        bool found = false;\n        while (std::getline(file, line)) {\n\n            std::string trimmedLine = trim(line);\n\n            // If the line contains the completion-end marker, stop searching\n            if (trimmedLine.find(\"completion-end\") != std::string::npos) {\n                break;\n            }\n\n            // ignore commented lines\n            if (trimmedLine.find(\"//\") == 0) continue;\n            \n            if (trimmedLine.find(\"cudaLaunchKernelEx\") != std::string::npos) {\n                found = true;\n                break;\n            }\n\n        }\n\n        if (!found) {\n            std::cerr << \"Test failed because the generated code doesn't use CUDA kernel launch API!\" << std::endl;\n            std::exit(1);\n        }\n    };\n\n    auto dynamic_test = [] () {\n        int *output, *input;\n        launch(4, 256);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16, 4, 1);\n        cudaCheckErrors(\"kernel launch failed\");\n    };\n\n    static_test();\n    dynamic_test();\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Launch the kernel function with the specified grid and block dimensions\n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    kernel<<<gridSize, blockSize>>>(output, input);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` without using triple chevrons. The x,y,z grid and dimensions will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
11 | {"task_id": "CUDA/3", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <fstream>\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\nstd::string trim(const std::string& str) {\n    size_t first = str.find_first_not_of(' ');\n    if (std::string::npos == first) {\n        return str;\n    }\n    size_t last = str.find_last_not_of(' ');\n    return str.substr(first, last - first + 1);\n}\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Launch the kernel function with the specified grid and block dimensions\n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    kernel<<<gridSize, blockSize>>>(output, input);\n}\n// completion-end \n\nint main() {\n    auto static_test = [] () {\n        const char* path = std::getenv(\"COMPUTE_EVAL_SRC_FILE\");\n        if (path == nullptr) {\n            std::cerr << \"Environment variable not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::ifstream file(path);\n        if (!file.is_open()) {\n            std::cerr << \"File not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::string line;\n\n        // Skip until the beginning of the completion block\n        while (std::getline(file, line)) {\n\n            if (line.find(\"completion-begin\") != std::string::npos && \n                line.find(\"std::string::npos\") == std::string::npos) {\n                break;\n            }\n        }\n\n        // Search for the CUDA kernel launch API call\n        bool found = false;\n        while (std::getline(file, line)) {\n\n            std::string trimmedLine = trim(line);\n\n            // If the line contains the completion-end marker, stop searching\n            if (trimmedLine.find(\"completion-end\") != std::string::npos) {\n                break;\n            }\n\n            // ignore commented lines\n            if (trimmedLine.find(\"//\") == 0) continue;\n            \n            if (trimmedLine.find(\"cudaLaunchKernelEx\") != std::string::npos) {\n                found = true;\n                break;\n            }\n\n        }\n\n        if (!found) {\n            std::cerr << \"Test failed because the generated code doesn't use CUDA kernel launch API!\" << std::endl;\n            std::exit(1);\n        }\n    };\n\n    auto dynamic_test = [] () {\n        int *output, *input;\n        launch(4, 256);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16, 4, 1);\n        cudaCheckErrors(\"kernel launch failed\");\n    };\n\n    static_test();\n    dynamic_test();\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Launch the kernel function with the specified grid and block dimensions\n    // Assuming 'output' and 'input' are already defined and allocated on the device\n    kernel<<<gridSize, blockSize>>>(output, input);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` without using triple chevrons. The x,y,z grid and dimensions will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
12 | {"task_id": "CUDA/3", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <fstream>\n#include <iostream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                       \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n    int id     = threadIdx.x + blockIdx.x * blockDim.x;\n    output[id] = input[id];\n}\n\nstd::string trim(const std::string& str) {\n    size_t first = str.find_first_not_of(' ');\n    if (std::string::npos == first) {\n        return str;\n    }\n    size_t last = str.find_last_not_of(' ');\n    return str.substr(first, last - first + 1);\n}\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Launch the kernel without using triple chevrons\n    kernel<<<gridSize, blockSize>>>(/* pass required kernel arguments here, \n                                       e.g., output and input pointers */);\n}\n// completion-end \n\nint main() {\n    auto static_test = [] () {\n        const char* path = std::getenv(\"COMPUTE_EVAL_SRC_FILE\");\n        if (path == nullptr) {\n            std::cerr << \"Environment variable not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::ifstream file(path);\n        if (!file.is_open()) {\n            std::cerr << \"File not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::string line;\n\n        // Skip until the beginning of the completion block\n        while (std::getline(file, line)) {\n\n            if (line.find(\"completion-begin\") != std::string::npos && \n                line.find(\"std::string::npos\") == std::string::npos) {\n                break;\n            }\n        }\n\n        // Search for the CUDA kernel launch API call\n        bool found = false;\n        while (std::getline(file, line)) {\n\n            std::string trimmedLine = trim(line);\n\n            // If the line contains the completion-end marker, stop searching\n            if (trimmedLine.find(\"completion-end\") != std::string::npos) {\n                break;\n            }\n\n            // ignore commented lines\n            if (trimmedLine.find(\"//\") == 0) continue;\n            \n            if (trimmedLine.find(\"cudaLaunchKernelEx\") != std::string::npos) {\n                found = true;\n                break;\n            }\n\n        }\n\n        if (!found) {\n            std::cerr << \"Test failed because the generated code doesn't use CUDA kernel launch API!\" << std::endl;\n            std::exit(1);\n        }\n    };\n\n    auto dynamic_test = [] () {\n        int *output, *input;\n        launch(4, 256);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16, 4, 1);\n        cudaCheckErrors(\"kernel launch failed\");\n    };\n\n    static_test();\n    dynamic_test();\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); \n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);\n\n    // Launch the kernel without using triple chevrons\n    kernel<<<gridSize, blockSize>>>(/* pass required kernel arguments here, \n                                       e.g., output and input pointers */);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` without using triple chevrons. The x,y,z grid and dimensions will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
13 | {"task_id": "CUDA/4", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n#include <fstream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                      \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err == cudaErrorInvalidKernelImage || __err == cudaErrorNoKernelImageForDevice) \\\n        {                                                                                     \\\n            fprintf(stderr, \"Invalid GPU architecture: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(200);                                                                       \\\n        }                                                                                       \\\n        else if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n\n}\n\nstd::string trim(const std::string& str) {\n    size_t first = str.find_first_not_of(' ');\n    if (std::string::npos == first) {\n        return str;\n    }\n    size_t last = str.find_last_not_of(' ');\n    return str.substr(first, last - first + 1);\n}\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Calculate the total number of threads in a block\n    int blockSize = blockSizeX * blockSizeY * blockSizeZ;\n    \n    // Check if the block size is valid (must be a power of 2 and not exceed the maximum allowed)\n    if ((blockSize & (blockSize - 1)) != 0 || blockSize > 1024) {\n        printf(\"Error: Invalid block size. Must be a power of 2 and not exceed 1024.\\n\");\n        return;\n    }\n    \n    // Define the grid dimensions\n    dim3 gridDim(gridSizeX, gridSizeY, gridSizeZ);\n    \n    // Define the block dimensions\n    dim3 blockDim(blockSizeX, blockSizeY, blockSizeZ);\n    \n    // Launch the kernel\n    kernel<<<gridDim, blockDim>>>(output, input);\n}\n// completion-end \n\nint main() {\nauto static_test = [] () {\n        const char* path = std::getenv(\"COMPUTE_EVAL_SRC_FILE\");\n        if (path == nullptr) {\n            std::cerr << \"Environment variable not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::ifstream file(path);\n        if (!file.is_open()) {\n            std::cerr << \"File not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::string line;\n\n        // Skip until the beginning of the completion block\n        while (std::getline(file, line)) {\n\n            if (line.find(\"completion-begin\") != std::string::npos && \n                line.find(\"std::string::npos\") == std::string::npos) {\n                break;\n            }\n        }\n\n        // Search for the CUDA kernel launch API call\n        bool foundKernelLaunch = false, foundClusterDim = false;\n        while (std::getline(file, line)) {\n\n            std::string trimmedLine = trim(line);\n\n            // If the line contains the completion-end marker, stop searching\n            if (trimmedLine.find(\"completion-end\") != std::string::npos) {\n                break;\n            }\n\n            // ignore commented lines\n            if (trimmedLine.find(\"//\") == 0) continue;\n            \n            if (trimmedLine.find(\"cudaLaunchKernelEx\") != std::string::npos) foundKernelLaunch = true;\n            if (trimmedLine.find(\"cudaLaunchAttributeClusterDimension\") != std::string::npos) foundClusterDim = true;\n\n            if (foundKernelLaunch && foundClusterDim) break;\n\n        }\n\n        if (!foundKernelLaunch) {\n            std::cerr << \"Test failed because the generated code doesn't use CUDA kernel launch API!\" << std::endl;\n            std::exit(1);\n        }\n\n        if (!foundClusterDim) {\n            std::cerr << \"Test failed because the generated code doesn't use cluster dimension attribute!\" << std::endl;\n            std::exit(1);\n        }\n    };\n\n    auto dynamic_test = [] () {\n        int *output, *input;\n        launch(4, 256);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16, 4, 1);\n        cudaCheckErrors(\"kernel launch failed\");\n    };\n\n    static_test();\n    dynamic_test();\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Calculate the total number of threads in a block\n    int blockSize = blockSizeX * blockSizeY * blockSizeZ;\n    \n    // Check if the block size is valid (must be a power of 2 and not exceed the maximum allowed)\n    if ((blockSize & (blockSize - 1)) != 0 || blockSize > 1024) {\n        printf(\"Error: Invalid block size. Must be a power of 2 and not exceed 1024.\\n\");\n        return;\n    }\n    \n    // Define the grid dimensions\n    dim3 gridDim(gridSizeX, gridSizeY, gridSizeZ);\n    \n    // Define the block dimensions\n    dim3 blockDim(blockSizeX, blockSizeY, blockSizeZ);\n    \n    // Launch the kernel\n    kernel<<<gridDim, blockDim>>>(output, input);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with thread block clusters and wihout using triple chevrons. The x,y,z grid and dimensions will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
14 | {"task_id": "CUDA/4", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n#include <fstream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                      \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err == cudaErrorInvalidKernelImage || __err == cudaErrorNoKernelImageForDevice) \\\n        {                                                                                     \\\n            fprintf(stderr, \"Invalid GPU architecture: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(200);                                                                       \\\n        }                                                                                       \\\n        else if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n\n}\n\nstd::string trim(const std::string& str) {\n    size_t first = str.find_first_not_of(' ');\n    if (std::string::npos == first) {\n        return str;\n    }\n    size_t last = str.find_last_not_of(' ');\n    return str.substr(first, last - first + 1);\n}\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ);  // The number of threads in each thread block\n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);      // The number of thread blocks in the grid\n\n    // Launch the kernel function with the specified grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(/* output and input pointers must be passed here, \n                                       assuming they are defined in the scope of the launch function */ \n                                     output, input);\n}\n// completion-end \n\nint main() {\nauto static_test = [] () {\n        const char* path = std::getenv(\"COMPUTE_EVAL_SRC_FILE\");\n        if (path == nullptr) {\n            std::cerr << \"Environment variable not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::ifstream file(path);\n        if (!file.is_open()) {\n            std::cerr << \"File not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::string line;\n\n        // Skip until the beginning of the completion block\n        while (std::getline(file, line)) {\n\n            if (line.find(\"completion-begin\") != std::string::npos && \n                line.find(\"std::string::npos\") == std::string::npos) {\n                break;\n            }\n        }\n\n        // Search for the CUDA kernel launch API call\n        bool foundKernelLaunch = false, foundClusterDim = false;\n        while (std::getline(file, line)) {\n\n            std::string trimmedLine = trim(line);\n\n            // If the line contains the completion-end marker, stop searching\n            if (trimmedLine.find(\"completion-end\") != std::string::npos) {\n                break;\n            }\n\n            // ignore commented lines\n            if (trimmedLine.find(\"//\") == 0) continue;\n            \n            if (trimmedLine.find(\"cudaLaunchKernelEx\") != std::string::npos) foundKernelLaunch = true;\n            if (trimmedLine.find(\"cudaLaunchAttributeClusterDimension\") != std::string::npos) foundClusterDim = true;\n\n            if (foundKernelLaunch && foundClusterDim) break;\n\n        }\n\n        if (!foundKernelLaunch) {\n            std::cerr << \"Test failed because the generated code doesn't use CUDA kernel launch API!\" << std::endl;\n            std::exit(1);\n        }\n\n        if (!foundClusterDim) {\n            std::cerr << \"Test failed because the generated code doesn't use cluster dimension attribute!\" << std::endl;\n            std::exit(1);\n        }\n    };\n\n    auto dynamic_test = [] () {\n        int *output, *input;\n        launch(4, 256);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16, 4, 1);\n        cudaCheckErrors(\"kernel launch failed\");\n    };\n\n    static_test();\n    dynamic_test();\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ);  // The number of threads in each thread block\n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);      // The number of thread blocks in the grid\n\n    // Launch the kernel function with the specified grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(/* output and input pointers must be passed here, \n                                       assuming they are defined in the scope of the launch function */ \n                                     output, input);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with thread block clusters and wihout using triple chevrons. The x,y,z grid and dimensions will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
15 | {"task_id": "CUDA/4", "compilable_code": "#include <cuda.h>\n#include \"cuda_runtime.h\"\n#include <iostream>\n#include <fstream>\n\nusing namespace std;\n\n#define cudaCheckErrors(msg)                                                                 \\\n    do                                                                                      \\\n    {                                                                                        \\\n        cudaError_t __err = cudaGetLastError();                                              \\\n        if (__err == cudaErrorInvalidKernelImage || __err == cudaErrorNoKernelImageForDevice) \\\n        {                                                                                     \\\n            fprintf(stderr, \"Invalid GPU architecture: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(200);                                                                       \\\n        }                                                                                       \\\n        else if (__err != cudaSuccess)                                                            \\\n        {                                                                                    \\\n            fprintf(stderr, \"Fatal error: %s (%s at %s:%d)\", msg, cudaGetErrorString(__err), \\\n                    __FILE__, __LINE__);                                                     \\\n            fprintf(stderr, \"*** FAILED - ABORTING\");                                        \\\n            exit(1);                                                                         \\\n        }                                                                                    \\\n    }                                                                                        \\\n    while (0)\n\n__global__ void kernel(int *output, const int *input)\n{\n\n}\n\nstd::string trim(const std::string& str) {\n    size_t first = str.find_first_not_of(' ');\n    if (std::string::npos == first) {\n        return str;\n    }\n    size_t last = str.find_last_not_of(' ');\n    return str.substr(first, last - first + 1);\n}\n\n\n// completion-begin \nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); // The number of threads in each thread block\n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);    // The number of thread blocks in each dimension\n\n    // Launch the kernel function with the specified grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(output, input);\n}\n// completion-end \n\nint main() {\nauto static_test = [] () {\n        const char* path = std::getenv(\"COMPUTE_EVAL_SRC_FILE\");\n        if (path == nullptr) {\n            std::cerr << \"Environment variable not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::ifstream file(path);\n        if (!file.is_open()) {\n            std::cerr << \"File not found!\" << std::endl;\n            std::exit(1);\n        }\n\n        std::string line;\n\n        // Skip until the beginning of the completion block\n        while (std::getline(file, line)) {\n\n            if (line.find(\"completion-begin\") != std::string::npos && \n                line.find(\"std::string::npos\") == std::string::npos) {\n                break;\n            }\n        }\n\n        // Search for the CUDA kernel launch API call\n        bool foundKernelLaunch = false, foundClusterDim = false;\n        while (std::getline(file, line)) {\n\n            std::string trimmedLine = trim(line);\n\n            // If the line contains the completion-end marker, stop searching\n            if (trimmedLine.find(\"completion-end\") != std::string::npos) {\n                break;\n            }\n\n            // ignore commented lines\n            if (trimmedLine.find(\"//\") == 0) continue;\n            \n            if (trimmedLine.find(\"cudaLaunchKernelEx\") != std::string::npos) foundKernelLaunch = true;\n            if (trimmedLine.find(\"cudaLaunchAttributeClusterDimension\") != std::string::npos) foundClusterDim = true;\n\n            if (foundKernelLaunch && foundClusterDim) break;\n\n        }\n\n        if (!foundKernelLaunch) {\n            std::cerr << \"Test failed because the generated code doesn't use CUDA kernel launch API!\" << std::endl;\n            std::exit(1);\n        }\n\n        if (!foundClusterDim) {\n            std::cerr << \"Test failed because the generated code doesn't use cluster dimension attribute!\" << std::endl;\n            std::exit(1);\n        }\n    };\n\n    auto dynamic_test = [] () {\n        int *output, *input;\n        launch(4, 256);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16);\n        cudaCheckErrors(\"kernel launch failed\");\n        launch(4, 16, 4, 16, 4, 1);\n        cudaCheckErrors(\"kernel launch failed\");\n    };\n\n    static_test();\n    dynamic_test();\n}\n", "generated_completion": "void launch(int gridSizeX, int blockSizeX, int gridSizeY, int blockSizeY, int gridSizeZ, int blockSizeZ) {\n    // Define the kernel launch parameters\n    dim3 blockSize(blockSizeX, blockSizeY, blockSizeZ); // The number of threads in each thread block\n    dim3 gridSize(gridSizeX, gridSizeY, gridSizeZ);    // The number of thread blocks in each dimension\n\n    // Launch the kernel function with the specified grid and block dimensions\n    kernel<<<gridSize, blockSize>>>(output, input);\n}", "prompt": "Implement a function called `launch` that launches a kernel function named `kernel` with thread block clusters and wihout using triple chevrons. The x,y,z grid and dimensions will be provided as parameters\nto the `launch` function. Assume that the `kernel` function is already defined. \n\nThe signature of the `kernel` function is\n```cuda\n__global__ void kernel(int *output, const int *input) \n```\n\nThe function signature is \n```cuda\nvoid launch(int gridSizeX, int blockSizeX, int gridSizeY = 1, int blockSizeY = 1, int gridSizeZ = 1, int blockSizeZ = 1)\n\n"}
16 | 


--------------------------------------------------------------------------------
/example_config_evalcorrectness.yaml:
--------------------------------------------------------------------------------
1 | sample_file: data/samples.jsonl
2 | problem_file: data/cuda_problems_121924.jsonl
3 | 
4 | k: [1, 3]
5 | 


--------------------------------------------------------------------------------
/example_config_gen_samples.yaml:
--------------------------------------------------------------------------------
1 | problem_file: data/cuda_problems_121924.jsonl   # Input problems
2 | sample_file: data/samples.jsonl          # Generated samples
3 | 
4 | model: llama-3.1-nemotron-70b-instruct   # Model to use
5 | num_samples_per_problem: 3               # Samples to generate per problem
6 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "compute-eval"
 3 | version = "0.1.0"
 4 | description = "Library for evaluating Large Language Models on CUDA code"
 5 | readme = "README.md"
 6 | authors = [
 7 |     { name = "NVIDIA" }
 8 | ]
 9 | requires-python = ">=3.10"
10 | dependencies = [
11 |     "anthropic>=0.42.0",
12 |     "fire>=0.7.0",
13 |     "numpy>=2.2.0",
14 |     "openai>=1.58.1",
15 |     "pre-commit>=4.0.1",
16 |     "psutil>=6.1.1",
17 |     "python-dotenv>=1.0.1",
18 |     "requests>=2.32.3",
19 |     "tabulate>=0.9.0",
20 |     "tqdm>=4.67.1",
21 | ]
22 | 
23 | [dependency-groups]
24 | dev = [
25 |     "ruff>=0.8.4",
26 | ]
27 | 
28 | [tool.poetry]
29 | name = "compute-eval"
30 | version = "0.1.0"
31 | description = "Library for evaluating Large Language Models on CUDA code"
32 | authors = ["NVIDIA"]
33 | readme = "README.md"
34 | 
35 | [tool.poetry.group.dev.dependencies]
36 | black = "^24.4.2"
37 | isort = "^5.13.2"
38 | 
39 | [build-system]
40 | requires = ["poetry-core"]
41 | build-backend = "poetry.core.masonry.api"
42 | 
43 | [tool.poetry.scripts]
44 | compute_eval = "compute_eval.main:main"
45 | 


--------------------------------------------------------------------------------