├── .gitignore
├── LICENSE
├── README.md
├── assets
├── gpu_config_comparison.png
├── header.png
├── ip_title.png
├── ip_v2.png
├── pipeline.png
└── speedup_and_example.png
├── eval
└── eval_cuda.py
├── more_baselines
├── cuda_graph.json
└── cudnn.json
└── optimized_cuda_code
├── 3090.json
├── a100.json
├── codes
├── 3090.json
├── a100.json
├── h100.json
├── h20.json
└── l40.json
├── h100.json
├── h20.json
└── l40.json
/.gitignore:
--------------------------------------------------------------------------------
1 | # Xcode
2 | *.DS_Store
3 | .idea
4 | .idea/*
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[codz]
9 | *$py.class
10 |
11 | # C extensions
12 | *.so
13 |
14 | # Distribution / packaging
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | share/python-wheels/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | MANIFEST
33 |
34 | # PyInstaller
35 | # Usually these files are written by a python script from a template
36 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 |
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 |
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .nox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *.cover
54 | *.py.cover
55 | .hypothesis/
56 | .pytest_cache/
57 | cover/
58 |
59 | # Translations
60 | *.mo
61 | *.pot
62 |
63 | # Django stuff:
64 | *.log
65 | local_settings.py
66 | db.sqlite3
67 | db.sqlite3-journal
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | .pybuilder/
81 | target/
82 |
83 | # Jupyter Notebook
84 | .ipynb_checkpoints
85 |
86 | # IPython
87 | profile_default/
88 | ipython_config.py
89 |
90 | # pyenv
91 | # For a library or package, you might want to ignore these files since the code is
92 | # intended to run in multiple environments; otherwise, check them in:
93 | # .python-version
94 |
95 | # pipenv
96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
99 | # install all needed dependencies.
100 | #Pipfile.lock
101 |
102 | # UV
103 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
104 | # This is especially recommended for binary packages to ensure reproducibility, and is more
105 | # commonly ignored for libraries.
106 | #uv.lock
107 |
108 | # poetry
109 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110 | # This is especially recommended for binary packages to ensure reproducibility, and is more
111 | # commonly ignored for libraries.
112 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113 | #poetry.lock
114 | #poetry.toml
115 |
116 | # pdm
117 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
118 | # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
119 | # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
120 | #pdm.lock
121 | #pdm.toml
122 | .pdm-python
123 | .pdm-build/
124 |
125 | # pixi
126 | # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
127 | #pixi.lock
128 | # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
129 | # in the .venv directory. It is recommended not to include this directory in version control.
130 | .pixi
131 |
132 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
133 | __pypackages__/
134 |
135 | # Celery stuff
136 | celerybeat-schedule
137 | celerybeat.pid
138 |
139 | # SageMath parsed files
140 | *.sage.py
141 |
142 | # Environments
143 | .env
144 | .envrc
145 | .venv
146 | env/
147 | venv/
148 | ENV/
149 | env.bak/
150 | venv.bak/
151 |
152 | # Spyder project settings
153 | .spyderproject
154 | .spyproject
155 |
156 | # Rope project settings
157 | .ropeproject
158 |
159 | # mkdocs documentation
160 | /site
161 |
162 | # mypy
163 | .mypy_cache/
164 | .dmypy.json
165 | dmypy.json
166 |
167 | # Pyre type checker
168 | .pyre/
169 |
170 | # pytype static type analyzer
171 | .pytype/
172 |
173 | # Cython debug symbols
174 | cython_debug/
175 |
176 | # PyCharm
177 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
178 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
179 | # and can be added to the global gitignore or merged into this file. For a more nuclear
180 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
181 | #.idea/
182 |
183 | # Abstra
184 | # Abstra is an AI-powered process automation framework.
185 | # Ignore directories containing user credentials, local state, and settings.
186 | # Learn more at https://abstra.io/docs
187 | .abstra/
188 |
189 | # Visual Studio Code
190 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
191 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
192 | # and can be added to the global gitignore or merged into this file. However, if you prefer,
193 | # you could uncomment the following to ignore the entire vscode folder
194 | # .vscode/
195 |
196 | # Ruff stuff:
197 | .ruff_cache/
198 |
199 | # PyPI configuration file
200 | .pypirc
201 |
202 | # Cursor
203 | # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
204 | # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
205 | # refer to https://docs.cursor.com/context/ignore-files
206 | .cursorignore
207 | .cursorindexingignore
208 |
209 | # Marimo
210 | marimo/_static/
211 | marimo/_lsp/
212 | __marimo__/
213 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 ShannonAI
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
| 🏠 Project Page | 📄 Paper
12 |
13 |
14 |
15 | CUDA-L1: Improving CUDA Optimization via Contrastive Reinforcement Learning
16 |
17 |
18 |
19 | ## 🥳 Introduction
20 |
21 | In this paper, we introduce CUDA-L1, an automated reinforcement learning (RL) framework for CUDA optimization. The core of CUDA-L1 is a contrastive RL model, a newly-designed RL system to enhance optimization through comparative learning.
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 | Fig:Average speedup across different architectures on KernelBench over baselines.
30 |
31 |
32 |
33 |
34 | ## 🗒️ To-do List
35 | - [x] Fix KernelBench evaluations with proper stream timing synchronization ✅
36 | - [x] Remove caching ✅
37 | - [x] Compare with torch.compile ✅
38 | - [x] Compare with pytorch eager + cuda graph ✅
39 | - [x] Compare with custom torch CUDA/cuDNN backend flags ✅
40 | - [ ] 5090/4090
41 |
42 |
43 | ## 🩺 Evaluation Results
44 |
45 | Our evaluation is conducted on the KernelBench [dataset](https://github.com/ScalingIntelligence/KernelBench), a collection of 250 PyTorch workloads designed to evaluate language models' ability to generate efficient GPU kernels.
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 | **Table: Performance comparison across different configurations on KernelBench on A100.**
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 | | Configuration |
62 |
63 | Method |
64 |
65 | Mean |
66 |
67 | Max |
68 |
69 | 75% |
70 |
71 | 50% |
72 |
73 | 25% |
74 |
75 | Success↑ # out of total |
76 |
77 | Speedup↑ >1.01x out of total |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | | Default |
88 |
89 | All |
90 |
91 | 3.12× |
92 |
93 | 120× |
94 |
95 | 2.25× |
96 |
97 | 1.42× |
98 |
99 | 1.17× |
100 |
101 | 249/250 |
102 |
103 | 226/250 |
104 |
105 |
106 |
107 |
108 |
109 | | Torch Compile |
110 |
111 | All |
112 |
113 | 2.77× |
114 |
115 | 69.0× |
116 |
117 | 2.55× |
118 |
119 | 1.72× |
120 |
121 | 1.14× |
122 |
123 | 249/250 |
124 |
125 | 203/250 |
126 |
127 |
128 |
129 |
130 |
131 | | Torch Compile RO |
132 |
133 | All |
134 |
135 | 2.88× |
136 |
137 | 80.1× |
138 |
139 | 2.48× |
140 |
141 | 1.67× |
142 |
143 | 1.13× |
144 |
145 | 249/250 |
146 |
147 | 200/250 |
148 |
149 |
150 |
151 |
152 |
153 | | CUDA Graph |
154 |
155 | All |
156 |
157 | 2.81× |
158 |
159 | 97.9× |
160 |
161 | 1.83× |
162 |
163 | 1.20× |
164 |
165 | 0.954× |
166 |
167 | 249/250 |
168 |
169 | 147/229 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 | • RO = Reduce Overhead
181 |
182 | • Success and Speedup indicate the number of successful benchmarks out of the total for each level
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 | **Table: Mean speedup across different configurations and GPU devices.**
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 | | Configuration |
200 |
201 | A100 |
202 |
203 | 3090 |
204 |
205 | H100 |
206 |
207 | H20 |
208 |
209 | L40 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 | | Default |
220 |
221 | 3.12× |
222 |
223 | 2.51× |
224 |
225 | 3.85× |
226 |
227 | 2.38× |
228 |
229 | 3.13× |
230 |
231 |
232 |
233 |
234 |
235 | | Torch Compile |
236 |
237 | 2.77× |
238 |
239 | 2.58× |
240 |
241 | 2.74× |
242 |
243 | 2.89× |
244 |
245 | 2.85× |
246 |
247 |
248 |
249 |
250 |
251 | | Torch Compile RO |
252 |
253 | 2.88× |
254 |
255 | 2.61× |
256 |
257 | 2.77× |
258 |
259 | 2.82× |
260 |
261 | 2.89× |
262 |
263 |
264 |
265 |
266 |
267 | | CUDA Graph |
268 |
269 | 2.81× |
270 |
271 | 3.34× |
272 |
273 | 2.23× |
274 |
275 | 2.20× |
276 |
277 | 3.98× |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 | ## ❓ How to reproduce the results?
290 |
291 | We provide CUDA code snippets optimized by CUDA-L1 in the `optimized_cuda_code` folder, with separate versions for each GPU device. For example, to reproduce our results on H100 XSM, download `./optimized_cuda_code/h100.json` and run each code snippet on your H100 device.
292 |
293 |
294 | ## 📁 Structure of Release Code
295 |
296 | Each line in the release file contains a JSON object with the following fields:
297 |
298 | | Field | Description |
299 | |-------|-------------|
300 | | `level_id` | Level index in KernelBench (values: 1, 2, 3) |
301 | | `task_id` | Task index for that level |
302 | | `ref_code` | Reference CUDA code provided by KernelBench |
303 | | `custom_code` | Optimized code generated by CUDA-L1 |
304 | | `cuda_graph_code` | KernelBench reference code with CUDA Graph modifications |
305 | | `score_default` | Execution time ratio: ref_code / custom_code |
306 | | `score_torch_compile_default` | Execution time ratio: ref_code / custom_code (with torch.compile) |
307 | | `score_torch_compile_reduce_overhead` | Execution time ratio: ref_code / custom_code (with torch.compile reduce_overhead mode) |
308 | | `score_cuda_graph` | Execution time ratio: cuda_graph_code / custom_code |
309 |
310 | **Note:** If `custom_code` is None, it means the RL either failed to generate code faster than the reference code or simply copied the reference code during generation.
311 |
312 | ### Example Entry Structure
313 | ```json
314 | {
315 | "level_id": 1,
316 | "task_id": 1,
317 | "ref_code": "import torch...",
318 | "custom_code": "import torch...",
319 | "cuda_graph_code": "import torch...",
320 | "score_default": 1.762,
321 | "score_torch_compile_default": 1.958,
322 | "score_torch_compile_reduce_overhead": 2.118,
323 | "score_cuda_graph": 1.566,
324 | }
325 | ```
326 |
327 | ## 🔭 Limitations and Challenges
328 |
329 | During the training process, we found that RL is particularly susceptible to reward hacking. We've already identified quite a few hacking cases (e.g., exploiting timing measurements & caching results). If you identify any additional reward hacks in the code, we would greatly appreciate you letting us know.
330 |
331 |
332 | ## 📇 Citation
333 | ```latex
334 | @article{deepreinforce2025cudal1,
335 | title={CUDA-L1: Improving CUDA Optimization via Contrastive Reinforcement Learning},
336 | author={Li, Xiaoya and Sun, Xiaofei and Wang, Albert and Li, Jiwei and Chris, Shum},
337 | journal={arXiv preprint arXiv:2507.14111},
338 | year={2025}
339 | }
340 | ```
341 |
342 | ## ✉️ Contact
343 | If you have any questions, please reach out to us at **research@deep-reinforce.com**.
344 |
--------------------------------------------------------------------------------
/assets/gpu_config_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/gpu_config_comparison.png
--------------------------------------------------------------------------------
/assets/header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/header.png
--------------------------------------------------------------------------------
/assets/ip_title.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/ip_title.png
--------------------------------------------------------------------------------
/assets/ip_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/ip_v2.png
--------------------------------------------------------------------------------
/assets/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/pipeline.png
--------------------------------------------------------------------------------
/assets/speedup_and_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/speedup_and_example.png
--------------------------------------------------------------------------------
/eval/eval_cuda.py:
--------------------------------------------------------------------------------
1 | """
2 | Helpers for Evaluations
3 | Copied and then Adapted from the KernelBench evaluation code
4 |
5 | Key Enhancement for Multi-Stream CUDA Models:
6 |
7 | In eval_kernel_against_ref(), we modified the timing logic to properly handle models
8 | that create their own CUDA streams. This ensures accurate performance measurements
9 | for models with complex stream management.
10 |
11 | Original timing code:
12 | with torch.cuda.stream(custom_model_stream):
13 | start_event.record(custom_model_stream)
14 | custom_model(*inputs)
15 | end_event.record(custom_model_stream)
16 |
17 | Enhanced timing code:
18 | with torch.cuda.stream(custom_model_stream):
19 | start_event.record(custom_model_stream)
20 | custom_model(*inputs)
21 |
22 | # Wait for all model streams to complete before recording end event
23 | if custom_model_streams:
24 | for stream in custom_model_streams:
25 | custom_model_stream.wait_stream(stream)
26 |
27 | end_event.record(custom_model_stream)
28 |
29 | This enhancement prevents timing inaccuracies when models use internal streams
30 | for operations like CUDA graphs, asynchronous kernels, or parallel execution.
31 | Without this synchronization, timing measurements could complete before the
32 | model's actual GPU work finishes, leading to artificially fast results.
33 |
34 | """
35 | import os
36 | os.environ["MKL_THREADING_LAYER"] = "GNU"
37 | import numpy as np
38 | import torch
39 | import torch.nn as nn
40 | import subprocess
41 | import random
42 | import json
43 | from contextlib import redirect_stdout, redirect_stderr
44 | from io import StringIO
45 | import multiprocessing as mp
46 | from concurrent.futures import ThreadPoolExecutor, TimeoutError
47 | import time
48 | from datetime import datetime, timezone, timedelta
49 | from typing import Tuple, List, Dict, Union, Optional, Callable
50 |
51 |
52 |
53 |
54 | pst_tz = timezone(timedelta(hours=-8))
55 |
56 | REPO_TOP_PATH = os.path.abspath(
57 | os.path.join(
58 | os.path.dirname(__file__),
59 | "..",
60 | )
61 | )
62 | KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")
63 |
64 |
65 | def execute_model_with_timeout(
66 | model_src: str,
67 | context: Dict,
68 | timeout: float = 300.0,
69 | build_directory: Optional[str] = None,
70 | use_process_isolation: bool = False,
71 | info_string: str = ""
72 | ) -> Tuple[bool, str, Optional[float]]:
73 | """
74 | Execute model source code with a time limit.
75 |
76 | Args:
77 | model_src: Source code to execute (can be original_model_src or custom_model_src)
78 | context: Dictionary to execute the code in
79 | timeout: Maximum time in seconds to allow for execution (default: 300s = 5 minutes)
80 | build_directory: Optional build directory for CUDA extensions
81 | use_process_isolation: Use multiprocessing instead of threading (slower but more robust)
82 |
83 | Returns:
84 | Tuple[bool, str, Optional[float]]: (success, error_message, execution_time)
85 | - success: True if execution completed within timeout, False otherwise
86 | - error_message: Error details if execution failed, empty string if successful
87 | - execution_time: Time taken for execution in seconds, None if failed
88 |
89 | Note:
90 | ThreadPoolExecutor cannot interrupt blocking operations like time.sleep(),
91 | network requests, or infinite loops. The timeout detection works correctly,
92 | but background threads may continue running until the blocking operation completes.
93 | For CUDA code, this is usually not an issue as compilation errors are detected quickly.
94 | """
95 | # Format info_string for consistent display
96 | info_prefix = f"[{info_string}] " if info_string else ""
97 |
98 | # Prepare source code with build directory if provided
99 | if build_directory:
100 | context["BUILD_DIRECTORY"] = build_directory
101 | model_src = (
102 | "import os\n"
103 | f"os.environ['TORCH_EXTENSIONS_DIR'] = '{build_directory}'\n"
104 | ) + model_src
105 |
106 | # Static analysis for potentially problematic patterns
107 | potentially_hanging_patterns = [
108 | ('time.sleep(', 'time.sleep() calls'),
109 | ('requests.get(', 'network requests'),
110 | ('urllib.request.', 'URL requests'),
111 | ('input(', 'user input'),
112 | ('while True:', 'infinite loops'),
113 | ('subprocess.', 'subprocess calls'),
114 | ]
115 |
116 | detected_patterns = []
117 | for pattern, description in potentially_hanging_patterns:
118 | if pattern in model_src:
119 | detected_patterns.append(description)
120 |
121 | if detected_patterns:
122 | print(f"{info_prefix}[execute_model_with_timeout] WARNING: Detected potentially blocking operations:")
123 | for pattern in detected_patterns:
124 | print(f"{info_prefix} - {pattern}")
125 | print(f"{info_prefix}[execute_model_with_timeout] These may cause hanging if they block indefinitely.")
126 | print(f"{info_prefix}[execute_model_with_timeout] Consider using use_process_isolation=True for risky code.")
127 |
128 | # Check for extremely problematic patterns that should be blocked
129 | blocking_patterns = ['time.sleep(', 'input(', 'while True:']
130 | should_block = any(pattern in model_src for pattern, _ in potentially_hanging_patterns
131 | if pattern in blocking_patterns)
132 |
133 | if should_block and not use_process_isolation:
134 | error_msg = f"Code contains blocking patterns that may cause indefinite hanging: {detected_patterns}"
135 | print(f"{info_prefix}[execute_model_with_timeout] BLOCKING EXECUTION: {error_msg}")
136 | print(f"{info_prefix}[execute_model_with_timeout] Use use_process_isolation=True to override")
137 | return False, error_msg, None
138 |
139 | def _execute_code():
140 | """Helper function to execute the code in a separate thread"""
141 | try:
142 | compile(model_src, "", "exec")
143 | exec(model_src, context)
144 | return True
145 | except Exception as e:
146 | raise e
147 |
148 | try:
149 | isolation_method = "process isolation" if use_process_isolation else "thread isolation"
150 | print(f"{info_prefix}Executing model code with {timeout}s timeout using {isolation_method}...")
151 |
152 | if use_process_isolation:
153 | # Use multiprocessing (more robust but has limitations with CUDA)
154 | import multiprocessing as mp
155 | print(f"{info_prefix}[execute_model_with_timeout] WARNING: Process isolation may not work well with CUDA contexts")
156 |
157 | def _execute_in_process():
158 | try:
159 | compile(model_src, "", "exec")
160 | local_context = {}
161 | exec(model_src, local_context)
162 | return True
163 | except Exception as e:
164 | raise e
165 |
166 | process = mp.Process(target=_execute_in_process)
167 | t1 = time.time()
168 | process.start()
169 | process.join(timeout=timeout)
170 | t2 = time.time()
171 | execution_time = t2 - t1
172 |
173 | if process.is_alive():
174 | print(f"{info_prefix}[execute_model_with_timeout] Process timeout - terminating")
175 | process.terminate()
176 | process.join(timeout=5.0)
177 | if process.is_alive():
178 | process.kill()
179 | process.join()
180 |
181 | error_msg = f"Execution timeout after {execution_time:.6f} seconds"
182 | print(f"{info_prefix}[execute_model_with_timeout] {error_msg}")
183 | return False, error_msg, None
184 |
185 | if process.exitcode == 0:
186 | print(f"{info_prefix}Model code execution completed successfully")
187 | # Note: Process isolation doesn't update the context
188 | print(f"{info_prefix}[execute_model_with_timeout] Note: Context not updated due to process isolation")
189 | return True, "", execution_time
190 | else:
191 | error_msg = f"Process exited with code {process.exitcode}"
192 | return False, error_msg, None
193 |
194 | else:
195 | # Use threading (faster, works with CUDA, but can't interrupt blocking operations)
196 | with ThreadPoolExecutor(max_workers=1) as executor:
197 | future = executor.submit(_execute_code)
198 | try:
199 | t1 = time.time()
200 | future.result(timeout=timeout)
201 | t2 = time.time()
202 | execution_time = t2 - t1
203 | print(f"{info_prefix}Model code execution completed successfully")
204 | return True, "", execution_time
205 |
206 | except TimeoutError:
207 | future.cancel() # This won't stop blocking operations
208 | elapsed_time = time.time() - t1
209 | error_msg = f"Execution timeout after {elapsed_time:.6f} seconds"
210 | print(f"{info_prefix}[execute_model_with_timeout] {error_msg}")
211 | print(f"{info_prefix}[execute_model_with_timeout] Source code length: {len(model_src)} chars")
212 | print(f"{info_prefix}[execute_model_with_timeout] First 200 chars: {model_src[:200]}...")
213 | if detected_patterns:
214 | print(f"{info_prefix}[execute_model_with_timeout] Note: Background thread may still be running due to blocking operations")
215 | return False, error_msg, None
216 |
217 | except SyntaxError as e:
218 | error_msg = f"Syntax Error: {e}"
219 | print(f"{info_prefix}[execute_model_with_timeout] {error_msg}")
220 | print(f"{info_prefix}[execute_model_with_timeout] Source code length: {len(model_src)} chars")
221 | print(f"{info_prefix}[execute_model_with_timeout] First 200 chars: {model_src[:200]}...")
222 | return False, error_msg, None
223 |
224 | except Exception as e:
225 | error_msg = f"Runtime Error: {e}"
226 | print(f"{info_prefix}[execute_model_with_timeout] {error_msg}")
227 | print(f"{info_prefix}[execute_model_with_timeout] Source code length: {len(model_src)} chars")
228 | print(f"{info_prefix}[execute_model_with_timeout] First 200 chars: {model_src[:200]}...")
229 | return False, error_msg, None
230 |
231 |
232 |
233 | def set_seed(seed: int):
234 | torch.manual_seed(seed)
235 | # NOTE: this only sets on current cuda device
236 | torch.cuda.manual_seed(seed)
237 |
238 |
239 | def load_original_model_and_inputs(
240 | model_original_src: str, context: Dict, timeout: float = 300.0, info_string: str = ""
241 | ) -> Tuple[nn.Module, Callable, Callable]:
242 | """
243 | Load class from original NN.module pytorch code
244 | this is pytorch reference and we feed that to model to see if there will be any improvement
245 |
246 | Args:
247 | model_original_src: Source code for the original model
248 | context: Dictionary to execute the code in
249 | timeout: Maximum time in seconds to allow for code execution (default: 300s = 5 minutes)
250 | info_string: Information string for consistent logging
251 | """
252 | # Format info_string for consistent display
253 | info_prefix = f"[{info_string}] " if info_string else ""
254 |
255 | # Execute the model source code with timeout
256 | success, error_msg, execution_time = execute_model_with_timeout(
257 | model_src=model_original_src,
258 | context=context,
259 | timeout=timeout,
260 | build_directory=None, # Original models typically don't need CUDA extensions
261 | info_string=info_string
262 | )
263 |
264 | if not success:
265 | print(f"{info_prefix}[load_original_model_and_inputs] Failed to execute original model code: {error_msg}")
266 | return None
267 |
268 | # these should be defined in the original model code and present in the context
269 | get_init_inputs_fn = context.get("get_init_inputs")
270 | get_inputs_fn = context.get("get_inputs")
271 | Model = context.get("Model")
272 | return (Model, get_init_inputs_fn, get_inputs_fn)
273 |
274 |
275 | def load_custom_model(
276 | model_custom_src: str, context: Dict, build_directory: Optional[str] = None, timeout: float = 300.0, info_string: str = ""
277 | ) -> Optional[nn.Module]:
278 | """
279 | Load class from custom NN.module pytorch code
280 | this is the code output by LLM with calls to custom cuda kernels
281 |
282 | Args:
283 | model_custom_src: Source code for the custom model
284 | context: Dictionary to execute the code in
285 | build_directory: Directory for CUDA extensions
286 | timeout: Maximum time in seconds to allow for code execution (default: 300s = 5 minutes)
287 | info_string: Information string for consistent logging
288 | """
289 | # Format info_string for consistent display
290 | info_prefix = f"[{info_string}] " if info_string else ""
291 |
292 | # Execute the model source code with timeout
293 | success, error_msg, execution_time = execute_model_with_timeout(
294 | model_src=model_custom_src,
295 | context=context,
296 | timeout=timeout,
297 | build_directory=build_directory,
298 | info_string=info_string
299 | )
300 |
301 | if not success:
302 | print(f"{info_prefix}[load_custom_model] Failed to execute custom model code: {error_msg}")
303 | return None
304 |
305 | if execution_time is not None:
306 | print(f"{info_prefix}[load_custom_model] Model loaded successfully in {execution_time:.2f}s")
307 |
308 | ModelNew = context.get("ModelNew")
309 |
310 | # Debug: Show what's in the context
311 | print(f"{info_prefix}[load_custom_model] Context keys: {list(context.keys())}")
312 | print(f"{info_prefix}[load_custom_model] ModelNew from context: {ModelNew}")
313 |
314 | # Validate that ModelNew was properly defined
315 | if ModelNew is None:
316 | print(f"{info_prefix}[load_custom_model] Error: ModelNew was not defined in the custom model source code")
317 | print(f"{info_prefix}[load_custom_model] Make sure your custom model source includes: ModelNew = YourModelClass")
318 | print(f"{info_prefix}[load_custom_model] Available in context: {[k for k in context.keys() if not k.startswith('__')]}")
319 | return None
320 |
321 | if not callable(ModelNew):
322 | print(f"{info_prefix}Error: ModelNew is not callable (got {type(ModelNew)})")
323 | print(f"{info_prefix}Make sure ModelNew is a class that can be instantiated")
324 | return None
325 |
326 | # Additional validation - check if it's a class
327 | if not isinstance(ModelNew, type):
328 | print(f"{info_prefix}Error: ModelNew should be a class, got {type(ModelNew)}")
329 | print(f"{info_prefix}Example: class MyModel(nn.Module): ... then ModelNew = MyModel")
330 | return None
331 |
332 | return ModelNew
333 |
334 |
335 | def graceful_eval_cleanup(curr_context: Dict, device: torch.device):
336 | """
337 | Clean up env, gpu cache, and compiled CUDA extensions after evaluation
338 | """ # delete ran-specific function definitions before next eval run
339 | del curr_context
340 | # Clear CUDA cache and reset GPU state
341 | with torch.cuda.device(device):
342 | torch.cuda.empty_cache()
343 |
344 | # does this help?
345 | torch.cuda.reset_peak_memory_stats(device=device)
346 |
347 | torch.cuda.synchronize(
348 | device=device
349 | ) # Wait for all CUDA operations to complete
350 |
351 | # _cleanup_cuda_extensions() # SIMON NOTE: is this necessary?
352 |
353 |
354 | def check_kernel_correctness(
355 | warmup_src:str,
356 | original_model_src: str,
357 | custom_model_src: str,
358 | seed_num: int = 42,
359 | num_correct_trials: int = 5,
360 | verbose: bool = False,
361 | build_dir: os.PathLike = None,
362 | device: torch.device = None,
363 | timeout: float = 300.0,
364 | info_string: str = ""
365 | ) -> tuple[bool, str, dict]:
366 | """
367 | Check correctness of custom CUDA kernel against reference implementation.
368 |
369 | Args:
370 | original_model_src: Source code for the original/reference model
371 | custom_model_src: Source code for the custom CUDA kernel model
372 | seed_num: Base seed for reproducible testing
373 | num_correct_trials: Number of trials with different inputs to test
374 | verbose: Whether to print detailed progress
375 | build_dir: Directory for CUDA extensions
376 | device: CUDA device to run on (defaults to current device)
377 | timeout: Timeout for model loading in seconds
378 |
379 | Returns:
380 | tuple[bool, str, dict]: (success, error_message, metadata)
381 | - success: True if all correctness trials pass
382 | - error_message: Error details if failed, empty string if successful
383 | - metadata: Dictionary with trial details and statistics
384 | """
385 | if device is None:
386 | raise Exception("Device is not set for check_kernel_correctness")
387 |
388 | if not torch.cuda.is_available():
389 | return False, "CUDA is not available", {}
390 |
391 | # Define beijing_tz at the beginning of the function
392 | beijing_tz = timezone(timedelta(hours=8))
393 |
394 | # Format info_string for consistent display
395 | info_prefix = f"[{info_string}] " if info_string else ""
396 |
397 | # Set CUDA device
398 | torch.cuda.set_device(device)
399 |
400 | metadata = {
401 | "device": str(device),
402 | "hardware": torch.cuda.get_device_name(device=device),
403 | "num_trials": num_correct_trials,
404 | "trials_passed": 0,
405 | "trials_failed": 0,
406 | "max_difference": [],
407 | "avg_difference": []
408 | }
409 |
410 | if verbose:
411 | print(f"{info_prefix}[Correctness] Starting correctness check on device: {device}")
412 | print(f"{info_prefix}[Correctness] Running {num_correct_trials} trials")
413 |
414 | # Load original model
415 | context_warmup = {}
416 | if verbose:
417 | print(f"{info_prefix}[Correctness] Loading original model...")
418 | WarmupModel, get_init_inputs, get_inputs = load_original_model_and_inputs(
419 | warmup_src, context_warmup
420 | )
421 |
422 | set_seed(seed_num)
423 | init_inputs = get_init_inputs()
424 | init_inputs = [
425 | x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs
426 | ]
427 |
428 | with torch.no_grad():
429 | set_seed(seed_num)
430 | warmup_model = WarmupModel(*init_inputs).to(device)
431 | inputs = get_inputs()
432 | inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs]
433 | warmup_model(*inputs)
434 | torch.cuda.synchronize(device=device)
435 |
436 |
437 | try:
438 | context_original = {}
439 | Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
440 | original_model_src, context_original, timeout=timeout, info_string=info_string
441 | )
442 | if Model is None:
443 | return False, "Failed to load original model", metadata
444 |
445 | # Initialize original model
446 | set_seed(seed_num)
447 | init_inputs = get_init_inputs()
448 | init_inputs = [
449 | x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs
450 | ]
451 |
452 | with torch.no_grad():
453 | set_seed(seed_num)
454 | original_model = Model(*init_inputs).to(device)
455 |
456 | except Exception as e:
457 | return False, f"Failed to initialize original model: {e}", metadata
458 |
459 | # Load custom model
460 | context_custom = {}
461 | if verbose:
462 | print(f"{info_prefix}[Correctness] Loading custom model...")
463 |
464 | try:
465 | os.environ["TORCH_USE_CUDA_DSA"] = "1" # Enable device-side assertions
466 | ModelNew = load_custom_model(custom_model_src, context_custom, build_dir, timeout=timeout, info_string=info_string)
467 | if ModelNew is None:
468 | return False, "Failed to load custom model", metadata
469 |
470 | # Initialize custom model
471 | with torch.no_grad():
472 | set_seed(seed_num)
473 | custom_model = ModelNew(*init_inputs).to(device)
474 |
475 | torch.cuda.synchronize(device=device)
476 |
477 | except Exception as e:
478 | return False, f"Failed to initialize custom model: {e}", metadata
479 |
480 | # Run correctness trials
481 | if verbose:
482 | print(f"{info_prefix}[Correctness] Running {num_correct_trials} correctness trials...")
483 |
484 | # Generate trial seeds deterministically
485 | torch.manual_seed(seed_num)
486 | trial_seeds = [torch.randint(0, 2**32 - 1, (1,)).item() for _ in range(num_correct_trials)]
487 |
488 | pass_count = 0
489 |
490 | with torch.no_grad():
491 | for trial in range(num_correct_trials):
492 | trial_seed = trial_seeds[trial]
493 |
494 | # if verbose:
495 | # print(f"{info_prefix}[Correctness {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Trial {trial + 1}/{num_correct_trials} (seed: {trial_seed})")
496 |
497 | try:
498 | # Generate inputs for this trial
499 | set_seed(trial_seed)
500 | inputs = get_inputs()
501 | inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs]
502 |
503 | # Run original model
504 | set_seed(trial_seed)
505 | original_model.eval()
506 | original_output = original_model(*inputs)
507 | torch.cuda.synchronize(device=device)
508 |
509 | # Run custom model
510 | set_seed(trial_seed)
511 | custom_model.eval()
512 | custom_output = custom_model(*inputs)
513 | torch.cuda.synchronize(device=device)
514 |
515 | # Check output shapes
516 | if original_output.shape != custom_output.shape:
517 | error_msg = f"Shape mismatch to the original model"
518 | metadata["trials_failed"] += 1
519 | # if verbose:
520 | # print(f"{info_prefix}[Correctness] ❌ {error_msg}")
521 | return False, error_msg, metadata
522 |
523 | # Check output values
524 | if not torch.allclose(original_output, custom_output, atol=1e-02, rtol=1e-02):
525 | max_diff = torch.max(torch.abs(original_output - custom_output)).item()
526 | avg_diff = torch.mean(torch.abs(original_output - custom_output)).item()
527 |
528 | metadata["max_difference"].append(f"{max_diff:.6f}")
529 | metadata["avg_difference"].append(f"{avg_diff:.6f}")
530 | metadata["trials_failed"] += 1
531 | print(metadata)
532 | error_msg = f"Value mismatch to the original model"
533 | # if verbose:
534 | # print(f"{info_prefix}[Correctness] ❌ {error_msg}")
535 | return False, error_msg, metadata
536 | else:
537 | # Trial passed
538 | pass_count += 1
539 | metadata["trials_passed"] += 1
540 | # if verbose:
541 | # print(f"{info_prefix}[Correctness] ✅ Trial {trial + 1} passed")
542 |
543 | except Exception as e:
544 | metadata["trials_failed"] += 1
545 | error_msg = f"Runtime error in trial {trial + 1}: {e}"
546 | if verbose:
547 | print(f"{info_prefix}[Correctness {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] ❌ {error_msg}")
548 | return False, error_msg, metadata
549 |
550 | # Final validation
551 | if pass_count == num_correct_trials:
552 | if verbose:
553 | print(f"{info_prefix}[Correctness {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] ✅ All {pass_count}/{num_correct_trials} trials passed!")
554 |
555 | # Cleanup
556 | graceful_eval_cleanup(context_original, device)
557 | graceful_eval_cleanup(context_custom, device)
558 |
559 | return True, "", metadata
560 | else:
561 | error_msg = f"Only {pass_count}/{num_correct_trials} trials passed"
562 | return False, error_msg, metadata
563 |
564 |
565 | def eval_kernel_against_ref(
566 | warmup_src: str,
567 | original_model_src: str,
568 | custom_model_src: str,
569 | seed_num: int = 42,
570 | num_perf_trials: int = 10,
571 | verbose: bool = False,
572 | build_dir: os.PathLike = None,
573 | device: torch.device = None, # have to run on GPU
574 | info_string: str = "",
575 | original_eval_setup: str = "vanilla"
576 | ) -> tuple[float | None, float | None, str]:
577 | """
578 | Evaluate the custom kernel against the original model (vanilla or torch.compile)
579 |
580 | Args:
581 | original_model_src: Source code for the original model
582 | custom_model_src: Source code for the custom model
583 | seed_num: Seed for reproducible results
584 | num_perf_trials: Number of performance trials to run
585 | verbose: Whether to print detailed progress
586 | build_dir: Directory for CUDA extensions
587 | device: GPU device to run evaluation on
588 | info_string: Information string for consistent logging
589 | original_eval_setup: Evaluation setup for original model ("vanilla", "torch_compile", or "CUDA_graphs")
590 |
591 |
592 | Returns:
593 | tuple[float | None, float | None, str]: (score, total_elapsed_time, message) where:
594 | - score: reference_model_time / custom_model_time (higher is better, >1.0 means speedup), or None if failed
595 | - total_elapsed_time: Total time taken for evaluation in seconds, or None if failed
596 | - message: Success message or error description
597 | """
598 | # Validate original_eval_setup parameter
599 | if original_eval_setup not in ["vanilla", "torch_compile", "torch_compile_reduce_overhead","cudnn","cuda_graph"]:
600 | raise ValueError(f"original_eval_setup must be 'vanilla', 'torch_compile', 'torch_compile_reduce_overhead', 'cudnn', or 'cuda_graph', got '{original_eval_setup}'")
601 |
602 | # TODO: check device is busy
603 | assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval"
604 | torch.set_printoptions(
605 | precision=4, # Decimal places
606 | threshold=10, # Total number of elements before truncating
607 | edgeitems=3, # Number of elem xents at beginning and end of dimensions
608 | linewidth=80, # Maximum width before wrapping
609 | )
610 |
611 | custom_contain_new_streams = False
612 | if custom_model_src.find("cuda.Stream")!=-1:
613 | custom_contain_new_streams = True
614 |
615 | # Define beijing_tz at the beginning of the function
616 | beijing_tz = timezone(timedelta(hours=8))
617 |
618 | # Format info_string for consistent display
619 | info_prefix = f"[{info_string}] " if info_string else ""
620 |
621 | # set CUDA device
622 | torch.cuda.set_device(device)
623 |
624 | context = {}
625 |
626 | if verbose:
627 | print(f"{info_prefix}[Eval] Start Evaluation ({original_eval_setup})! on device: {device}")
628 | if original_eval_setup == "torch_compile":
629 | print(f"{info_prefix}[Eval] Compile mode: default")
630 | print(f"{info_prefix}[Eval] Loading Original Model")
631 |
632 | WarmupModel, get_init_inputs, get_inputs = load_original_model_and_inputs(
633 | warmup_src, context, info_string=info_string
634 | )
635 | set_seed(seed_num) # set seed for reproducible input
636 | init_inputs = get_init_inputs()
637 | init_inputs = [
638 | x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs
639 | ]
640 | with torch.no_grad():
641 | set_seed(seed_num) # set seed for reproducible weights
642 | warmup_model = WarmupModel(*init_inputs)
643 | warmup_model = warmup_model.to(device)
644 | inputs = get_inputs()
645 | inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs]
646 | warmup_model(*inputs)
647 | torch.cuda.synchronize(device=device)
648 |
649 | Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
650 | original_model_src, context, info_string=info_string
651 | )
652 | set_seed(seed_num) # set seed for reproducible input
653 | init_inputs = get_init_inputs()
654 | init_inputs = [
655 | x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs
656 | ]
657 |
658 | with torch.no_grad():
659 | set_seed(seed_num) # set seed for reproducible weights
660 | original_model = Model(*init_inputs)
661 | assert hasattr(original_model, "forward")
662 |
663 | # Conditionally apply torch.compile to the original model
664 | if original_eval_setup == "torch_compile":
665 | if verbose:
666 | print(f"{info_prefix}[Eval] Applying torch.compile to original model (mode: default)")
667 |
668 | try:
669 | # Apply torch.compile with default mode
670 | original_model = torch.compile(original_model, mode="default")
671 | if verbose:
672 | print(f"{info_prefix}[Eval] Original Model compiled (warmup will happen later)")
673 |
674 | except Exception as e:
675 | print(f"{info_prefix}Failed to compile original model with torch.compile: {e}")
676 | return None, None, f"Failed to compile original model with torch.compile: {e}"
677 | elif original_eval_setup == "torch_compile_reduce_overhead":
678 | if verbose:
679 | print(f"{info_prefix}[Eval] Applying torch.compile to original model (mode: reduce-overhead)")
680 |
681 | try:
682 | # Apply torch.compile with reduce-overhead mode for CUDA graphs
683 | original_model = torch.compile(original_model, mode="reduce-overhead")
684 |
685 | if verbose:
686 | print(f"{info_prefix}[Eval] Original Model compiled with reduce-overhead mode (warmup will happen later)")
687 |
688 | except Exception as e:
689 | print(f"{info_prefix}Failed to compile original model with torch.compile (reduce-overhead): {e}")
690 | return None, None, f"Failed to compile original model with torch.compile (reduce-overhead): {e}"
691 | else:
692 | if verbose:
693 | print(f"{info_prefix}[Eval] Original Model Loaded")
694 |
695 | if verbose:
696 | print(f"{info_prefix}[Eval] Loading and Compiling New Model with Custom CUDA Kernel")
697 |
698 | metadata = {} # for storing result metadata
699 | metadata["hardware"] = torch.cuda.get_device_name(device=device)
700 | metadata["device"] = str(device) # for debugging
701 | metadata["original_eval_setup"] = original_eval_setup
702 | if original_eval_setup == "torch_compile":
703 | metadata["compile_mode"] = "default"
704 | elif original_eval_setup == "torch_compile_reduce_overhead":
705 | metadata["compile_mode"] = "reduce-overhead"
706 |
707 | # this is where compilation happens
708 | try:
709 | os.environ["TORCH_USE_CUDA_DSA"] = "1" # compile with device side assertion
710 | # add hash for later to distinguish between multi-turn kernels
711 | ModelNew = load_custom_model(custom_model_src, context, build_dir, info_string=info_string)
712 |
713 | # Debug: Check what load_custom_model returned
714 | if verbose:
715 | print(f"{info_prefix}[DEBUG] load_custom_model returned: {ModelNew} (type: {type(ModelNew)})")
716 |
717 | # Validate ModelNew before proceeding
718 | if ModelNew is None:
719 | print(f"{info_prefix}ERROR: load_custom_model returned None - check the model source code")
720 | print(f"{info_prefix}The custom model source must define: ModelNew = YourModelClass")
721 | return None, None, "ModelNew is None"
722 |
723 | torch.cuda.synchronize(device=device) # not sure if this is too much
724 | except Exception as e:
725 | print(
726 | f"{info_prefix}Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}"
727 | )
728 | return None, None, "Failed to compile custom CUDA kernel"
729 |
730 | # at this point we passed compilation
731 | try:
732 | with torch.no_grad():
733 | set_seed(seed_num) # set seed for reproducible weights
734 | custom_model = ModelNew(*init_inputs)
735 | assert hasattr(custom_model, "forward")
736 | torch.cuda.synchronize(device=device)
737 | if verbose:
738 | print(f"{info_prefix}[Eval] New Model with Custom CUDA Kernel Loaded")
739 | except RuntimeError as e:
740 | print(
741 | f"{info_prefix}Failed to load custom CUDA kernel; Compiled but not able to run, count as runtime error. \nError: {e}"
742 | )
743 | return None, None, "Failed to load custom CUDA kernel with New Model"
744 |
745 | # Handle case where num_correct_trials is 0 (skip correctness check)
746 |
747 | if verbose:
748 | reference_type = "Compiled (torch.compile)" if original_eval_setup == "torch_compile" else "Original (vanilla)"
749 | print(f"{info_prefix}[Eval] Measuring Performance ({reference_type} vs Custom)")
750 |
751 | # Move models to the correct device for performance measurement
752 | original_model = original_model.to(device)
753 | custom_model = custom_model.to(device)
754 |
755 | reference_times = [] # Will store either vanilla or compiled times
756 | custom_times = []
757 |
758 | # === WARMUP PHASE ===
759 | if verbose:
760 | print(f"{info_prefix}[Eval] Starting warmup phase for both models...")
761 |
762 | try:
763 | warmup_inputs = get_inputs()
764 | warmup_inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in warmup_inputs]
765 |
766 | # Warm up both models (3 iterations each)
767 | for i in range(3):
768 | with torch.no_grad():
769 | # Warmup original model (especially important for torch.compile)
770 | _ = original_model(*warmup_inputs)
771 |
772 | # Warmup custom model (eliminates CUDA kernel initialization overhead)
773 | _ = custom_model(*warmup_inputs)
774 |
775 | torch.cuda.synchronize(device=device)
776 |
777 | if verbose:
778 | model_types = f"original ({original_eval_setup}) and custom"
779 | print(f"{info_prefix}[Eval] Warmup completed for {model_types} models")
780 |
781 | except Exception as e:
782 | print(f"{info_prefix}Warning: Model warmup failed: {e}")
783 | # Continue anyway - warmup failure shouldn't block evaluation
784 |
785 | if verbose:
786 | print(f"{info_prefix}[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, trials: {num_perf_trials}")
787 |
788 | t1 = time.time()
789 | with torch.no_grad():
790 | for trial in range(num_perf_trials):
791 | # Generate one random input for this trial - SAME input will be used for both models
792 | inputs = get_inputs()
793 | inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs]
794 | # Randomize execution order to eliminate systematic bias
795 | run_reference_first = random.choice([True, False])
796 |
797 | # IMPORTANT: Detect model streams to ensure accurate timing
798 | current_stream = torch.cuda.current_stream(device=device)
799 |
800 | # Comprehensive stream detection - find any CUDA streams the models use
801 | def find_model_streams(model):
802 | """Find all CUDA streams in a model, regardless of attribute names"""
803 | streams = []
804 |
805 | # Check all attributes of the model
806 | for attr_name in dir(model):
807 | try:
808 | attr_value = getattr(model, attr_name)
809 |
810 | # Check if it's a single CUDA stream
811 | if isinstance(attr_value, torch.cuda.Stream):
812 | streams.append(attr_value)
813 |
814 | # Check if it's a list/tuple of CUDA streams
815 | elif isinstance(attr_value, (list, tuple)):
816 | for item in attr_value:
817 | if isinstance(item, torch.cuda.Stream):
818 | streams.append(item)
819 |
820 | # Check if it's a dict containing CUDA streams
821 | elif isinstance(attr_value, dict):
822 | for item in attr_value.values():
823 | if isinstance(item, torch.cuda.Stream):
824 | streams.append(item)
825 |
826 | except (AttributeError, RuntimeError):
827 | # Some attributes might not be accessible or might raise errors
828 | continue
829 |
830 | return streams
831 |
832 | # Find streams for both models
833 | custom_model_streams = find_model_streams(custom_model)
834 | # Use current stream for timing, but track all model streams for synchronization
835 | # This ensures we capture all work regardless of which streams the model uses
836 | reference_model_stream = current_stream
837 | custom_model_stream = current_stream
838 |
839 | # Debug info for stream detection
840 | if verbose and custom_model_streams:
841 | print(f"{info_prefix}[Stream Detection] Found {len(custom_model_streams)} CUDA streams in custom model")
842 |
843 | if run_reference_first:
844 | # Time reference model first
845 | start_event = torch.cuda.Event(enable_timing=True)
846 | end_event = torch.cuda.Event(enable_timing=True)
847 |
848 | with torch.cuda.stream(reference_model_stream):
849 | start_event.record(reference_model_stream)
850 | original_model(*inputs)
851 |
852 | # Wait for all model streams to complete before recording end event
853 | end_event.record(reference_model_stream)
854 |
855 | torch.cuda.synchronize(device=device)
856 | reference_time = start_event.elapsed_time(end_event)
857 |
858 | # Time custom model second
859 | start_event = torch.cuda.Event(enable_timing=True)
860 | end_event = torch.cuda.Event(enable_timing=True)
861 |
862 | with torch.cuda.stream(custom_model_stream):
863 | start_event.record(custom_model_stream)
864 | custom_model(*inputs)
865 |
866 | # Wait for all model streams to complete before recording end event
867 | if custom_contain_new_streams:
868 | for stream in custom_model_streams:
869 | custom_model_stream.wait_stream(stream)
870 |
871 | end_event.record(custom_model_stream)
872 |
873 | torch.cuda.synchronize(device=device)
874 | custom_time = start_event.elapsed_time(end_event)
875 | else:
876 | # Time custom model first
877 | start_event = torch.cuda.Event(enable_timing=True)
878 | end_event = torch.cuda.Event(enable_timing=True)
879 |
880 | with torch.cuda.stream(custom_model_stream):
881 | start_event.record(custom_model_stream)
882 | custom_model(*inputs)
883 |
884 | # Wait for all model streams to complete before recording end event
885 | if custom_contain_new_streams:
886 | for stream in custom_model_streams:
887 | custom_model_stream.wait_stream(stream)
888 |
889 | end_event.record(custom_model_stream)
890 |
891 | torch.cuda.synchronize(device=device)
892 | custom_time = start_event.elapsed_time(end_event)
893 |
894 | start_event = torch.cuda.Event(enable_timing=True)
895 | end_event = torch.cuda.Event(enable_timing=True)
896 |
897 | with torch.cuda.stream(reference_model_stream):
898 | start_event.record(reference_model_stream)
899 | original_model(*inputs)
900 |
901 | # Wait for all model streams to complete before recording end event
902 | end_event.record(reference_model_stream)
903 |
904 | torch.cuda.synchronize(device=device)
905 | reference_time = start_event.elapsed_time(end_event)
906 |
907 | reference_times.append(reference_time)
908 | custom_times.append(custom_time)
909 | t2 = time.time()
910 |
911 | # Calculate averages and score
912 | avg_reference_time = sum(reference_times) / len(reference_times)
913 | avg_custom_time = sum(custom_times) / len(custom_times)
914 | score = avg_reference_time / avg_custom_time
915 | total_elapsed_time = (sum(reference_times) + sum(custom_times)) / 1000.0 # Convert from milliseconds to seconds
916 |
917 | if verbose:
918 | reference_type = "Compiled (torch.compile)" if original_eval_setup == "torch_compile" else "Original (vanilla)"
919 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] {reference_type} avg: {avg_reference_time:.3f}ms")
920 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Custom avg: {avg_custom_time:.3f}ms")
921 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Score (reference/custom): {score:.3f}")
922 |
923 | if score > 1.0:
924 | speedup = score
925 | vs_type = "torch.compile" if original_eval_setup == "torch_compile" else "original model"
926 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Custom kernel is {speedup:.2f}x faster than {vs_type}")
927 | elif score < 1.0:
928 | slowdown = 1.0 / score
929 | vs_type = "torch.compile" if original_eval_setup == "torch_compile" else "original model"
930 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Custom kernel is {slowdown:.2f}x slower than {vs_type}")
931 | else:
932 | vs_type = "torch.compile" if original_eval_setup == "torch_compile" else "original model"
933 | print(f"{info_prefix}[Results] Same performance as {vs_type}")
934 |
935 | graceful_eval_cleanup(context, device)
936 | return score, total_elapsed_time, "Success"
937 |
938 |
939 |
940 |
941 |
942 | ################################################################################
943 | # Performance Eval
944 | ################################################################################
945 |
946 |
947 | def fetch_baseline_time(
948 | level_name: str, problem_id: int, dataset: List[str], baseline_time_filepath: str
949 | ) -> Dict:
950 | """
951 | Fetch the baseline time from the time
952 | """
953 | if not os.path.exists(baseline_time_filepath):
954 | raise FileNotFoundError(
955 | f"Baseline time file not found at {baseline_time_filepath}"
956 | )
957 |
958 | with open(baseline_time_filepath, "r") as f:
959 | baseline_json = json.load(f)
960 |
961 | problem_name = dataset[problem_id].split("/")[-1]
962 | baseline_time = baseline_json[level_name].get(problem_name, None)
963 | return baseline_time
964 |
965 |
966 | def get_timing_stats(elapsed_times: List[float], device: Optional[torch.device] = None) -> Dict:
967 | """Get timing statistics from a list of elapsed times.
968 |
969 | Args:
970 | elapsed_times: List of elapsed times in milliseconds
971 | device: CUDA device, record device info
972 | Returns:
973 | Dict containing mean, std, min, max and num_trials
974 | all timing are in ms
975 | """
976 |
977 | stats = {
978 | "mean": float(f"{np.mean(elapsed_times):.3g}"),
979 | "std": float(f"{np.std(elapsed_times):.3g}"),
980 | "min": float(f"{np.min(elapsed_times):.3g}"),
981 | "max": float(f"{np.max(elapsed_times):.3g}"),
982 | "num_trials": len(elapsed_times),
983 | }
984 |
985 | if device:
986 | stats["hardware"] = torch.cuda.get_device_name(device=device)
987 | stats["device"] = str(device) # for debugging
988 |
989 | return stats
990 |
991 |
992 | def get_available_gpus():
993 | """Get list of available GPU device IDs"""
994 | if not torch.cuda.is_available():
995 | return []
996 | return list(range(torch.cuda.device_count()))
997 |
998 |
999 |
1000 |
1001 | def eval_pipeline(
1002 | warmup_src: str,
1003 | original_model_src: str,
1004 | custom_model_src: str,
1005 | num_correct_trials: int,
1006 | num_perf_trials: int,
1007 | global_n_trials: int,
1008 | gpu_index: int,
1009 | verbose: bool = False,
1010 | log_path: str = None,
1011 | max_time: float = None,
1012 | use_process_isolation: bool = False,
1013 | info_string = "",
1014 | original_eval_setup="",
1015 | ):
1016 | assert original_eval_setup!=""
1017 | tz = timezone(timedelta(hours=0))
1018 |
1019 | # Format info_string for consistent display
1020 | info_prefix = f"[{info_string}] " if info_string else ""
1021 |
1022 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] median_comparison_pipeline start")
1023 | if log_path is not None:
1024 | log_dir = os.path.dirname(log_path)
1025 | os.makedirs(log_dir, exist_ok=True)
1026 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] Writing log to {log_path}")
1027 | current_time = datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')
1028 |
1029 | with open(log_path, "w") as write_log:
1030 | print(f"in log_path open and write {log_path}")
1031 | write_log.write(json.dumps({"info_string": info_string, "start_time": current_time, "code": custom_model_src}) + "\n")
1032 | # write_log.write(json.dumps({"info_string": info_string, "start_time": current_time, "custom_model_src": custom_model_src}) + "\n")
1033 | write_log.flush()
1034 |
1035 | # step 1: check whether the model can be executed and compiled
1036 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 1: check whether the model can be executed and compiled")
1037 | context = {}
1038 | success_original, error_msg, execution_time = execute_model_with_timeout(
1039 | model_src=original_model_src,
1040 | context=context,
1041 | timeout=30.0, # 30 seconds should be enough
1042 | use_process_isolation=use_process_isolation,
1043 | info_string=info_string
1044 | )
1045 | if not success_original:
1046 | log_dict_ = {
1047 | "info_string": info_string,
1048 | "error_msg": f"Original model compilation failed: {error_msg}",
1049 | "error": True,
1050 | "done": True
1051 | }
1052 | with open(log_path, "a") as write_log:
1053 | write_log.write(json.dumps(log_dict_) + "\n")
1054 | write_log.flush()
1055 | return None, f"Original model compilation failed: {error_msg}"
1056 |
1057 | success_custom, error_msg, execution_time = execute_model_with_timeout(
1058 | model_src=custom_model_src,
1059 | context={}, # Use fresh context for custom model
1060 | timeout=100, # Give enough time for CUDA compilation with minimum 30s
1061 | use_process_isolation=use_process_isolation,
1062 | info_string=info_string
1063 | )
1064 | if not success_custom:
1065 | log_dict_ = {
1066 | "info_string": info_string,
1067 | "error_msg": "fail to compile or execute",
1068 | "error": True,
1069 | "done": True
1070 | }
1071 | with open(log_path, "a") as write_log:
1072 | write_log.write(json.dumps(log_dict_) + "\n")
1073 | write_log.flush()
1074 | return None, "Custom model compilation failed"
1075 | else:
1076 | log_dict_ = {
1077 | "info_string": info_string,
1078 | "info": "stage1:Compile Success",
1079 | "time": datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S'),
1080 | "error": False,
1081 | "done": False
1082 | }
1083 | with open(log_path, "a") as write_log:
1084 | write_log.write(json.dumps(log_dict_) + "\n")
1085 | write_log.flush()
1086 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 2: preliminary speed check")
1087 | device = torch.device(f'cuda:{gpu_index}')
1088 | time1 = time.time()
1089 |
1090 | # step 3: correctness check
1091 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 3: correctness check")
1092 | time1 = time.time()
1093 | correctness_passed, error_msg, correctness_metadata = check_kernel_correctness(
1094 | warmup_src=warmup_src,
1095 | original_model_src=original_model_src,
1096 | custom_model_src=custom_model_src,
1097 | num_correct_trials=num_correct_trials,
1098 | verbose=verbose,
1099 | device=device,
1100 | info_string=info_string
1101 | )
1102 | time2 = time.time()
1103 | if not correctness_passed:
1104 | log_dict_ = {
1105 | "info_string": info_string,
1106 | "error_msg": error_msg,
1107 | "error": True,
1108 | "done": True
1109 | }
1110 | with open(log_path, "a") as write_log:
1111 | write_log.write(json.dumps(log_dict_) + "\n")
1112 | write_log.flush()
1113 | return None, error_msg
1114 | else:
1115 | log_dict_ = {
1116 | "info_string": info_string,
1117 | "info": "stage3:Correctness Check Success",
1118 | "time": datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S'),
1119 | "error": False,
1120 | "done": False,
1121 | "duration": time2 - time1,
1122 | }
1123 | with open(log_path, "a") as write_log:
1124 | write_log.write(json.dumps(log_dict_) + "\n")
1125 | write_log.flush()
1126 |
1127 | log_dict_ = {
1128 | "info_string": info_string,
1129 | "info": "stage4:Performance Evaluation",
1130 | "time": datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S'),
1131 | "error": False,
1132 | "done": False
1133 | }
1134 | with open(log_path, "a") as write_log:
1135 | write_log.write(json.dumps(log_dict_) + "\n")
1136 | write_log.flush()
1137 | scores = []
1138 | list_gpu_execution_time = []
1139 | # Run global_n_trials sequential evaluations
1140 | start_time = time.time()
1141 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 4: performance evaluation")
1142 | for trial in range(global_n_trials):
1143 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 4: performance evaluation, trial {trial + 1}/{global_n_trials}")
1144 | # Run single evaluation
1145 | time1 = time.time()
1146 | score, gpu_execution_time, error_msg = eval_kernel_against_ref(
1147 | warmup_src=warmup_src,
1148 | original_model_src=original_model_src,
1149 | custom_model_src=custom_model_src,
1150 | seed_num=42 + trial, # Different seed for each trial
1151 | num_perf_trials=num_perf_trials,
1152 | verbose=False, # Keep individual trials quiet unless overall verbose
1153 | build_dir=None,
1154 | device=device,
1155 | info_string=info_string,
1156 | original_eval_setup=original_eval_setup
1157 | )
1158 | list_gpu_execution_time.append(gpu_execution_time)
1159 | if score is None:
1160 | error_msg = f"fail to inference"
1161 | log_dict_ = {
1162 | "info_string": info_string,
1163 | "trial": trial,
1164 | "gpu_index": gpu_index,
1165 | "score": score,
1166 | "error_msg": error_msg,
1167 | "error": True,
1168 | "done": True
1169 | }
1170 | with open(log_path, "a") as write_log:
1171 | write_log.write(json.dumps(log_dict_) + "\n")
1172 | write_log.flush()
1173 | return None, error_msg
1174 | time2 = time.time()
1175 | log_dict_ = {
1176 | "info_string": info_string,
1177 | "n_trial": num_perf_trials,
1178 | "trial": trial,
1179 | "gpu_index": gpu_index,
1180 | "score": score,
1181 | "time": datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S'),
1182 | "gpu_execution_time": gpu_execution_time,
1183 | "ave_gpu_execution_time": gpu_execution_time / num_perf_trials,
1184 | "done": False,
1185 | "duration": time2 - time1,
1186 | "error": False
1187 | }
1188 | with open(log_path, "a") as write_log:
1189 | write_log.write(json.dumps(log_dict_) + "\n")
1190 | write_log.flush()
1191 | scores.append(score)
1192 | if score is not None and score < 0.3:
1193 | break
1194 |
1195 | # if verbose:
1196 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] Trial {trial + 1}: {score:.4f} at gpu {gpu_index}")
1197 |
1198 | if len(scores) == 0:
1199 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] ❌ No trials completed successfully")
1200 | log_dict_empty = {
1201 | "info_string": info_string,
1202 | "error": True,
1203 | "error_msg": "No trials completed successfully",
1204 | "completed_trials": 0,
1205 | "done": True
1206 | }
1207 | with open(log_path, "a") as write_log:
1208 | write_log.write(json.dumps(log_dict_empty) + "\n")
1209 | write_log.flush()
1210 | return None, "No trials completed successfully"
1211 |
1212 | mean_score = float(np.mean(scores))
1213 |
1214 | std = float(np.std(scores))
1215 |
1216 | # Round all scores in the list to 4 decimal places for consistency
1217 | rounded_scores = [round(score, 4) for score in scores]
1218 |
1219 | # Record final elapsed time
1220 | total_elapsed_time = time.time() - start_time
1221 | n_all_trials = num_perf_trials*global_n_trials
1222 | log_dict_ = {
1223 | "info_string": info_string,
1224 | "score": mean_score,
1225 | "rounded_scores": rounded_scores,
1226 | "scores_sorted": sorted(scores),
1227 | "completed_trials": len(scores),
1228 | "total_trials": global_n_trials,
1229 | "n_all_trials_trials": n_all_trials,
1230 | "total_elapsed_time": total_elapsed_time,
1231 | "total_gpu_execution_time": sum(list_gpu_execution_time),
1232 | "ave_gpu_execution_time": sum(list_gpu_execution_time)/n_all_trials,
1233 | "error": False,
1234 | "done": True,
1235 | "scores": [round(ss, 4) for ss in scores],
1236 | "std": std,
1237 | }
1238 | with open(log_path, "a") as write_log:
1239 | write_log.write(json.dumps(log_dict_) + "\n")
1240 | write_log.flush()
1241 |
1242 | return 0
1243 |
1244 |
1245 | def load_cuda_file(PATH_TO_CUDA_FILE):
1246 | if not os.path.exists(PATH_TO_CUDA_FILE):
1247 | raise Exception(f"{PATH_TO_CUDA_FILE} not found")
1248 | cuda_dict_= {}
1249 | with open(PATH_TO_CUDA_FILE, "r") as f:
1250 | for line in f:
1251 | dict_ = json.loads(line)
1252 | level_id = dict_["level_id"]
1253 | task_id = dict_["task_id"]
1254 | ref_code = dict_["ref_code"]
1255 | custom_code = dict_["custom_code"]
1256 | cuda_graph_code = dict_["cuda_graph_code"]
1257 | cudnn_code = dict_["cudnn_code"]
1258 | if level_id not in cuda_dict_:
1259 | cuda_dict_[level_id] = {}
1260 | cuda_dict_[level_id][task_id] = {
1261 | "ref_code": ref_code,
1262 | "custom_code": custom_code,
1263 | "cuda_graph_code": cuda_graph_code,
1264 | "cudnn_code": cudnn_code
1265 | }
1266 | return cuda_dict_
1267 |
1268 | def eval():
1269 | YOUR_HOME_FOLDER = "/data2/jiwei/cuda_results/CUDA-L1/optimized_cuda_code"
1270 | PATH_TO_CUDA_FILE = os.path.join(YOUR_HOME_FOLDER, "3090.json")
1271 | output_path = os.path.join(YOUR_HOME_FOLDER, "log.json")
1272 |
1273 | cuda_dict_ = load_cuda_file(PATH_TO_CUDA_FILE)
1274 | level_id = 3
1275 | task_id = 35
1276 | current_dict = cuda_dict_[level_id][task_id]
1277 | original_eval_setup="vanilla"
1278 | # original_eval_setup="torch_compile"
1279 | # original_eval_setup="torch_compile_reduce_overhead"
1280 | # original_eval_setup = "cuda_graph"
1281 | # original_eval_setup = "cudnn"
1282 |
1283 | # original_eval_setup can take value of
1284 | # 1. vanilla, which compares the speed with custom_code with ref_code
1285 | # 2. torch_compile, which compares the speed with custom_code with ref_code compiled with torch.compile
1286 | # 3. torch_compile_reduce_overhead, which compares the speed with custom_code with ref_code compiled with torch.compile and reduce overhead
1287 | # 4. cuda_graph, which compares the speed with custom_code with ref_code with cuda_graph modification
1288 | # 5. cudnn, which compares the speed with custom_code with ref_code with cudnn modification
1289 | warmup_code, ref_code, custom_code, cuda_graph_code, cudnn_code = current_dict["ref_code"], current_dict["ref_code"], current_dict["custom_code"], current_dict["cuda_graph_code"], current_dict["cudnn_code"]
1290 | # for whatever eval_setup, we use ref_code for warmup
1291 | if custom_code == None:
1292 | print(f"CUDA-L1 does not yield performance boost on this task on this gpu architecture.")
1293 | return 0
1294 | if original_eval_setup == "cuda_graph" and cuda_graph_code == None:
1295 | print(f"We were unable to generate valid CUDA graph code for this task. Your request will be skipped.")
1296 | return 0
1297 | if original_eval_setup == "cudnn" and cudnn_code == None:
1298 | print(f"We were unable to generate valid CUDNN code for this task. Your request will be skipped.")
1299 | return 0
1300 | if original_eval_setup == "cuda_graph":
1301 | original_model_src = cuda_graph_code
1302 | elif original_eval_setup == "cudnn":
1303 | original_model_src = cudnn_code
1304 | else:
1305 | original_model_src = ref_code
1306 |
1307 | eval_pipeline(
1308 | warmup_src=warmup_code,
1309 | original_model_src=original_model_src,
1310 | custom_model_src=custom_code,
1311 | num_correct_trials=10,
1312 | num_perf_trials=10,
1313 | global_n_trials=7,
1314 | gpu_index=0,
1315 | verbose=False,
1316 | log_path=output_path,
1317 | max_time=1800,
1318 | original_eval_setup=original_eval_setup
1319 | )
1320 | #original_eval_setup should take vanilla, torch_compile, torch_compile_reduce_overhead
1321 | print(f"log_path: {output_path}")
1322 | print(f"log_path: {output_path}")
1323 |
1324 |
1325 | if __name__ == "__main__":
1326 | eval()
--------------------------------------------------------------------------------