├── .gitignore ├── LICENSE ├── README.md ├── assets ├── gpu_config_comparison.png ├── header.png ├── ip_title.png ├── ip_v2.png ├── pipeline.png └── speedup_and_example.png ├── eval └── eval_cuda.py ├── more_baselines ├── cuda_graph.json └── cudnn.json └── optimized_cuda_code ├── 3090.json ├── a100.json ├── codes ├── 3090.json ├── a100.json ├── h100.json ├── h20.json └── l40.json ├── h100.json ├── h20.json └── l40.json /.gitignore: -------------------------------------------------------------------------------- 1 | # Xcode 2 | *.DS_Store 3 | .idea 4 | .idea/* 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[codz] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py.cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # UV 103 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 104 | # This is especially recommended for binary packages to ensure reproducibility, and is more 105 | # commonly ignored for libraries. 106 | #uv.lock 107 | 108 | # poetry 109 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 110 | # This is especially recommended for binary packages to ensure reproducibility, and is more 111 | # commonly ignored for libraries. 112 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 113 | #poetry.lock 114 | #poetry.toml 115 | 116 | # pdm 117 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 118 | # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. 119 | # https://pdm-project.org/en/latest/usage/project/#working-with-version-control 120 | #pdm.lock 121 | #pdm.toml 122 | .pdm-python 123 | .pdm-build/ 124 | 125 | # pixi 126 | # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. 127 | #pixi.lock 128 | # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one 129 | # in the .venv directory. It is recommended not to include this directory in version control. 130 | .pixi 131 | 132 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 133 | __pypackages__/ 134 | 135 | # Celery stuff 136 | celerybeat-schedule 137 | celerybeat.pid 138 | 139 | # SageMath parsed files 140 | *.sage.py 141 | 142 | # Environments 143 | .env 144 | .envrc 145 | .venv 146 | env/ 147 | venv/ 148 | ENV/ 149 | env.bak/ 150 | venv.bak/ 151 | 152 | # Spyder project settings 153 | .spyderproject 154 | .spyproject 155 | 156 | # Rope project settings 157 | .ropeproject 158 | 159 | # mkdocs documentation 160 | /site 161 | 162 | # mypy 163 | .mypy_cache/ 164 | .dmypy.json 165 | dmypy.json 166 | 167 | # Pyre type checker 168 | .pyre/ 169 | 170 | # pytype static type analyzer 171 | .pytype/ 172 | 173 | # Cython debug symbols 174 | cython_debug/ 175 | 176 | # PyCharm 177 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 178 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 179 | # and can be added to the global gitignore or merged into this file. For a more nuclear 180 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 181 | #.idea/ 182 | 183 | # Abstra 184 | # Abstra is an AI-powered process automation framework. 185 | # Ignore directories containing user credentials, local state, and settings. 186 | # Learn more at https://abstra.io/docs 187 | .abstra/ 188 | 189 | # Visual Studio Code 190 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 191 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore 192 | # and can be added to the global gitignore or merged into this file. However, if you prefer, 193 | # you could uncomment the following to ignore the entire vscode folder 194 | # .vscode/ 195 | 196 | # Ruff stuff: 197 | .ruff_cache/ 198 | 199 | # PyPI configuration file 200 | .pypirc 201 | 202 | # Cursor 203 | # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to 204 | # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data 205 | # refer to https://docs.cursor.com/context/ignore-files 206 | .cursorignore 207 | .cursorindexingignore 208 | 209 | # Marimo 210 | marimo/_static/ 211 | marimo/_lsp/ 212 | __marimo__/ 213 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 ShannonAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | CUDA-L1: Improving CUDA Optimization via Contrastive Reinforcement Learning 4 | 5 |
6 | 7 |
8 | 9 | 10 |

11 | License     |     🏠  Project Page     |     📄  Paper     12 |

13 | 14 |

15 | CUDA-L1: Improving CUDA Optimization via Contrastive Reinforcement Learning 16 |

17 | 18 | 19 | ## 🥳 Introduction 20 | 21 | In this paper, we introduce CUDA-L1, an automated reinforcement learning (RL) framework for CUDA optimization. The core of CUDA-L1 is a contrastive RL model, a newly-designed RL system to enhance optimization through comparative learning. 22 | 23 |
24 | 25 | Evaluation Results 26 | 27 |
28 |

29 | Fig:Average speedup across different architectures on KernelBench over baselines. 30 |

31 |
32 | 33 | 34 | ## 🗒️ To-do List 35 | - [x] Fix KernelBench evaluations with proper stream timing synchronization ✅ 36 | - [x] Remove caching ✅ 37 | - [x] Compare with torch.compile ✅ 38 | - [x] Compare with pytorch eager + cuda graph ✅ 39 | - [x] Compare with custom torch CUDA/cuDNN backend flags ✅ 40 | - [ ] 5090/4090 41 | 42 | 43 | ## 🩺 Evaluation Results 44 | 45 | Our evaluation is conducted on the KernelBench [dataset](https://github.com/ScalingIntelligence/KernelBench), a collection of 250 PyTorch workloads designed to evaluate language models' ability to generate efficient GPU kernels. 46 | 47 | 48 | 49 | 50 | 51 |
52 | 53 | **Table: Performance comparison across different configurations on KernelBench on A100.** 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 |
ConfigurationMethodMeanMax75%50%25%Success↑
# out of total
Speedup↑
>1.01x out of total
DefaultAll3.12×120×2.25×1.42×1.17×249/250226/250
Torch CompileAll2.77×69.0×2.55×1.72×1.14×249/250203/250
Torch Compile ROAll2.88×80.1×2.48×1.67×1.13×249/250200/250
CUDA GraphAll2.81×97.9×1.83×1.20×0.954×249/250147/229
176 | 177 |
178 | 179 | 180 | • RO = Reduce Overhead 181 |
182 | • Success and Speedup indicate the number of successful benchmarks out of the total for each level 183 |
184 | 185 | 186 | 187 | 188 | 189 |
190 | 191 | **Table: Mean speedup across different configurations and GPU devices.** 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 |
ConfigurationA1003090H100H20L40
Default3.12×2.51×3.85×2.38×3.13×
Torch Compile2.77×2.58×2.74×2.89×2.85×
Torch Compile RO2.88×2.61×2.77×2.82×2.89×
CUDA Graph2.81×3.34×2.23×2.20×3.98×
286 | 287 |
288 | 289 | ## ❓ How to reproduce the results? 290 | 291 | We provide CUDA code snippets optimized by CUDA-L1 in the `optimized_cuda_code` folder, with separate versions for each GPU device. For example, to reproduce our results on H100 XSM, download `./optimized_cuda_code/h100.json` and run each code snippet on your H100 device. 292 | 293 | 294 | ## 📁 Structure of Release Code 295 | 296 | Each line in the release file contains a JSON object with the following fields: 297 | 298 | | Field | Description | 299 | |-------|-------------| 300 | | `level_id` | Level index in KernelBench (values: 1, 2, 3) | 301 | | `task_id` | Task index for that level | 302 | | `ref_code` | Reference CUDA code provided by KernelBench | 303 | | `custom_code` | Optimized code generated by CUDA-L1 | 304 | | `cuda_graph_code` | KernelBench reference code with CUDA Graph modifications | 305 | | `score_default` | Execution time ratio: ref_code / custom_code | 306 | | `score_torch_compile_default` | Execution time ratio: ref_code / custom_code (with torch.compile) | 307 | | `score_torch_compile_reduce_overhead` | Execution time ratio: ref_code / custom_code (with torch.compile reduce_overhead mode) | 308 | | `score_cuda_graph` | Execution time ratio: cuda_graph_code / custom_code | 309 | 310 | **Note:** If `custom_code` is None, it means the RL either failed to generate code faster than the reference code or simply copied the reference code during generation. 311 | 312 | ### Example Entry Structure 313 | ```json 314 | { 315 | "level_id": 1, 316 | "task_id": 1, 317 | "ref_code": "import torch...", 318 | "custom_code": "import torch...", 319 | "cuda_graph_code": "import torch...", 320 | "score_default": 1.762, 321 | "score_torch_compile_default": 1.958, 322 | "score_torch_compile_reduce_overhead": 2.118, 323 | "score_cuda_graph": 1.566, 324 | } 325 | ``` 326 | 327 | ## 🔭 Limitations and Challenges 328 | 329 | During the training process, we found that RL is particularly susceptible to reward hacking. We've already identified quite a few hacking cases (e.g., exploiting timing measurements & caching results). If you identify any additional reward hacks in the code, we would greatly appreciate you letting us know. 330 | 331 | 332 | ## 📇 Citation 333 | ```latex 334 | @article{deepreinforce2025cudal1, 335 | title={CUDA-L1: Improving CUDA Optimization via Contrastive Reinforcement Learning}, 336 | author={Li, Xiaoya and Sun, Xiaofei and Wang, Albert and Li, Jiwei and Chris, Shum}, 337 | journal={arXiv preprint arXiv:2507.14111}, 338 | year={2025} 339 | } 340 | ``` 341 | 342 | ## ✉️ Contact 343 | If you have any questions, please reach out to us at **research@deep-reinforce.com**. 344 | -------------------------------------------------------------------------------- /assets/gpu_config_comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/gpu_config_comparison.png -------------------------------------------------------------------------------- /assets/header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/header.png -------------------------------------------------------------------------------- /assets/ip_title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/ip_title.png -------------------------------------------------------------------------------- /assets/ip_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/ip_v2.png -------------------------------------------------------------------------------- /assets/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/pipeline.png -------------------------------------------------------------------------------- /assets/speedup_and_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepreinforce-ai/CUDA-L1/517b76d33c8ca9f36925246aa33a4d441657a581/assets/speedup_and_example.png -------------------------------------------------------------------------------- /eval/eval_cuda.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for Evaluations 3 | Copied and then Adapted from the KernelBench evaluation code 4 | 5 | Key Enhancement for Multi-Stream CUDA Models: 6 | 7 | In eval_kernel_against_ref(), we modified the timing logic to properly handle models 8 | that create their own CUDA streams. This ensures accurate performance measurements 9 | for models with complex stream management. 10 | 11 | Original timing code: 12 | with torch.cuda.stream(custom_model_stream): 13 | start_event.record(custom_model_stream) 14 | custom_model(*inputs) 15 | end_event.record(custom_model_stream) 16 | 17 | Enhanced timing code: 18 | with torch.cuda.stream(custom_model_stream): 19 | start_event.record(custom_model_stream) 20 | custom_model(*inputs) 21 | 22 | # Wait for all model streams to complete before recording end event 23 | if custom_model_streams: 24 | for stream in custom_model_streams: 25 | custom_model_stream.wait_stream(stream) 26 | 27 | end_event.record(custom_model_stream) 28 | 29 | This enhancement prevents timing inaccuracies when models use internal streams 30 | for operations like CUDA graphs, asynchronous kernels, or parallel execution. 31 | Without this synchronization, timing measurements could complete before the 32 | model's actual GPU work finishes, leading to artificially fast results. 33 | 34 | """ 35 | import os 36 | os.environ["MKL_THREADING_LAYER"] = "GNU" 37 | import numpy as np 38 | import torch 39 | import torch.nn as nn 40 | import subprocess 41 | import random 42 | import json 43 | from contextlib import redirect_stdout, redirect_stderr 44 | from io import StringIO 45 | import multiprocessing as mp 46 | from concurrent.futures import ThreadPoolExecutor, TimeoutError 47 | import time 48 | from datetime import datetime, timezone, timedelta 49 | from typing import Tuple, List, Dict, Union, Optional, Callable 50 | 51 | 52 | 53 | 54 | pst_tz = timezone(timedelta(hours=-8)) 55 | 56 | REPO_TOP_PATH = os.path.abspath( 57 | os.path.join( 58 | os.path.dirname(__file__), 59 | "..", 60 | ) 61 | ) 62 | KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench") 63 | 64 | 65 | def execute_model_with_timeout( 66 | model_src: str, 67 | context: Dict, 68 | timeout: float = 300.0, 69 | build_directory: Optional[str] = None, 70 | use_process_isolation: bool = False, 71 | info_string: str = "" 72 | ) -> Tuple[bool, str, Optional[float]]: 73 | """ 74 | Execute model source code with a time limit. 75 | 76 | Args: 77 | model_src: Source code to execute (can be original_model_src or custom_model_src) 78 | context: Dictionary to execute the code in 79 | timeout: Maximum time in seconds to allow for execution (default: 300s = 5 minutes) 80 | build_directory: Optional build directory for CUDA extensions 81 | use_process_isolation: Use multiprocessing instead of threading (slower but more robust) 82 | 83 | Returns: 84 | Tuple[bool, str, Optional[float]]: (success, error_message, execution_time) 85 | - success: True if execution completed within timeout, False otherwise 86 | - error_message: Error details if execution failed, empty string if successful 87 | - execution_time: Time taken for execution in seconds, None if failed 88 | 89 | Note: 90 | ThreadPoolExecutor cannot interrupt blocking operations like time.sleep(), 91 | network requests, or infinite loops. The timeout detection works correctly, 92 | but background threads may continue running until the blocking operation completes. 93 | For CUDA code, this is usually not an issue as compilation errors are detected quickly. 94 | """ 95 | # Format info_string for consistent display 96 | info_prefix = f"[{info_string}] " if info_string else "" 97 | 98 | # Prepare source code with build directory if provided 99 | if build_directory: 100 | context["BUILD_DIRECTORY"] = build_directory 101 | model_src = ( 102 | "import os\n" 103 | f"os.environ['TORCH_EXTENSIONS_DIR'] = '{build_directory}'\n" 104 | ) + model_src 105 | 106 | # Static analysis for potentially problematic patterns 107 | potentially_hanging_patterns = [ 108 | ('time.sleep(', 'time.sleep() calls'), 109 | ('requests.get(', 'network requests'), 110 | ('urllib.request.', 'URL requests'), 111 | ('input(', 'user input'), 112 | ('while True:', 'infinite loops'), 113 | ('subprocess.', 'subprocess calls'), 114 | ] 115 | 116 | detected_patterns = [] 117 | for pattern, description in potentially_hanging_patterns: 118 | if pattern in model_src: 119 | detected_patterns.append(description) 120 | 121 | if detected_patterns: 122 | print(f"{info_prefix}[execute_model_with_timeout] WARNING: Detected potentially blocking operations:") 123 | for pattern in detected_patterns: 124 | print(f"{info_prefix} - {pattern}") 125 | print(f"{info_prefix}[execute_model_with_timeout] These may cause hanging if they block indefinitely.") 126 | print(f"{info_prefix}[execute_model_with_timeout] Consider using use_process_isolation=True for risky code.") 127 | 128 | # Check for extremely problematic patterns that should be blocked 129 | blocking_patterns = ['time.sleep(', 'input(', 'while True:'] 130 | should_block = any(pattern in model_src for pattern, _ in potentially_hanging_patterns 131 | if pattern in blocking_patterns) 132 | 133 | if should_block and not use_process_isolation: 134 | error_msg = f"Code contains blocking patterns that may cause indefinite hanging: {detected_patterns}" 135 | print(f"{info_prefix}[execute_model_with_timeout] BLOCKING EXECUTION: {error_msg}") 136 | print(f"{info_prefix}[execute_model_with_timeout] Use use_process_isolation=True to override") 137 | return False, error_msg, None 138 | 139 | def _execute_code(): 140 | """Helper function to execute the code in a separate thread""" 141 | try: 142 | compile(model_src, "", "exec") 143 | exec(model_src, context) 144 | return True 145 | except Exception as e: 146 | raise e 147 | 148 | try: 149 | isolation_method = "process isolation" if use_process_isolation else "thread isolation" 150 | print(f"{info_prefix}Executing model code with {timeout}s timeout using {isolation_method}...") 151 | 152 | if use_process_isolation: 153 | # Use multiprocessing (more robust but has limitations with CUDA) 154 | import multiprocessing as mp 155 | print(f"{info_prefix}[execute_model_with_timeout] WARNING: Process isolation may not work well with CUDA contexts") 156 | 157 | def _execute_in_process(): 158 | try: 159 | compile(model_src, "", "exec") 160 | local_context = {} 161 | exec(model_src, local_context) 162 | return True 163 | except Exception as e: 164 | raise e 165 | 166 | process = mp.Process(target=_execute_in_process) 167 | t1 = time.time() 168 | process.start() 169 | process.join(timeout=timeout) 170 | t2 = time.time() 171 | execution_time = t2 - t1 172 | 173 | if process.is_alive(): 174 | print(f"{info_prefix}[execute_model_with_timeout] Process timeout - terminating") 175 | process.terminate() 176 | process.join(timeout=5.0) 177 | if process.is_alive(): 178 | process.kill() 179 | process.join() 180 | 181 | error_msg = f"Execution timeout after {execution_time:.6f} seconds" 182 | print(f"{info_prefix}[execute_model_with_timeout] {error_msg}") 183 | return False, error_msg, None 184 | 185 | if process.exitcode == 0: 186 | print(f"{info_prefix}Model code execution completed successfully") 187 | # Note: Process isolation doesn't update the context 188 | print(f"{info_prefix}[execute_model_with_timeout] Note: Context not updated due to process isolation") 189 | return True, "", execution_time 190 | else: 191 | error_msg = f"Process exited with code {process.exitcode}" 192 | return False, error_msg, None 193 | 194 | else: 195 | # Use threading (faster, works with CUDA, but can't interrupt blocking operations) 196 | with ThreadPoolExecutor(max_workers=1) as executor: 197 | future = executor.submit(_execute_code) 198 | try: 199 | t1 = time.time() 200 | future.result(timeout=timeout) 201 | t2 = time.time() 202 | execution_time = t2 - t1 203 | print(f"{info_prefix}Model code execution completed successfully") 204 | return True, "", execution_time 205 | 206 | except TimeoutError: 207 | future.cancel() # This won't stop blocking operations 208 | elapsed_time = time.time() - t1 209 | error_msg = f"Execution timeout after {elapsed_time:.6f} seconds" 210 | print(f"{info_prefix}[execute_model_with_timeout] {error_msg}") 211 | print(f"{info_prefix}[execute_model_with_timeout] Source code length: {len(model_src)} chars") 212 | print(f"{info_prefix}[execute_model_with_timeout] First 200 chars: {model_src[:200]}...") 213 | if detected_patterns: 214 | print(f"{info_prefix}[execute_model_with_timeout] Note: Background thread may still be running due to blocking operations") 215 | return False, error_msg, None 216 | 217 | except SyntaxError as e: 218 | error_msg = f"Syntax Error: {e}" 219 | print(f"{info_prefix}[execute_model_with_timeout] {error_msg}") 220 | print(f"{info_prefix}[execute_model_with_timeout] Source code length: {len(model_src)} chars") 221 | print(f"{info_prefix}[execute_model_with_timeout] First 200 chars: {model_src[:200]}...") 222 | return False, error_msg, None 223 | 224 | except Exception as e: 225 | error_msg = f"Runtime Error: {e}" 226 | print(f"{info_prefix}[execute_model_with_timeout] {error_msg}") 227 | print(f"{info_prefix}[execute_model_with_timeout] Source code length: {len(model_src)} chars") 228 | print(f"{info_prefix}[execute_model_with_timeout] First 200 chars: {model_src[:200]}...") 229 | return False, error_msg, None 230 | 231 | 232 | 233 | def set_seed(seed: int): 234 | torch.manual_seed(seed) 235 | # NOTE: this only sets on current cuda device 236 | torch.cuda.manual_seed(seed) 237 | 238 | 239 | def load_original_model_and_inputs( 240 | model_original_src: str, context: Dict, timeout: float = 300.0, info_string: str = "" 241 | ) -> Tuple[nn.Module, Callable, Callable]: 242 | """ 243 | Load class from original NN.module pytorch code 244 | this is pytorch reference and we feed that to model to see if there will be any improvement 245 | 246 | Args: 247 | model_original_src: Source code for the original model 248 | context: Dictionary to execute the code in 249 | timeout: Maximum time in seconds to allow for code execution (default: 300s = 5 minutes) 250 | info_string: Information string for consistent logging 251 | """ 252 | # Format info_string for consistent display 253 | info_prefix = f"[{info_string}] " if info_string else "" 254 | 255 | # Execute the model source code with timeout 256 | success, error_msg, execution_time = execute_model_with_timeout( 257 | model_src=model_original_src, 258 | context=context, 259 | timeout=timeout, 260 | build_directory=None, # Original models typically don't need CUDA extensions 261 | info_string=info_string 262 | ) 263 | 264 | if not success: 265 | print(f"{info_prefix}[load_original_model_and_inputs] Failed to execute original model code: {error_msg}") 266 | return None 267 | 268 | # these should be defined in the original model code and present in the context 269 | get_init_inputs_fn = context.get("get_init_inputs") 270 | get_inputs_fn = context.get("get_inputs") 271 | Model = context.get("Model") 272 | return (Model, get_init_inputs_fn, get_inputs_fn) 273 | 274 | 275 | def load_custom_model( 276 | model_custom_src: str, context: Dict, build_directory: Optional[str] = None, timeout: float = 300.0, info_string: str = "" 277 | ) -> Optional[nn.Module]: 278 | """ 279 | Load class from custom NN.module pytorch code 280 | this is the code output by LLM with calls to custom cuda kernels 281 | 282 | Args: 283 | model_custom_src: Source code for the custom model 284 | context: Dictionary to execute the code in 285 | build_directory: Directory for CUDA extensions 286 | timeout: Maximum time in seconds to allow for code execution (default: 300s = 5 minutes) 287 | info_string: Information string for consistent logging 288 | """ 289 | # Format info_string for consistent display 290 | info_prefix = f"[{info_string}] " if info_string else "" 291 | 292 | # Execute the model source code with timeout 293 | success, error_msg, execution_time = execute_model_with_timeout( 294 | model_src=model_custom_src, 295 | context=context, 296 | timeout=timeout, 297 | build_directory=build_directory, 298 | info_string=info_string 299 | ) 300 | 301 | if not success: 302 | print(f"{info_prefix}[load_custom_model] Failed to execute custom model code: {error_msg}") 303 | return None 304 | 305 | if execution_time is not None: 306 | print(f"{info_prefix}[load_custom_model] Model loaded successfully in {execution_time:.2f}s") 307 | 308 | ModelNew = context.get("ModelNew") 309 | 310 | # Debug: Show what's in the context 311 | print(f"{info_prefix}[load_custom_model] Context keys: {list(context.keys())}") 312 | print(f"{info_prefix}[load_custom_model] ModelNew from context: {ModelNew}") 313 | 314 | # Validate that ModelNew was properly defined 315 | if ModelNew is None: 316 | print(f"{info_prefix}[load_custom_model] Error: ModelNew was not defined in the custom model source code") 317 | print(f"{info_prefix}[load_custom_model] Make sure your custom model source includes: ModelNew = YourModelClass") 318 | print(f"{info_prefix}[load_custom_model] Available in context: {[k for k in context.keys() if not k.startswith('__')]}") 319 | return None 320 | 321 | if not callable(ModelNew): 322 | print(f"{info_prefix}Error: ModelNew is not callable (got {type(ModelNew)})") 323 | print(f"{info_prefix}Make sure ModelNew is a class that can be instantiated") 324 | return None 325 | 326 | # Additional validation - check if it's a class 327 | if not isinstance(ModelNew, type): 328 | print(f"{info_prefix}Error: ModelNew should be a class, got {type(ModelNew)}") 329 | print(f"{info_prefix}Example: class MyModel(nn.Module): ... then ModelNew = MyModel") 330 | return None 331 | 332 | return ModelNew 333 | 334 | 335 | def graceful_eval_cleanup(curr_context: Dict, device: torch.device): 336 | """ 337 | Clean up env, gpu cache, and compiled CUDA extensions after evaluation 338 | """ # delete ran-specific function definitions before next eval run 339 | del curr_context 340 | # Clear CUDA cache and reset GPU state 341 | with torch.cuda.device(device): 342 | torch.cuda.empty_cache() 343 | 344 | # does this help? 345 | torch.cuda.reset_peak_memory_stats(device=device) 346 | 347 | torch.cuda.synchronize( 348 | device=device 349 | ) # Wait for all CUDA operations to complete 350 | 351 | # _cleanup_cuda_extensions() # SIMON NOTE: is this necessary? 352 | 353 | 354 | def check_kernel_correctness( 355 | warmup_src:str, 356 | original_model_src: str, 357 | custom_model_src: str, 358 | seed_num: int = 42, 359 | num_correct_trials: int = 5, 360 | verbose: bool = False, 361 | build_dir: os.PathLike = None, 362 | device: torch.device = None, 363 | timeout: float = 300.0, 364 | info_string: str = "" 365 | ) -> tuple[bool, str, dict]: 366 | """ 367 | Check correctness of custom CUDA kernel against reference implementation. 368 | 369 | Args: 370 | original_model_src: Source code for the original/reference model 371 | custom_model_src: Source code for the custom CUDA kernel model 372 | seed_num: Base seed for reproducible testing 373 | num_correct_trials: Number of trials with different inputs to test 374 | verbose: Whether to print detailed progress 375 | build_dir: Directory for CUDA extensions 376 | device: CUDA device to run on (defaults to current device) 377 | timeout: Timeout for model loading in seconds 378 | 379 | Returns: 380 | tuple[bool, str, dict]: (success, error_message, metadata) 381 | - success: True if all correctness trials pass 382 | - error_message: Error details if failed, empty string if successful 383 | - metadata: Dictionary with trial details and statistics 384 | """ 385 | if device is None: 386 | raise Exception("Device is not set for check_kernel_correctness") 387 | 388 | if not torch.cuda.is_available(): 389 | return False, "CUDA is not available", {} 390 | 391 | # Define beijing_tz at the beginning of the function 392 | beijing_tz = timezone(timedelta(hours=8)) 393 | 394 | # Format info_string for consistent display 395 | info_prefix = f"[{info_string}] " if info_string else "" 396 | 397 | # Set CUDA device 398 | torch.cuda.set_device(device) 399 | 400 | metadata = { 401 | "device": str(device), 402 | "hardware": torch.cuda.get_device_name(device=device), 403 | "num_trials": num_correct_trials, 404 | "trials_passed": 0, 405 | "trials_failed": 0, 406 | "max_difference": [], 407 | "avg_difference": [] 408 | } 409 | 410 | if verbose: 411 | print(f"{info_prefix}[Correctness] Starting correctness check on device: {device}") 412 | print(f"{info_prefix}[Correctness] Running {num_correct_trials} trials") 413 | 414 | # Load original model 415 | context_warmup = {} 416 | if verbose: 417 | print(f"{info_prefix}[Correctness] Loading original model...") 418 | WarmupModel, get_init_inputs, get_inputs = load_original_model_and_inputs( 419 | warmup_src, context_warmup 420 | ) 421 | 422 | set_seed(seed_num) 423 | init_inputs = get_init_inputs() 424 | init_inputs = [ 425 | x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs 426 | ] 427 | 428 | with torch.no_grad(): 429 | set_seed(seed_num) 430 | warmup_model = WarmupModel(*init_inputs).to(device) 431 | inputs = get_inputs() 432 | inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs] 433 | warmup_model(*inputs) 434 | torch.cuda.synchronize(device=device) 435 | 436 | 437 | try: 438 | context_original = {} 439 | Model, get_init_inputs, get_inputs = load_original_model_and_inputs( 440 | original_model_src, context_original, timeout=timeout, info_string=info_string 441 | ) 442 | if Model is None: 443 | return False, "Failed to load original model", metadata 444 | 445 | # Initialize original model 446 | set_seed(seed_num) 447 | init_inputs = get_init_inputs() 448 | init_inputs = [ 449 | x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs 450 | ] 451 | 452 | with torch.no_grad(): 453 | set_seed(seed_num) 454 | original_model = Model(*init_inputs).to(device) 455 | 456 | except Exception as e: 457 | return False, f"Failed to initialize original model: {e}", metadata 458 | 459 | # Load custom model 460 | context_custom = {} 461 | if verbose: 462 | print(f"{info_prefix}[Correctness] Loading custom model...") 463 | 464 | try: 465 | os.environ["TORCH_USE_CUDA_DSA"] = "1" # Enable device-side assertions 466 | ModelNew = load_custom_model(custom_model_src, context_custom, build_dir, timeout=timeout, info_string=info_string) 467 | if ModelNew is None: 468 | return False, "Failed to load custom model", metadata 469 | 470 | # Initialize custom model 471 | with torch.no_grad(): 472 | set_seed(seed_num) 473 | custom_model = ModelNew(*init_inputs).to(device) 474 | 475 | torch.cuda.synchronize(device=device) 476 | 477 | except Exception as e: 478 | return False, f"Failed to initialize custom model: {e}", metadata 479 | 480 | # Run correctness trials 481 | if verbose: 482 | print(f"{info_prefix}[Correctness] Running {num_correct_trials} correctness trials...") 483 | 484 | # Generate trial seeds deterministically 485 | torch.manual_seed(seed_num) 486 | trial_seeds = [torch.randint(0, 2**32 - 1, (1,)).item() for _ in range(num_correct_trials)] 487 | 488 | pass_count = 0 489 | 490 | with torch.no_grad(): 491 | for trial in range(num_correct_trials): 492 | trial_seed = trial_seeds[trial] 493 | 494 | # if verbose: 495 | # print(f"{info_prefix}[Correctness {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Trial {trial + 1}/{num_correct_trials} (seed: {trial_seed})") 496 | 497 | try: 498 | # Generate inputs for this trial 499 | set_seed(trial_seed) 500 | inputs = get_inputs() 501 | inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs] 502 | 503 | # Run original model 504 | set_seed(trial_seed) 505 | original_model.eval() 506 | original_output = original_model(*inputs) 507 | torch.cuda.synchronize(device=device) 508 | 509 | # Run custom model 510 | set_seed(trial_seed) 511 | custom_model.eval() 512 | custom_output = custom_model(*inputs) 513 | torch.cuda.synchronize(device=device) 514 | 515 | # Check output shapes 516 | if original_output.shape != custom_output.shape: 517 | error_msg = f"Shape mismatch to the original model" 518 | metadata["trials_failed"] += 1 519 | # if verbose: 520 | # print(f"{info_prefix}[Correctness] ❌ {error_msg}") 521 | return False, error_msg, metadata 522 | 523 | # Check output values 524 | if not torch.allclose(original_output, custom_output, atol=1e-02, rtol=1e-02): 525 | max_diff = torch.max(torch.abs(original_output - custom_output)).item() 526 | avg_diff = torch.mean(torch.abs(original_output - custom_output)).item() 527 | 528 | metadata["max_difference"].append(f"{max_diff:.6f}") 529 | metadata["avg_difference"].append(f"{avg_diff:.6f}") 530 | metadata["trials_failed"] += 1 531 | print(metadata) 532 | error_msg = f"Value mismatch to the original model" 533 | # if verbose: 534 | # print(f"{info_prefix}[Correctness] ❌ {error_msg}") 535 | return False, error_msg, metadata 536 | else: 537 | # Trial passed 538 | pass_count += 1 539 | metadata["trials_passed"] += 1 540 | # if verbose: 541 | # print(f"{info_prefix}[Correctness] ✅ Trial {trial + 1} passed") 542 | 543 | except Exception as e: 544 | metadata["trials_failed"] += 1 545 | error_msg = f"Runtime error in trial {trial + 1}: {e}" 546 | if verbose: 547 | print(f"{info_prefix}[Correctness {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] ❌ {error_msg}") 548 | return False, error_msg, metadata 549 | 550 | # Final validation 551 | if pass_count == num_correct_trials: 552 | if verbose: 553 | print(f"{info_prefix}[Correctness {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] ✅ All {pass_count}/{num_correct_trials} trials passed!") 554 | 555 | # Cleanup 556 | graceful_eval_cleanup(context_original, device) 557 | graceful_eval_cleanup(context_custom, device) 558 | 559 | return True, "", metadata 560 | else: 561 | error_msg = f"Only {pass_count}/{num_correct_trials} trials passed" 562 | return False, error_msg, metadata 563 | 564 | 565 | def eval_kernel_against_ref( 566 | warmup_src: str, 567 | original_model_src: str, 568 | custom_model_src: str, 569 | seed_num: int = 42, 570 | num_perf_trials: int = 10, 571 | verbose: bool = False, 572 | build_dir: os.PathLike = None, 573 | device: torch.device = None, # have to run on GPU 574 | info_string: str = "", 575 | original_eval_setup: str = "vanilla" 576 | ) -> tuple[float | None, float | None, str]: 577 | """ 578 | Evaluate the custom kernel against the original model (vanilla or torch.compile) 579 | 580 | Args: 581 | original_model_src: Source code for the original model 582 | custom_model_src: Source code for the custom model 583 | seed_num: Seed for reproducible results 584 | num_perf_trials: Number of performance trials to run 585 | verbose: Whether to print detailed progress 586 | build_dir: Directory for CUDA extensions 587 | device: GPU device to run evaluation on 588 | info_string: Information string for consistent logging 589 | original_eval_setup: Evaluation setup for original model ("vanilla", "torch_compile", or "CUDA_graphs") 590 | 591 | 592 | Returns: 593 | tuple[float | None, float | None, str]: (score, total_elapsed_time, message) where: 594 | - score: reference_model_time / custom_model_time (higher is better, >1.0 means speedup), or None if failed 595 | - total_elapsed_time: Total time taken for evaluation in seconds, or None if failed 596 | - message: Success message or error description 597 | """ 598 | # Validate original_eval_setup parameter 599 | if original_eval_setup not in ["vanilla", "torch_compile", "torch_compile_reduce_overhead","cudnn","cuda_graph"]: 600 | raise ValueError(f"original_eval_setup must be 'vanilla', 'torch_compile', 'torch_compile_reduce_overhead', 'cudnn', or 'cuda_graph', got '{original_eval_setup}'") 601 | 602 | # TODO: check device is busy 603 | assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval" 604 | torch.set_printoptions( 605 | precision=4, # Decimal places 606 | threshold=10, # Total number of elements before truncating 607 | edgeitems=3, # Number of elem xents at beginning and end of dimensions 608 | linewidth=80, # Maximum width before wrapping 609 | ) 610 | 611 | custom_contain_new_streams = False 612 | if custom_model_src.find("cuda.Stream")!=-1: 613 | custom_contain_new_streams = True 614 | 615 | # Define beijing_tz at the beginning of the function 616 | beijing_tz = timezone(timedelta(hours=8)) 617 | 618 | # Format info_string for consistent display 619 | info_prefix = f"[{info_string}] " if info_string else "" 620 | 621 | # set CUDA device 622 | torch.cuda.set_device(device) 623 | 624 | context = {} 625 | 626 | if verbose: 627 | print(f"{info_prefix}[Eval] Start Evaluation ({original_eval_setup})! on device: {device}") 628 | if original_eval_setup == "torch_compile": 629 | print(f"{info_prefix}[Eval] Compile mode: default") 630 | print(f"{info_prefix}[Eval] Loading Original Model") 631 | 632 | WarmupModel, get_init_inputs, get_inputs = load_original_model_and_inputs( 633 | warmup_src, context, info_string=info_string 634 | ) 635 | set_seed(seed_num) # set seed for reproducible input 636 | init_inputs = get_init_inputs() 637 | init_inputs = [ 638 | x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs 639 | ] 640 | with torch.no_grad(): 641 | set_seed(seed_num) # set seed for reproducible weights 642 | warmup_model = WarmupModel(*init_inputs) 643 | warmup_model = warmup_model.to(device) 644 | inputs = get_inputs() 645 | inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs] 646 | warmup_model(*inputs) 647 | torch.cuda.synchronize(device=device) 648 | 649 | Model, get_init_inputs, get_inputs = load_original_model_and_inputs( 650 | original_model_src, context, info_string=info_string 651 | ) 652 | set_seed(seed_num) # set seed for reproducible input 653 | init_inputs = get_init_inputs() 654 | init_inputs = [ 655 | x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs 656 | ] 657 | 658 | with torch.no_grad(): 659 | set_seed(seed_num) # set seed for reproducible weights 660 | original_model = Model(*init_inputs) 661 | assert hasattr(original_model, "forward") 662 | 663 | # Conditionally apply torch.compile to the original model 664 | if original_eval_setup == "torch_compile": 665 | if verbose: 666 | print(f"{info_prefix}[Eval] Applying torch.compile to original model (mode: default)") 667 | 668 | try: 669 | # Apply torch.compile with default mode 670 | original_model = torch.compile(original_model, mode="default") 671 | if verbose: 672 | print(f"{info_prefix}[Eval] Original Model compiled (warmup will happen later)") 673 | 674 | except Exception as e: 675 | print(f"{info_prefix}Failed to compile original model with torch.compile: {e}") 676 | return None, None, f"Failed to compile original model with torch.compile: {e}" 677 | elif original_eval_setup == "torch_compile_reduce_overhead": 678 | if verbose: 679 | print(f"{info_prefix}[Eval] Applying torch.compile to original model (mode: reduce-overhead)") 680 | 681 | try: 682 | # Apply torch.compile with reduce-overhead mode for CUDA graphs 683 | original_model = torch.compile(original_model, mode="reduce-overhead") 684 | 685 | if verbose: 686 | print(f"{info_prefix}[Eval] Original Model compiled with reduce-overhead mode (warmup will happen later)") 687 | 688 | except Exception as e: 689 | print(f"{info_prefix}Failed to compile original model with torch.compile (reduce-overhead): {e}") 690 | return None, None, f"Failed to compile original model with torch.compile (reduce-overhead): {e}" 691 | else: 692 | if verbose: 693 | print(f"{info_prefix}[Eval] Original Model Loaded") 694 | 695 | if verbose: 696 | print(f"{info_prefix}[Eval] Loading and Compiling New Model with Custom CUDA Kernel") 697 | 698 | metadata = {} # for storing result metadata 699 | metadata["hardware"] = torch.cuda.get_device_name(device=device) 700 | metadata["device"] = str(device) # for debugging 701 | metadata["original_eval_setup"] = original_eval_setup 702 | if original_eval_setup == "torch_compile": 703 | metadata["compile_mode"] = "default" 704 | elif original_eval_setup == "torch_compile_reduce_overhead": 705 | metadata["compile_mode"] = "reduce-overhead" 706 | 707 | # this is where compilation happens 708 | try: 709 | os.environ["TORCH_USE_CUDA_DSA"] = "1" # compile with device side assertion 710 | # add hash for later to distinguish between multi-turn kernels 711 | ModelNew = load_custom_model(custom_model_src, context, build_dir, info_string=info_string) 712 | 713 | # Debug: Check what load_custom_model returned 714 | if verbose: 715 | print(f"{info_prefix}[DEBUG] load_custom_model returned: {ModelNew} (type: {type(ModelNew)})") 716 | 717 | # Validate ModelNew before proceeding 718 | if ModelNew is None: 719 | print(f"{info_prefix}ERROR: load_custom_model returned None - check the model source code") 720 | print(f"{info_prefix}The custom model source must define: ModelNew = YourModelClass") 721 | return None, None, "ModelNew is None" 722 | 723 | torch.cuda.synchronize(device=device) # not sure if this is too much 724 | except Exception as e: 725 | print( 726 | f"{info_prefix}Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}" 727 | ) 728 | return None, None, "Failed to compile custom CUDA kernel" 729 | 730 | # at this point we passed compilation 731 | try: 732 | with torch.no_grad(): 733 | set_seed(seed_num) # set seed for reproducible weights 734 | custom_model = ModelNew(*init_inputs) 735 | assert hasattr(custom_model, "forward") 736 | torch.cuda.synchronize(device=device) 737 | if verbose: 738 | print(f"{info_prefix}[Eval] New Model with Custom CUDA Kernel Loaded") 739 | except RuntimeError as e: 740 | print( 741 | f"{info_prefix}Failed to load custom CUDA kernel; Compiled but not able to run, count as runtime error. \nError: {e}" 742 | ) 743 | return None, None, "Failed to load custom CUDA kernel with New Model" 744 | 745 | # Handle case where num_correct_trials is 0 (skip correctness check) 746 | 747 | if verbose: 748 | reference_type = "Compiled (torch.compile)" if original_eval_setup == "torch_compile" else "Original (vanilla)" 749 | print(f"{info_prefix}[Eval] Measuring Performance ({reference_type} vs Custom)") 750 | 751 | # Move models to the correct device for performance measurement 752 | original_model = original_model.to(device) 753 | custom_model = custom_model.to(device) 754 | 755 | reference_times = [] # Will store either vanilla or compiled times 756 | custom_times = [] 757 | 758 | # === WARMUP PHASE === 759 | if verbose: 760 | print(f"{info_prefix}[Eval] Starting warmup phase for both models...") 761 | 762 | try: 763 | warmup_inputs = get_inputs() 764 | warmup_inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in warmup_inputs] 765 | 766 | # Warm up both models (3 iterations each) 767 | for i in range(3): 768 | with torch.no_grad(): 769 | # Warmup original model (especially important for torch.compile) 770 | _ = original_model(*warmup_inputs) 771 | 772 | # Warmup custom model (eliminates CUDA kernel initialization overhead) 773 | _ = custom_model(*warmup_inputs) 774 | 775 | torch.cuda.synchronize(device=device) 776 | 777 | if verbose: 778 | model_types = f"original ({original_eval_setup}) and custom" 779 | print(f"{info_prefix}[Eval] Warmup completed for {model_types} models") 780 | 781 | except Exception as e: 782 | print(f"{info_prefix}Warning: Model warmup failed: {e}") 783 | # Continue anyway - warmup failure shouldn't block evaluation 784 | 785 | if verbose: 786 | print(f"{info_prefix}[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, trials: {num_perf_trials}") 787 | 788 | t1 = time.time() 789 | with torch.no_grad(): 790 | for trial in range(num_perf_trials): 791 | # Generate one random input for this trial - SAME input will be used for both models 792 | inputs = get_inputs() 793 | inputs = [x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in inputs] 794 | # Randomize execution order to eliminate systematic bias 795 | run_reference_first = random.choice([True, False]) 796 | 797 | # IMPORTANT: Detect model streams to ensure accurate timing 798 | current_stream = torch.cuda.current_stream(device=device) 799 | 800 | # Comprehensive stream detection - find any CUDA streams the models use 801 | def find_model_streams(model): 802 | """Find all CUDA streams in a model, regardless of attribute names""" 803 | streams = [] 804 | 805 | # Check all attributes of the model 806 | for attr_name in dir(model): 807 | try: 808 | attr_value = getattr(model, attr_name) 809 | 810 | # Check if it's a single CUDA stream 811 | if isinstance(attr_value, torch.cuda.Stream): 812 | streams.append(attr_value) 813 | 814 | # Check if it's a list/tuple of CUDA streams 815 | elif isinstance(attr_value, (list, tuple)): 816 | for item in attr_value: 817 | if isinstance(item, torch.cuda.Stream): 818 | streams.append(item) 819 | 820 | # Check if it's a dict containing CUDA streams 821 | elif isinstance(attr_value, dict): 822 | for item in attr_value.values(): 823 | if isinstance(item, torch.cuda.Stream): 824 | streams.append(item) 825 | 826 | except (AttributeError, RuntimeError): 827 | # Some attributes might not be accessible or might raise errors 828 | continue 829 | 830 | return streams 831 | 832 | # Find streams for both models 833 | custom_model_streams = find_model_streams(custom_model) 834 | # Use current stream for timing, but track all model streams for synchronization 835 | # This ensures we capture all work regardless of which streams the model uses 836 | reference_model_stream = current_stream 837 | custom_model_stream = current_stream 838 | 839 | # Debug info for stream detection 840 | if verbose and custom_model_streams: 841 | print(f"{info_prefix}[Stream Detection] Found {len(custom_model_streams)} CUDA streams in custom model") 842 | 843 | if run_reference_first: 844 | # Time reference model first 845 | start_event = torch.cuda.Event(enable_timing=True) 846 | end_event = torch.cuda.Event(enable_timing=True) 847 | 848 | with torch.cuda.stream(reference_model_stream): 849 | start_event.record(reference_model_stream) 850 | original_model(*inputs) 851 | 852 | # Wait for all model streams to complete before recording end event 853 | end_event.record(reference_model_stream) 854 | 855 | torch.cuda.synchronize(device=device) 856 | reference_time = start_event.elapsed_time(end_event) 857 | 858 | # Time custom model second 859 | start_event = torch.cuda.Event(enable_timing=True) 860 | end_event = torch.cuda.Event(enable_timing=True) 861 | 862 | with torch.cuda.stream(custom_model_stream): 863 | start_event.record(custom_model_stream) 864 | custom_model(*inputs) 865 | 866 | # Wait for all model streams to complete before recording end event 867 | if custom_contain_new_streams: 868 | for stream in custom_model_streams: 869 | custom_model_stream.wait_stream(stream) 870 | 871 | end_event.record(custom_model_stream) 872 | 873 | torch.cuda.synchronize(device=device) 874 | custom_time = start_event.elapsed_time(end_event) 875 | else: 876 | # Time custom model first 877 | start_event = torch.cuda.Event(enable_timing=True) 878 | end_event = torch.cuda.Event(enable_timing=True) 879 | 880 | with torch.cuda.stream(custom_model_stream): 881 | start_event.record(custom_model_stream) 882 | custom_model(*inputs) 883 | 884 | # Wait for all model streams to complete before recording end event 885 | if custom_contain_new_streams: 886 | for stream in custom_model_streams: 887 | custom_model_stream.wait_stream(stream) 888 | 889 | end_event.record(custom_model_stream) 890 | 891 | torch.cuda.synchronize(device=device) 892 | custom_time = start_event.elapsed_time(end_event) 893 | 894 | start_event = torch.cuda.Event(enable_timing=True) 895 | end_event = torch.cuda.Event(enable_timing=True) 896 | 897 | with torch.cuda.stream(reference_model_stream): 898 | start_event.record(reference_model_stream) 899 | original_model(*inputs) 900 | 901 | # Wait for all model streams to complete before recording end event 902 | end_event.record(reference_model_stream) 903 | 904 | torch.cuda.synchronize(device=device) 905 | reference_time = start_event.elapsed_time(end_event) 906 | 907 | reference_times.append(reference_time) 908 | custom_times.append(custom_time) 909 | t2 = time.time() 910 | 911 | # Calculate averages and score 912 | avg_reference_time = sum(reference_times) / len(reference_times) 913 | avg_custom_time = sum(custom_times) / len(custom_times) 914 | score = avg_reference_time / avg_custom_time 915 | total_elapsed_time = (sum(reference_times) + sum(custom_times)) / 1000.0 # Convert from milliseconds to seconds 916 | 917 | if verbose: 918 | reference_type = "Compiled (torch.compile)" if original_eval_setup == "torch_compile" else "Original (vanilla)" 919 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] {reference_type} avg: {avg_reference_time:.3f}ms") 920 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Custom avg: {avg_custom_time:.3f}ms") 921 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Score (reference/custom): {score:.3f}") 922 | 923 | if score > 1.0: 924 | speedup = score 925 | vs_type = "torch.compile" if original_eval_setup == "torch_compile" else "original model" 926 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Custom kernel is {speedup:.2f}x faster than {vs_type}") 927 | elif score < 1.0: 928 | slowdown = 1.0 / score 929 | vs_type = "torch.compile" if original_eval_setup == "torch_compile" else "original model" 930 | print(f"{info_prefix}[Results {datetime.now(beijing_tz).strftime('%Y-%m-%d %H:%M:%S')}] Custom kernel is {slowdown:.2f}x slower than {vs_type}") 931 | else: 932 | vs_type = "torch.compile" if original_eval_setup == "torch_compile" else "original model" 933 | print(f"{info_prefix}[Results] Same performance as {vs_type}") 934 | 935 | graceful_eval_cleanup(context, device) 936 | return score, total_elapsed_time, "Success" 937 | 938 | 939 | 940 | 941 | 942 | ################################################################################ 943 | # Performance Eval 944 | ################################################################################ 945 | 946 | 947 | def fetch_baseline_time( 948 | level_name: str, problem_id: int, dataset: List[str], baseline_time_filepath: str 949 | ) -> Dict: 950 | """ 951 | Fetch the baseline time from the time 952 | """ 953 | if not os.path.exists(baseline_time_filepath): 954 | raise FileNotFoundError( 955 | f"Baseline time file not found at {baseline_time_filepath}" 956 | ) 957 | 958 | with open(baseline_time_filepath, "r") as f: 959 | baseline_json = json.load(f) 960 | 961 | problem_name = dataset[problem_id].split("/")[-1] 962 | baseline_time = baseline_json[level_name].get(problem_name, None) 963 | return baseline_time 964 | 965 | 966 | def get_timing_stats(elapsed_times: List[float], device: Optional[torch.device] = None) -> Dict: 967 | """Get timing statistics from a list of elapsed times. 968 | 969 | Args: 970 | elapsed_times: List of elapsed times in milliseconds 971 | device: CUDA device, record device info 972 | Returns: 973 | Dict containing mean, std, min, max and num_trials 974 | all timing are in ms 975 | """ 976 | 977 | stats = { 978 | "mean": float(f"{np.mean(elapsed_times):.3g}"), 979 | "std": float(f"{np.std(elapsed_times):.3g}"), 980 | "min": float(f"{np.min(elapsed_times):.3g}"), 981 | "max": float(f"{np.max(elapsed_times):.3g}"), 982 | "num_trials": len(elapsed_times), 983 | } 984 | 985 | if device: 986 | stats["hardware"] = torch.cuda.get_device_name(device=device) 987 | stats["device"] = str(device) # for debugging 988 | 989 | return stats 990 | 991 | 992 | def get_available_gpus(): 993 | """Get list of available GPU device IDs""" 994 | if not torch.cuda.is_available(): 995 | return [] 996 | return list(range(torch.cuda.device_count())) 997 | 998 | 999 | 1000 | 1001 | def eval_pipeline( 1002 | warmup_src: str, 1003 | original_model_src: str, 1004 | custom_model_src: str, 1005 | num_correct_trials: int, 1006 | num_perf_trials: int, 1007 | global_n_trials: int, 1008 | gpu_index: int, 1009 | verbose: bool = False, 1010 | log_path: str = None, 1011 | max_time: float = None, 1012 | use_process_isolation: bool = False, 1013 | info_string = "", 1014 | original_eval_setup="", 1015 | ): 1016 | assert original_eval_setup!="" 1017 | tz = timezone(timedelta(hours=0)) 1018 | 1019 | # Format info_string for consistent display 1020 | info_prefix = f"[{info_string}] " if info_string else "" 1021 | 1022 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] median_comparison_pipeline start") 1023 | if log_path is not None: 1024 | log_dir = os.path.dirname(log_path) 1025 | os.makedirs(log_dir, exist_ok=True) 1026 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] Writing log to {log_path}") 1027 | current_time = datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S') 1028 | 1029 | with open(log_path, "w") as write_log: 1030 | print(f"in log_path open and write {log_path}") 1031 | write_log.write(json.dumps({"info_string": info_string, "start_time": current_time, "code": custom_model_src}) + "\n") 1032 | # write_log.write(json.dumps({"info_string": info_string, "start_time": current_time, "custom_model_src": custom_model_src}) + "\n") 1033 | write_log.flush() 1034 | 1035 | # step 1: check whether the model can be executed and compiled 1036 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 1: check whether the model can be executed and compiled") 1037 | context = {} 1038 | success_original, error_msg, execution_time = execute_model_with_timeout( 1039 | model_src=original_model_src, 1040 | context=context, 1041 | timeout=30.0, # 30 seconds should be enough 1042 | use_process_isolation=use_process_isolation, 1043 | info_string=info_string 1044 | ) 1045 | if not success_original: 1046 | log_dict_ = { 1047 | "info_string": info_string, 1048 | "error_msg": f"Original model compilation failed: {error_msg}", 1049 | "error": True, 1050 | "done": True 1051 | } 1052 | with open(log_path, "a") as write_log: 1053 | write_log.write(json.dumps(log_dict_) + "\n") 1054 | write_log.flush() 1055 | return None, f"Original model compilation failed: {error_msg}" 1056 | 1057 | success_custom, error_msg, execution_time = execute_model_with_timeout( 1058 | model_src=custom_model_src, 1059 | context={}, # Use fresh context for custom model 1060 | timeout=100, # Give enough time for CUDA compilation with minimum 30s 1061 | use_process_isolation=use_process_isolation, 1062 | info_string=info_string 1063 | ) 1064 | if not success_custom: 1065 | log_dict_ = { 1066 | "info_string": info_string, 1067 | "error_msg": "fail to compile or execute", 1068 | "error": True, 1069 | "done": True 1070 | } 1071 | with open(log_path, "a") as write_log: 1072 | write_log.write(json.dumps(log_dict_) + "\n") 1073 | write_log.flush() 1074 | return None, "Custom model compilation failed" 1075 | else: 1076 | log_dict_ = { 1077 | "info_string": info_string, 1078 | "info": "stage1:Compile Success", 1079 | "time": datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S'), 1080 | "error": False, 1081 | "done": False 1082 | } 1083 | with open(log_path, "a") as write_log: 1084 | write_log.write(json.dumps(log_dict_) + "\n") 1085 | write_log.flush() 1086 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 2: preliminary speed check") 1087 | device = torch.device(f'cuda:{gpu_index}') 1088 | time1 = time.time() 1089 | 1090 | # step 3: correctness check 1091 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 3: correctness check") 1092 | time1 = time.time() 1093 | correctness_passed, error_msg, correctness_metadata = check_kernel_correctness( 1094 | warmup_src=warmup_src, 1095 | original_model_src=original_model_src, 1096 | custom_model_src=custom_model_src, 1097 | num_correct_trials=num_correct_trials, 1098 | verbose=verbose, 1099 | device=device, 1100 | info_string=info_string 1101 | ) 1102 | time2 = time.time() 1103 | if not correctness_passed: 1104 | log_dict_ = { 1105 | "info_string": info_string, 1106 | "error_msg": error_msg, 1107 | "error": True, 1108 | "done": True 1109 | } 1110 | with open(log_path, "a") as write_log: 1111 | write_log.write(json.dumps(log_dict_) + "\n") 1112 | write_log.flush() 1113 | return None, error_msg 1114 | else: 1115 | log_dict_ = { 1116 | "info_string": info_string, 1117 | "info": "stage3:Correctness Check Success", 1118 | "time": datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S'), 1119 | "error": False, 1120 | "done": False, 1121 | "duration": time2 - time1, 1122 | } 1123 | with open(log_path, "a") as write_log: 1124 | write_log.write(json.dumps(log_dict_) + "\n") 1125 | write_log.flush() 1126 | 1127 | log_dict_ = { 1128 | "info_string": info_string, 1129 | "info": "stage4:Performance Evaluation", 1130 | "time": datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S'), 1131 | "error": False, 1132 | "done": False 1133 | } 1134 | with open(log_path, "a") as write_log: 1135 | write_log.write(json.dumps(log_dict_) + "\n") 1136 | write_log.flush() 1137 | scores = [] 1138 | list_gpu_execution_time = [] 1139 | # Run global_n_trials sequential evaluations 1140 | start_time = time.time() 1141 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 4: performance evaluation") 1142 | for trial in range(global_n_trials): 1143 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] step 4: performance evaluation, trial {trial + 1}/{global_n_trials}") 1144 | # Run single evaluation 1145 | time1 = time.time() 1146 | score, gpu_execution_time, error_msg = eval_kernel_against_ref( 1147 | warmup_src=warmup_src, 1148 | original_model_src=original_model_src, 1149 | custom_model_src=custom_model_src, 1150 | seed_num=42 + trial, # Different seed for each trial 1151 | num_perf_trials=num_perf_trials, 1152 | verbose=False, # Keep individual trials quiet unless overall verbose 1153 | build_dir=None, 1154 | device=device, 1155 | info_string=info_string, 1156 | original_eval_setup=original_eval_setup 1157 | ) 1158 | list_gpu_execution_time.append(gpu_execution_time) 1159 | if score is None: 1160 | error_msg = f"fail to inference" 1161 | log_dict_ = { 1162 | "info_string": info_string, 1163 | "trial": trial, 1164 | "gpu_index": gpu_index, 1165 | "score": score, 1166 | "error_msg": error_msg, 1167 | "error": True, 1168 | "done": True 1169 | } 1170 | with open(log_path, "a") as write_log: 1171 | write_log.write(json.dumps(log_dict_) + "\n") 1172 | write_log.flush() 1173 | return None, error_msg 1174 | time2 = time.time() 1175 | log_dict_ = { 1176 | "info_string": info_string, 1177 | "n_trial": num_perf_trials, 1178 | "trial": trial, 1179 | "gpu_index": gpu_index, 1180 | "score": score, 1181 | "time": datetime.now(timezone(timedelta(hours=8))).strftime('%Y-%m-%d %H:%M:%S'), 1182 | "gpu_execution_time": gpu_execution_time, 1183 | "ave_gpu_execution_time": gpu_execution_time / num_perf_trials, 1184 | "done": False, 1185 | "duration": time2 - time1, 1186 | "error": False 1187 | } 1188 | with open(log_path, "a") as write_log: 1189 | write_log.write(json.dumps(log_dict_) + "\n") 1190 | write_log.flush() 1191 | scores.append(score) 1192 | if score is not None and score < 0.3: 1193 | break 1194 | 1195 | # if verbose: 1196 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] Trial {trial + 1}: {score:.4f} at gpu {gpu_index}") 1197 | 1198 | if len(scores) == 0: 1199 | print(f"{info_prefix}[Score {datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S')}] ❌ No trials completed successfully") 1200 | log_dict_empty = { 1201 | "info_string": info_string, 1202 | "error": True, 1203 | "error_msg": "No trials completed successfully", 1204 | "completed_trials": 0, 1205 | "done": True 1206 | } 1207 | with open(log_path, "a") as write_log: 1208 | write_log.write(json.dumps(log_dict_empty) + "\n") 1209 | write_log.flush() 1210 | return None, "No trials completed successfully" 1211 | 1212 | mean_score = float(np.mean(scores)) 1213 | 1214 | std = float(np.std(scores)) 1215 | 1216 | # Round all scores in the list to 4 decimal places for consistency 1217 | rounded_scores = [round(score, 4) for score in scores] 1218 | 1219 | # Record final elapsed time 1220 | total_elapsed_time = time.time() - start_time 1221 | n_all_trials = num_perf_trials*global_n_trials 1222 | log_dict_ = { 1223 | "info_string": info_string, 1224 | "score": mean_score, 1225 | "rounded_scores": rounded_scores, 1226 | "scores_sorted": sorted(scores), 1227 | "completed_trials": len(scores), 1228 | "total_trials": global_n_trials, 1229 | "n_all_trials_trials": n_all_trials, 1230 | "total_elapsed_time": total_elapsed_time, 1231 | "total_gpu_execution_time": sum(list_gpu_execution_time), 1232 | "ave_gpu_execution_time": sum(list_gpu_execution_time)/n_all_trials, 1233 | "error": False, 1234 | "done": True, 1235 | "scores": [round(ss, 4) for ss in scores], 1236 | "std": std, 1237 | } 1238 | with open(log_path, "a") as write_log: 1239 | write_log.write(json.dumps(log_dict_) + "\n") 1240 | write_log.flush() 1241 | 1242 | return 0 1243 | 1244 | 1245 | def load_cuda_file(PATH_TO_CUDA_FILE): 1246 | if not os.path.exists(PATH_TO_CUDA_FILE): 1247 | raise Exception(f"{PATH_TO_CUDA_FILE} not found") 1248 | cuda_dict_= {} 1249 | with open(PATH_TO_CUDA_FILE, "r") as f: 1250 | for line in f: 1251 | dict_ = json.loads(line) 1252 | level_id = dict_["level_id"] 1253 | task_id = dict_["task_id"] 1254 | ref_code = dict_["ref_code"] 1255 | custom_code = dict_["custom_code"] 1256 | cuda_graph_code = dict_["cuda_graph_code"] 1257 | cudnn_code = dict_["cudnn_code"] 1258 | if level_id not in cuda_dict_: 1259 | cuda_dict_[level_id] = {} 1260 | cuda_dict_[level_id][task_id] = { 1261 | "ref_code": ref_code, 1262 | "custom_code": custom_code, 1263 | "cuda_graph_code": cuda_graph_code, 1264 | "cudnn_code": cudnn_code 1265 | } 1266 | return cuda_dict_ 1267 | 1268 | def eval(): 1269 | YOUR_HOME_FOLDER = "/data2/jiwei/cuda_results/CUDA-L1/optimized_cuda_code" 1270 | PATH_TO_CUDA_FILE = os.path.join(YOUR_HOME_FOLDER, "3090.json") 1271 | output_path = os.path.join(YOUR_HOME_FOLDER, "log.json") 1272 | 1273 | cuda_dict_ = load_cuda_file(PATH_TO_CUDA_FILE) 1274 | level_id = 3 1275 | task_id = 35 1276 | current_dict = cuda_dict_[level_id][task_id] 1277 | original_eval_setup="vanilla" 1278 | # original_eval_setup="torch_compile" 1279 | # original_eval_setup="torch_compile_reduce_overhead" 1280 | # original_eval_setup = "cuda_graph" 1281 | # original_eval_setup = "cudnn" 1282 | 1283 | # original_eval_setup can take value of 1284 | # 1. vanilla, which compares the speed with custom_code with ref_code 1285 | # 2. torch_compile, which compares the speed with custom_code with ref_code compiled with torch.compile 1286 | # 3. torch_compile_reduce_overhead, which compares the speed with custom_code with ref_code compiled with torch.compile and reduce overhead 1287 | # 4. cuda_graph, which compares the speed with custom_code with ref_code with cuda_graph modification 1288 | # 5. cudnn, which compares the speed with custom_code with ref_code with cudnn modification 1289 | warmup_code, ref_code, custom_code, cuda_graph_code, cudnn_code = current_dict["ref_code"], current_dict["ref_code"], current_dict["custom_code"], current_dict["cuda_graph_code"], current_dict["cudnn_code"] 1290 | # for whatever eval_setup, we use ref_code for warmup 1291 | if custom_code == None: 1292 | print(f"CUDA-L1 does not yield performance boost on this task on this gpu architecture.") 1293 | return 0 1294 | if original_eval_setup == "cuda_graph" and cuda_graph_code == None: 1295 | print(f"We were unable to generate valid CUDA graph code for this task. Your request will be skipped.") 1296 | return 0 1297 | if original_eval_setup == "cudnn" and cudnn_code == None: 1298 | print(f"We were unable to generate valid CUDNN code for this task. Your request will be skipped.") 1299 | return 0 1300 | if original_eval_setup == "cuda_graph": 1301 | original_model_src = cuda_graph_code 1302 | elif original_eval_setup == "cudnn": 1303 | original_model_src = cudnn_code 1304 | else: 1305 | original_model_src = ref_code 1306 | 1307 | eval_pipeline( 1308 | warmup_src=warmup_code, 1309 | original_model_src=original_model_src, 1310 | custom_model_src=custom_code, 1311 | num_correct_trials=10, 1312 | num_perf_trials=10, 1313 | global_n_trials=7, 1314 | gpu_index=0, 1315 | verbose=False, 1316 | log_path=output_path, 1317 | max_time=1800, 1318 | original_eval_setup=original_eval_setup 1319 | ) 1320 | #original_eval_setup should take vanilla, torch_compile, torch_compile_reduce_overhead 1321 | print(f"log_path: {output_path}") 1322 | print(f"log_path: {output_path}") 1323 | 1324 | 1325 | if __name__ == "__main__": 1326 | eval() --------------------------------------------------------------------------------