├── 2.0 ├── git ├── LICENSE ├── master ├── FETCH_HEAD ├── cli ├── __init__.py └── cli.py ├── docs ├── user_guide.md ├── api_reference.md └── developer_guide.md ├── tests ├── __init__.py ├── test_cli.py ├── test_code_optimizer.py ├── test_cuda_parser.py ├── test_cudnn_mapper.py ├── test_host_adapter.py ├── unit │ ├── test_generator.py │ ├── test_parser.py │ └── test_translator.py ├── test_kernel_translator.py ├── integration_tests │ ├── __init__.py │ └── test_end_to_end.py └── integration │ ├── test_basic_kernels.py │ └── test_complex_kernels.py ├── utils ├── __init__.py ├── metal_equivalents.py ├── cuda_to_metal_type_mapping.py ├── error_handler.py ├── cuda_builtin_functions.py ├── logger.py └── file_utils.py ├── generator ├── __init__.py ├── swift_generator.py └── msl_generator.py ├── templates ├── unifier.py ├── objc │ ├── metal_setup.m │ ├── cudnn_wrapper.h │ ├── metal_manager.h │ ├── cudnn_wrapper.m │ ├── main.m │ ├── metal_manager.m │ └── kernel_wrapper.m ├── swift │ ├── metal_setup.swift │ ├── cudnn_wrapper.swift │ ├── main.swift │ ├── metal_manager.swift │ └── kernel_wrapper.swift ├── msl │ ├── device_functions.metal │ └── kernel_template.metal └── metal │ ├── header_template.h │ └── kernel_template.metal ├── optimization ├── barrier_optimizer.py ├── kernel_optimizer.py └── memory_optimizer.py ├── requirements.txt ├── setup.py ├── assets └── cudam_logo.png ├── translator ├── __init__.py ├── host_adapter.py ├── thread_hierarchy_mapper.py ├── cudnn_mapper.py └── intrinsic_function_mapper.py ├── .idea ├── .gitignore ├── misc.xml ├── inspectionProfiles │ ├── profiles_settings.xml │ └── Project_Default.xml ├── vcs.xml ├── modules.xml └── CUDAM.iml ├── __init__.py ├── .gitignore ├── core ├── parser │ ├── __init__.py │ └── clang_integration.py └── translator │ └── host_translator.py ├── examples └── simple_vector_add │ └── vector_add.py ├── parser └── __init__.py ├── problems.py ├── testdata.py ├── native └── metal_interop.mm ├── README.md ├── LICENSE.md └── optimizer └── unified_optimizer_metal.py /2.0: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /git: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /master: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /FETCH_HEAD: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/user_guide.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/api_reference.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/developer_guide.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /generator/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /templates/unifier.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /templates/objc/metal_setup.m: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_code_optimizer.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_cuda_parser.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_cudnn_mapper.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_host_adapter.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/test_generator.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/test_parser.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /optimization/barrier_optimizer.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /optimization/kernel_optimizer.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /templates/swift/metal_setup.swift: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_kernel_translator.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/test_translator.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration/test_basic_kernels.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration/test_complex_kernels.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration_tests/test_end_to_end.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | logging~=0.4.9.6 2 | utils~=1.0.2 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # CUDAM/setup.py 2 | 3 | from setuptools import setup, find_packages 4 | -------------------------------------------------------------------------------- /assets/cudam_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MEHDI342/CUDAM/HEAD/assets/cudam_logo.png -------------------------------------------------------------------------------- /translator/__init__.py: -------------------------------------------------------------------------------- 1 | from core import CudaTranslator 2 | from kernel_translator import KernelTranslator 3 | from .host_adapter import HostAdapter 4 | 5 | __all__ = ['CudaTranslator', 'KernelTranslator', 'HostAdapter'] -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /templates/msl/device_functions.metal: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace metal; 3 | 4 | // Helper function that can be used by kernels 5 | float compute_something(float value) { 6 | return value * 2.0; 7 | } 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from .translator import CudaTranslator 2 | from .optimizer import MetalOptimizer 3 | from .parser import CudaParser, ast_nodes 4 | from .utils import logger 5 | 6 | __version__ = '1.0.0' 7 | __all__ = ['CudaTranslator', 'MetalOptimizer', 'CudaParser'] -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /templates/objc/cudnn_wrapper.h: -------------------------------------------------------------------------------- 1 | #import 2 | #import 3 | 4 | @interface CUDNNWrapper : NSObject 5 | 6 | - (instancetype)initWithDevice:(id)device; 7 | - (void)performConvolutionWithInput:(MPSImage *)input 8 | output:(MPSImage *)output; 9 | 10 | @end 11 | -------------------------------------------------------------------------------- /templates/msl/kernel_template.metal: -------------------------------------------------------------------------------- 1 | #include 2 | #include "device_functions.metal" 3 | using namespace metal; 4 | 5 | kernel void example_kernel(const device float* input [[buffer(0)]], 6 | device float* output [[buffer(1)]], 7 | uint id [[thread_position_in_grid]]) { 8 | output[id] = compute_something(input[id]); 9 | } 10 | -------------------------------------------------------------------------------- /templates/objc/metal_manager.h: -------------------------------------------------------------------------------- 1 | #import 2 | #import 3 | 4 | @interface MetalManager : NSObject 5 | 6 | - (instancetype)initWithDevice:(id)device; 7 | - (void)executeKernelWithName:(NSString *)kernelName 8 | withInput:(id)inputBuffer 9 | outputBuffer:(id)outputBuffer; 10 | 11 | @end 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python cache 2 | __pycache__/ 3 | *.py[cod] 4 | *.pyo 5 | *.pyd 6 | 7 | # Ignore the files generated by IDEs and environments 8 | .idea/ 9 | .vscode/ 10 | env/ 11 | venv/ 12 | *.log 13 | 14 | # Project-specific files you might not want in version control 15 | pylint_errors.txt 16 | projett_content.txt 17 | 18 | # Ignoring test and problem scripts during early development 19 | CUDAM/testdata.py 20 | CUDAM/problems.py 21 | 22 | # Avoid pylint error outputs and logs 23 | *.pylint.log 24 | babouchka.txt 25 | -------------------------------------------------------------------------------- /.idea/CUDAM.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | 13 | 18 | 19 | -------------------------------------------------------------------------------- /templates/swift/cudnn_wrapper.swift: -------------------------------------------------------------------------------- 1 | import MetalPerformanceShaders 2 | 3 | class CUDNNWrapper { 4 | private let device: MTLDevice 5 | private var convolution: MPSCNNConvolution 6 | 7 | init(device: MTLDevice) { 8 | self.device = device 9 | 10 | let convDesc = MPSCNNConvolutionDescriptor(kernelWidth: 3, kernelHeight: 3, 11 | inputFeatureChannels: 1, outputFeatureChannels: 1) 12 | 13 | convolution = MPSCNNConvolution(device: device, convolutionDescriptor: convDesc, kernelWeights: [], biasTerms: nil) 14 | } 15 | 16 | func performConvolution(input: MPSImage, output: MPSImage, commandBuffer: MTLCommandBuffer) { 17 | convolution.encode(commandBuffer: commandBuffer, sourceImage: input, destinationImage: output) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /core/parser/__init__.py: -------------------------------------------------------------------------------- 1 | # CUDAM/core/parser/__init__.py 2 | 3 | # Optionally, import classes from ast_nodes.py for easier access 4 | from .ast_nodes import ( 5 | CUDANode, 6 | CUDAKernel, 7 | CUDAParameter, 8 | CUDAType, 9 | CUDAQualifier, 10 | CUDASharedMemory, 11 | CUDAThreadIdx, 12 | CUDABarrier, 13 | CUDACompoundStmt, 14 | CUDAExpressionNode, 15 | CUDAStatement, 16 | FunctionNode, 17 | KernelNode, 18 | VariableNode, 19 | StructNode, 20 | EnumNode, 21 | TypedefNode, 22 | ClassNode, 23 | NamespaceNode, 24 | TemplateNode, 25 | CudaASTNode, 26 | CudaTranslationContext 27 | ) 28 | 29 | __all__ = [ 30 | "CUDANode", 31 | "CUDAKernel", 32 | "CUDAParameter", 33 | "CUDAType", 34 | "CUDAQualifier", 35 | "CUDASharedMemory", 36 | "CUDAThreadIdx", 37 | "CUDABarrier", 38 | "CUDACompoundStmt", 39 | "CUDAExpressionNode", 40 | "CUDAStatement", 41 | "FunctionNode", 42 | "KernelNode", 43 | "VariableNode", 44 | "StructNode", 45 | "EnumNode", 46 | "TypedefNode", 47 | "ClassNode", 48 | "NamespaceNode", 49 | "TemplateNode", 50 | "CudaASTNode", 51 | "CudaTranslationContext" 52 | ] 53 | -------------------------------------------------------------------------------- /templates/swift/main.swift: -------------------------------------------------------------------------------- 1 | import Metal 2 | import MetalKit 3 | 4 | // Entry point for the application using Metal 5 | class MetalApp { 6 | private let device: MTLDevice 7 | private let metalManager: MetalManager 8 | 9 | init() { 10 | guard let device = MTLCreateSystemDefaultDevice() else { 11 | fatalError("Metal is not supported on this device.") 12 | } 13 | self.device = device 14 | self.metalManager = MetalManager(device: device) 15 | } 16 | 17 | func run() { 18 | // Input and output buffers setup 19 | let inputBuffer = device.makeBuffer(length: MemoryLayout.size * 256, options: []) 20 | let outputBuffer = device.makeBuffer(length: MemoryLayout.size * 256, options: []) 21 | 22 | // Fill the input buffer with data 23 | let inputPointer = inputBuffer?.contents().bindMemory(to: Float.self, capacity: 256) 24 | for i in 0..<256 { 25 | inputPointer?[i] = Float(i) 26 | } 27 | 28 | // Execute kernel 29 | metalManager.executeKernel(functionName: "example_kernel", inputBuffer: inputBuffer!, outputBuffer: outputBuffer!) 30 | } 31 | } 32 | 33 | // Running the Metal app 34 | let app = MetalApp() 35 | app.run() 36 | -------------------------------------------------------------------------------- /templates/objc/cudnn_wrapper.m: -------------------------------------------------------------------------------- 1 | #import "cudnn_wrapper.h" 2 | 3 | @implementation CUDNNWrapper { 4 | id _device; 5 | MPSNNConvolution *convolution; 6 | } 7 | 8 | - (instancetype)initWithDevice:(id)device { 9 | self = [super init]; 10 | if (self) { 11 | _device = device; 12 | // Setup Metal Performance Shader convolution kernel 13 | MPSNNConvolutionDescriptor *convDesc = [[MPSNNConvolutionDescriptor alloc] initWithKernelWidth:3 14 | kernelHeight:3 15 | inputFeatureChannels:1 16 | outputFeatureChannels:1]; 17 | convolution = [[MPSNNConvolution alloc] initWithDevice:_device 18 | convolutionDescriptor:convDesc]; 19 | } 20 | return self; 21 | } 22 | 23 | - (void)performConvolutionWithInput:(MPSImage *)input 24 | output:(MPSImage *)output { 25 | // Code to perform convolution 26 | // Example only: Ensure input/output handling is correct in actual code 27 | [convolution encodeToCommandBuffer:commandBuffer 28 | sourceImage:input 29 | destinationImage:output]; 30 | } 31 | 32 | @end 33 | -------------------------------------------------------------------------------- /templates/objc/main.m: -------------------------------------------------------------------------------- 1 | #import 2 | #import 3 | #import "metal_manager.h" 4 | 5 | int main(int argc, const char * argv[]) { 6 | @autoreleasepool { 7 | // Check if Metal is supported 8 | id device = MTLCreateSystemDefaultDevice(); 9 | if (!device) { 10 | NSLog(@"Metal is not supported on this device."); 11 | return -1; 12 | } 13 | 14 | // Initialize Metal manager 15 | MetalManager *metalManager = [[MetalManager alloc] initWithDevice:device]; 16 | 17 | // Create input and output buffers 18 | id inputBuffer = [device newBufferWithLength:sizeof(float) * 256 options:MTLResourceStorageModeShared]; 19 | id outputBuffer = [device newBufferWithLength:sizeof(float) * 256 options:MTLResourceStorageModeShared]; 20 | 21 | // Fill input buffer with data 22 | float *inputPointer = (float *)[inputBuffer contents]; 23 | for (int i = 0; i < 256; i++) { 24 | inputPointer[i] = (float)i; 25 | } 26 | 27 | // Execute the kernel 28 | [metalManager executeKernelWithName:@"example_kernel" withInput:inputBuffer outputBuffer:outputBuffer]; 29 | 30 | // Output the results 31 | float *outputPointer = (float *)[outputBuffer contents]; 32 | for (int i = 0; i < 256; i++) { 33 | NSLog(@"Output[%d]: %f", i, outputPointer[i]); 34 | } 35 | } 36 | return 0; 37 | } 38 | -------------------------------------------------------------------------------- /examples/simple_vector_add/vector_add.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from CUDAM.parser.clang_integration import CUDAClangParser 3 | from CUDAM.translator.host_translator import CUDAHostTranslator 4 | from CUDAM.generator.metal_generator import MetalGenerator 5 | 6 | def translate_cuda_to_metal(cuda_file: str): 7 | # Initialize components 8 | parser = CUDAClangParser() 9 | host_translator = CUDAHostTranslator() 10 | metal_generator = MetalGenerator() 11 | 12 | # Parse CUDA file 13 | cuda_ast = parser.parse_file(cuda_file) 14 | if not cuda_ast: 15 | print("Failed to parse CUDA file") 16 | return 17 | 18 | # Find kernel functions 19 | kernels = [] 20 | def find_kernels(node): 21 | if hasattr(node, 'is_kernel') and node.is_kernel(): 22 | kernels.append(node) 23 | cuda_ast.traverse(find_kernels) 24 | 25 | # Generate Metal code 26 | output_dir = Path('metal_output') 27 | output_dir.mkdir(exist_ok=True) 28 | 29 | # Generate kernel code 30 | for kernel in kernels: 31 | metal_code = metal_generator.generate_metal_code(kernel) 32 | kernel_file = output_dir / f"{kernel.name}.metal" 33 | kernel_file.write_text(metal_code) 34 | 35 | # Translate host code 36 | with open(cuda_file) as f: 37 | cuda_host_code = f.read() 38 | metal_host_code = host_translator.translate_host_code(cuda_host_code, target_lang='swift') 39 | host_file = output_dir / "host.swift" 40 | host_file.write_text(metal_host_code) 41 | 42 | if __name__ == "__main__": 43 | cuda_file = "vector_add.cu" 44 | translate_cuda_to_metal(cuda_file) -------------------------------------------------------------------------------- /templates/swift/metal_manager.swift: -------------------------------------------------------------------------------- 1 | import Metal 2 | import Foundation 3 | 4 | class MetalManager { 5 | private let device: MTLDevice 6 | private let commandQueue: MTLCommandQueue 7 | 8 | init(device: MTLDevice) { 9 | self.device = device 10 | self.commandQueue = device.makeCommandQueue()! 11 | } 12 | 13 | func executeKernel(functionName: String, inputBuffer: MTLBuffer, outputBuffer: MTLBuffer) { 14 | guard let library = device.makeDefaultLibrary(), 15 | let function = library.makeFunction(name: functionName) else { 16 | print("Failed to find the function \(functionName)") 17 | return 18 | } 19 | 20 | do { 21 | let pipelineState = try device.makeComputePipelineState(function: function) 22 | guard let commandBuffer = commandQueue.makeCommandBuffer(), 23 | let commandEncoder = commandBuffer.makeComputeCommandEncoder() else { 24 | print("Failed to create command encoder") 25 | return 26 | } 27 | 28 | commandEncoder.setComputePipelineState(pipelineState) 29 | commandEncoder.setBuffer(inputBuffer, offset: 0, index: 0) 30 | commandEncoder.setBuffer(outputBuffer, offset: 0, index: 1) 31 | 32 | let gridSize = MTLSize(width: 256, height: 1, depth: 1) 33 | let threadGroupSize = MTLSize(width: 16, height: 1, depth: 1) 34 | commandEncoder.dispatchThreads(gridSize, threadsPerThreadgroup: threadGroupSize) 35 | 36 | commandEncoder.endEncoding() 37 | commandBuffer.commit() 38 | commandBuffer.waitUntilCompleted() 39 | 40 | print("Kernel execution completed") 41 | } catch { 42 | print("Error creating pipeline state: \(error)") 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /templates/objc/metal_manager.m: -------------------------------------------------------------------------------- 1 | #import "metal_manager.h" 2 | 3 | @implementation MetalManager { 4 | id _device; 5 | id _commandQueue; 6 | } 7 | 8 | - (instancetype)initWithDevice:(id)device { 9 | self = [super init]; 10 | if (self) { 11 | _device = device; 12 | _commandQueue = [_device newCommandQueue]; 13 | } 14 | return self; 15 | } 16 | 17 | - (void)executeKernelWithName:(NSString *)kernelName 18 | withInput:(id)inputBuffer 19 | outputBuffer:(id)outputBuffer { 20 | NSError *error = nil; 21 | id library = [_device newDefaultLibrary]; 22 | id function = [library newFunctionWithName:kernelName]; 23 | 24 | if (!function) { 25 | NSLog(@"Failed to load kernel function: %@", kernelName); 26 | return; 27 | } 28 | 29 | id pipelineState = [_device newComputePipelineStateWithFunction:function error:&error]; 30 | if (error) { 31 | NSLog(@"Error creating pipeline state: %@", error.localizedDescription); 32 | return; 33 | } 34 | 35 | id commandBuffer = [_commandQueue commandBuffer]; 36 | id commandEncoder = [commandBuffer computeCommandEncoder]; 37 | 38 | [commandEncoder setComputePipelineState:pipelineState]; 39 | [commandEncoder setBuffer:inputBuffer offset:0 atIndex:0]; 40 | [commandEncoder setBuffer:outputBuffer offset:0 atIndex:1]; 41 | 42 | MTLSize gridSize = MTLSizeMake(256, 1, 1); 43 | MTLSize threadGroupSize = MTLSizeMake(16, 1, 1); 44 | [commandEncoder dispatchThreads:gridSize threadsPerThreadgroup:threadGroupSize]; 45 | 46 | [commandEncoder endEncoding]; 47 | [commandBuffer commit]; 48 | [commandBuffer waitUntilCompleted]; 49 | 50 | NSLog(@"Kernel execution complete."); 51 | } 52 | 53 | @end 54 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 34 | -------------------------------------------------------------------------------- /parser/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | CUDA Parser Module Initialization 3 | Provides complete type system and node hierarchy for CUDA to Metal translation. 4 | 5 | Usage: 6 | from CUDAM.parser import CUDAKernel, CUDAType, CUDAQualifier 7 | """ 8 | 9 | # Core node system imports using absolute imports 10 | from core.parser.ast_nodes import ( 11 | # Core node types and enums 12 | CUDANode, 13 | CUDAKernel, 14 | CUDAParameter, 15 | CUDAType, 16 | CUDAQualifier, 17 | CUDASharedMemory, 18 | CUDAThreadIdx, 19 | CUDABarrier, 20 | CUDACompoundStmt, 21 | CUDAExpressionNode, 22 | CUDAStatement, 23 | FunctionNode, 24 | KernelNode, 25 | VariableNode, 26 | StructNode, 27 | EnumNode, 28 | TypedefNode, 29 | ClassNode, 30 | NamespaceNode, 31 | TemplateNode, 32 | CudaASTNode, 33 | CudaTranslationContext 34 | ) 35 | 36 | # Core configuration 37 | VERSION = "1.0.0" 38 | METAL_TARGET = "2.4" 39 | OPTIMIZATION_LEVEL = 2 40 | 41 | # Public API - Defines exactly what gets exported 42 | __all__ = [ 43 | "CUDANode", 44 | "CUDAKernel", 45 | "CUDAParameter", 46 | "CUDAType", 47 | "CUDAQualifier", 48 | "CUDASharedMemory", 49 | "CUDAThreadIdx", 50 | "CUDABarrier", 51 | "CUDACompoundStmt", 52 | "CUDAExpressionNode", 53 | "CUDAStatement", 54 | "FunctionNode", 55 | "KernelNode", 56 | "VariableNode", 57 | "StructNode", 58 | "EnumNode", 59 | "TypedefNode", 60 | "ClassNode", 61 | "NamespaceNode", 62 | "TemplateNode", 63 | "CudaASTNode", 64 | "CudaTranslationContext" 65 | ] 66 | 67 | # Convenience aliases 68 | KernelNode = CUDAKernel 69 | ParameterNode = CUDAParameter 70 | CompoundStmtNode = CUDACompoundStmt 71 | 72 | # Initialize configuration 73 | def init_translation( 74 | source_file: str, 75 | metal_target: str = METAL_TARGET, 76 | optimization_level: int = OPTIMIZATION_LEVEL 77 | ) -> CudaTranslationContext: 78 | """Initialize AST translation context with specified parameters.""" 79 | return CudaTranslationContext( 80 | source_file=source_file, 81 | metal_target=metal_target, 82 | optimization_level=optimization_level 83 | ) 84 | 85 | # Error checking and validation 86 | def validate_ast(node: CUDANode) -> bool: 87 | """Validate AST node and its children for Metal compatibility.""" 88 | if not isinstance(node, CUDANode): 89 | return False 90 | return all(validate_ast(child) for child in node.children) 91 | -------------------------------------------------------------------------------- /templates/metal/header_template.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDAMetalKernel_h 2 | #define CUDAMetalKernel_h 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace metal; 10 | 11 | // CUDA-style vector types 12 | struct int2 { int x, y; }; 13 | struct int3 { int x, y, z; }; 14 | struct int4 { int x, y, z, w; }; 15 | struct uint2 { uint x, y; }; 16 | struct uint3 { uint x, y, z; }; 17 | struct uint4 { uint x, y, z, w; }; 18 | struct float2 { float x, y; }; 19 | struct float3 { float x, y, z; }; 20 | struct float4 { float x, y, z, w; }; 21 | 22 | // Thread indexing 23 | #define threadIdx_x (thread_position_in_threadgroup.x) 24 | #define threadIdx_y (thread_position_in_threadgroup.y) 25 | #define threadIdx_z (thread_position_in_threadgroup.z) 26 | #define blockIdx_x (threadgroup_position_in_grid.x) 27 | #define blockIdx_y (threadgroup_position_in_grid.y) 28 | #define blockIdx_z (threadgroup_position_in_grid.z) 29 | #define blockDim_x (threads_per_threadgroup.x) 30 | #define blockDim_y (threads_per_threadgroup.y) 31 | #define blockDim_z (threads_per_threadgroup.z) 32 | #define gridDim_x (threadgroups_per_grid.x) 33 | #define gridDim_y (threadgroups_per_grid.y) 34 | #define gridDim_z (threadgroups_per_grid.z) 35 | 36 | // Common kernel parameters structure 37 | struct KernelParameters { 38 | uint problemSize; 39 | uint batchSize; 40 | float learningRate; 41 | float4 reserved; // For alignment 42 | }; 43 | 44 | // CUDA synchronization primitives 45 | #define __syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup) 46 | #define __threadfence() threadgroup_barrier(mem_flags::mem_device) 47 | #define __threadfence_block() threadgroup_barrier(mem_flags::mem_threadgroup) 48 | 49 | // CUDA atomic operations 50 | template 51 | METAL_FUNC T atomicAdd(device atomic_uint* addr, T val) { 52 | return atomic_fetch_add_explicit(addr, val, memory_order_relaxed); 53 | } 54 | 55 | template 56 | METAL_FUNC T atomicMax(device atomic_uint* addr, T val) { 57 | return atomic_fetch_max_explicit(addr, val, memory_order_relaxed); 58 | } 59 | 60 | // CUDA math functions 61 | #define __fdividef(x, y) ((x) / (y)) 62 | #define __expf(x) metal::exp(x) 63 | #define __logf(x) metal::log(x) 64 | #define __powf(x, y) metal::pow(x, y) 65 | 66 | // SIMD group operations 67 | #define METAL_WARP_SIZE 32 68 | #define warpSize METAL_WARP_SIZE 69 | 70 | METAL_FUNC uint get_lane_id() { 71 | return threadIdx_x & (METAL_WARP_SIZE - 1); 72 | } 73 | 74 | METAL_FUNC uint get_warp_id() { 75 | return threadIdx_x >> 5; 76 | } 77 | 78 | // Memory space qualifiers 79 | #define __shared__ threadgroup 80 | #define __constant__ constant 81 | #define __device__ device 82 | 83 | #endif /* CUDAMetalKernel_h */ -------------------------------------------------------------------------------- /problems.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import json 4 | from pathlib import Path 5 | 6 | def run_pylint(project_dir): 7 | """ 8 | Runs pylint on the specified project directory and returns the JSON output. 9 | """ 10 | try: 11 | # Run pylint with JSON output 12 | result = subprocess.run( 13 | ['pylint', project_dir, '--output-format=json'], 14 | stdout=subprocess.PIPE, 15 | stderr=subprocess.PIPE, 16 | text=True, 17 | check=False # Don't raise exception on non-zero exit 18 | ) 19 | 20 | if result.stderr: 21 | print("Pylint encountered an error:") 22 | print(result.stderr) 23 | # Continue processing even if pylint reports errors (like syntax errors) 24 | 25 | # Parse JSON output 26 | pylint_output = json.loads(result.stdout) 27 | return pylint_output 28 | 29 | except FileNotFoundError: 30 | print("Pylint is not installed or not found in the system PATH.") 31 | return None 32 | except json.JSONDecodeError: 33 | print("Failed to parse pylint output. Ensure pylint is producing valid JSON.") 34 | return None 35 | 36 | def extract_errors(pylint_output): 37 | """ 38 | Extracts only error and fatal issues from pylint output. 39 | 40 | Args: 41 | pylint_output (list): The JSON-parsed output from pylint. 42 | 43 | Returns: 44 | list: Filtered list of error issues. 45 | """ 46 | error_issues = [ 47 | { 48 | 'File': issue.get('path', ''), 49 | 'Line': issue.get('line', ''), 50 | 'Column': issue.get('column', ''), 51 | 'Symbol': issue.get('symbol', ''), 52 | 'Message': issue.get('message', ''), 53 | 'Type': issue.get('type', '') 54 | } 55 | for issue in pylint_output 56 | if issue.get('type', '').lower() in ['error', 'fatal'] and issue.get('message-id', '').startswith(('E', 'F')) 57 | ] 58 | 59 | return error_issues 60 | 61 | def main(): 62 | # Define your project directory 63 | project_dir = Path(r'C:\Users\PC\Desktop\Megie\CUDAM\CUDAM') 64 | 65 | if not project_dir.exists(): 66 | print(f"The directory {project_dir} does not exist.") 67 | return 68 | 69 | print(f"Running pylint on {project_dir}...") 70 | 71 | pylint_output = run_pylint(str(project_dir)) 72 | 73 | if pylint_output is None: 74 | print("No pylint output to process.") 75 | return 76 | 77 | relevant_errors = extract_errors(pylint_output) 78 | 79 | print("\n=== Pylint Errors ===") 80 | if relevant_errors: 81 | for issue in relevant_errors: 82 | print(f"{issue['File']}:{issue['Line']}:{issue['Column']} - {issue['Message']} [{issue['Symbol']}] ({issue['Type'].capitalize()})") 83 | else: 84 | print("No errors found.") 85 | 86 | # Optionally, save the results to a file 87 | save_results = True # Set to False if you don't want to save 88 | if save_results: 89 | errors_file = project_dir / 'pylint_errors.txt' 90 | 91 | with open(errors_file, 'w', encoding='utf-8') as f: 92 | for issue in relevant_errors: 93 | f.write(f"{issue['File']}:{issue['Line']}:{issue['Column']} - {issue['Message']} [{issue['Symbol']}] ({issue['Type'].capitalize()})\n") 94 | 95 | print(f"\nErrors saved to {errors_file}") 96 | 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /translator/host_adapter.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict, Any 3 | from ..utils.error_handler import CudaTranslationError 4 | from ..utils.logger import get_logger 5 | from ..translator.kernel_translator import KernelTranslator 6 | from ..translator.memory_model_translator import MemoryModelTranslator 7 | 8 | logger = get_logger(__name__) 9 | 10 | class HostAdapter: 11 | def __init__(self, kernel_translator: KernelTranslator, memory_translator: MemoryModelTranslator): 12 | self.kernel_translator = kernel_translator 13 | self.memory_translator = memory_translator 14 | self.cuda_to_metal_api = { 15 | 'cudaMalloc': 'newBufferWithLength', 16 | 'cudaFree': None, 17 | 'cudaMemcpy': 'contents', 18 | 'cudaStreamCreate': 'newCommandQueue', 19 | 'cudaStreamDestroy': None, 20 | 'cudaEventCreate': 'newEvent', 21 | 'cudaEventRecord': 'enqueue', 22 | 'cudaEventSynchronize': 'waitUntilCompleted', 23 | 'cudaDeviceSynchronize': 'commit' 24 | } 25 | 26 | def translate_host_code(self, cuda_code: str) -> str: 27 | metal_code = cuda_code 28 | 29 | for cuda_api, metal_api in self.cuda_to_metal_api.items(): 30 | if metal_api: 31 | metal_code = metal_code.replace(cuda_api, metal_api) 32 | else: 33 | metal_code = self.remove_unsupported_call(metal_code, cuda_api) 34 | 35 | metal_code = self.adapt_kernel_launches(metal_code) 36 | metal_code = self.translate_memory_management(metal_code) 37 | return metal_code 38 | 39 | def remove_unsupported_call(self, code: str, api_call: str) -> str: 40 | pattern = rf'{api_call}\s*\([^)]*\);' 41 | return re.sub(pattern, f'// Removed unsupported CUDA call: {api_call}', code) 42 | 43 | def adapt_kernel_launches(self, code: str) -> str: 44 | kernel_launch_pattern = r'(\w+)<<<(.+?)>>>(.+?);' 45 | 46 | def replace_kernel_launch(match): 47 | kernel_name = match.group(1) 48 | launch_params = match.group(2).split(',') 49 | kernel_args = match.group(3) 50 | 51 | grid_dim = launch_params[0].strip() 52 | block_dim = launch_params[1].strip() 53 | 54 | return f""" 55 | MTLSize gridSize = MTLSizeMake({grid_dim}, 1, 1); 56 | MTLSize threadGroupSize = MTLSizeMake({block_dim}, 1, 1); 57 | [commandEncoder setComputePipelineState:{kernel_name}PipelineState]; 58 | [commandEncoder dispatchThreadgroups:gridSize threadsPerThreadgroup:threadGroupSize]; 59 | {self.kernel_translator.translate_kernel(kernel_name)}{kernel_args}; 60 | """ 61 | 62 | return re.sub(kernel_launch_pattern, replace_kernel_launch, code) 63 | 64 | def translate_memory_management(self, code: str) -> str: 65 | malloc_pattern = r'cudaMalloc\(\(void\*\*\)&(\w+),\s*(.+?)\);' 66 | code = re.sub(malloc_pattern, lambda m: f"{m.group(1)} = [device newBufferWithLength:{m.group(2)} options:MTLResourceStorageModeShared];", code) 67 | 68 | memcpy_pattern = r'cudaMemcpy\((.+?),\s*(.+?),\s*(.+?),\s*cudaMemcpy(.+?)\);' 69 | code = re.sub(memcpy_pattern, lambda m: f"memcpy({m.group(1)}.contents, {m.group(2)}, {m.group(3)});", code) 70 | 71 | return code 72 | 73 | def generate_metal_setup(self) -> str: 74 | return """ 75 | id device = MTLCreateSystemDefaultDevice(); 76 | id commandQueue = [device newCommandQueue]; 77 | id commandBuffer = [commandQueue commandBuffer]; 78 | id commandEncoder = [commandBuffer computeCommandEncoder]; 79 | """ 80 | 81 | def generate_metal_cleanup(self) -> str: 82 | return """ 83 | [commandEncoder endEncoding]; 84 | [commandBuffer commit]; 85 | [commandBuffer waitUntilCompleted]; 86 | """ 87 | 88 | logger.info("HostAdapter initialized for CUDA to Metal host code translation.") -------------------------------------------------------------------------------- /testdata.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | def generate_project_structure(directory, indent_level=0): 5 | structure = "" 6 | for root, dirs, files in os.walk(directory): 7 | if any(ignored in root for ignored in ['venv', '.git', 'node_modules','public']): 8 | continue 9 | 10 | level = root.replace(directory, '').count(os.sep) 11 | indent = '│ ' * (level - indent_level) 12 | structure += f"{indent}├── {os.path.basename(root)}/\n" 13 | sub_indent = '│ ' * (level + 1 - indent_level) 14 | for file in files: 15 | structure += f"{sub_indent}├── {file}\n" 16 | dirs[:] = [d for d in dirs if d not in ['venv', '.git', 'node_modules','public']] # Skip these directories 17 | 18 | return structure 19 | 20 | def extract_classes_and_methods(content): 21 | class_regex = r'class\s+(\w+)\s*(\(.*?\))?:' 22 | frontend_method_regex = r'(?:render_template|get|post|route)\s*\(.*?\)' # Matches common Flask or Django view methods 23 | 24 | extracted_content = "" 25 | class_matches = re.findall(class_regex, content) 26 | 27 | for class_match in class_matches: 28 | class_name = class_match 29 | extracted_content += f"\nClass: {class_name}\n" 30 | extracted_content += "-" * 80 + "\n" 31 | 32 | method_matches = re.findall(frontend_method_regex, content) 33 | for method_match in method_matches: 34 | extracted_content += f" Method: {method_match}\n" 35 | 36 | return extracted_content 37 | 38 | def read_frontend_files(directory): 39 | content = "" 40 | for root, dirs, files in os.walk(directory): 41 | if any(ignored in root for ignored in ['venv', '.git', 'node_modules','public','build']): 42 | continue 43 | 44 | for file in files: 45 | if file.endswith(('.metal', '.h', '.m', '.swift', '.py', '.cu', '.cuh')): 46 | file_path = os.path.join(root, file) 47 | print(f"Processing file: {file_path}") 48 | content += f"File: {file_path}\n\n" 49 | try: 50 | with open(file_path, 'r', encoding='utf-8') as f: 51 | file_content = f.read() 52 | content += file_content 53 | 54 | # Extract classes and methods if it's a Python file for frontend views 55 | if file.endswith(('.metal', '.h', '.m', '.swift', '.py', '.cu', '.cuh')): 56 | extracted_classes_methods = extract_classes_and_methods(file_content) 57 | content += extracted_classes_methods 58 | 59 | except UnicodeDecodeError: 60 | try: 61 | with open(file_path, 'r', encoding='ISO-8859-1') as f: 62 | file_content = f.read() 63 | content += file_content 64 | except Exception as e: 65 | content += f"Error reading file: {e}" 66 | content += "\n\n" + "-"*80 + "\n\n" 67 | dirs[:] = [d for d in dirs if d not in ['venv', '.git', 'node_modules','public','build']] # Skip these directories 68 | return content 69 | 70 | def save_content_to_txt(directory, output_file): 71 | print("Starting the process...") 72 | project_structure = generate_project_structure(directory) 73 | frontend_content = read_frontend_files(directory) 74 | with open(output_file, 'w', encoding='utf-8') as f: 75 | f.write("Project Structure:\n\n") 76 | f.write(project_structure) 77 | f.write("\n\n" + "="*80 + "\n\n") 78 | f.write("Frontend File Contents:\n\n") 79 | f.write(frontend_content) 80 | print("Process completed successfully.") 81 | 82 | # Usage 83 | project_directory = r"C:\Users\PC\Desktop\Megie\CUDAM\CUDAM" 84 | output_file = r"C:\Users\PC\Desktop\Megie\CUDAM\CUDAM\babouchka.txt" 85 | 86 | try: 87 | save_content_to_txt(project_directory, output_file) 88 | except PermissionError: 89 | print("Permission denied. Please check your write permissions or choose a different output location.") 90 | except Exception as e: 91 | print(f"An error occurred: {e}") -------------------------------------------------------------------------------- /utils/metal_equivalents.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Callable, Any, List, Optional 2 | from .cuda_builtin_functions import CudaBuiltinFunction, CUDA_BUILTIN_FUNCTIONS 3 | from .cuda_to_metal_type_mapping import map_cuda_type_to_metal 4 | 5 | class MetalEquivalent: 6 | def __init__(self, cuda_function: str, metal_function: str, 7 | argument_transformer: Optional[Callable[[List[str]], List[str]]] = None, 8 | return_transformer: Optional[Callable[[str], str]] = None, 9 | requires_custom_implementation: bool = False): 10 | self.cuda_function = cuda_function 11 | self.metal_function = metal_function 12 | self.argument_transformer = argument_transformer 13 | self.return_transformer = return_transformer 14 | self.requires_custom_implementation = requires_custom_implementation 15 | 16 | def transform_arguments(self, args: List[str]) -> List[str]: 17 | if self.argument_transformer: 18 | return self.argument_transformer(args) 19 | return args 20 | 21 | def transform_return(self, return_value: str) -> str: 22 | if self.return_transformer: 23 | return self.return_transformer(return_value) 24 | return return_value 25 | 26 | def threadIdx_transformer(args: List[str]) -> List[str]: 27 | return ['thread_position_in_threadgroup'] 28 | 29 | def blockIdx_transformer(args: List[str]) -> List[str]: 30 | return ['threadgroup_position_in_grid'] 31 | 32 | def atomicAdd_transformer(args: List[str]) -> List[str]: 33 | return [f'atomic_fetch_add_explicit({args[0]}, {args[1]}, memory_order_relaxed)'] 34 | 35 | METAL_EQUIVALENTS: Dict[str, MetalEquivalent] = { 36 | 'threadIdx': MetalEquivalent('threadIdx', 'thread_position_in_threadgroup', threadIdx_transformer), 37 | 'blockIdx': MetalEquivalent('blockIdx', 'threadgroup_position_in_grid', blockIdx_transformer), 38 | 'blockDim': MetalEquivalent('blockDim', 'threadgroup_size'), 39 | 'gridDim': MetalEquivalent('gridDim', 'grid_size'), 40 | '__syncthreads': MetalEquivalent('__syncthreads', 'threadgroup_barrier(metal::mem_flags::mem_device)'), 41 | 'atomicAdd': MetalEquivalent('atomicAdd', 'atomic_fetch_add_explicit', atomicAdd_transformer), 42 | 'cudaMalloc': MetalEquivalent('cudaMalloc', 'device.makeBuffer', requires_custom_implementation=True), 43 | 'cudaFree': MetalEquivalent('cudaFree', '', requires_custom_implementation=True), # No direct equivalent, memory management is different 44 | 'cudaMemcpy': MetalEquivalent('cudaMemcpy', 'memcpy', requires_custom_implementation=True), 45 | } 46 | 47 | def get_metal_equivalent(cuda_function: str) -> MetalEquivalent: 48 | if cuda_function in METAL_EQUIVALENTS: 49 | return METAL_EQUIVALENTS[cuda_function] 50 | 51 | # For CUDA built-in functions not explicitly defined in METAL_EQUIVALENTS 52 | if cuda_function in CUDA_BUILTIN_FUNCTIONS: 53 | cuda_builtin = CUDA_BUILTIN_FUNCTIONS[cuda_function] 54 | return MetalEquivalent(cuda_function, cuda_builtin.metal_equivalent) 55 | 56 | # If no equivalent is found, return the original function name 57 | return MetalEquivalent(cuda_function, cuda_function) 58 | 59 | def translate_cuda_call_to_metal(cuda_function: str, args: List[str]) -> str: 60 | equivalent = get_metal_equivalent(cuda_function) 61 | transformed_args = equivalent.transform_arguments(args) 62 | 63 | if equivalent.requires_custom_implementation: 64 | return f"// TODO: Implement custom Metal equivalent for {cuda_function}\n" \ 65 | f"// {equivalent.metal_function}({', '.join(transformed_args)})" 66 | 67 | return f"{equivalent.metal_function}({', '.join(transformed_args)})" 68 | 69 | def get_metal_type(cuda_type: str) -> str: 70 | return map_cuda_type_to_metal(cuda_type) 71 | 72 | def generate_metal_kernel_signature(kernel_name: str, parameters: List[CudaBuiltinFunction]) -> str: 73 | metal_params = [] 74 | for i, param in enumerate(parameters): 75 | metal_type = get_metal_type(param.return_type) 76 | metal_params.append(f"{metal_type} {param.name} [[buffer({i})]]") 77 | 78 | return f"kernel void {kernel_name}({', '.join(metal_params)})" 79 | 80 | -------------------------------------------------------------------------------- /utils/cuda_to_metal_type_mapping.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | class TypeMapping: 4 | def __init__(self, cuda_type: str, metal_type: str, 5 | requires_header: bool = False, 6 | metal_header: Optional[str] = None): 7 | self.cuda_type = cuda_type 8 | self.metal_type = metal_type 9 | self.requires_header = requires_header 10 | self.metal_header = metal_header 11 | 12 | def __str__(self): 13 | return f"{self.cuda_type} -> {self.metal_type}" 14 | 15 | CUDA_TO_METAL_TYPE_MAP: Dict[str, TypeMapping] = { 16 | # Integer types 17 | 'char': TypeMapping('char', 'char'), 18 | 'signed char': TypeMapping('signed char', 'char'), 19 | 'unsigned char': TypeMapping('unsigned char', 'uchar'), 20 | 'short': TypeMapping('short', 'short'), 21 | 'unsigned short': TypeMapping('unsigned short', 'ushort'), 22 | 'int': TypeMapping('int', 'int'), 23 | 'unsigned int': TypeMapping('unsigned int', 'uint'), 24 | 'long': TypeMapping('long', 'int'), # In Metal, long is 32-bit 25 | 'unsigned long': TypeMapping('unsigned long', 'uint'), 26 | 'long long': TypeMapping('long long', 'long'), # In Metal, long long is 64-bit 27 | 'unsigned long long': TypeMapping('unsigned long long', 'ulong'), 28 | 29 | # Floating-point types 30 | 'float': TypeMapping('float', 'float'), 31 | 'double': TypeMapping('double', 'float'), # Metal doesn't support double, use float 32 | 33 | # Vector types 34 | 'char2': TypeMapping('char2', 'char2', True, ''), 35 | 'char3': TypeMapping('char3', 'char3', True, ''), 36 | 'char4': TypeMapping('char4', 'char4', True, ''), 37 | 'uchar2': TypeMapping('uchar2', 'uchar2', True, ''), 38 | 'uchar3': TypeMapping('uchar3', 'uchar3', True, ''), 39 | 'uchar4': TypeMapping('uchar4', 'uchar4', True, ''), 40 | 'short2': TypeMapping('short2', 'short2', True, ''), 41 | 'short3': TypeMapping('short3', 'short3', True, ''), 42 | 'short4': TypeMapping('short4', 'short4', True, ''), 43 | 'ushort2': TypeMapping('ushort2', 'ushort2', True, ''), 44 | 'ushort3': TypeMapping('ushort3', 'ushort3', True, ''), 45 | 'ushort4': TypeMapping('ushort4', 'ushort4', True, ''), 46 | 'int2': TypeMapping('int2', 'int2', True, ''), 47 | 'int3': TypeMapping('int3', 'int3', True, ''), 48 | 'int4': TypeMapping('int4', 'int4', True, ''), 49 | 'uint2': TypeMapping('uint2', 'uint2', True, ''), 50 | 'uint3': TypeMapping('uint3', 'uint3', True, ''), 51 | 'uint4': TypeMapping('uint4', 'uint4', True, ''), 52 | 'float2': TypeMapping('float2', 'float2', True, ''), 53 | 'float3': TypeMapping('float3', 'float3', True, ''), 54 | 'float4': TypeMapping('float4', 'float4', True, ''), 55 | 56 | # CUDA-specific types 57 | 'dim3': TypeMapping('dim3', 'uint3', True, ''), 58 | 'cudaError_t': TypeMapping('cudaError_t', 'int'), 59 | 'cudaStream_t': TypeMapping('cudaStream_t', 'metal::command_queue'), 60 | 'cudaEvent_t': TypeMapping('cudaEvent_t', 'metal::event'), 61 | } 62 | 63 | def map_cuda_type_to_metal(cuda_type: str) -> str: 64 | mapping = CUDA_TO_METAL_TYPE_MAP.get(cuda_type) 65 | return mapping.metal_type if mapping else cuda_type 66 | 67 | def requires_metal_header(cuda_type: str) -> bool: 68 | mapping = CUDA_TO_METAL_TYPE_MAP.get(cuda_type) 69 | return mapping.requires_header if mapping else False 70 | 71 | def get_metal_header(cuda_type: str) -> Optional[str]: 72 | mapping = CUDA_TO_METAL_TYPE_MAP.get(cuda_type) 73 | return mapping.metal_header if mapping else None 74 | 75 | def is_vector_type(type_name: str) -> bool: 76 | return type_name.lower() in [ 77 | 'char2', 'char3', 'char4', 78 | 'uchar2', 'uchar3', 'uchar4', 79 | 'short2', 'short3', 'short4', 80 | 'ushort2', 'ushort3', 'ushort4', 81 | 'int2', 'int3', 'int4', 82 | 'uint2', 'uint3', 'uint4', 83 | 'float2', 'float3', 'float4' 84 | ] 85 | 86 | def get_vector_component_type(vector_type: str) -> str: 87 | base_type = vector_type.rstrip('234') 88 | return map_cuda_type_to_metal(base_type) 89 | 90 | def get_vector_size(vector_type: str) -> int: 91 | return int(vector_type[-1]) if vector_type[-1].isdigit() else 0 -------------------------------------------------------------------------------- /utils/error_handler.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Any 2 | import traceback 3 | 4 | class CudaError(Exception): 5 | """Base class for CUDA-related errors.""" 6 | def __init__(self, message: str, error_code: Optional[int] = None, details: Optional[Dict[str, Any]] = None): 7 | self.message = message 8 | self.error_code = error_code 9 | self.details = details or {} 10 | super().__init__(self.message) 11 | 12 | def __str__(self): 13 | error_str = f"[Error {self.error_code}] " if self.error_code else "" 14 | error_str += self.message 15 | if self.details: 16 | error_str += "\nDetails:\n" + "\n".join(f" {k}: {v}" for k, v in self.details.items()) 17 | return error_str 18 | 19 | class CudaParseError(CudaError): 20 | """Exception raised for errors in parsing CUDA code.""" 21 | def __init__(self, message: str, line: Optional[int] = None, column: Optional[int] = None, filename: Optional[str] = None): 22 | details = {"line": line, "column": column, "filename": filename} 23 | super().__init__(message, error_code=1001, details=details) 24 | 25 | class CudaTranslationError(CudaError): 26 | """Exception raised for errors in translating CUDA code to Metal.""" 27 | def __init__(self, message: str, cuda_construct: Optional[str] = None, metal_equivalent: Optional[str] = None): 28 | details = {"cuda_construct": cuda_construct, "metal_equivalent": metal_equivalent} 29 | super().__init__(message, error_code=2001, details=details) 30 | 31 | class CudaTypeError(CudaError): 32 | """Exception raised for type-related errors in CUDA code.""" 33 | def __init__(self, message: str, expected_type: Optional[str] = None, actual_type: Optional[str] = None): 34 | details = {"expected_type": expected_type, "actual_type": actual_type} 35 | super().__init__(message, error_code=3001, details=details) 36 | 37 | class CudaNotSupportedError(CudaError): 38 | """Exception raised for CUDA features not supported in Metal.""" 39 | def __init__(self, message: str, cuda_feature: str): 40 | details = {"cuda_feature": cuda_feature} 41 | super().__init__(message, error_code=4001, details=details) 42 | 43 | class CudaWarning: 44 | """Warning class for non-critical issues in CUDA code parsing or translation.""" 45 | def __init__(self, message: str, warning_code: Optional[int] = None, details: Optional[Dict[str, Any]] = None): 46 | self.message = message 47 | self.warning_code = warning_code 48 | self.details = details or {} 49 | 50 | def __str__(self): 51 | warning_str = f"[Warning {self.warning_code}] " if self.warning_code else "" 52 | warning_str += self.message 53 | if self.details: 54 | warning_str += "\nDetails:\n" + "\n".join(f" {k}: {v}" for k, v in self.details.items()) 55 | return warning_str 56 | 57 | def handle_exception(e: Exception, logger): 58 | """ 59 | Handle exceptions, log them, and optionally perform additional actions. 60 | """ 61 | if isinstance(e, CudaError): 62 | logger.error(str(e)) 63 | else: 64 | logger.error(f"Unexpected error: {str(e)}") 65 | logger.debug(f"Stack trace:\n{''.join(traceback.format_tb(e.__traceback__))}") 66 | 67 | def raise_cuda_parse_error(message: str, line: Optional[int] = None, column: Optional[int] = None, filename: Optional[str] = None): 68 | """Convenience function to raise a CudaParseError.""" 69 | raise CudaParseError(message, line, column, filename) 70 | 71 | def raise_cuda_translation_error(message: str, cuda_construct: Optional[str] = None, metal_equivalent: Optional[str] = None): 72 | """Convenience function to raise a CudaTranslationError.""" 73 | raise CudaTranslationError(message, cuda_construct, metal_equivalent) 74 | 75 | def raise_cuda_type_error(message: str, expected_type: Optional[str] = None, actual_type: Optional[str] = None): 76 | """Convenience function to raise a CudaTypeError.""" 77 | raise CudaTypeError(message, expected_type, actual_type) 78 | 79 | def raise_cuda_not_supported_error(message: str, cuda_feature: str): 80 | """Convenience function to raise a CudaNotSupportedError.""" 81 | raise CudaNotSupportedError(message, cuda_feature) 82 | 83 | def issue_cuda_warning(message: str, warning_code: Optional[int] = None, details: Optional[Dict[str, Any]] = None, logger=None): 84 | """Issue a CudaWarning and optionally log it.""" 85 | warning = CudaWarning(message, warning_code, details) 86 | if logger: 87 | logger.warning(str(warning)) 88 | return warning -------------------------------------------------------------------------------- /utils/cuda_builtin_functions.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | 3 | class CudaBuiltinFunction: 4 | def __init__(self, name: str, return_type: str, parameters: List[Tuple[str, str]], 5 | is_device_function: bool, metal_equivalent: str): 6 | self.name = name 7 | self.return_type = return_type 8 | self.parameters = parameters 9 | self.is_device_function = is_device_function 10 | self.metal_equivalent = metal_equivalent 11 | 12 | def __str__(self): 13 | params_str = ', '.join([f'{param_type} {param_name}' for param_name, param_type in self.parameters]) 14 | return f'{self.return_type} {self.name}({params_str})' 15 | 16 | CUDA_BUILTIN_FUNCTIONS: Dict[str, CudaBuiltinFunction] = { 17 | # Thread Management 18 | 'threadIdx': CudaBuiltinFunction('threadIdx', 'uint3', [], True, 'thread_position_in_threadgroup'), 19 | 'blockIdx': CudaBuiltinFunction('blockIdx', 'uint3', [], True, 'threadgroup_position_in_grid'), 20 | 'blockDim': CudaBuiltinFunction('blockDim', 'uint3', [], True, 'threadgroup_size'), 21 | 'gridDim': CudaBuiltinFunction('gridDim', 'uint3', [], True, 'grid_size'), 22 | 'warpSize': CudaBuiltinFunction('warpSize', 'int', [], True, '32'), 23 | 24 | # Synchronization 25 | '__syncthreads': CudaBuiltinFunction('__syncthreads', 'void', [], True, 'threadgroup_barrier(mem_flags::mem_device)'), 26 | '__syncwarp': CudaBuiltinFunction('__syncwarp', 'void', [('mask', 'unsigned int')], True, 'simdgroup_barrier(mem_flags::mem_none)'), 27 | 28 | # Atomic Operations 29 | 'atomicAdd': CudaBuiltinFunction('atomicAdd', 'T', [('address', 'T*'), ('val', 'T')], True, 'atomic_fetch_add_explicit'), 30 | 'atomicSub': CudaBuiltinFunction('atomicSub', 'T', [('address', 'T*'), ('val', 'T')], True, 'atomic_fetch_sub_explicit'), 31 | 'atomicExch': CudaBuiltinFunction('atomicExch', 'T', [('address', 'T*'), ('val', 'T')], True, 'atomic_exchange_explicit'), 32 | 'atomicMin': CudaBuiltinFunction('atomicMin', 'T', [('address', 'T*'), ('val', 'T')], True, 'atomic_fetch_min_explicit'), 33 | 'atomicMax': CudaBuiltinFunction('atomicMax', 'T', [('address', 'T*'), ('val', 'T')], True, 'atomic_fetch_max_explicit'), 34 | 'atomicInc': CudaBuiltinFunction('atomicInc', 'unsigned int', [('address', 'unsigned int*'), ('val', 'unsigned int')], True, 'custom_atomic_inc'), 35 | 'atomicDec': CudaBuiltinFunction('atomicDec', 'unsigned int', [('address', 'unsigned int*'), ('val', 'unsigned int')], True, 'custom_atomic_dec'), 36 | 'atomicCAS': CudaBuiltinFunction('atomicCAS', 'T', [('address', 'T*'), ('compare', 'T'), ('val', 'T')], True, 'atomic_compare_exchange_weak_explicit'), 37 | 38 | # Math Functions (subset) 39 | 'sin': CudaBuiltinFunction('sin', 'float', [('x', 'float')], False, 'sin'), 40 | 'cos': CudaBuiltinFunction('cos', 'float', [('x', 'float')], False, 'cos'), 41 | 'exp': CudaBuiltinFunction('exp', 'float', [('x', 'float')], False, 'exp'), 42 | 'log': CudaBuiltinFunction('log', 'float', [('x', 'float')], False, 'log'), 43 | 'sqrt': CudaBuiltinFunction('sqrt', 'float', [('x', 'float')], False, 'sqrt'), 44 | 45 | # Vector Types 46 | 'make_int2': CudaBuiltinFunction('make_int2', 'int2', [('x', 'int'), ('y', 'int')], False, 'int2'), 47 | 'make_float2': CudaBuiltinFunction('make_float2', 'float2', [('x', 'float'), ('y', 'float')], False, 'float2'), 48 | 49 | # Texture Functions 50 | 'tex2D': CudaBuiltinFunction('tex2D', 'float4', [('texObj', 'texture'), ('x', 'float'), ('y', 'float')], True, 'sample'), 51 | 52 | # Memory Management 53 | 'cudaMalloc': CudaBuiltinFunction('cudaMalloc', 'cudaError_t', [('devPtr', 'void**'), ('size', 'size_t')], False, 'device.makeBuffer'), 54 | 'cudaFree': CudaBuiltinFunction('cudaFree', 'cudaError_t', [('devPtr', 'void*')], False, 'None'), 55 | 'cudaMemcpy': CudaBuiltinFunction('cudaMemcpy', 'cudaError_t', [('dst', 'void*'), ('src', 'const void*'), ('count', 'size_t'), ('kind', 'cudaMemcpyKind')], False, 'memcpy'), 56 | } 57 | 58 | def is_cuda_builtin(func_name: str) -> bool: 59 | return func_name in CUDA_BUILTIN_FUNCTIONS 60 | 61 | def get_cuda_builtin(func_name: str) -> CudaBuiltinFunction: 62 | return CUDA_BUILTIN_FUNCTIONS.get(func_name) 63 | 64 | def get_metal_equivalent(func_name: str) -> str: 65 | builtin = get_cuda_builtin(func_name) 66 | return builtin.metal_equivalent if builtin else None 67 | 68 | def is_device_function(func_name: str) -> bool: 69 | builtin = get_cuda_builtin(func_name) 70 | return builtin.is_device_function if builtin else False 71 | 72 | def get_return_type(func_name: str) -> str: 73 | builtin = get_cuda_builtin(func_name) 74 | return builtin.return_type if builtin else None 75 | 76 | def get_parameters(func_name: str) -> List[Tuple[str, str]]: 77 | builtin = get_cuda_builtin(func_name) 78 | return builtin.parameters if builtin else [] 79 | 80 | -------------------------------------------------------------------------------- /translator/thread_hierarchy_mapper.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Tuple, Any 2 | from ..utils.error_handler import CudaTranslationError 3 | from ..utils.logger import get_logger 4 | 5 | logger = get_logger(__name__) 6 | 7 | class ThreadHierarchyMapper: 8 | def __init__(self): 9 | self.cuda_to_metal_map = { 10 | 'threadIdx': 'thread_position_in_threadgroup', 11 | 'blockIdx': 'threadgroup_position_in_grid', 12 | 'blockDim': 'threadgroup_size', 13 | 'gridDim': 'grid_size' 14 | } 15 | self.max_threads_per_threadgroup = 1024 # This may vary depending on the Metal device 16 | 17 | def map_thread_id(self, cuda_expr: str) -> str: 18 | for cuda_var, metal_var in self.cuda_to_metal_map.items(): 19 | if cuda_var in cuda_expr: 20 | return cuda_expr.replace(cuda_var, metal_var) 21 | raise CudaTranslationError(f"Unsupported CUDA thread hierarchy expression: {cuda_expr}") 22 | 23 | def calculate_global_id(self, dim: str) -> str: 24 | return f"(thread_position_in_threadgroup.{dim} + (threadgroup_position_in_grid.{dim} * threadgroup_size.{dim}))" 25 | 26 | def translate_launch_parameters(self, grid_dim: Tuple[int, int, int], block_dim: Tuple[int, int, int]) -> Dict[str, Any]: 27 | optimized_grid_dim, optimized_block_dim = self.optimize_thread_hierarchy(grid_dim, block_dim) 28 | return { 29 | 'threads_per_threadgroup': self._create_metal_size(optimized_block_dim), 30 | 'threadgroups_per_grid': self._create_metal_size(optimized_grid_dim) 31 | } 32 | 33 | def _create_metal_size(self, dim: Tuple[int, int, int]) -> str: 34 | return f"MTLSizeMake({dim[0]}, {dim[1]}, {dim[2]})" 35 | 36 | def generate_metal_dispatch(self, kernel_name: str, grid_dim: Tuple[int, int, int], block_dim: Tuple[int, int, int]) -> str: 37 | launch_params = self.translate_launch_parameters(grid_dim, block_dim) 38 | return f""" 39 | [commandEncoder setComputePipelineState:{kernel_name}PipelineState]; 40 | [commandEncoder dispatchThreadgroups:{launch_params['threadgroups_per_grid']} 41 | threadsPerThreadgroup:{launch_params['threads_per_threadgroup']}]; 42 | """ 43 | 44 | def translate_shared_memory(self, cuda_shared_mem: str) -> str: 45 | return cuda_shared_mem.replace("__shared__", "threadgroup") 46 | 47 | def translate_syncthreads(self) -> str: 48 | return "threadgroup_barrier(metal::mem_flags::mem_threadgroup);" 49 | 50 | def translate_block_sync(self) -> str: 51 | return "threadgroup_barrier(metal::mem_flags::mem_device);" 52 | 53 | def translate_grid_sync(self) -> str: 54 | logger.warning("Grid-wide synchronization is not directly supported in Metal. Using device memory barrier.") 55 | return "threadgroup_barrier(metal::mem_flags::mem_device);" 56 | 57 | def optimize_thread_hierarchy(self, grid_dim: Tuple[int, int, int], block_dim: Tuple[int, int, int]) -> Tuple[Tuple[int, int, int], Tuple[int, int, int]]: 58 | total_threads = block_dim[0] * block_dim[1] * block_dim[2] 59 | if total_threads > self.max_threads_per_threadgroup: 60 | scale_factor = (self.max_threads_per_threadgroup / total_threads) ** (1/3) 61 | new_block_dim = tuple(int(dim * scale_factor) for dim in block_dim) 62 | new_grid_dim = tuple(int(grid_dim[i] * (block_dim[i] / new_block_dim[i])) for i in range(3)) 63 | return new_grid_dim, new_block_dim 64 | 65 | # Ensure block dimensions are multiples of the SIMD width (usually 32 for Metal GPUs) 66 | simd_width = 32 67 | optimized_block_dim = tuple(((dim + simd_width - 1) // simd_width) * simd_width for dim in block_dim) 68 | 69 | # Adjust grid dimensions to account for changes in block dimensions 70 | optimized_grid_dim = tuple((grid_dim[i] * block_dim[i] + optimized_block_dim[i] - 1) // optimized_block_dim[i] for i in range(3)) 71 | 72 | return optimized_grid_dim, optimized_block_dim 73 | 74 | def translate_warp_level_operations(self, cuda_expr: str) -> str: 75 | warp_ops = { 76 | '__shfl': 'simd_shuffle', 77 | '__shfl_up': 'simd_shuffle_up', 78 | '__shfl_down': 'simd_shuffle_down', 79 | '__shfl_xor': 'simd_shuffle_xor', 80 | '__all': 'simd_all', 81 | '__any': 'simd_any', 82 | '__ballot': 'simd_ballot' 83 | } 84 | for cuda_op, metal_op in warp_ops.items(): 85 | if cuda_op in cuda_expr: 86 | return cuda_expr.replace(cuda_op, metal_op) 87 | return cuda_expr 88 | 89 | def adjust_kernel_launch(self, kernel_name: str, grid_dim: Tuple[int, int, int], block_dim: Tuple[int, int, int]) -> str: 90 | optimized_grid_dim, optimized_block_dim = self.optimize_thread_hierarchy(grid_dim, block_dim) 91 | return self.generate_metal_dispatch(kernel_name, optimized_grid_dim, optimized_block_dim) 92 | 93 | logger.info("ThreadHierarchyMapper initialized for CUDA to Metal thread hierarchy translation.") -------------------------------------------------------------------------------- /core/parser/clang_integration.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Union, Tuple 2 | from pathlib import Path 3 | import logging 4 | import clang.cindex 5 | from clang.cindex import Index, TranslationUnit, Cursor, CursorKind, TypeKind 6 | 7 | from .ast_nodes import ( 8 | CUDAType, 9 | CUDAQualifier, 10 | CUDANode, 11 | CUDAKernel, 12 | CUDAParameter, 13 | CUDACompoundStmt, 14 | CUDAThreadIdx, 15 | CUDABlockIdx, 16 | CUDAGridDim, 17 | CUDAAtomicOperation, 18 | CUDASharedMemory, 19 | CUDATexture, 20 | CUDABarrier, 21 | SourceLocation, 22 | CUDANodeType 23 | ) 24 | 25 | class ClangParser: 26 | """CUDA parser using Clang's Python bindings""" 27 | 28 | def __init__(self, cuda_path: Optional[str] = None): 29 | self.index = Index.create() 30 | self.cuda_path = cuda_path or self._find_cuda_path() 31 | self.cuda_version = self._detect_cuda_version() 32 | self._init_compilation_args() 33 | 34 | def _find_cuda_path(self) -> str: 35 | """Find CUDA installation path""" 36 | common_paths = [ 37 | "/usr/local/cuda", 38 | "/usr/cuda", 39 | "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA", 40 | "C:/CUDA" 41 | ] 42 | 43 | for path in common_paths: 44 | if Path(path).exists(): 45 | return str(Path(path)) 46 | raise RuntimeError("CUDA installation not found") 47 | 48 | def _detect_cuda_version(self) -> str: 49 | """Detect CUDA version from installation""" 50 | version_file = Path(self.cuda_path) / "version.txt" 51 | if version_file.exists(): 52 | content = version_file.read_text() 53 | import re 54 | if match := re.search(r'V(\d+\.\d+\.\d+)', content): 55 | return match.group(1) 56 | return "unknown" 57 | 58 | def _init_compilation_args(self): 59 | """Initialize CUDA compilation arguments""" 60 | self.compilation_args = [ 61 | "-x", "cuda", 62 | "--cuda-gpu-arch=sm_75", 63 | "-std=c++14", 64 | f"-I{Path(self.cuda_path)/'include'}", 65 | "-D__CUDACC__", 66 | "-D__CUDA_ARCH__=750", 67 | "-DNDEBUG", 68 | ] 69 | 70 | def parse_file(self, cuda_file: Union[str, Path]) -> Optional[CUDANode]: 71 | """Parse CUDA source file into AST""" 72 | try: 73 | tu = self.index.parse( 74 | str(cuda_file), 75 | args=self.compilation_args, 76 | options=( 77 | TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD | 78 | TranslationUnit.PARSE_INCOMPLETE 79 | ) 80 | ) 81 | 82 | # Check for fatal errors 83 | if self._has_fatal_errors(tu): 84 | return None 85 | 86 | # Convert to CUDA AST 87 | return self._process_translation_unit(tu.cursor) 88 | 89 | except Exception as e: 90 | logging.error(f"Failed to parse {cuda_file}: {str(e)}") 91 | return None 92 | 93 | def _has_fatal_errors(self, tu: TranslationUnit) -> bool: 94 | """Check for fatal parsing errors""" 95 | has_fatal = False 96 | for diag in tu.diagnostics: 97 | if diag.severity >= diag.Error: 98 | logging.error( 99 | f"{diag.location.file}:{diag.location.line} - {diag.spelling}" 100 | ) 101 | has_fatal = True 102 | return has_fatal 103 | 104 | def _process_translation_unit(self, cursor: Cursor) -> CUDANode: 105 | """Process translation unit cursor""" 106 | root = CUDANode( 107 | line=cursor.location.line, 108 | column=cursor.location.column 109 | ) 110 | 111 | for child in cursor.get_children(): 112 | if node := self._process_cursor(child): 113 | root.add_child(node) 114 | 115 | return root 116 | 117 | def _process_cursor(self, cursor: Cursor) -> Optional[CUDANode]: 118 | """Process a single Clang cursor""" 119 | source_location = SourceLocation( 120 | file=str(cursor.location.file) if cursor.location.file else "", 121 | line=cursor.location.line, 122 | column=cursor.location.column, 123 | offset=cursor.location.offset 124 | ) 125 | 126 | # Handle different cursor kinds 127 | if cursor.kind == CursorKind.FUNCTION_DECL: 128 | return self._process_function(cursor, source_location) 129 | elif cursor.kind == CursorKind.VAR_DECL: 130 | return self._process_variable(cursor, source_location) 131 | elif cursor.kind == CursorKind.MEMBER_REF_EXPR: 132 | return self._process_member_ref(cursor, source_location) 133 | elif cursor.kind == CursorKind.CALL_EXPR: 134 | return self._process_call(cursor, source_location) 135 | 136 | return None 137 | 138 | # ... rest of the implementation remains the same ... -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import Dict, Optional 4 | from logging.handlers import RotatingFileHandler, TimedRotatingFileHandler 5 | 6 | class CudaLogger: 7 | _instance = None 8 | _loggers: Dict[str, logging.Logger] = {} 9 | 10 | def __new__(cls): 11 | if cls._instance is None: 12 | cls._instance = super(CudaLogger, cls).__new__(cls) 13 | cls._instance._configure_root_logger() 14 | return cls._instance 15 | 16 | def _configure_root_logger(self): 17 | root_logger = logging.getLogger() 18 | root_logger.setLevel(logging.DEBUG) 19 | 20 | # Console handler 21 | console_handler = logging.StreamHandler() 22 | console_handler.setLevel(logging.INFO) 23 | console_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 24 | console_handler.setFormatter(console_formatter) 25 | root_logger.addHandler(console_handler) 26 | 27 | # File handler 28 | log_dir = "logs" 29 | os.makedirs(log_dir, exist_ok=True) 30 | file_handler = RotatingFileHandler( 31 | filename=os.path.join(log_dir, "cuda_to_metal.log"), 32 | maxBytes=10 * 1024 * 1024, # 10 MB 33 | backupCount=5 34 | ) 35 | file_handler.setLevel(logging.DEBUG) 36 | file_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s') 37 | file_handler.setFormatter(file_formatter) 38 | root_logger.addHandler(file_handler) 39 | 40 | def get_logger(self, name: str) -> logging.Logger: 41 | if name not in self._loggers: 42 | logger = logging.getLogger(name) 43 | self._loggers[name] = logger 44 | return self._loggers[name] 45 | 46 | def set_log_level(self, level: int): 47 | for logger in self._loggers.values(): 48 | logger.setLevel(level) 49 | 50 | def add_file_handler(self, filename: str, level: int = logging.DEBUG, 51 | max_bytes: int = 10 * 1024 * 1024, backup_count: int = 5): 52 | file_handler = RotatingFileHandler( 53 | filename=filename, 54 | maxBytes=max_bytes, 55 | backupCount=backup_count 56 | ) 57 | file_handler.setLevel(level) 58 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s') 59 | file_handler.setFormatter(formatter) 60 | for logger in self._loggers.values(): 61 | logger.addHandler(file_handler) 62 | 63 | def add_timed_rotating_file_handler(self, filename: str, level: int = logging.DEBUG, 64 | when: str = 'midnight', interval: int = 1, backup_count: int = 7): 65 | file_handler = TimedRotatingFileHandler( 66 | filename=filename, 67 | when=when, 68 | interval=interval, 69 | backupCount=backup_count 70 | ) 71 | file_handler.setLevel(level) 72 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s') 73 | file_handler.setFormatter(formatter) 74 | for logger in self._loggers.values(): 75 | logger.addHandler(file_handler) 76 | 77 | def get_logger(name: str) -> logging.Logger: 78 | return CudaLogger().get_logger(name) 79 | 80 | # Convenience functions for different log levels 81 | def debug(logger: logging.Logger, message: str, *args, **kwargs): 82 | logger.debug(message, *args, **kwargs) 83 | 84 | def info(logger: logging.Logger, message: str, *args, **kwargs): 85 | logger.info(message, *args, **kwargs) 86 | 87 | def warning(logger: logging.Logger, message: str, *args, **kwargs): 88 | logger.warning(message, *args, **kwargs) 89 | 90 | def error(logger: logging.Logger, message: str, *args, **kwargs): 91 | logger.error(message, *args, **kwargs) 92 | 93 | def critical(logger: logging.Logger, message: str, *args, **kwargs): 94 | logger.critical(message, *args, **kwargs) 95 | 96 | def exception(logger: logging.Logger, message: str, *args, exc_info=True, **kwargs): 97 | logger.exception(message, *args, exc_info=exc_info, **kwargs) 98 | 99 | # Performance logging 100 | def log_performance(logger: logging.Logger, operation: str, execution_time: float): 101 | logger.info(f"Performance: {operation} took {execution_time:.4f} seconds") 102 | 103 | # Function entry/exit logging 104 | def log_function_entry(logger: logging.Logger, func_name: str, args: Optional[Dict] = None): 105 | args_str = ", ".join(f"{k}={v}" for k, v in args.items()) if args else "" 106 | logger.debug(f"Entering function: {func_name}({args_str})") 107 | 108 | def log_function_exit(logger: logging.Logger, func_name: str, result: Any = None): 109 | logger.debug(f"Exiting function: {func_name} with result: {result}") 110 | 111 | # Context manager for function logging 112 | class LogFunction: 113 | def __init__(self, logger: logging.Logger, func_name: str): 114 | self.logger = logger 115 | self.func_name = func_name 116 | 117 | def __enter__(self): 118 | log_function_entry(self.logger, self.func_name) 119 | 120 | def __exit__(self, exc_type, exc_value, traceback): 121 | if exc_type: 122 | self.logger.exception(f"Exception in function {self.func_name}: {exc_value}") 123 | else: 124 | log_function_exit(self.logger, self.func_name) -------------------------------------------------------------------------------- /translator/cudnn_mapper.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Any 2 | from ..utils.error_handler import CudaTranslationError 3 | from ..utils.logger import get_logger 4 | 5 | logger = get_logger(__name__) 6 | 7 | class CudnnMapper: 8 | def __init__(self): 9 | self.cudnn_to_mps_map: Dict[str, str] = { 10 | 'cudnnConvolutionForward': 'MPSCNNConvolution', 11 | 'cudnnPoolingForward': 'MPSCNNPooling', 12 | 'cudnnActivationForward': 'MPSCNNNeuron', 13 | 'cudnnSoftmaxForward': 'MPSCNNSoftMax', 14 | 'cudnnBatchNormalizationForward': 'MPSCNNBatchNormalization', 15 | 'cudnnRNNForward': 'MPSNNGRU', 16 | 'cudnnDropoutForward': 'MPSCNNDropout', 17 | 'cudnnOpTensor': 'MPSNNAdd', 18 | } 19 | 20 | def map_function(self, cudnn_function: str, args: List[Any]) -> str: 21 | if cudnn_function not in self.cudnn_to_mps_map: 22 | raise CudaTranslationError(f"Unsupported cuDNN function: {cudnn_function}") 23 | 24 | mps_function = self.cudnn_to_mps_map[cudnn_function] 25 | return self._generate_mps_call(mps_function, args) 26 | 27 | def _generate_mps_call(self, mps_function: str, args: List[Any]) -> str: 28 | if mps_function == 'MPSCNNConvolution': 29 | return self._generate_convolution_call(args) 30 | elif mps_function == 'MPSCNNPooling': 31 | return self._generate_pooling_call(args) 32 | elif mps_function == 'MPSCNNNeuron': 33 | return self._generate_activation_call(args) 34 | elif mps_function == 'MPSCNNSoftMax': 35 | return self._generate_softmax_call(args) 36 | elif mps_function == 'MPSCNNBatchNormalization': 37 | return self._generate_batchnorm_call(args) 38 | else: 39 | return f"{mps_function}({', '.join(map(str, args))})" 40 | 41 | def _generate_convolution_call(self, args: List[Any]) -> str: 42 | return f""" 43 | MPSCNNConvolution *convLayer = [[MPSCNNConvolution alloc] 44 | initWithDevice:device 45 | kernelWidth:{args[0]} 46 | kernelHeight:{args[1]} 47 | inputFeatureChannels:{args[2]} 48 | outputFeatureChannels:{args[3]} 49 | neuronFilter:nil]; 50 | [convLayer encodeToCommandBuffer:commandBuffer 51 | sourceImage:sourceTexture 52 | destinationImage:destTexture]; 53 | """ 54 | 55 | def _generate_pooling_call(self, args: List[Any]) -> str: 56 | return f""" 57 | MPSCNNPooling *poolLayer = [[MPSCNNPooling alloc] 58 | initWithDevice:device 59 | kernelWidth:{args[0]} 60 | kernelHeight:{args[1]} 61 | strideInPixelsX:{args[2]} 62 | strideInPixelsY:{args[3]}]; 63 | [poolLayer encodeToCommandBuffer:commandBuffer 64 | sourceImage:sourceTexture 65 | destinationImage:destTexture]; 66 | """ 67 | 68 | def _generate_activation_call(self, args: List[Any]) -> str: 69 | return f""" 70 | MPSCNNNeuron *activationLayer = [MPSCNNNeuronReLU nodeWithSource:nil]; 71 | [activationLayer encodeToCommandBuffer:commandBuffer 72 | sourceImage:sourceTexture 73 | destinationImage:destTexture]; 74 | """ 75 | 76 | def _generate_softmax_call(self, args: List[Any]) -> str: 77 | return f""" 78 | MPSCNNSoftMax *softmaxLayer = [[MPSCNNSoftMax alloc] initWithDevice:device]; 79 | [softmaxLayer encodeToCommandBuffer:commandBuffer 80 | sourceImage:sourceTexture 81 | destinationImage:destTexture]; 82 | """ 83 | 84 | def _generate_batchnorm_call(self, args: List[Any]) -> str: 85 | return f""" 86 | MPSCNNBatchNormalization *batchNormLayer = [[MPSCNNBatchNormalization alloc] 87 | initWithDevice:device 88 | featureChannels:{args[0]}]; 89 | [batchNormLayer encodeToCommandBuffer:commandBuffer 90 | sourceImage:sourceTexture 91 | destinationImage:destTexture]; 92 | """ 93 | 94 | def translate_cudnn_descriptor(self, descriptor_type: str, params: Dict[str, Any]) -> str: 95 | if descriptor_type == 'cudnnTensorDescriptor': 96 | return self._translate_tensor_descriptor(params) 97 | elif descriptor_type == 'cudnnFilterDescriptor': 98 | return self._translate_filter_descriptor(params) 99 | elif descriptor_type == 'cudnnConvolutionDescriptor': 100 | return self._translate_convolution_descriptor(params) 101 | else: 102 | raise CudaTranslationError(f"Unsupported descriptor type: {descriptor_type}") 103 | 104 | def _translate_tensor_descriptor(self, params: Dict[str, Any]) -> str: 105 | return f""" 106 | MPSImageDescriptor *tensorDescriptor = [MPSImageDescriptor 107 | imageDescriptorWithChannelFormat:MPSImageFeatureChannelFormatFloat32 108 | width:{params['width']} 109 | height:{params['height']} 110 | featureChannels:{params['channels']}]; 111 | """ 112 | 113 | def _translate_filter_descriptor(self, params: Dict[str, Any]) -> str: 114 | return f""" 115 | MPSCNNConvolutionDescriptor *filterDescriptor = [MPSCNNConvolutionDescriptor 116 | cnnConvolutionDescriptorWithKernelWidth:{params['kernelWidth']} 117 | kernelHeight:{params['kernelHeight']} 118 | inputFeatureChannels:{params['inputChannels']} 119 | outputFeatureChannels:{params['outputChannels']}]; 120 | """ 121 | 122 | def _translate_convolution_descriptor(self, params: Dict[str, Any]) -> str: 123 | return f""" 124 | MPSNNDefaultPadding *convolutionDescriptor = [MPSNNDefaultPadding 125 | paddingWithMethod:MPSNNPaddingMethodSizeSame]; 126 | convolutionDescriptor.kernelOffsetX = {params['padWidth']}; 127 | convolutionDescriptor.kernelOffsetY = {params['padHeight']}; 128 | """ 129 | 130 | logger.info("CudnnMapper initialized for cuDNN to Metal Performance Shaders translation.") -------------------------------------------------------------------------------- /native/metal_interop.mm: -------------------------------------------------------------------------------- 1 | // metal_interop.mm 2 | // (Continuing the implementation of all remaining functions) 3 | 4 | void begin_compute_pass(MetalCommandObjects* cmd_objects) { 5 | if (!cmd_objects || cmd_objects->compute_encoder) return; 6 | 7 | id cmdBuffer = (__bridge id)cmd_objects->command_buffer; 8 | id encoder = [cmdBuffer computeCommandEncoder]; 9 | cmd_objects->compute_encoder = (__bridge_retained void*)encoder; 10 | } 11 | 12 | void end_compute_pass(MetalCommandObjects* cmd_objects) { 13 | if (!cmd_objects || !cmd_objects->compute_encoder) return; 14 | 15 | id encoder = (__bridge id)cmd_objects->compute_encoder; 16 | [encoder endEncoding]; 17 | 18 | cmd_objects->compute_encoder = nil; 19 | } 20 | 21 | void commit_commands(MetalCommandObjects* cmd_objects) { 22 | if (!cmd_objects || !cmd_objects->command_buffer) return; 23 | 24 | id cmdBuffer = (__bridge id)cmd_objects->command_buffer; 25 | [cmdBuffer commit]; 26 | } 27 | 28 | void wait_for_completion(MetalCommandObjects* cmd_objects) { 29 | if (!cmd_objects || !cmd_objects->command_buffer) return; 30 | 31 | id cmdBuffer = (__bridge id)cmd_objects->command_buffer; 32 | [cmdBuffer waitUntilCompleted]; 33 | } 34 | 35 | MetalPipelineConfig* create_pipeline_config(const char* kernel_name) { 36 | if (!kernel_name) return NULL; 37 | 38 | MetalPipelineConfig* config = (MetalPipelineConfig*)malloc(sizeof(MetalPipelineConfig)); 39 | if (!config) return NULL; 40 | 41 | NSString* funcName = [NSString stringWithUTF8String:kernel_name]; 42 | id device = [MetalDeviceManager sharedDevice]; 43 | id library = [device newDefaultLibrary]; 44 | id function = [library newFunctionWithName:funcName]; 45 | 46 | NSError* error = nil; 47 | id pipelineState = 48 | [device newComputePipelineStateWithFunction:function error:&error]; 49 | 50 | if (!pipelineState) { 51 | NSLog(@"Failed to create pipeline state: %@", error); 52 | free(config); 53 | return NULL; 54 | } 55 | 56 | config->pipeline_state = (__bridge_retained void*)pipelineState; 57 | config->thread_group_size[0] = 1; 58 | config->thread_group_size[1] = 1; 59 | config->thread_group_size[2] = 1; 60 | config->grid_size[0] = 1; 61 | config->grid_size[1] = 1; 62 | config->grid_size[2] = 1; 63 | 64 | return config; 65 | } 66 | 67 | void destroy_pipeline_config(MetalPipelineConfig* config) { 68 | if (!config) return; 69 | 70 | if (config->pipeline_state) { 71 | id pipelineState = 72 | (__bridge_transfer id)config->pipeline_state; 73 | pipelineState = nil; 74 | } 75 | 76 | free(config); 77 | } 78 | 79 | void set_pipeline_thread_groups(MetalPipelineConfig* config, 80 | uint32_t x, uint32_t y, uint32_t z) { 81 | if (!config) return; 82 | 83 | config->thread_group_size[0] = x; 84 | config->thread_group_size[1] = y; 85 | config->thread_group_size[2] = z; 86 | } 87 | 88 | void set_pipeline_grid_size(MetalPipelineConfig* config, 89 | uint32_t x, uint32_t y, uint32_t z) { 90 | if (!config) return; 91 | 92 | config->grid_size[0] = x; 93 | config->grid_size[1] = y; 94 | config->grid_size[2] = z; 95 | } 96 | 97 | @interface MetalCommandBufferWrapper : NSObject 98 | @property (nonatomic, strong) id commandBuffer; 99 | @property (nonatomic, strong) NSMutableArray>* retainedBuffers; 100 | @end 101 | 102 | @implementation MetalCommandBufferWrapper 103 | - (instancetype)initWithCommandBuffer:(id)commandBuffer { 104 | if (self = [super init]) { 105 | _commandBuffer = commandBuffer; 106 | _retainedBuffers = [NSMutableArray array]; 107 | } 108 | return self; 109 | } 110 | @end 111 | 112 | // Thread-local storage for retained buffers 113 | static NSMutableDictionary* threadLocalBuffers = nil; 114 | static dispatch_once_t bufferOnceToken; 115 | 116 | @interface MetalBufferManager : NSObject 117 | + (void)retainBuffer:(id)buffer forThread:(NSThread*)thread; 118 | + (void)releaseBuffersForThread:(NSThread*)thread; 119 | @end 120 | 121 | @implementation MetalBufferManager 122 | 123 | + (void)initialize { 124 | if (self == [MetalBufferManager class]) { 125 | dispatch_once(&bufferOnceToken, ^{ 126 | threadLocalBuffers = [NSMutableDictionary dictionary]; 127 | }); 128 | } 129 | } 130 | 131 | + (void)retainBuffer:(id)buffer forThread:(NSThread*)thread { 132 | if (!buffer || !thread) return; 133 | 134 | @synchronized(threadLocalBuffers) { 135 | NSString* threadKey = [NSString stringWithFormat:@"%p", thread]; 136 | NSMutableArray* buffers = threadLocalBuffers[threadKey]; 137 | if (!buffers) { 138 | buffers = [NSMutableArray array]; 139 | threadLocalBuffers[threadKey] = buffers; 140 | } 141 | [buffers addObject:buffer]; 142 | } 143 | } 144 | 145 | + (void)releaseBuffersForThread:(NSThread*)thread { 146 | if (!thread) return; 147 | 148 | @synchronized(threadLocalBuffers) { 149 | NSString* threadKey = [NSString stringWithFormat:@"%p", thread]; 150 | [threadLocalBuffers removeObjectForKey:threadKey]; 151 | } 152 | } 153 | 154 | @end 155 | 156 | // Helper functions for error handling 157 | static void handleMetalError(NSError* error, const char* operation) { 158 | if (error) { 159 | NSLog(@"Metal error during %s: %@", operation, error); 160 | } 161 | } 162 | 163 | static BOOL validateDevice() { 164 | id device = [MetalDeviceManager sharedDevice]; 165 | if (!device) { 166 | NSLog(@"No Metal device available"); 167 | return NO; 168 | } 169 | return YES; 170 | } 171 | 172 | static BOOL validatePipelineState(id pipelineState) { 173 | if (!pipelineState) { 174 | NSLog(@"Invalid compute pipeline state"); 175 | return NO; 176 | } 177 | return YES; 178 | } -------------------------------------------------------------------------------- /translator/intrinsic_function_mapper.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import Dict, Optional, List, Tuple, Union, Set 3 | from dataclasses import dataclass 4 | from enum import Enum 5 | import logging 6 | 7 | from ..utils.error_handler import CudaTranslationError 8 | from ..utils.logger import get_logger 9 | 10 | logger = get_logger(__name__) 11 | 12 | class IntrinsicType(Enum): 13 | MATH = "math" 14 | ATOMIC = "atomic" 15 | SYNC = "sync" 16 | MEMORY = "memory" 17 | THREAD = "thread" 18 | WARP = "warp" 19 | SPECIAL = "special" 20 | 21 | @dataclass 22 | class IntrinsicFunction: 23 | """Represents a CUDA intrinsic function with its Metal equivalent.""" 24 | cuda_name: str 25 | metal_name: str 26 | return_type: str 27 | arg_types: List[str] 28 | type: IntrinsicType 29 | needs_wrapper: bool = False 30 | has_metal_equivalent: bool = True 31 | requires_memory_order: bool = False 32 | requires_scope: bool = False 33 | is_simd_function: bool = False 34 | vectorizable: bool = False 35 | custom_translation: Optional[str] = None 36 | 37 | class IntrinsicFunctionMapper: 38 | """Maps CUDA intrinsic functions to their Metal equivalents.""" 39 | 40 | def __init__(self): 41 | self.intrinsics: Dict[str, IntrinsicFunction] = self._init_intrinsics() 42 | self.used_intrinsics: Set[str] = set() 43 | self.required_headers: Set[str] = set() 44 | 45 | def _init_intrinsics(self) -> Dict[str, IntrinsicFunction]: 46 | """Initialize all supported intrinsic functions.""" 47 | return { 48 | # Math intrinsics 49 | "__sinf": IntrinsicFunction( 50 | cuda_name="__sinf", 51 | metal_name="metal::fast::sin", 52 | return_type="float", 53 | arg_types=["float"], 54 | type=IntrinsicType.MATH, 55 | vectorizable=True 56 | ), 57 | "__cosf": IntrinsicFunction( 58 | cuda_name="__cosf", 59 | metal_name="metal::fast::cos", 60 | return_type="float", 61 | arg_types=["float"], 62 | type=IntrinsicType.MATH, 63 | vectorizable=True 64 | ), 65 | # ... other intrinsic definitions ... 66 | } 67 | 68 | def map_intrinsic(self, node: dict) -> str: 69 | """Map CUDA intrinsic function call to Metal equivalent.""" 70 | try: 71 | func_name = node.get('function', {}).get('name') 72 | if not func_name: 73 | raise CudaTranslationError(f"Invalid intrinsic function call: {node}") 74 | 75 | if func_name not in self.intrinsics: 76 | raise CudaTranslationError(f"Unknown intrinsic function: {func_name}") 77 | 78 | intrinsic = self.intrinsics[func_name] 79 | self.used_intrinsics.add(func_name) 80 | 81 | # Handle custom translations 82 | if intrinsic.custom_translation: 83 | return intrinsic.custom_translation 84 | 85 | # Generate Metal function call 86 | args = self._translate_arguments(node.get('arguments', []), intrinsic) 87 | metal_call = f"{intrinsic.metal_name}({', '.join(args)})" 88 | 89 | # Add memory order if required 90 | if intrinsic.requires_memory_order: 91 | metal_call += ", memory_order_relaxed" 92 | 93 | # Add scope if required 94 | if intrinsic.requires_scope: 95 | metal_call += "(mem_flags::mem_threadgroup)" 96 | 97 | return metal_call 98 | 99 | except Exception as e: 100 | logger.error(f"Error mapping intrinsic function: {str(e)}") 101 | raise CudaTranslationError(f"Failed to map intrinsic function: {str(e)}") 102 | 103 | def _translate_arguments(self, args: List[dict], intrinsic: IntrinsicFunction) -> List[str]: 104 | """Translate function arguments to Metal.""" 105 | if len(args) != len(intrinsic.arg_types): 106 | raise CudaTranslationError( 107 | f"Wrong number of arguments for {intrinsic.cuda_name}: " 108 | f"expected {len(intrinsic.arg_types)}, got {len(args)}" 109 | ) 110 | 111 | translated_args = [] 112 | for arg, expected_type in zip(args, intrinsic.arg_types): 113 | arg_str = self._translate_argument(arg, expected_type) 114 | translated_args.append(arg_str) 115 | 116 | return translated_args 117 | 118 | def _translate_argument(self, arg: dict, expected_type: str) -> str: 119 | """Translate single argument with type checking.""" 120 | if 'value' in arg: 121 | return str(arg['value']) 122 | elif 'name' in arg: 123 | return arg['name'] 124 | return str(arg) 125 | 126 | def get_required_headers(self) -> Set[str]: 127 | """Get required Metal headers based on used intrinsics.""" 128 | headers = set() 129 | for intrinsic_name in self.used_intrinsics: 130 | intrinsic = self.intrinsics[intrinsic_name] 131 | if intrinsic.type == IntrinsicType.MATH: 132 | headers.add("#include ") 133 | elif intrinsic.type == IntrinsicType.ATOMIC: 134 | headers.add("#include ") 135 | elif intrinsic.is_simd_function: 136 | headers.add("#include ") 137 | return headers 138 | 139 | def get_vectorizable_intrinsics(self) -> Set[str]: 140 | """Get list of vectorizable intrinsic functions.""" 141 | return {name for name, func in self.intrinsics.items() if func.vectorizable} 142 | 143 | def get_simd_functions(self) -> Set[str]: 144 | """Get list of SIMD-specific functions.""" 145 | return {name for name, func in self.intrinsics.items() if func.is_simd_function} 146 | 147 | def validate_intrinsic_usage(self, node: dict) -> bool: 148 | """Validate intrinsic function usage.""" 149 | func_name = node.get('function', {}).get('name') 150 | if not func_name or func_name not in self.intrinsics: 151 | return False 152 | 153 | intrinsic = self.intrinsics[func_name] 154 | return len(node.get('arguments', [])) == len(intrinsic.arg_types) 155 | 156 | logger.info("IntrinsicFunctionMapper initialized with complete mappings") 157 | -------------------------------------------------------------------------------- /core/translator/host_translator.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any 2 | import re 3 | from pathlib import Path 4 | 5 | from ..utils.error_handler import CudaTranslationError 6 | from ..utils.logger import get_logger 7 | from ..core.parser.ast_nodes import ( 8 | CUDANode, CUDAKernel, CUDAParameter, CUDAType, 9 | CUDAQualifier, CUDASharedMemory, CUDAThreadIdx 10 | ) 11 | from ..generator.msl_generator import MetalShaderGenerator 12 | 13 | 14 | class CUDAHostTranslator: 15 | """ 16 | Translates CUDA host code to Metal host code following NVIDIA's host API patterns 17 | """ 18 | 19 | def __init__(self): 20 | self.metal_buffer_index = 0 21 | self.kernel_map: Dict[str, CUDAKernel] = {} 22 | 23 | def translate_host_code(self, cuda_code: str, target_lang: str = 'swift') -> str: 24 | """Translate CUDA host code to Metal""" 25 | if target_lang not in {'swift', 'objc'}: 26 | raise ValueError("Target language must be 'swift' or 'objc'") 27 | 28 | # Process CUDA API calls 29 | processed_code = self._translate_device_management(cuda_code) 30 | processed_code = self._translate_memory_management(processed_code) 31 | processed_code = self._translate_kernel_launch(processed_code) 32 | processed_code = self._translate_synchronization(processed_code) 33 | 34 | # Generate appropriate host code 35 | if target_lang == 'swift': 36 | return self._generate_swift_code(processed_code) 37 | else: 38 | return self._generate_objc_code(processed_code) 39 | 40 | def _translate_device_management(self, code: str) -> str: 41 | """Translate CUDA device management calls""" 42 | replacements = { 43 | r'cudaSetDevice\((\d+)\)': r'// Metal automatically manages devices', 44 | r'cudaGetDevice\(&dev\)': r'// Metal automatically manages devices', 45 | r'cudaGetDeviceCount\(&count\)': r'let count = MTLCopyAllDevices().count', 46 | r'cudaDeviceSynchronize\(\)': r'commandBuffer.waitUntilCompleted()' 47 | } 48 | 49 | result = code 50 | for cuda_pattern, metal_code in replacements.items(): 51 | result = re.sub(cuda_pattern, metal_code, result) 52 | 53 | return result 54 | 55 | def _translate_memory_management(self, code: str) -> str: 56 | """Translate CUDA memory management calls""" 57 | # Handle cudaMalloc 58 | code = re.sub( 59 | r'cudaMalloc\(\(void\*\*\)&(\w+),\s*(.+?)\)', 60 | lambda m: f'{m.group(1)} = device.makeBuffer(length: {m.group(2)}, ' 61 | f'options: .storageModeShared)', 62 | code 63 | ) 64 | 65 | # Handle cudaMemcpy 66 | code = re.sub( 67 | r'cudaMemcpy\((.+?),\s*(.+?),\s*(.+?),\s*cudaMemcpy(.+?)\)', 68 | self._translate_memcpy, 69 | code 70 | ) 71 | 72 | # Handle cudaFree 73 | code = re.sub( 74 | r'cudaFree\((\w+)\)', 75 | r'// Metal automatically manages memory', 76 | code 77 | ) 78 | 79 | return code 80 | 81 | def _translate_memcpy(self, match) -> str: 82 | """Translate cudaMemcpy calls""" 83 | dst, src, size, kind = match.groups() 84 | 85 | if kind == 'HostToDevice': 86 | return f'memcpy({dst}.contents, {src}, {size})' 87 | elif kind == 'DeviceToHost': 88 | return f'memcpy({dst}, {src}.contents, {size})' 89 | elif kind == 'DeviceToDevice': 90 | return (f'let blitEncoder = commandBuffer.makeBlitCommandEncoder()\n' 91 | f'blitEncoder.copy(from: {src}, to: {dst}, size: {size})\n' 92 | f'blitEncoder.endEncoding()') 93 | 94 | return match.group(0) 95 | 96 | def _translate_kernel_launch(self, code: str) -> str: 97 | """Translate CUDA kernel launches""" 98 | # Match kernel launch syntax 99 | pattern = r'(\w+)<<<(.+?)>>>(.+?);' 100 | 101 | return re.sub(pattern, self._translate_launch_config, code) 102 | 103 | def _translate_launch_config(self, match) -> str: 104 | """Translate kernel launch configuration""" 105 | kernel_name, config, args = match.groups() 106 | 107 | # Parse grid and block dimensions 108 | grid_dim, block_dim = config.split(',', 1) 109 | 110 | return ( 111 | f'let commandEncoder = commandBuffer.makeComputeCommandEncoder()\n' 112 | f'commandEncoder.setComputePipelineState({kernel_name}PipelineState)\n' 113 | f'let gridSize = MTLSize(width: {grid_dim}, height: 1, depth: 1)\n' 114 | f'let blockSize = MTLSize(width: {block_dim}, height: 1, depth: 1)\n' 115 | f'commandEncoder.dispatchThreadgroups(gridSize, threadsPerThreadgroup: blockSize)\n' 116 | f'commandEncoder.endEncoding()' 117 | ) 118 | 119 | def _translate_synchronization(self, code: str) -> str: 120 | """Translate CUDA synchronization calls""" 121 | replacements = { 122 | r'cudaDeviceSynchronize\(\)': 'commandBuffer.waitUntilCompleted()', 123 | r'cudaStreamSynchronize\((\w+)\)': r'\1.waitUntilCompleted()', 124 | r'cudaEventSynchronize\((\w+)\)': r'\1.waitUntilCompleted()', 125 | } 126 | 127 | result = code 128 | for cuda_pattern, metal_code in replacements.items(): 129 | result = re.sub(cuda_pattern, metal_code, result) 130 | 131 | return result 132 | 133 | def _generate_swift_code(self, processed_code: str) -> str: 134 | """Generate Swift host code""" 135 | setup_code = """ 136 | import Metal 137 | import MetalKit 138 | 139 | guard let device = MTLCreateSystemDefaultDevice() else { 140 | fatalError("GPU not available") 141 | } 142 | 143 | let commandQueue = device.makeCommandQueue()! 144 | let commandBuffer = commandQueue.makeCommandBuffer()! 145 | """ 146 | 147 | return f"{setup_code}\n{processed_code}" 148 | 149 | def _generate_objc_code(self, processed_code: str) -> str: 150 | """Generate Objective-C host code""" 151 | setup_code = """ 152 | #import 153 | #import 154 | 155 | id device = MTLCreateSystemDefaultDevice(); 156 | if (!device) { 157 | NSLog(@"GPU not available"); 158 | return; 159 | } 160 | 161 | id commandQueue = [device newCommandQueue]; 162 | id commandBuffer = [commandQueue commandBuffer]; 163 | """ 164 | 165 | return f"{setup_code}\n{processed_code}" -------------------------------------------------------------------------------- /utils/file_utils.py: -------------------------------------------------------------------------------- 1 | # utils/file_utils.py 2 | 3 | import os 4 | import shutil 5 | import hashlib 6 | import tempfile 7 | from pathlib import Path 8 | from typing import List, Set, Dict, Optional, Generator 9 | from concurrent.futures import ThreadPoolExecutor 10 | from threading import Lock 11 | import logging 12 | 13 | from .error_handler import CudaTranslationError 14 | from .logger import get_logger 15 | 16 | logger = get_logger(__name__) 17 | 18 | class FileCache: 19 | """Thread-safe file cache manager.""" 20 | def __init__(self, cache_dir: Optional[str] = None): 21 | self.cache_dir = Path(cache_dir) if cache_dir else Path(tempfile.gettempdir()) / "cuda_metal_cache" 22 | self.cache_dir.mkdir(parents=True, exist_ok=True) 23 | self._lock = Lock() 24 | self._cache_index: Dict[str, Path] = {} 25 | self._load_cache_index() 26 | 27 | def _load_cache_index(self): 28 | """Load cache index from disk.""" 29 | with self._lock: 30 | index_file = self.cache_dir / "index.json" 31 | if index_file.exists(): 32 | import json 33 | with open(index_file, 'r') as f: 34 | self._cache_index = {k: Path(v) for k, v in json.load(f).items()} 35 | 36 | def _save_cache_index(self): 37 | """Save cache index to disk.""" 38 | with self._lock: 39 | index_file = self.cache_dir / "index.json" 40 | import json 41 | with open(index_file, 'w') as f: 42 | json.dump({k: str(v) for k, v in self._cache_index.items()}, f) 43 | 44 | def get_cached_path(self, key: str) -> Optional[Path]: 45 | """Get cached file path if exists.""" 46 | with self._lock: 47 | return self._cache_index.get(key) 48 | 49 | def add_to_cache(self, key: str, file_path: Path): 50 | """Add file to cache.""" 51 | with self._lock: 52 | cache_path = self.cache_dir / hashlib.sha256(key.encode()).hexdigest() 53 | shutil.copy2(file_path, cache_path) 54 | self._cache_index[key] = cache_path 55 | self._save_cache_index() 56 | 57 | class FileTracker: 58 | """Tracks file dependencies and modifications.""" 59 | def __init__(self): 60 | self.dependencies: Dict[Path, Set[Path]] = {} 61 | self._lock = Lock() 62 | 63 | def add_dependency(self, source: Path, dependency: Path): 64 | """Add a dependency relationship.""" 65 | with self._lock: 66 | if source not in self.dependencies: 67 | self.dependencies[source] = set() 68 | self.dependencies[source].add(dependency) 69 | 70 | def get_dependencies(self, source: Path) -> Set[Path]: 71 | """Get all dependencies for a file.""" 72 | with self._lock: 73 | return self.dependencies.get(source, set()) 74 | 75 | def is_modified(self, source: Path, dependency: Path) -> bool: 76 | """Check if dependency is modified after source.""" 77 | try: 78 | source_mtime = source.stat().st_mtime 79 | dep_mtime = dependency.stat().st_mtime 80 | return dep_mtime > source_mtime 81 | except OSError: 82 | return True 83 | 84 | class FileUtils: 85 | """Utility class for file operations with Metal-specific optimizations.""" 86 | 87 | def __init__(self): 88 | self.cache = FileCache() 89 | self.tracker = FileTracker() 90 | self.temp_dir = Path(tempfile.mkdtemp(prefix="cuda_metal_")) 91 | self._lock = Lock() 92 | 93 | def read_file(self, path: Path, encoding: str = 'utf-8') -> str: 94 | """Read file with caching and error handling.""" 95 | try: 96 | with open(path, 'r', encoding=encoding) as f: 97 | content = f.read() 98 | 99 | # Cache the content 100 | cache_key = f"{path}:{path.stat().st_mtime}" 101 | self.cache.add_to_cache(cache_key, path) 102 | 103 | return content 104 | 105 | except UnicodeDecodeError: 106 | logger.warning(f"Failed to read {path} with {encoding} encoding, trying alternate encodings") 107 | for alt_encoding in ['latin1', 'cp1252']: 108 | try: 109 | with open(path, 'r', encoding=alt_encoding) as f: 110 | return f.read() 111 | except UnicodeDecodeError: 112 | continue 113 | raise CudaTranslationError(f"Unable to read file {path} with any supported encoding") 114 | 115 | except OSError as e: 116 | raise CudaTranslationError(f"Failed to read file {path}: {str(e)}") 117 | 118 | def write_file(self, path: Path, content: str, encoding: str = 'utf-8', backup: bool = True): 119 | """Write file with backup and atomic operation.""" 120 | if backup and path.exists(): 121 | self._create_backup(path) 122 | 123 | # Write to temporary file first 124 | temp_path = self.temp_dir / f"{path.name}.tmp" 125 | try: 126 | with open(temp_path, 'w', encoding=encoding) as f: 127 | f.write(content) 128 | f.flush() 129 | os.fsync(f.fileno()) 130 | 131 | # Atomic move 132 | shutil.move(str(temp_path), str(path)) 133 | 134 | except OSError as e: 135 | raise CudaTranslationError(f"Failed to write file {path}: {str(e)}") 136 | finally: 137 | if temp_path.exists(): 138 | temp_path.unlink() 139 | 140 | def _create_backup(self, path: Path): 141 | """Create backup of existing file.""" 142 | backup_path = path.with_suffix(path.suffix + '.bak') 143 | try: 144 | shutil.copy2(path, backup_path) 145 | except OSError as e: 146 | logger.warning(f"Failed to create backup of {path}: {str(e)}") 147 | 148 | def process_directory(self, 149 | directory: Path, 150 | pattern: str = "*.cu", 151 | recursive: bool = True) -> Generator[Path, None, None]: 152 | """Process directory with parallel file scanning.""" 153 | try: 154 | if recursive: 155 | paths = directory.rglob(pattern) 156 | else: 157 | paths = directory.glob(pattern) 158 | 159 | with ThreadPoolExecutor() as executor: 160 | yield from executor.map(self._process_file, paths) 161 | 162 | except OSError as e: 163 | raise CudaTranslationError(f"Failed to process directory {directory}: {str(e)}") 164 | 165 | def _process_file(self, path: Path) -> Path: 166 | """Process individual file with validation.""" 167 | if not path.is_file(): 168 | logger.warning(f"Skipping non-file path: {path}") 169 | return None 170 | 171 | return path 172 | 173 | def ensure_directory(self, path: Path): 174 | """Ensure directory exists with proper permissions.""" 175 | try: 176 | path.mkdir(parents=True, exist_ok=True) 177 | 178 | # Set appropriate permissions 179 | if os.name == 'posix': 180 | os.chmod(path, 0o755) 181 | 182 | except OSError as e: 183 | raise CudaTranslationError(f"Failed to create directory {path}: {str(e)}") 184 | 185 | def copy_with_metadata(self, src: Path, dst: Path): 186 | """Copy file with all metadata preserved.""" 187 | try: 188 | shutil.copy2(src, dst) 189 | 190 | # Track dependency 191 | self.tracker.add_dependency(dst, src) 192 | 193 | except OSError as e: 194 | raise CudaTranslationError(f"Failed to copy {src} to {dst}: {str(e)}") 195 | 196 | def get_relative_path(self, path: Path, base: Path) -> Path: 197 | """Get relative path with validation.""" 198 | try: 199 | return path.relative_to(base) 200 | except ValueError: 201 | return path 202 | 203 | def cleanup(self): 204 | """Clean up temporary files.""" 205 | try: 206 | shutil.rmtree(self.temp_dir, ignore_errors=True) 207 | except OSError as e: 208 | logger.warning(f"Failed to clean up temporary files: {str(e)}") 209 | 210 | def __enter__(self): 211 | return self 212 | 213 | def __exit__(self, exc_type, exc_val, exc_tb): 214 | self.cleanup() 215 | 216 | logger.info("FileUtils initialized with Metal-specific optimizations.") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA-to-Metal MPS Translation Project 2 | 3 | ![CUDAM Logo](assets/cudam_logo.png) 4 | 5 | ## Introduction 6 | 7 | Hi there! This project is designed to tackle a pretty significant problem for developers who want to port their CUDA applications to Apple M1 devices. 8 | 9 | Apple's M1 chips are amazing,im serious really really great but one major drawback is the lack of support for NVIDIA’s CUDA and cuDNN libraries. That’s where this project comes in. It’s all about providing developers with a tool to **automatically convert CUDA code** into **Metal Shading Language (MSL)**, enabling GPU-accelerated computations on M1 devices without having to rewrite the entire codebase from scratch. 10 | 11 | Whether you're working with CUDA kernels or leveraging cuDNN for deep learning, this tool aims to make your life easier by automating the translation process, so you can focus on performance and results. Let's dive into the details! 12 | 13 | --- 14 | 15 | ## Why Does This Matter? 16 | 17 | If you’ve ever tried to port CUDA code to a non-NVIDIA device, you know how painful it can be. The goal of this project is simple but powerful: **port CUDA code, including code that uses cuDNN, to Apple M1 GPUs** using **Metal** and **Metal Performance Shaders (MPS)**. This module will: 18 | - **Translate CUDA kernels** to MSL. 19 | - **Map cuDNN functions** to MPS equivalents. 20 | - Provide an easy-to-use **CLI and Python API** to automate the entire process. 21 | 22 | It’s like giving you a bridge between two worlds that don’t normally talk to each other—NVIDIA’s CUDA and Apple’s Metal. 23 | 24 | --- 25 | 26 | ## Table of Contents 27 | 1. [Project Overview](#project-overview) 28 | 2. [Challenges & How We Solve Them](#challenges) 29 | 3. [How It Works (The Tech Behind It)](#how-it-works) 30 | 4. [Installation & Usage](#installation-usage) 31 | 5. [Testing & Validation](#testing-validation) 32 | 6. [Roadmap](#roadmap) 33 | 7. [Risks & How We’re Tackling Them](#risks) 34 | 8. [Contributing](#contributing) 35 | 9. [Closing Thoughts](#closing-thoughts) 36 | 37 | --- 38 | 39 | ## 1. Project Overview 40 | 41 | The **CUDA-to-Metal MPS Translation Project** is a PyPI module that automates the conversion of CUDA code into Metal code, specifically designed for Apple M1 devices. This includes translating CUDA kernels, mapping cuDNN functions to Metal Performance Shaders (MPS), and providing a simple way for developers to adapt their CUDA-based codebases to the M1 architecture. 42 | 43 | --- 44 | 45 | ## 2. Challenges & How We Solve Them 46 | 47 | ### **Challenge 1**: CUDA and cuDNN are NVIDIA-specific and can’t run on Apple M1. 48 | - **Solution**: We translate CUDA code into **Metal Shading Language (MSL)** and map cuDNN functions to **MPS** equivalents. 49 | 50 | ### **Challenge 2**: The GPU architectures between NVIDIA and Apple are very different. 51 | - **Solution**: We build mapping layers that handle architectural differences, like memory models and threading paradigms. 52 | 53 | ### **Challenge 3**: There are performance gaps between CUDA/cuDNN and Metal/MPS. 54 | - **Solution**: After translating, we **optimize** the code using Apple’s GPU profiling tools and best practices to minimize these gaps. 55 | 56 | --- 57 | 58 | ## 3. How It Works (The Tech Behind It) 59 | 60 | Here’s a quick breakdown of how the project operates: 61 | 62 | ### Core Components: 63 | - **CUDA Parser**: Reads and interprets CUDA code. 64 | - **Kernel Translator**: Converts CUDA kernels into **MSL**. 65 | - **cuDNN Mapper**: Maps cuDNN functions to **MPS** or other Metal-compatible equivalents. 66 | - **Host Code Adapter**: Translates the host-side CUDA runtime API into Metal’s API (works with both Swift and Objective-C). 67 | - **CLI Tool & Python API**: A friendly interface to help you use these features without getting lost in the details. 68 | 69 | ### Data Flow: 70 | 1. **Input**: Your CUDA source files. 71 | 2. **Process**: We parse the code, translate kernels to Metal, and map cuDNN functions to MPS. 72 | 3. **Output**: The result is Metal-compatible code that can run on Apple M1 devices. 73 | 74 | --- 75 | 76 | ## 4. Installation & Usage 77 | 78 | ### Installation: 79 | 80 | Get started by installing the package from PyPI: 81 | 82 | ```bash 83 | pip install cuda_to_metal_mps 84 | 85 | Usage: 86 | 87 | The command-line interface makes it easy to use: 88 | 89 | bash 90 | 91 | cuda_to_metal_mps translate --input my_cuda_project/ --output my_metal_project/ --language swift 92 | 93 | Options: 94 | 95 | --input: Path to the CUDA source code. 96 | --output: Where the translated Metal code should go. 97 | --language: Choose between Swift or Objective-C for the host code. 98 | --config: Optional config file to customize translations. 99 | 100 | Example Workflow: 101 | 102 | Translate CUDA code: 103 | 104 | bash 105 | 106 | cuda_to_metal_mps translate --input src/ --output metal_project/ --language swift 107 | 108 | Build the project in Xcode for running on your Apple M1 device. 109 | 110 | 5. Testing & Validation 111 | 112 | Testing is crucial! Here’s how we ensure the module works as expected: 113 | 114 | Unit Tests: We test individual components (like parsing and kernel translation). 115 | Integration Tests: Run complete CUDA-to-Metal translations on sample projects. 116 | Performance Tests: Compare the performance of translated Metal code with the original CUDA code. 117 | 118 | 6. Roadmap 119 | 120 | Here's what the next few weeks look like for this project: 121 | 122 | Weeks 1-2-3-4-5: Set up project structure and identify core components. 123 | Weeks 5-6-7-8-9-10: Develop the CUDA parser and kernel translator. 124 | Weeks 11-12-13-14: Build the cuDNN-to-MPS mapper and host code adapter. 125 | Weeks 15-16: Complete CLI tool, Python API, and start testing. 126 | Weeks 16+: Optimize and release! 127 | 128 | 7. Risks & How We’re Tackling Them 129 | Risk 1: Not all cuDNN functions have a 1-to-1 MPS equivalent. 130 | 131 | Mitigation: Focus on mapping the most commonly used functions first and document any gaps. 132 | 133 | Risk 2: Translated code might not match CUDA’s performance. 134 | 135 | Mitigation: Use Apple’s profiling tools to identify and fix bottlenecks. 136 | 137 | 8. Contributing 138 | 139 | Want to help make this project better? Awesome! Here's how you can contribute: 140 | 141 | Fork the repository. 142 | Create a new branch for your feature or fix. 143 | Open a pull request with a description of what you’ve changed. 144 | 145 | Whether it’s adding new features, improving performance, or fixing bugs, every contribution is welcome! 💡 146 | 9. Closing Thoughts 147 | 148 | This project is just getting started, but it already has the potential to make a big impact for developers working with Apple’s M1 devices. By building a tool that automates the hard work of porting CUDA code to Metal, we’re opening up new possibilities for GPU acceleration on non-NVIDIA hardware. 149 | 150 | Feel free to dive in, give it a try, and let me know what you think! 🚀 151 | 152 | vbnet 153 | 154 | 155 | ### Key Differences: 156 | - **Human-friendly tone**: I’ve written it as if you’re speaking directly to your audience. 157 | - **Real-world impact**: I focused on the *why* behind the project to make it clear why it’s important. 158 | - **Conversational**: This will resonate more with other developers, especially students and researchers. 159 | 160 | Let me know if you’d like to tweak anything further! 161 | 162 | ⚠️ **DISCLAIMER**: I'm working on this project and multiple others ones on my free-time and at side of my studies and work,so be kind ,Also This is still version 0.01, and many classes,files,functionalities... are not yet implemented. Feel free to contribute by adding missing files or such or even improving existing ones if you think yours offers a better solution. 163 | 164 | ## Legal Disclaimer 165 | 166 | This project is created for **educational purposes** only and is not intended for commercial use. I do not own or claim to own any rights to Apple’s M1 architecture, Metal, Metal Performance Shaders (MPS), or NVIDIA’s CUDA and cuDNN libraries. All trademarks, logos, and brand names are the property of their respective owners. 167 | 168 | The purpose of this project is to provide developers with a tool to aid in learning about and experimenting with code translation between CUDA and Metal, and to explore the GPU capabilities of different architectures. **No warranties** are made regarding the completeness, reliability, or accuracy of the code and translations generated by this project. Use of this project is **at your own risk**. 169 | 170 | By using or contributing to this project, you acknowledge and agree that this is an independent work and is not affiliated with, endorsed by, or associated with Apple, NVIDIA, or any other company mentioned. 171 | 172 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. 6 | 7 | Preamble 8 | 9 | The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. 10 | 11 | The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. 12 | 13 | When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. 14 | 15 | Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. 16 | 17 | A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. 18 | 19 | The GNU Affero General Public License is designed to fill this gap. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. 20 | 21 | An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has contributed to its development by funding a project to update the GNU General Public License to address network server software. 22 | 23 | The precise terms and conditions for copying, distribution and modification follow. 24 | 25 | TERMS AND CONDITIONS 26 | 27 | ### 0. Definitions. 28 | 29 | “This License” refers to version 3 of the GNU Affero General Public License. 30 | 31 | “Copyright” also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. 32 | 33 | “The Program” refers to any copyrightable work licensed under this License. Each licensee is addressed as “you”. “Licensees” and “recipients” may be individuals or organizations. 34 | 35 | To “modify” a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a “modified version” of the earlier work or a work “based on” the earlier work. 36 | 37 | A “covered work” means either the unmodified Program or a work based on the Program. 38 | 39 | To “propagate” a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. 40 | 41 | To “convey” a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. 42 | 43 | An interactive user interface displays “Appropriate Legal Notices” to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 44 | 45 | ### 1. Source Code. 46 | 47 | The “source code” for a work means the preferred form of the work for making modifications to it. “Object code” means any non-source form of a work. 48 | 49 | A “Standard Interface” means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. 50 | 51 | The “System Libraries” of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A “Major Component”, in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. 52 | 53 | The “Corresponding Source” for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. 54 | 55 | The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. 56 | 57 | The Corresponding Source for a work in source code form is that same work. 58 | 59 | ### 2. Basic Permissions. 60 | 61 | All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. 62 | 63 | You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. 64 | 65 | Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 66 | 67 | ### 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 68 | 69 | No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. 70 | 71 | When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 72 | 73 | ### 4. Conveying Verbatim Copies. 74 | 75 | You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. 76 | 77 | You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 78 | 79 | [For more information, visit the full AGPLv3.0 text](https://www.gnu.org/licenses/agpl-3.0.en.html) 80 | 81 | -------------------------------------------------------------------------------- /optimization/memory_optimizer.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Set, Tuple, Any 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | import logging 5 | 6 | from ..parser.ast_nodes import ( 7 | CUDANode, CUDAType, CUDAKernel, CUDASharedMemory, 8 | CUDAThreadIdx, CUDABlockIdx 9 | ) 10 | 11 | class MemoryAccessPattern(Enum): 12 | COALESCED = "coalesced" 13 | STRIDED = "strided" 14 | RANDOM = "random" 15 | BROADCAST = "broadcast" 16 | SEQUENTIAL = "sequential" 17 | 18 | @dataclass 19 | class MemoryAccess: 20 | """Information about a memory access""" 21 | node: CUDANode 22 | type: MemoryAccessPattern 23 | stride: Optional[int] = None 24 | scope: str = "global" 25 | is_read: bool = True 26 | is_atomic: bool = False 27 | alignment: int = 16 28 | vector_width: Optional[int] = None 29 | 30 | class MemoryOptimizer: 31 | """ 32 | Optimizes memory access patterns for Metal GPU following NVIDIA best practices 33 | """ 34 | 35 | def __init__(self): 36 | self.simd_width = 32 # Metal SIMD width 37 | self.max_threads_per_group = 1024 38 | self.shared_memory_limit = 32768 # 32KB for Metal 39 | self.l1_cache_line_size = 128 # Metal cache line size 40 | self.vector_sizes = {2, 4, 8, 16} # Supported vector widths 41 | self.memory_accesses: List[MemoryAccess] = [] 42 | 43 | def optimize_kernel(self, kernel: CUDAKernel) -> CUDAKernel: 44 | """Apply memory optimizations to kernel""" 45 | # Analyze memory access patterns 46 | self._analyze_memory_accesses(kernel) 47 | 48 | # Apply optimizations 49 | kernel = self._optimize_global_memory(kernel) 50 | kernel = self._optimize_shared_memory(kernel) 51 | kernel = self._optimize_texture_memory(kernel) 52 | kernel = self._optimize_atomics(kernel) 53 | 54 | return kernel 55 | 56 | def _analyze_memory_accesses(self, kernel: CUDAKernel): 57 | """Analyze all memory accesses in kernel""" 58 | self.memory_accesses.clear() 59 | 60 | def visit_node(node: CUDANode): 61 | if access := self._detect_memory_access(node): 62 | self.memory_accesses.append(access) 63 | 64 | kernel.traverse(visit_node) 65 | 66 | # Group and analyze patterns 67 | self._analyze_access_patterns() 68 | 69 | def _detect_memory_access(self, node: CUDANode) -> Optional[MemoryAccess]: 70 | """Detect memory access type and pattern""" 71 | if not hasattr(node, 'cuda_type'): 72 | return None 73 | 74 | # Check for array access 75 | if self._is_array_access(node): 76 | pattern = self._determine_access_pattern(node) 77 | scope = self._determine_memory_scope(node) 78 | 79 | return MemoryAccess( 80 | node=node, 81 | type=pattern, 82 | scope=scope, 83 | stride=self._calculate_stride(node), 84 | vector_width=self._detect_vector_width(node), 85 | alignment=self._check_alignment(node) 86 | ) 87 | 88 | return None 89 | 90 | def _is_array_access(self, node: CUDANode) -> bool: 91 | """Check if node represents array access""" 92 | return hasattr(node, 'is_pointer') and node.is_pointer 93 | 94 | def _determine_access_pattern(self, node: CUDANode) -> MemoryAccessPattern: 95 | """Determine memory access pattern""" 96 | thread_idx = self._find_thread_index(node) 97 | if not thread_idx: 98 | return MemoryAccessPattern.RANDOM 99 | 100 | # Check for coalesced access 101 | if self._is_coalesced_access(node, thread_idx): 102 | return MemoryAccessPattern.COALESCED 103 | 104 | # Check for strided access 105 | stride = self._calculate_stride(node) 106 | if stride: 107 | return MemoryAccessPattern.STRIDED 108 | 109 | # Check for broadcast 110 | if self._is_broadcast_access(node): 111 | return MemoryAccessPattern.BROADCAST 112 | 113 | return MemoryAccessPattern.RANDOM 114 | 115 | def _optimize_global_memory(self, kernel: CUDAKernel) -> CUDAKernel: 116 | """Optimize global memory access patterns""" 117 | coalescing_opportunities = [ 118 | access for access in self.memory_accesses 119 | if access.scope == "global" and access.type != MemoryAccessPattern.COALESCED 120 | ] 121 | 122 | # Apply vectorization where possible 123 | for access in coalescing_opportunities: 124 | if self._can_vectorize(access): 125 | kernel = self._apply_vectorization(kernel, access) 126 | 127 | # Optimize array indexing 128 | kernel = self._optimize_array_indexing(kernel) 129 | 130 | # Add padding for alignment 131 | kernel = self._add_memory_padding(kernel) 132 | 133 | return kernel 134 | 135 | def _optimize_shared_memory(self, kernel: CUDAKernel) -> CUDAKernel: 136 | """Optimize shared memory usage""" 137 | shared_vars = [ 138 | node for node in kernel.children 139 | if isinstance(node, CUDASharedMemory) 140 | ] 141 | 142 | total_size = 0 143 | for var in shared_vars: 144 | # Optimize bank conflicts 145 | var = self._resolve_bank_conflicts(var) 146 | 147 | # Track size 148 | size = self._calculate_shared_memory_size(var) 149 | total_size += size 150 | 151 | if total_size > self.shared_memory_limit: 152 | logging.warning(f"Shared memory usage {total_size} exceeds Metal limit {self.shared_memory_limit}") 153 | 154 | return kernel 155 | 156 | def _optimize_texture_memory(self, kernel: CUDAKernel) -> CUDAKernel: 157 | """Optimize texture memory usage""" 158 | # Find read-only array accesses that could use textures 159 | candidate_arrays = [ 160 | access for access in self.memory_accesses 161 | if access.scope == "global" and access.is_read and not access.is_atomic 162 | ] 163 | 164 | for access in candidate_arrays: 165 | if self._should_use_texture(access): 166 | kernel = self._convert_to_texture(kernel, access) 167 | 168 | return kernel 169 | 170 | def _optimize_atomics(self, kernel: CUDAKernel) -> CUDAKernel: 171 | """Optimize atomic operations""" 172 | atomic_accesses = [ 173 | access for access in self.memory_accesses 174 | if access.is_atomic 175 | ] 176 | 177 | for access in atomic_accesses: 178 | # Try to use simdgroup operations 179 | if self._can_use_simdgroup(access): 180 | kernel = self._convert_to_simdgroup(kernel, access) 181 | else: 182 | # Optimize atomic memory layout 183 | kernel = self._optimize_atomic_layout(kernel, access) 184 | 185 | return kernel 186 | 187 | def _resolve_bank_conflicts(self, shared_var: CUDASharedMemory) -> CUDASharedMemory: 188 | """Resolve shared memory bank conflicts""" 189 | if not self._has_bank_conflicts(shared_var): 190 | return shared_var 191 | 192 | # Add padding to avoid conflicts 193 | padding = self._calculate_padding(shared_var) 194 | shared_var.size += padding 195 | 196 | return shared_var 197 | 198 | def _calculate_padding(self, var: CUDASharedMemory) -> int: 199 | """Calculate padding to avoid bank conflicts""" 200 | type_size = self._get_type_size(var.cuda_type) 201 | banks = 32 # Metal uses 32 banks 202 | 203 | if var.size % banks == 0: 204 | return 0 205 | 206 | return banks - (var.size % banks) 207 | 208 | def _can_vectorize(self, access: MemoryAccess) -> bool: 209 | """Check if memory access can be vectorized""" 210 | if not access.stride: 211 | return False 212 | 213 | # Check if stride matches vector size 214 | return ( 215 | access.stride in self.vector_sizes and 216 | access.alignment >= access.stride * 4 and # 4 bytes per element 217 | not access.is_atomic 218 | ) 219 | 220 | def _should_use_texture(self, access: MemoryAccess) -> bool: 221 | """Determine if array should use texture memory""" 222 | return ( 223 | access.is_read and 224 | not access.is_atomic and 225 | access.type in {MemoryAccessPattern.RANDOM, MemoryAccessPattern.STRIDED} and 226 | self._get_type_size(access.node.cuda_type) <= 16 # Max texture element size 227 | ) 228 | 229 | def _can_use_simdgroup(self, access: MemoryAccess) -> bool: 230 | """Check if atomic can use simdgroup operations""" 231 | return ( 232 | access.is_atomic and 233 | access.type == MemoryAccessPattern.SEQUENTIAL and 234 | self._is_reduction_pattern(access) 235 | ) 236 | 237 | def _get_type_size(self, cuda_type: CUDAType) -> int: 238 | """Get size of CUDA type in bytes""" 239 | size_map = { 240 | CUDAType.CHAR: 1, 241 | CUDAType.SHORT: 2, 242 | CUDAType.INT: 4, 243 | CUDAType.FLOAT: 4, 244 | CUDAType.DOUBLE: 8, 245 | } 246 | return size_map.get(cuda_type, 4) # Default to 4 bytes 247 | 248 | def get_optimization_report(self) -> Dict[str, Any]: 249 | """Generate memory optimization report""" 250 | return { 251 | "access_patterns": { 252 | pattern.value: len([a for a in self.memory_accesses if a.type == pattern]) 253 | for pattern in MemoryAccessPattern 254 | }, 255 | "vectorization_opportunities": len([ 256 | a for a in self.memory_accesses if self._can_vectorize(a) 257 | ]), 258 | "texture_candidates": len([ 259 | a for a in self.memory_accesses if self._should_use_texture(a) 260 | ]), 261 | "bank_conflicts": len([ 262 | a for a in self.memory_accesses 263 | if a.scope == "shared" and self._has_bank_conflicts(a.node) 264 | ]), 265 | "simdgroup_opportunities": len([ 266 | a for a in self.memory_accesses if self._can_use_simdgroup(a) 267 | ]) 268 | } -------------------------------------------------------------------------------- /templates/metal/kernel_template.metal: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace metal; 7 | 8 | // Utility functions for thread/block mapping 9 | namespace cuda { 10 | // Thread indexing 11 | struct uint3 { 12 | uint x, y, z; 13 | }; 14 | 15 | struct float3 { 16 | float x, y, z; 17 | }; 18 | 19 | // Device functions for CUDA compatibility 20 | METAL_FUNC uint3 get_thread_idx( 21 | uint3 thread_position_in_threadgroup, 22 | uint3 threads_per_threadgroup 23 | ) { 24 | return uint3{ 25 | thread_position_in_threadgroup.x, 26 | thread_position_in_threadgroup.y, 27 | thread_position_in_threadgroup.z 28 | }; 29 | } 30 | 31 | METAL_FUNC uint3 get_block_idx( 32 | uint3 threadgroup_position_in_grid, 33 | uint3 threads_per_threadgroup 34 | ) { 35 | return uint3{ 36 | threadgroup_position_in_grid.x, 37 | threadgroup_position_in_grid.y, 38 | threadgroup_position_in_grid.z 39 | }; 40 | } 41 | 42 | // Atomic operations 43 | template 44 | METAL_FUNC T atomicAdd(device atomic_uint* addr, T val) { 45 | return atomic_fetch_add_explicit(addr, val, memory_order_relaxed); 46 | } 47 | 48 | template 49 | METAL_FUNC T atomicMax(device atomic_uint* addr, T val) { 50 | return atomic_fetch_max_explicit(addr, val, memory_order_relaxed); 51 | } 52 | 53 | // Sync functions 54 | METAL_FUNC void __syncthreads() { 55 | threadgroup_barrier(mem_flags::mem_threadgroup); 56 | } 57 | 58 | METAL_FUNC void __threadfence() { 59 | threadgroup_barrier(mem_flags::mem_device); 60 | } 61 | 62 | // Math functions 63 | METAL_FUNC float __fdividef(float a, float b) { 64 | return a / b; 65 | } 66 | 67 | METAL_FUNC float __expf(float x) { 68 | return metal::exp(x); 69 | } 70 | } 71 | 72 | // Kernel struct for shared state 73 | struct KernelState { 74 | uint3 thread_idx; 75 | uint3 block_idx; 76 | uint3 block_dim; 77 | uint3 grid_dim; 78 | uint simd_lane_id; 79 | uint simd_group_id; 80 | }; 81 | 82 | // Initialize kernel state 83 | METAL_FUNC KernelState init_kernel_state( 84 | uint3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], 85 | uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], 86 | uint3 threads_per_threadgroup [[threads_per_threadgroup]], 87 | uint3 threadgroups_per_grid [[threadgroups_per_grid]] 88 | ) { 89 | KernelState state; 90 | 91 | state.thread_idx = cuda::get_thread_idx( 92 | thread_position_in_threadgroup, 93 | threads_per_threadgroup 94 | ); 95 | 96 | state.block_idx = cuda::get_block_idx( 97 | threadgroup_position_in_grid, 98 | threads_per_threadgroup 99 | ); 100 | 101 | state.block_dim = threads_per_threadgroup; 102 | state.grid_dim = threadgroups_per_grid; 103 | 104 | state.simd_lane_id = thread_position_in_threadgroup.x & 0x1F; 105 | state.simd_group_id = thread_position_in_threadgroup.x >> 5; 106 | 107 | return state; 108 | } 109 | 110 | // Common kernel parameters struct 111 | struct KernelParams { 112 | uint problem_size; 113 | uint batch_size; 114 | float learning_rate; 115 | // Add other common parameters 116 | }; 117 | 118 | // Example kernel - will be replaced by translation 119 | kernel void example_kernel( 120 | device float* input [[buffer(0)]], 121 | device float* output [[buffer(1)]], 122 | constant KernelParams& params [[buffer(2)]], 123 | uint3 thread_position_in_threadgroup [[thread_position_in_threadgroup]], 124 | uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]], 125 | uint3 threads_per_threadgroup [[threads_per_threadgroup]], 126 | uint3 threadgroups_per_grid [[threadgroups_per_grid]] 127 | ) { 128 | // Initialize kernel state 129 | KernelState state = init_kernel_state( 130 | thread_position_in_threadgroup, 131 | threadgroup_position_in_grid, 132 | threads_per_threadgroup, 133 | threadgroups_per_grid 134 | ); 135 | 136 | // Example shared memory 137 | threadgroup float shared_data[1024]; 138 | 139 | // Example CUDA-style indexing 140 | uint idx = (state.block_idx.x * state.block_dim.x) + state.thread_idx.x; 141 | if (idx >= params.problem_size) return; 142 | 143 | // Example computation with shared memory 144 | shared_data[state.thread_idx.x] = input[idx]; 145 | cuda::__syncthreads(); 146 | 147 | output[idx] = shared_data[state.thread_idx.x] * params.learning_rate; 148 | } 149 | // CUDA Performance Primitives (cuBLAS-like functions) 150 | namespace cublas { 151 | // Matrix multiply 152 | METAL_FUNC void gemm( 153 | device const float* A, 154 | device const float* B, 155 | device float* C, 156 | uint M, uint N, uint K, 157 | threadgroup float* shared_mem [[threadgroup(0)]] 158 | ) { 159 | constexpr uint TILE_SIZE = 16; 160 | uint2 tid = uint2(threadIdx_x, threadIdx_y); 161 | uint2 bid = uint2(blockIdx_x, blockIdx_y); 162 | 163 | // Tile start positions 164 | uint row = bid.y * TILE_SIZE + tid.y; 165 | uint col = bid.x * TILE_SIZE + tid.x; 166 | 167 | // Accumulator for dot product 168 | float acc = 0.0f; 169 | 170 | // Loop over tiles 171 | for (uint t = 0; t < K; t += TILE_SIZE) { 172 | // Load tile into shared memory 173 | threadgroup float* tile_A = shared_mem; 174 | threadgroup float* tile_B = shared_mem + TILE_SIZE * TILE_SIZE; 175 | 176 | if (row < M && (t + tid.x) < K) 177 | tile_A[tid.y * TILE_SIZE + tid.x] = A[row * K + t + tid.x]; 178 | if (col < N && (t + tid.y) < K) 179 | tile_B[tid.y * TILE_SIZE + tid.x] = B[(t + tid.y) * N + col]; 180 | 181 | threadgroup_barrier(mem_flags::mem_threadgroup); 182 | 183 | // Compute partial dot product 184 | for (uint k = 0; k < TILE_SIZE; k++) { 185 | acc += tile_A[tid.y * TILE_SIZE + k] * 186 | tile_B[k * TILE_SIZE + tid.x]; 187 | } 188 | 189 | threadgroup_barrier(mem_flags::mem_threadgroup); 190 | } 191 | 192 | // Store result 193 | if (row < M && col < N) 194 | C[row * N + col] = acc; 195 | } 196 | 197 | // Vector operations 198 | METAL_FUNC void axpy( 199 | device const float* x, 200 | device float* y, 201 | float alpha, 202 | uint n 203 | ) { 204 | uint idx = (blockIdx_x * blockDim_x) + threadIdx_x; 205 | if (idx < n) 206 | y[idx] = alpha * x[idx] + y[idx]; 207 | } 208 | } 209 | 210 | // Common Deep Learning Primitives 211 | namespace cudnn { 212 | // ReLU activation 213 | METAL_FUNC void relu( 214 | device const float* input, 215 | device float* output, 216 | uint size 217 | ) { 218 | uint idx = (blockIdx_x * blockDim_x) + threadIdx_x; 219 | if (idx < size) 220 | output[idx] = max(0.0f, input[idx]); 221 | } 222 | 223 | // Softmax 224 | METAL_FUNC void softmax( 225 | device const float* input, 226 | device float* output, 227 | uint batch_size, 228 | uint feature_size, 229 | threadgroup float* shared_mem [[threadgroup(0)]] 230 | ) { 231 | uint tid = threadIdx_x; 232 | uint bid = blockIdx_x; 233 | 234 | if (bid >= batch_size) return; 235 | 236 | // Find max value 237 | float max_val = -INFINITY; 238 | for (uint i = tid; i < feature_size; i += blockDim_x) 239 | max_val = max(max_val, input[bid * feature_size + i]); 240 | 241 | threadgroup float* shared_max = shared_mem; 242 | shared_max[tid] = max_val; 243 | threadgroup_barrier(mem_flags::mem_threadgroup); 244 | 245 | // Reduce to find global max 246 | for (uint stride = blockDim_x/2; stride > 0; stride >>= 1) { 247 | if (tid < stride) 248 | shared_max[tid] = max(shared_max[tid], shared_max[tid + stride]); 249 | threadgroup_barrier(mem_flags::mem_threadgroup); 250 | } 251 | max_val = shared_max[0]; 252 | 253 | // Compute exp and sum 254 | float sum = 0.0f; 255 | for (uint i = tid; i < feature_size; i += blockDim_x) { 256 | float val = exp(input[bid * feature_size + i] - max_val); 257 | output[bid * feature_size + i] = val; 258 | sum += val; 259 | } 260 | 261 | threadgroup float* shared_sum = shared_mem; 262 | shared_sum[tid] = sum; 263 | threadgroup_barrier(mem_flags::mem_threadgroup); 264 | 265 | // Reduce to find global sum 266 | for (uint stride = blockDim_x/2; stride > 0; stride >>= 1) { 267 | if (tid < stride) 268 | shared_sum[tid] += shared_sum[tid + stride]; 269 | threadgroup_barrier(mem_flags::mem_threadgroup); 270 | } 271 | sum = shared_sum[0]; 272 | 273 | // Normalize 274 | for (uint i = tid; i < feature_size; i += blockDim_x) 275 | output[bid * feature_size + i] /= sum; 276 | } 277 | } 278 | 279 | // Memory optimization utilities 280 | namespace cuda_utils { 281 | // Coalesced memory copy 282 | METAL_FUNC void coalesced_copy( 283 | device const float* src, 284 | device float* dst, 285 | uint size 286 | ) { 287 | uint idx = (blockIdx_x * blockDim_x) + threadIdx_x; 288 | if (idx >= size) return; 289 | 290 | // Vector load/store when possible 291 | if ((idx + 3) < size && (idx % 4) == 0) { 292 | float4 vec = *reinterpret_cast(&src[idx]); 293 | *reinterpret_cast(&dst[idx]) = vec; 294 | } else if (idx < size) { 295 | dst[idx] = src[idx]; 296 | } 297 | } 298 | 299 | // Strided memory access pattern 300 | METAL_FUNC void strided_copy( 301 | device const float* src, 302 | device float* dst, 303 | uint size, 304 | uint stride 305 | ) { 306 | uint idx = threadIdx_x + blockDim_x * blockIdx_x; 307 | uint offset = idx * stride; 308 | 309 | if (offset >= size) return; 310 | 311 | for (uint i = 0; i < stride && (offset + i) < size; i++) 312 | dst[offset + i] = src[offset + i]; 313 | } 314 | } 315 | 316 | // Warp-level primitives 317 | namespace cuda_warp { 318 | // Warp reduce sum 319 | METAL_FUNC float warp_reduce_sum(float val) { 320 | const uint lane_id = get_lane_id(); 321 | 322 | // Butterfly reduction 323 | for (uint offset = METAL_WARP_SIZE/2; offset > 0; offset >>= 1) 324 | val += simd_shuffle_xor(val, offset); 325 | 326 | return val; 327 | } 328 | 329 | // Warp reduce max 330 | METAL_FUNC float warp_reduce_max(float val) { 331 | const uint lane_id = get_lane_id(); 332 | 333 | for (uint offset = METAL_WARP_SIZE/2; offset > 0; offset >>= 1) 334 | val = max(val, simd_shuffle_xor(val, offset)); 335 | 336 | return val; 337 | } 338 | 339 | // Warp broadcast 340 | METAL_FUNC float warp_broadcast(float val, uint src_lane) { 341 | return simd_broadcast(val, src_lane); 342 | } 343 | } -------------------------------------------------------------------------------- /cli/cli.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | from pathlib import Path 4 | from typing import Dict, List, Optional, Union, Any 5 | from concurrent.futures import ThreadPoolExecutor 6 | from dataclasses import dataclass 7 | 8 | from ..parser.cuda_parser import CudaParser 9 | from ..translator.kernel_translator import KernelTranslator 10 | from ..translator.host_adapter import HostAdapter 11 | from ..optimizer.metal_optimizer import MetalOptimizer 12 | from ..utils.error_handler import CudaTranslationError 13 | from ..utils.logger import get_logger 14 | from .config_parser import ConfigParser, MetalConfig 15 | 16 | logger = get_logger(__name__) 17 | 18 | @dataclass 19 | class TranslationConfig: 20 | """Translation configuration parameters""" 21 | input_path: Path 22 | output_path: Path 23 | metal_target: str = "2.4" 24 | optimization_level: int = 2 25 | generate_tests: bool = True 26 | preserve_comments: bool = True 27 | source_map: bool = True 28 | enable_profiling: bool = False 29 | 30 | class CLI: 31 | """ 32 | Production-grade CLI implementation for CUDA to Metal translation. 33 | Thread-safe, optimized for performance, with comprehensive error handling. 34 | """ 35 | 36 | def __init__(self): 37 | """Initialize CLI with required components""" 38 | self.parser = CudaParser() 39 | self.kernel_translator = KernelTranslator() 40 | self.host_adapter = HostAdapter() 41 | self.optimizer = MetalOptimizer() 42 | self.config_parser = ConfigParser() 43 | 44 | # Thread pool for parallel processing 45 | self.executor = ThreadPoolExecutor(max_workers=min(32, (os.cpu_count() or 1) * 4)) 46 | 47 | # Translation cache for performance 48 | self._translation_cache: Dict[str, Any] = {} 49 | 50 | def run(self) -> int: 51 | """ 52 | Main entry point for CLI execution. 53 | Returns exit code (0 for success, non-zero for error) 54 | """ 55 | try: 56 | args = self._parse_arguments() 57 | config = self._load_configuration(args) 58 | 59 | if args.command == 'translate': 60 | return self._handle_translation(args, config) 61 | elif args.command == 'validate': 62 | return self._handle_validation(args) 63 | elif args.command == 'analyze': 64 | return self._handle_analysis(args) 65 | 66 | logger.error(f"Unknown command: {args.command}") 67 | return 1 68 | 69 | except Exception as e: 70 | logger.error(f"Error during execution: {str(e)}") 71 | return 1 72 | finally: 73 | self.executor.shutdown(wait=True) 74 | 75 | def _parse_arguments(self) -> argparse.Namespace: 76 | """Parse and validate command line arguments""" 77 | parser = argparse.ArgumentParser( 78 | description='CUDA to Metal Translation Tool', 79 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 80 | ) 81 | 82 | parser.add_argument( 83 | '--verbose', '-v', 84 | action='count', 85 | default=0, 86 | help='Increase output verbosity' 87 | ) 88 | 89 | parser.add_argument( 90 | '--config', 91 | type=str, 92 | help='Path to configuration file' 93 | ) 94 | 95 | subparsers = parser.add_subparsers(dest='command', required=True) 96 | 97 | # Translation command 98 | translate_parser = subparsers.add_parser('translate') 99 | translate_parser.add_argument( 100 | 'input', 101 | type=str, 102 | help='Input CUDA file or directory' 103 | ) 104 | translate_parser.add_argument( 105 | 'output', 106 | type=str, 107 | help='Output directory for Metal code' 108 | ) 109 | translate_parser.add_argument( 110 | '--language', 111 | choices=['swift', 'objc'], 112 | default='swift', 113 | help='Output language for host code' 114 | ) 115 | translate_parser.add_argument( 116 | '--optimize', 117 | type=int, 118 | choices=[0, 1, 2, 3], 119 | default=2, 120 | help='Optimization level' 121 | ) 122 | 123 | # Validation command 124 | validate_parser = subparsers.add_parser('validate') 125 | validate_parser.add_argument( 126 | 'input', 127 | type=str, 128 | help='Input CUDA file or directory to validate' 129 | ) 130 | 131 | # Analysis command 132 | analyze_parser = subparsers.add_parser('analyze') 133 | analyze_parser.add_argument( 134 | 'input', 135 | type=str, 136 | help='Input CUDA file or directory to analyze' 137 | ) 138 | 139 | args = parser.parse_args() 140 | 141 | # Set logging level based on verbosity 142 | if args.verbose == 1: 143 | logging.getLogger().setLevel(logging.INFO) 144 | elif args.verbose >= 2: 145 | logging.getLogger().setLevel(logging.DEBUG) 146 | 147 | return args 148 | 149 | def _load_configuration(self, args: argparse.Namespace) -> Dict[str, Any]: 150 | """Load and validate configuration from file""" 151 | if not args.config: 152 | return {} 153 | 154 | try: 155 | return self.config_parser.parse(args.config) 156 | except Exception as e: 157 | logger.error(f"Failed to parse configuration: {e}") 158 | raise 159 | 160 | def _handle_translation(self, args: argparse.Namespace, config: Dict[str, Any]) -> int: 161 | """Handle translation command with full error handling""" 162 | try: 163 | input_path = Path(args.input) 164 | output_path = Path(args.output) 165 | 166 | # Validate paths 167 | if not input_path.exists(): 168 | raise CudaTranslationError(f"Input path does not exist: {input_path}") 169 | 170 | output_path.mkdir(parents=True, exist_ok=True) 171 | 172 | if input_path.is_file(): 173 | return self._translate_file(input_path, output_path, args, config) 174 | elif input_path.is_dir(): 175 | return self._translate_directory(input_path, output_path, args, config) 176 | 177 | logger.error(f"Invalid input path: {input_path}") 178 | return 1 179 | 180 | except Exception as e: 181 | logger.error(f"Translation failed: {e}") 182 | return 1 183 | 184 | def _translate_file(self, input_file: Path, output_dir: Path, 185 | args: argparse.Namespace, config: Dict[str, Any]) -> int: 186 | """Translate single CUDA file to Metal""" 187 | try: 188 | logger.info(f"Translating file: {input_file}") 189 | 190 | # Parse CUDA code 191 | ast = self.parser.parse_file(str(input_file)) 192 | 193 | # Apply optimizations 194 | if args.optimize > 0: 195 | ast = self.optimizer.optimize(ast, args.optimize) 196 | 197 | # Generate Metal code 198 | metal_code = self.kernel_translator.translate_kernel(ast) 199 | 200 | # Generate host code 201 | if args.language == 'swift': 202 | host_code = self._generate_swift_host_code(ast) 203 | else: 204 | host_code = self._generate_objc_host_code(ast) 205 | 206 | # Write output files 207 | output_base = output_dir / input_file.stem 208 | metal_file = output_base.with_suffix('.metal') 209 | host_file = output_base.with_suffix( 210 | '.swift' if args.language == 'swift' else '.m' 211 | ) 212 | 213 | metal_file.write_text(metal_code) 214 | host_file.write_text(host_code) 215 | 216 | logger.info(f"Successfully translated {input_file}") 217 | return 0 218 | 219 | except Exception as e: 220 | logger.error(f"Failed to translate {input_file}: {e}") 221 | return 1 222 | 223 | def _generate_swift_host_code(self, ast: Any) -> str: 224 | """Generate Swift host code with proper Metal setup""" 225 | metal_code = [] 226 | 227 | # Import statements 228 | metal_code.append(""" 229 | import Metal 230 | import MetalKit 231 | 232 | // MARK: - Metal Setup 233 | guard let device = MTLCreateSystemDefaultDevice() else { 234 | fatalError("Metal is not supported on this device") 235 | } 236 | 237 | guard let commandQueue = device.makeCommandQueue() else { 238 | fatalError("Failed to create command queue") 239 | } 240 | """) 241 | 242 | # Add buffer creation 243 | for buffer in self._extract_buffers(ast): 244 | metal_code.append(self._generate_swift_buffer(buffer)) 245 | 246 | # Add kernel execution 247 | for kernel in self._extract_kernels(ast): 248 | metal_code.append(self._generate_swift_kernel_execution(kernel)) 249 | 250 | return "\n".join(metal_code) 251 | 252 | def _generate_objc_host_code(self, ast: Any) -> str: 253 | """Generate Objective-C host code with proper Metal setup""" 254 | metal_code = [] 255 | 256 | # Import and setup 257 | metal_code.append(""" 258 | #import 259 | #import 260 | 261 | id device = MTLCreateSystemDefaultDevice(); 262 | if (!device) { 263 | NSLog(@"Metal is not supported on this device"); 264 | return; 265 | } 266 | 267 | id commandQueue = [device newCommandQueue]; 268 | if (!commandQueue) { 269 | NSLog(@"Failed to create command queue"); 270 | return; 271 | } 272 | """) 273 | 274 | # Add buffer creation 275 | for buffer in self._extract_buffers(ast): 276 | metal_code.append(self._generate_objc_buffer(buffer)) 277 | 278 | # Add kernel execution 279 | for kernel in self._extract_kernels(ast): 280 | metal_code.append(self._generate_objc_kernel_execution(kernel)) 281 | 282 | return "\n".join(metal_code) 283 | 284 | def _extract_kernels(self, ast: Any) -> List[Any]: 285 | """Extract kernel nodes from AST""" 286 | kernels = [] 287 | for node in ast.walk_preorder(): 288 | if hasattr(node, 'is_kernel') and node.is_kernel(): 289 | kernels.append(node) 290 | return kernels 291 | 292 | def _extract_buffers(self, ast: Any) -> List[Any]: 293 | """Extract buffer nodes from AST""" 294 | buffers = [] 295 | for node in ast.walk_preorder(): 296 | if hasattr(node, 'is_buffer') and node.is_buffer(): 297 | buffers.append(node) 298 | return buffers 299 | 300 | def cleanup(self): 301 | """Clean up resources""" 302 | try: 303 | self.executor.shutdown(wait=True) 304 | except Exception as e: 305 | logger.error(f"Error during cleanup: {e}") 306 | 307 | # Direct script execution 308 | def main(): 309 | """Main entry point for CLI""" 310 | cli = CLI() 311 | try: 312 | return cli.run() 313 | finally: 314 | cli.cleanup() 315 | 316 | if __name__ == '__main__': 317 | import sys 318 | sys.exit(main()) -------------------------------------------------------------------------------- /templates/objc/kernel_wrapper.m: -------------------------------------------------------------------------------- 1 | #import 2 | #import 3 | #import "kernel_wrapper.h" 4 | 5 | // CUDA-style error codes 6 | typedef NS_ENUM(NSInteger, CUDAError) { 7 | cudaSuccess = 0, 8 | cudaErrorDeviceNotFound = 1, 9 | cudaErrorMemoryAllocation = 2, 10 | cudaErrorInvalidValue = 3, 11 | cudaErrorLaunchFailure = 4 12 | }; 13 | 14 | @implementation CUDAMetalDevice { 15 | id _device; 16 | id _commandQueue; 17 | NSMutableDictionary>* _kernelPipelineStates; 18 | NSMutableDictionary>* _kernelFunctions; 19 | NSMutableDictionary* _allocatedBuffers; 20 | } 21 | 22 | - (instancetype)init { 23 | self = [super init]; 24 | if (self) { 25 | _device = MTLCreateSystemDefaultDevice(); 26 | if (!_device) { 27 | return nil; 28 | } 29 | 30 | _commandQueue = [_device newCommandQueue]; 31 | if (!_commandQueue) { 32 | return nil; 33 | } 34 | 35 | _kernelPipelineStates = [NSMutableDictionary new]; 36 | _kernelFunctions = [NSMutableDictionary new]; 37 | _allocatedBuffers = [NSMutableDictionary new]; 38 | } 39 | return self; 40 | } 41 | 42 | // CUDA Memory Management 43 | - (CUDAError)cudaMalloc:(void**)ptr size:(size_t)size { 44 | id buffer = [_device newBufferWithLength:size 45 | options:MTLResourceStorageModeShared]; 46 | if (!buffer) { 47 | return cudaErrorMemoryAllocation; 48 | } 49 | 50 | *ptr = buffer.contents; 51 | [_allocatedBuffers setObject:buffer forKey:[NSValue valueWithPointer:*ptr]]; 52 | 53 | return cudaSuccess; 54 | } 55 | 56 | - (CUDAError)cudaFree:(void*)ptr { 57 | [_allocatedBuffers removeObjectForKey:[NSValue valueWithPointer:ptr]]; 58 | return cudaSuccess; 59 | } 60 | 61 | - (CUDAError)cudaMemcpy:(void*)dst 62 | src:(const void*)src 63 | size:(size_t)size 64 | kind:(CUDAMemcpyKind)kind { 65 | switch (kind) { 66 | case cudaMemcpyHostToDevice: { 67 | id buffer = [_allocatedBuffers objectForKey:[NSValue valueWithPointer:dst]]; 68 | if (!buffer) return cudaErrorInvalidValue; 69 | memcpy(buffer.contents, src, size); 70 | break; 71 | } 72 | 73 | case cudaMemcpyDeviceToHost: { 74 | id buffer = [_allocatedBuffers objectForKey:[NSValue valueWithPointer:src]]; 75 | if (!buffer) return cudaErrorInvalidValue; 76 | memcpy(dst, buffer.contents, size); 77 | break; 78 | } 79 | 80 | case cudaMemcpyDeviceToDevice: { 81 | id srcBuffer = [_allocatedBuffers objectForKey:[NSValue valueWithPointer:src]]; 82 | id dstBuffer = [_allocatedBuffers objectForKey:[NSValue valueWithPointer:dst]]; 83 | if (!srcBuffer || !dstBuffer) return cudaErrorInvalidValue; 84 | 85 | id commandBuffer = [_commandQueue commandBuffer]; 86 | id blitEncoder = [commandBuffer blitCommandEncoder]; 87 | 88 | [blitEncoder copyFromBuffer:srcBuffer 89 | sourceOffset:0 90 | toBuffer:dstBuffer 91 | destinationOffset:0 92 | size:size]; 93 | 94 | [blitEncoder endEncoding]; 95 | [commandBuffer commit]; 96 | [commandBuffer waitUntilCompleted]; 97 | break; 98 | } 99 | } 100 | return cudaSuccess; 101 | } 102 | 103 | // Kernel Management 104 | - (CUDAError)loadMetalLibraryWithURL:(NSURL*)url error:(NSError**)error { 105 | id library = [_device newLibraryWithURL:url error:error]; 106 | if (!library) { 107 | return cudaErrorLaunchFailure; 108 | } 109 | 110 | // Load all kernel functions 111 | for (NSString* functionName in library.functionNames) { 112 | id function = [library newFunctionWithName:functionName]; 113 | if (!function) continue; 114 | 115 | _kernelFunctions[functionName] = function; 116 | 117 | // Create pipeline state 118 | id pipelineState = 119 | [_device newComputePipelineStateWithFunction:function error:error]; 120 | if (pipelineState) { 121 | _kernelPipelineStates[functionName] = pipelineState; 122 | } 123 | } 124 | 125 | return cudaSuccess; 126 | } 127 | 128 | // CUDA Kernel Launch 129 | - (CUDAError)launchKernel:(NSString*)name 130 | gridDim:(MTLSize)gridDim 131 | blockDim:(MTLSize)blockDim 132 | arguments:(NSArray>*)arguments { 133 | 134 | id pipelineState = _kernelPipelineStates[name]; 135 | if (!pipelineState) { 136 | return cudaErrorLaunchFailure; 137 | } 138 | 139 | id commandBuffer = [_commandQueue commandBuffer]; 140 | id computeEncoder = [commandBuffer computeCommandEncoder]; 141 | 142 | // Set compute pipeline state 143 | [computeEncoder setComputePipelineState:pipelineState]; 144 | 145 | // Set buffer arguments 146 | [arguments enumerateObjectsUsingBlock:^(id buffer, NSUInteger idx, BOOL *stop) { 147 | [computeEncoder setBuffer:buffer offset:0 atIndex:idx]; 148 | }]; 149 | 150 | // Calculate threadgroup size 151 | NSUInteger threadGroupWidth = blockDim.width; 152 | NSUInteger threadGroupHeight = blockDim.height; 153 | NSUInteger threadGroupDepth = blockDim.depth; 154 | 155 | MTLSize threadsPerThreadgroup = MTLSizeMake(threadGroupWidth, 156 | threadGroupHeight, 157 | threadGroupDepth); 158 | 159 | // Dispatch threads 160 | [computeEncoder dispatchThreadgroups:gridDim 161 | threadsPerThreadgroup:threadsPerThreadgroup]; 162 | 163 | [computeEncoder endEncoding]; 164 | [commandBuffer commit]; 165 | 166 | return cudaSuccess; 167 | } 168 | 169 | // Helper Methods 170 | - (CUDAError)setBuffer:(void*)data 171 | size:(size_t)size 172 | forKernel:(NSString*)kernelName 173 | atIndex:(NSUInteger)index { 174 | 175 | id buffer = [_device newBufferWithBytes:data 176 | length:size 177 | options:MTLResourceStorageModeShared]; 178 | if (!buffer) { 179 | return cudaErrorMemoryAllocation; 180 | } 181 | 182 | _allocatedBuffers[[NSValue valueWithPointer:buffer.contents]] = buffer; 183 | return cudaSuccess; 184 | } 185 | 186 | // CUDA Event Management 187 | - (CUDAError)cudaEventCreate:(cudaEvent_t*)event { 188 | *event = (cudaEvent_t)[_device newEvent]; 189 | return cudaSuccess; 190 | } 191 | 192 | - (CUDAError)cudaEventRecord:(cudaEvent_t)event stream:(cudaStream_t)stream { 193 | id commandBuffer = (__bridge id)stream; 194 | [commandBuffer encodeWait:(__bridge id)event value:0]; 195 | return cudaSuccess; 196 | } 197 | 198 | - (CUDAError)cudaEventSynchronize:(cudaEvent_t)event { 199 | [(id)event notifyListener:nil 200 | atValue:0 201 | block:^(id event, uint64_t value){}]; 202 | return cudaSuccess; 203 | } 204 | 205 | // CUDA Stream Management 206 | - (CUDAError)cudaStreamCreate:(cudaStream_t*)stream { 207 | *stream = (cudaStream_t)CFBridgingRetain([_commandQueue commandBuffer]); 208 | return cudaSuccess; 209 | } 210 | 211 | - (CUDAError)cudaStreamSynchronize:(cudaStream_t)stream { 212 | id commandBuffer = (__bridge id)stream; 213 | [commandBuffer waitUntilCompleted]; 214 | return cudaSuccess; 215 | } 216 | 217 | // Device Synchronization 218 | - (CUDAError)cudaDeviceSynchronize { 219 | [_commandQueue insertDebugCaptureBoundary]; 220 | return cudaSuccess; 221 | } 222 | 223 | @end 224 | 225 | // Kernel Parameters 226 | @implementation KernelParameters 227 | 228 | - (instancetype)initWithProblemSize:(NSUInteger)problemSize 229 | batchSize:(NSUInteger)batchSize 230 | learningRate:(float)learningRate { 231 | self = [super init]; 232 | if (self) { 233 | _problemSize = problemSize; 234 | _batchSize = batchSize; 235 | _learningRate = learningRate; 236 | } 237 | return self; 238 | } 239 | 240 | - (id)asMetalBufferWithDevice:(id)device { 241 | return [device newBufferWithBytes:self 242 | length:sizeof(KernelParameters) 243 | options:MTLResourceStorageModeShared]; 244 | } 245 | 246 | @end 247 | 248 | // Header file for the above implementation 249 | @interface CUDAMetalDevice : NSObject 250 | 251 | // CUDA Memory Management 252 | - (CUDAError)cudaMalloc:(void**)ptr size:(size_t)size; 253 | - (CUDAError)cudaFree:(void*)ptr; 254 | - (CUDAError)cudaMemcpy:(void*)dst 255 | src:(const void*)src 256 | size:(size_t)size 257 | kind:(CUDAMemcpyKind)kind; 258 | 259 | // Kernel Management 260 | - (CUDAError)loadMetalLibraryWithURL:(NSURL*)url error:(NSError**)error; 261 | - (CUDAError)launchKernel:(NSString*)name 262 | gridDim:(MTLSize)gridDim 263 | blockDim:(MTLSize)blockDim 264 | arguments:(NSArray>*)arguments; 265 | 266 | // Event Management 267 | - (CUDAError)cudaEventCreate:(cudaEvent_t*)event; 268 | - (CUDAError)cudaEventRecord:(cudaEvent_t)event stream:(cudaStream_t)stream; 269 | - (CUDAError)cudaEventSynchronize:(cudaEvent_t)event; 270 | 271 | // Stream Management 272 | - (CUDAError)cudaStreamCreate:(cudaStream_t*)stream; 273 | - (CUDAError)cudaStreamSynchronize:(cudaStream_t)stream; 274 | 275 | // Device Synchronization 276 | - (CUDAError)cudaDeviceSynchronize; 277 | 278 | @end -------------------------------------------------------------------------------- /generator/swift_generator.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Set, Optional, Union 2 | from pathlib import Path 3 | import logging 4 | from threading import Lock 5 | 6 | from ..utils.error_handler import CudaTranslationError 7 | from ..utils.logger import get_logger 8 | from ..parser.ast_nodes import CUDAKernel 9 | 10 | logger = get_logger(__name__) 11 | 12 | class SwiftGenerator: 13 | """ 14 | Production-grade Swift code generator for Metal kernel integration. 15 | Handles host-side code generation with proper memory management and error handling. 16 | """ 17 | 18 | def __init__(self): 19 | self._lock = Lock() 20 | self._cache: Dict[str, str] = {} 21 | 22 | # Metal-specific settings 23 | self.metal_settings = { 24 | 'max_buffers': 31, 25 | 'max_buffer_size': 256 * 1024 * 1024, # 256MB 26 | 'preferred_alignment': 256, 27 | 'max_command_buffers': 32 28 | } 29 | 30 | def generate_host_code(self, kernel: CUDAKernel, class_name: Optional[str] = None) -> str: 31 | """Generate Swift host code for Metal kernel execution.""" 32 | try: 33 | # Generate core components 34 | class_name = class_name or f"{kernel.name}Kernel" 35 | imports = self._generate_imports() 36 | class_def = self._generate_class_definition(class_name, kernel) 37 | buffer_management = self._generate_buffer_management(kernel) 38 | kernel_execution = self._generate_kernel_execution(kernel) 39 | error_handling = self._generate_error_handling() 40 | 41 | # Combine all components 42 | swift_code = f""" 43 | {imports} 44 | 45 | // MARK: - Metal Kernel Implementation 46 | {class_def} 47 | 48 | // MARK: - Properties 49 | private let device: MTLDevice 50 | private let commandQueue: MTLCommandQueue 51 | private let pipelineState: MTLComputePipelineState 52 | private var buffers: [String: MTLBuffer] = [:] 53 | 54 | // MARK: - Initialization 55 | init() throws {{ 56 | guard let device = MTLCreateSystemDefaultDevice() else {{ 57 | throw MetalError.deviceNotFound 58 | }} 59 | self.device = device 60 | 61 | guard let commandQueue = device.makeCommandQueue() else {{ 62 | throw MetalError.commandQueueCreationFailed 63 | }} 64 | self.commandQueue = commandQueue 65 | 66 | self.pipelineState = try Self.createPipelineState(device: device) 67 | }} 68 | 69 | // MARK: - Pipeline Setup 70 | private static func createPipelineState(device: MTLDevice) throws -> MTLComputePipelineState {{ 71 | guard let library = device.makeDefaultLibrary() else {{ 72 | throw MetalError.libraryCreationFailed 73 | }} 74 | 75 | guard let kernelFunction = library.makeFunction(name: "{kernel.name}") else {{ 76 | throw MetalError.functionNotFound 77 | }} 78 | 79 | do {{ 80 | return try device.makeComputePipelineState(function: kernelFunction) 81 | }} catch {{ 82 | throw MetalError.pipelineCreationFailed 83 | }} 84 | }} 85 | 86 | {buffer_management} 87 | 88 | // MARK: - Kernel Execution 89 | {kernel_execution} 90 | 91 | {error_handling} 92 | }} 93 | 94 | // MARK: - Extension for Async/Await Support 95 | extension {class_name} {{ 96 | /// Execute kernel with async/await support 97 | func executeAsync( 98 | {self._generate_parameter_list(kernel)} 99 | ) async throws {{ 100 | try await withCheckedThrowingContinuation {{ continuation in 101 | execute( 102 | {self._generate_argument_list(kernel)}, 103 | completion: {{ result in 104 | switch result {{ 105 | case .success: 106 | continuation.resume() 107 | case .failure(let error): 108 | continuation.resume(throwing: error) 109 | }} 110 | }} 111 | ) 112 | }} 113 | }} 114 | 115 | /// Execute kernel with completion handler 116 | func execute( 117 | {self._generate_parameter_list(kernel)}, 118 | completion: @escaping (Result) -> Void 119 | ) {{ 120 | do {{ 121 | // Validate input parameters 122 | try validateInputs({self._generate_validation_list(kernel)}) 123 | 124 | // Create command buffer and encoder 125 | guard let commandBuffer = commandQueue.makeCommandBuffer(), 126 | let encoder = commandBuffer.makeComputeCommandEncoder() else {{ 127 | throw MetalError.commandEncodingFailed 128 | }} 129 | 130 | // Configure encoder 131 | encoder.setComputePipelineState(pipelineState) 132 | 133 | // Set buffers 134 | try setBuffers(encoder: encoder, {self._generate_buffer_list(kernel)}) 135 | 136 | // Calculate optimal thread configuration 137 | let threadGroupSize = MTLSize(width: {kernel.thread_config.block_size[0]}, 138 | height: {kernel.thread_config.block_size[1]}, 139 | depth: {kernel.thread_config.block_size[2]}) 140 | let gridSize = calculateGridSize(dataSize: dataSize, threadGroupSize: threadGroupSize) 141 | 142 | // Dispatch threads 143 | encoder.dispatchThreadgroups(gridSize, threadsPerThreadgroup: threadGroupSize) 144 | encoder.endEncoding() 145 | 146 | // Add completion handler 147 | commandBuffer.addCompletedHandler {{ buffer in 148 | if let error = buffer.error {{ 149 | completion(.failure(MetalError.executionFailed(error))) 150 | }} else {{ 151 | completion(.success(())) 152 | }} 153 | }} 154 | 155 | // Commit command buffer 156 | commandBuffer.commit() 157 | 158 | }} catch {{ 159 | completion(.failure(error)) 160 | }} 161 | }} 162 | 163 | // MARK: - Private Helper Methods 164 | private func validateInputs({self._generate_parameter_list(kernel)}) throws {{ 165 | // Implement input validation logic based on kernel requirements 166 | {self._generate_validation_code(kernel)} 167 | }} 168 | 169 | private func setBuffers( 170 | encoder: MTLComputeCommandEncoder, 171 | {self._generate_parameter_list(kernel)} 172 | ) throws {{ 173 | // Set buffers with proper error handling 174 | {self._generate_buffer_setup_code(kernel)} 175 | }} 176 | 177 | private func calculateGridSize(dataSize: Int, threadGroupSize: MTLSize) -> MTLSize {{ 178 | let w = (dataSize + threadGroupSize.width - 1) / threadGroupSize.width 179 | return MTLSizeMake(w, 1, 1) 180 | }} 181 | }} 182 | 183 | // MARK: - Error Types 184 | enum MetalError: LocalizedError {{ 185 | case deviceNotFound 186 | case libraryCreationFailed 187 | case functionNotFound 188 | case pipelineCreationFailed 189 | case commandQueueCreationFailed 190 | case commandEncodingFailed 191 | case invalidBufferSize 192 | case bufferAllocationFailed 193 | case executionFailed(Error) 194 | case invalidInputParameters(String) 195 | 196 | var errorDescription: String? {{ 197 | switch self {{ 198 | case .deviceNotFound: 199 | return "Metal device not found" 200 | case .libraryCreationFailed: 201 | return "Failed to create Metal library" 202 | case .functionNotFound: 203 | return "Metal kernel function not found" 204 | case .pipelineCreationFailed: 205 | return "Failed to create compute pipeline state" 206 | case .commandQueueCreationFailed: 207 | return "Failed to create command queue" 208 | case .commandEncodingFailed: 209 | return "Failed to create command encoder" 210 | case .invalidBufferSize: 211 | return "Invalid buffer size specified" 212 | case .bufferAllocationFailed: 213 | return "Failed to allocate Metal buffer" 214 | case .executionFailed(let error): 215 | return "Kernel execution failed: \\(error.localizedDescription)" 216 | case .invalidInputParameters(let message): 217 | return "Invalid input parameters: \\(message)" 218 | }} 219 | }} 220 | }} 221 | 222 | // MARK: - Buffer Management Extension 223 | private extension {class_name} {{ 224 | func createBuffer(from data: [T], options: MTLResourceOptions = .storageModeShared) throws -> MTLBuffer {{ 225 | let size = MemoryLayout.stride * data.count 226 | guard size > 0 else {{ 227 | throw MetalError.invalidBufferSize 228 | }} 229 | 230 | guard let buffer = device.makeBuffer(bytes: data, 231 | length: size, 232 | options: options) else {{ 233 | throw MetalError.bufferAllocationFailed 234 | }} 235 | 236 | return buffer 237 | }} 238 | 239 | func createBuffer(size: Int, options: MTLResourceOptions = .storageModeShared) throws -> MTLBuffer {{ 240 | guard size > 0 else {{ 241 | throw MetalError.invalidBufferSize 242 | }} 243 | 244 | guard let buffer = device.makeBuffer(length: size, 245 | options: options) else {{ 246 | throw MetalError.bufferAllocationFailed 247 | }} 248 | 249 | return buffer 250 | }} 251 | }} 252 | """ 253 | 254 | return swift_code 255 | 256 | except Exception as e: 257 | logger.error(f"Failed to generate Swift host code: {str(e)}") 258 | raise CudaTranslationError(f"Swift code generation failed: {str(e)}") 259 | 260 | def _generate_imports(self) -> str: 261 | """Generate required import statements.""" 262 | return """ 263 | import Metal 264 | import MetalKit 265 | import Foundation 266 | """ 267 | 268 | def _generate_class_definition(self, class_name: str, kernel: CUDAKernel) -> str: 269 | """Generate class definition with documentation.""" 270 | return f""" 271 | /// Metal kernel wrapper for {kernel.name} 272 | /// Provides type-safe interface for kernel execution with proper error handling 273 | final class {class_name} {{""" 274 | 275 | def _generate_parameter_list(self, kernel: CUDAKernel) -> str: 276 | """Generate parameter list for function signatures.""" 277 | params = [] 278 | for param in kernel.parameters: 279 | swift_type = self._cuda_type_to_swift(param.cuda_type) 280 | params.append(f"{param.name}: {swift_type}") 281 | return ", ".join(params) 282 | 283 | def _generate_validation_code(self, kernel: CUDAKernel) -> str: 284 | """Generate input validation code.""" 285 | validations = [] 286 | for param in kernel.parameters: 287 | if param.is_buffer: 288 | validations.append(f""" 289 | if {param.name}.count == 0 {{ 290 | throw MetalError.invalidInputParameters("Empty buffer for {param.name}") 291 | }}""") 292 | return "\n".join(validations) 293 | 294 | def _generate_buffer_setup_code(self, kernel: CUDAKernel) -> str: 295 | """Generate buffer setup code.""" 296 | setups = [] 297 | for idx, param in enumerate(kernel.parameters): 298 | if param.is_buffer: 299 | setups.append(f""" 300 | let {param.name}Buffer = try createBuffer(from: {param.name}) 301 | encoder.setBuffer({param.name}Buffer, offset: 0, index: {idx})""") 302 | return "\n".join(setups) 303 | 304 | def _cuda_type_to_swift(self, cuda_type: str) -> str: 305 | """Convert CUDA type to Swift type.""" 306 | type_mapping = { 307 | 'float': '[Float]', 308 | 'double': '[Double]', 309 | 'int': '[Int32]', 310 | 'unsigned int': '[UInt32]', 311 | 'long': '[Int64]', 312 | 'unsigned long': '[UInt64]', 313 | } 314 | return type_mapping.get(cuda_type, '[Float]') # Default to [Float] if type not found 315 | 316 | def cleanup(self): 317 | """Cleanup any resources.""" 318 | with self._lock: 319 | self._cache.clear() -------------------------------------------------------------------------------- /templates/swift/kernel_wrapper.swift: -------------------------------------------------------------------------------- 1 | import Metal 2 | import MetalKit 3 | 4 | // CUDA-like host wrapper for Metal GPU kernels 5 | class CUDAMetalDevice { 6 | // Metal objects 7 | private let device: MTLDevice 8 | private let commandQueue: MTLCommandQueue 9 | private var kernelPipelineStates: [String: MTLComputePipelineState] = [:] 10 | private var kernelFunctions: [String: MTLFunction] = [:] 11 | 12 | // Buffer management 13 | private var allocatedBuffers: [UnsafeMutableRawPointer: MTLBuffer] = [:] 14 | private var bufferSizes: [MTLBuffer: Int] = [:] 15 | 16 | // CUDA-like error handling 17 | enum CUDAError: Error { 18 | case deviceNotFound 19 | case kernelNotFound 20 | case outOfMemory 21 | case invalidValue 22 | case launchFailure 23 | } 24 | 25 | init() throws { 26 | guard let metalDevice = MTLCreateSystemDefaultDevice() else { 27 | throw CUDAError.deviceNotFound 28 | } 29 | self.device = metalDevice 30 | guard let queue = device.makeCommandQueue() else { 31 | throw CUDAError.deviceNotFound 32 | } 33 | self.commandQueue = queue 34 | } 35 | 36 | // CUDA Memory Management 37 | func cudaMalloc(_ size: Int) throws -> UnsafeMutablePointer { 38 | guard let buffer = device.makeBuffer(length: size, options: .storageModeShared) else { 39 | throw CUDAError.outOfMemory 40 | } 41 | 42 | let pointer = UnsafeMutableRawPointer(buffer.contents()) 43 | allocatedBuffers[pointer] = buffer 44 | bufferSizes[buffer] = size 45 | 46 | return pointer.assumingMemoryBound(to: T.self) 47 | } 48 | 49 | func cudaFree(_ pointer: UnsafeMutableRawPointer) { 50 | allocatedBuffers.removeValue(forKey: pointer) 51 | } 52 | 53 | func cudaMemcpy(_ dst: UnsafeMutablePointer, 54 | _ src: UnsafePointer, 55 | _ size: Int, 56 | _ direction: CudaMemcpyKind) throws { 57 | switch direction { 58 | case .hostToDevice: 59 | guard let buffer = allocatedBuffers[UnsafeMutableRawPointer(mutating: dst)] else { 60 | throw CUDAError.invalidValue 61 | } 62 | memcpy(buffer.contents(), src, size) 63 | 64 | case .deviceToHost: 65 | guard let buffer = allocatedBuffers[UnsafeMutableRawPointer(mutating: src)] else { 66 | throw CUDAError.invalidValue 67 | } 68 | memcpy(dst, buffer.contents(), size) 69 | 70 | case .deviceToDevice: 71 | guard let srcBuffer = allocatedBuffers[UnsafeMutableRawPointer(mutating: src)], 72 | let dstBuffer = allocatedBuffers[UnsafeMutableRawPointer(mutating: dst)] else { 73 | throw CUDAError.invalidValue 74 | } 75 | let commandBuffer = commandQueue.makeCommandBuffer() 76 | let blitEncoder = commandBuffer?.makeBlitCommandEncoder() 77 | blitEncoder?.copy(from: srcBuffer, sourceOffset: 0, 78 | to: dstBuffer, destinationOffset: 0, 79 | size: size) 80 | blitEncoder?.endEncoding() 81 | commandBuffer?.commit() 82 | } 83 | } 84 | 85 | // Kernel Management 86 | func loadMetalLibrary(url: URL) throws { 87 | guard let library = try? device.makeLibrary(URL: url) else { 88 | throw CUDAError.kernelNotFound 89 | } 90 | 91 | // Load all kernel functions 92 | for functionName in library.functionNames { 93 | guard let function = library.makeFunction(name: functionName) else { continue } 94 | kernelFunctions[functionName] = function 95 | 96 | // Create pipeline state 97 | if let pipelineState = try? device.makeComputePipelineState(function: function) { 98 | kernelPipelineStates[functionName] = pipelineState 99 | } 100 | } 101 | } 102 | 103 | // CUDA Kernel Launch 104 | func launchKernel(name: String, 105 | gridSize: (Int, Int, Int), 106 | blockSize: (Int, Int, Int), 107 | arguments: [MTLBuffer], 108 | completion: ((Error?) -> Void)? = nil) throws { 109 | guard let pipelineState = kernelPipelineStates[name] else { 110 | throw CUDAError.kernelNotFound 111 | } 112 | 113 | // Create command buffer and encoder 114 | guard let commandBuffer = commandQueue.makeCommandBuffer(), 115 | let computeEncoder = commandBuffer.makeComputeCommandEncoder() else { 116 | throw CUDAError.launchFailure 117 | } 118 | 119 | computeEncoder.setComputePipelineState(pipelineState) 120 | 121 | // Set buffers 122 | for (index, buffer) in arguments.enumerated() { 123 | computeEncoder.setBuffer(buffer, offset: 0, index: index) 124 | } 125 | 126 | // Convert sizes to Metal 127 | let threadsPerGrid = MTLSize(width: gridSize.0, height: gridSize.1, depth: gridSize.2) 128 | let threadsPerThreadgroup = MTLSize(width: blockSize.0, height: blockSize.1, depth: blockSize.2) 129 | 130 | // Dispatch 131 | computeEncoder.dispatchThreadgroups(threadsPerGrid, 132 | threadsPerThreadgroup: threadsPerThreadgroup) 133 | 134 | computeEncoder.endEncoding() 135 | 136 | if let completion = completion { 137 | commandBuffer.addCompletedHandler { _ in 138 | completion(nil) 139 | } 140 | } 141 | 142 | commandBuffer.commit() 143 | } 144 | 145 | // CUDA Synchronization 146 | func cudaDeviceSynchronize() { 147 | commandQueue.insertDebugCaptureBoundary() 148 | } 149 | 150 | enum CudaMemcpyKind { 151 | case hostToDevice 152 | case deviceToHost 153 | case deviceToDevice 154 | } 155 | } 156 | 157 | // Example usage extension 158 | extension CUDAMetalDevice { 159 | func createBuffer(_ data: [T]) throws -> MTLBuffer { 160 | let size = MemoryLayout.stride * data.count 161 | guard let buffer = device.makeBuffer(length: size, options: .storageModeShared) else { 162 | throw CUDAError.outOfMemory 163 | } 164 | memcpy(buffer.contents(), data, size) 165 | return buffer 166 | } 167 | // Advanced Memory Management 168 | extension CUDAMetalDevice { 169 | // 2D Memory Allocation 170 | func cudaMallocPitch(width: Int, height: Int) throws -> (UnsafeMutablePointer, Int) { 171 | let pitch = (width * MemoryLayout.stride + 255) & ~255 // 256-byte alignment 172 | let size = pitch * height 173 | 174 | guard let buffer = device.makeBuffer(length: size, options: .storageModeShared) else { 175 | throw CUDAError.outOfMemory 176 | } 177 | 178 | let pointer = buffer.contents().assumingMemoryBound(to: T.self) 179 | allocatedBuffers[pointer] = buffer 180 | 181 | return (pointer, pitch) 182 | } 183 | 184 | // Array Memory Management 185 | func cudaMallocArray(_ shape: [Int]) throws -> UnsafeMutablePointer { 186 | let size = shape.reduce(1, *) * MemoryLayout.stride 187 | return try cudaMalloc(size) 188 | } 189 | 190 | // Managed Memory 191 | func cudaMallocManaged(_ size: Int) throws -> UnsafeMutablePointer { 192 | guard let buffer = device.makeBuffer(length: size, 193 | options: [.storageModeShared, .hazardTrackingModeTracked]) else { 194 | throw CUDAError.outOfMemory 195 | } 196 | 197 | let pointer = buffer.contents().assumingMemoryBound(to: T.self) 198 | allocatedBuffers[pointer] = buffer 199 | 200 | return pointer 201 | } 202 | 203 | // Memory Prefetch 204 | func cudaMemPrefetchAsync(_ pointer: UnsafeMutablePointer, 205 | count: Int, 206 | location: MemoryLocation) throws { 207 | guard let buffer = allocatedBuffers[pointer] else { 208 | throw CUDAError.invalidValue 209 | } 210 | 211 | let commandBuffer = commandQueue.makeCommandBuffer() 212 | let blitEncoder = commandBuffer?.makeBlitCommandEncoder() 213 | 214 | switch location { 215 | case .device: 216 | blitEncoder?.synchronize(resource: buffer) 217 | case .host: 218 | buffer.didModifyRange(0.. KernelProfile { 284 | guard let pipelineState = kernelPipelineStates[name] else { 285 | throw CUDAError.kernelNotFound 286 | } 287 | 288 | let commandBuffer = commandQueue.makeCommandBuffer() 289 | 290 | let computeEncoder = commandBuffer?.makeComputeCommandEncoder() 291 | computeEncoder?.setComputePipelineState(pipelineState) 292 | 293 | // Set arguments 294 | for (index, buffer) in arguments.enumerated() { 295 | computeEncoder?.setBuffer(buffer, offset: 0, index: index) 296 | } 297 | 298 | let threadsPerGrid = MTLSize(width: gridSize.0, 299 | height: gridSize.1, 300 | depth: gridSize.2) 301 | 302 | let threadsPerThreadgroup = MTLSize(width: blockSize.0, 303 | height: blockSize.1, 304 | depth: blockSize.2) 305 | 306 | computeEncoder?.dispatchThreadgroups(threadsPerGrid, 307 | threadsPerThreadgroup: threadsPerThreadgroup) 308 | 309 | computeEncoder?.endEncoding() 310 | 311 | var profile = KernelProfile() 312 | 313 | commandBuffer?.addCompletedHandler { buffer in 314 | profile.executionTime = buffer.gpuEndTime - buffer.gpuStartTime 315 | profile.threadgroups = gridSize.0 * gridSize.1 * gridSize.2 316 | profile.threadsPerThreadgroup = blockSize.0 * blockSize.1 * blockSize.2 317 | } 318 | 319 | commandBuffer?.commit() 320 | commandBuffer?.waitUntilCompleted() 321 | 322 | return profile 323 | } 324 | } 325 | 326 | struct KernelProfile { 327 | var executionTime: Double = 0 328 | var threadgroups: Int = 0 329 | var threadsPerThreadgroup: Int = 0 330 | } 331 | 332 | enum MemoryLocation { 333 | case device 334 | case host 335 | } 336 | 337 | 338 | } -------------------------------------------------------------------------------- /optimizer/unified_optimizer_metal.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple, Union, Set, Any 2 | from dataclasses import dataclass 3 | from enum import Enum 4 | import logging 5 | from concurrent.futures import ThreadPoolExecutor 6 | from threading import Lock 7 | 8 | from ..utils.error_handler import CudaTranslationError 9 | from ..utils.logger import get_logger 10 | from ..core.parser.ast_nodes import ( 11 | CUDANode, CUDAKernel, CUDAThreadIdx, CUDABlockIdx 12 | ) 13 | from ..utils.metal_math_functions import MetalMathFunction 14 | from ..utils.cuda_to_metal_type_mapping import map_cuda_type_to_metal 15 | 16 | logger = get_logger(__name__) 17 | 18 | @dataclass 19 | class OptimizationMetrics: 20 | compute_intensity: float = 0.0 21 | memory_pressure: float = 0.0 22 | thread_divergence: float = 0.0 23 | bank_conflicts: int = 0 24 | simd_efficiency: float = 0.0 25 | register_pressure: int = 0 26 | 27 | class OptimizationType(Enum): 28 | MEMORY_COALESCING = "memory_coalescing" 29 | SIMD_GROUP = "simd_group" 30 | THREADGROUP_MEMORY = "threadgroup_memory" 31 | TEXTURE_SAMPLING = "texture_sampling" 32 | BARRIER_REDUCTION = "barrier_reduction" 33 | ARITHMETIC = "arithmetic" 34 | LOOP_UNROLLING = "loop_unrolling" 35 | VECTORIZATION = "vectorization" 36 | 37 | class UnifiedMetalOptimizer: 38 | """ 39 | Unified Metal optimization system following NVIDIA patterns. 40 | """ 41 | def __init__(self): 42 | # Constants following NVIDIA GPU patterns 43 | self.WARP_SIZE = 32 44 | self.MAX_THREADS_PER_BLOCK = 1024 45 | self.MAX_BLOCKS_PER_GRID = (2**31-1, 65535, 65535) 46 | self.MAX_SHARED_MEMORY = 48 * 1024 # 48KB 47 | self.L1_CACHE_LINE_SIZE = 128 48 | self.VECTOR_SIZES = {2, 4, 8, 16} 49 | 50 | # Metal-specific limits 51 | self.metal_limits = { 52 | 'max_threads_per_group': 1024, 53 | 'max_threadgroups': (2048, 2048, 2048), 54 | 'shared_memory_size': 32768, # 32KB 55 | 'simd_width': 32 56 | } 57 | 58 | # State management 59 | self.lock = Lock() 60 | self.thread_pool = ThreadPoolExecutor(max_workers=4) 61 | self._optimization_cache: Dict[str, Any] = {} 62 | self.metrics = OptimizationMetrics() 63 | self.applied_optimizations: Set[OptimizationType] = set() 64 | 65 | def optimize(self, kernel: CUDAKernel) -> CUDAKernel: 66 | """ 67 | Main optimization entry point following NVIDIA's optimization hierarchy. 68 | """ 69 | try: 70 | with self.lock: 71 | # Step 1: Analyze kernel characteristics 72 | analysis = self._analyze_kernel(kernel) 73 | 74 | # Step 2: Memory optimizations (highest priority) 75 | kernel = self._optimize_memory_access(kernel, analysis) 76 | kernel = self._optimize_shared_memory(kernel, analysis) 77 | kernel = self._optimize_texture_memory(kernel, analysis) 78 | 79 | # Step 3: Thread hierarchy optimizations 80 | kernel = self._optimize_thread_configuration(kernel, analysis) 81 | kernel = self._optimize_simd_groups(kernel, analysis) 82 | 83 | # Step 4: Arithmetic optimizations 84 | kernel = self._optimize_math_operations(kernel) 85 | kernel = self._optimize_vectorization(kernel) 86 | 87 | # Step 5: Control flow optimizations 88 | kernel = self._optimize_barriers(kernel) 89 | kernel = self._optimize_divergent_code(kernel) 90 | 91 | # Update metrics 92 | self._update_metrics(kernel, analysis) 93 | 94 | return kernel 95 | 96 | except Exception as e: 97 | logger.error(f"Optimization failed: {str(e)}") 98 | raise CudaTranslationError(f"Optimization failed: {str(e)}") 99 | 100 | def _analyze_kernel(self, kernel: CUDAKernel) -> Dict[str, Any]: 101 | """ 102 | Comprehensive kernel analysis following NVIDIA profiling patterns. 103 | """ 104 | analysis = { 105 | 'memory_patterns': self._analyze_memory_patterns(kernel), 106 | 'thread_hierarchy': self._analyze_thread_hierarchy(kernel), 107 | 'compute_intensity': self._calculate_compute_intensity(kernel), 108 | 'register_pressure': self._estimate_register_pressure(kernel), 109 | 'shared_memory_usage': self._analyze_shared_memory_usage(kernel), 110 | 'thread_divergence': self._analyze_thread_divergence(kernel), 111 | 'bank_conflicts': self._detect_bank_conflicts(kernel), 112 | 'optimization_opportunities': self._identify_optimization_opportunities(kernel) 113 | } 114 | 115 | # Cache analysis results 116 | self._optimization_cache[kernel.name] = analysis 117 | return analysis 118 | 119 | def _optimize_memory_access(self, kernel: CUDAKernel, analysis: Dict[str, Any]) -> CUDAKernel: 120 | """ 121 | Memory access optimization following NVIDIA coalescing patterns. 122 | """ 123 | memory_patterns = analysis['memory_patterns'] 124 | 125 | # Global memory coalescing 126 | if memory_patterns.get('uncoalesced_accesses'): 127 | kernel = self._apply_memory_coalescing(kernel, memory_patterns['uncoalesced_accesses']) 128 | self.applied_optimizations.add(OptimizationType.MEMORY_COALESCING) 129 | 130 | # Shared memory bank conflict resolution 131 | if memory_patterns.get('bank_conflicts'): 132 | kernel = self._resolve_bank_conflicts(kernel, memory_patterns['bank_conflicts']) 133 | self.applied_optimizations.add(OptimizationType.THREADGROUP_MEMORY) 134 | 135 | return kernel 136 | 137 | def _optimize_thread_configuration(self, kernel: CUDAKernel, analysis: Dict[str, Any]) -> CUDAKernel: 138 | """ 139 | Thread configuration optimization following NVIDIA occupancy patterns. 140 | """ 141 | thread_hierarchy = analysis['thread_hierarchy'] 142 | 143 | # Calculate optimal thread block size 144 | optimal_block_size = self._calculate_optimal_block_size( 145 | thread_hierarchy['current_block_size'], 146 | analysis['register_pressure'], 147 | analysis['shared_memory_usage'] 148 | ) 149 | 150 | # Adjust grid size based on block size 151 | optimal_grid_size = self._calculate_optimal_grid_size( 152 | thread_hierarchy['total_threads_needed'], 153 | optimal_block_size 154 | ) 155 | 156 | # Update kernel configuration 157 | kernel.thread_config.block_size = optimal_block_size 158 | kernel.thread_config.grid_size = optimal_grid_size 159 | 160 | return kernel 161 | 162 | def _optimize_simd_groups(self, kernel: CUDAKernel, analysis: Dict[str, Any]) -> CUDAKernel: 163 | """ 164 | SIMD group optimization following NVIDIA warp optimization patterns. 165 | """ 166 | opportunities = analysis['optimization_opportunities'] 167 | 168 | if opportunities.get('simd_operations'): 169 | # Convert appropriate operations to SIMD 170 | kernel = self._convert_to_simd_operations(kernel, opportunities['simd_operations']) 171 | self.applied_optimizations.add(OptimizationType.SIMD_GROUP) 172 | 173 | # Optimize SIMD group synchronization 174 | if opportunities.get('sync_points'): 175 | kernel = self._optimize_simd_sync(kernel, opportunities['sync_points']) 176 | 177 | return kernel 178 | 179 | def _optimize_barriers(self, kernel: CUDAKernel) -> CUDAKernel: 180 | """ 181 | Barrier optimization following NVIDIA synchronization patterns. 182 | """ 183 | sync_points = self._find_sync_points(kernel) 184 | 185 | optimized_sync_points = [] 186 | for sync in sync_points: 187 | if self._is_barrier_necessary(sync, kernel): 188 | optimized_sync_points.append(self._optimize_barrier_type(sync)) 189 | 190 | kernel = self._replace_sync_points(kernel, optimized_sync_points) 191 | self.applied_optimizations.add(OptimizationType.BARRIER_REDUCTION) 192 | 193 | return kernel 194 | 195 | def _optimize_math_operations(self, kernel: CUDAKernel) -> CUDAKernel: 196 | """ 197 | Math operation optimization following NVIDIA intrinsics patterns. 198 | """ 199 | def optimize_node(node: CUDANode) -> CUDANode: 200 | if isinstance(node, CUDAKernel): 201 | # Optimize math function calls 202 | node = self._optimize_math_functions(node) 203 | 204 | # Apply fast math where appropriate 205 | node = self._apply_fast_math(node) 206 | 207 | # Optimize compound operations 208 | node = self._optimize_compound_operations(node) 209 | 210 | self.applied_optimizations.add(OptimizationType.ARITHMETIC) 211 | 212 | return node 213 | 214 | return self._traverse_and_transform(kernel, optimize_node) 215 | 216 | def _optimize_vectorization(self, kernel: CUDAKernel) -> CUDAKernel: 217 | """ 218 | Vectorization optimization following NVIDIA vectorization patterns. 219 | """ 220 | vectorizable_ops = self._find_vectorizable_operations(kernel) 221 | 222 | if vectorizable_ops: 223 | for op in vectorizable_ops: 224 | vector_width = self._determine_vector_width(op) 225 | if vector_width: 226 | kernel = self._apply_vectorization(kernel, op, vector_width) 227 | self.applied_optimizations.add(OptimizationType.VECTORIZATION) 228 | 229 | return kernel 230 | 231 | def _update_metrics(self, kernel: CUDAKernel, analysis: Dict[str, Any]) -> None: 232 | """ 233 | Update optimization metrics following NVIDIA profiling patterns. 234 | """ 235 | with self.lock: 236 | self.metrics.compute_intensity = analysis['compute_intensity'] 237 | self.metrics.memory_pressure = analysis['memory_patterns'].get('pressure', 0.0) 238 | self.metrics.thread_divergence = len(analysis['thread_divergence']) 239 | self.metrics.bank_conflicts = len(analysis['bank_conflicts']) 240 | self.metrics.simd_efficiency = self._calculate_simd_efficiency(kernel) 241 | self.metrics.register_pressure = analysis['register_pressure'] 242 | 243 | def get_optimization_report(self) -> Dict[str, Any]: 244 | """ 245 | Generate comprehensive optimization report. 246 | """ 247 | return { 248 | 'applied_optimizations': [opt.value for opt in self.applied_optimizations], 249 | 'metrics': { 250 | 'compute_intensity': self.metrics.compute_intensity, 251 | 'memory_pressure': self.metrics.memory_pressure, 252 | 'thread_divergence': self.metrics.thread_divergence, 253 | 'bank_conflicts': self.metrics.bank_conflicts, 254 | 'simd_efficiency': self.metrics.simd_efficiency, 255 | 'register_pressure': self.metrics.register_pressure 256 | }, 257 | 'recommendations': self._generate_optimization_recommendations(), 258 | 'metal_specific': { 259 | 'threadgroup_size': self._get_optimal_threadgroup_size(), 260 | 'memory_layout': self._get_optimal_memory_layout(), 261 | 'barrier_usage': self._get_barrier_statistics() 262 | } 263 | } 264 | 265 | def _calculate_simd_efficiency(self, kernel: CUDAKernel) -> float: 266 | """Calculate SIMD efficiency based on thread utilization.""" 267 | active_threads = self._count_active_threads(kernel) 268 | total_threads = kernel.thread_config.block_size[0] * \ 269 | kernel.thread_config.block_size[1] * \ 270 | kernel.thread_config.block_size[2] 271 | 272 | return active_threads / (total_threads * self.metal_limits['simd_width']) 273 | 274 | def _generate_optimization_recommendations(self) -> List[Dict[str, str]]: 275 | """Generate optimization recommendations based on metrics.""" 276 | recommendations = [] 277 | 278 | if self.metrics.memory_pressure > 0.8: 279 | recommendations.append({ 280 | 'type': 'memory_access', 281 | 'message': 'High memory pressure detected. Consider using threadgroup memory.' 282 | }) 283 | 284 | if self.metrics.thread_divergence > 0.2: 285 | recommendations.append({ 286 | 'type': 'divergence', 287 | 'message': 'Significant thread divergence detected. Consider restructuring conditionals.' 288 | }) 289 | 290 | if self.metrics.simd_efficiency < 0.7: 291 | recommendations.append({ 292 | 'type': 'simd_usage', 293 | 'message': 'Low SIMD efficiency. Consider adjusting thread group size.' 294 | }) 295 | 296 | return recommendations 297 | 298 | def cleanup(self): 299 | """Cleanup resources.""" 300 | self.thread_pool.shutdown() 301 | self._optimization_cache.clear() -------------------------------------------------------------------------------- /generator/msl_generator.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Set, Optional, Union, Any 2 | from pathlib import Path 3 | import logging 4 | from concurrent.futures import ThreadPoolExecutor 5 | from threading import Lock 6 | 7 | from ..utils.error_handler import CudaTranslationError 8 | from ..utils.logger import get_logger 9 | from ..utils.metal_equivalents import get_metal_equivalent 10 | from ..utils.mapping_tables import MetalMappingRegistry 11 | from ..core.parser.ast_nodes import ( 12 | CUDAKernel, CUDANode, CUDAType, CUDAQualifier 13 | ) 14 | 15 | logger = get_logger(__name__) 16 | 17 | class MetalShaderGenerator: 18 | """ 19 | Production-ready Metal shader generator with comprehensive optimization capabilities. 20 | Thread-safe implementation for parallel shader generation. 21 | """ 22 | 23 | def __init__(self): 24 | self.mapping_registry = MetalMappingRegistry() 25 | self._lock = Lock() 26 | self._shader_cache: Dict[str, str] = {} 27 | self._function_registry: Dict[str, Dict[str, Any]] = {} 28 | self.executor = ThreadPoolExecutor(max_workers=4) 29 | 30 | # Initialize optimization flags 31 | self.optimization_flags = { 32 | 'vectorize': True, 33 | 'unroll_loops': True, 34 | 'simd_groups': True, 35 | 'memory_coalescing': True, 36 | 'constant_folding': True, 37 | 'barrier_optimization': True 38 | } 39 | 40 | # Metal-specific constraints 41 | self.METAL_LIMITS = { 42 | 'max_threads_per_group': 1024, 43 | 'max_total_threadgroup_memory': 32768, # 32KB 44 | 'simd_width': 32, 45 | 'max_buffers': 31, 46 | 'max_textures': 128 47 | } 48 | 49 | def generate_kernel(self, kernel: CUDAKernel, optimization_level: int = 2) -> str: 50 | """ 51 | Generate optimized Metal kernel from CUDA kernel. 52 | 53 | Args: 54 | kernel: CUDA kernel AST node 55 | optimization_level: 0-3, higher means more aggressive optimization 56 | 57 | Returns: 58 | Optimized Metal shader code 59 | 60 | Raises: 61 | CudaTranslationError: If translation fails 62 | """ 63 | try: 64 | # Check cache first 65 | cache_key = f"{kernel.name}_{optimization_level}" 66 | with self._lock: 67 | if cache_key in self._shader_cache: 68 | return self._shader_cache[cache_key] 69 | 70 | # Validate kernel constraints 71 | self._validate_kernel(kernel) 72 | 73 | # Generate shader components 74 | signature = self._generate_kernel_signature(kernel) 75 | declarations = self._generate_declarations(kernel) 76 | body = self._generate_kernel_body(kernel, optimization_level) 77 | 78 | # Combine and optimize 79 | shader_code = self._optimize_shader( 80 | f"{signature}\n{{\n{declarations}\n{body}\n}}\n", 81 | optimization_level 82 | ) 83 | 84 | # Cache result 85 | with self._lock: 86 | self._shader_cache[cache_key] = shader_code 87 | 88 | return shader_code 89 | 90 | except Exception as e: 91 | logger.error(f"Failed to generate Metal shader for kernel {kernel.name}: {str(e)}") 92 | raise CudaTranslationError(f"Shader generation failed: {str(e)}") 93 | 94 | def _validate_kernel(self, kernel: CUDAKernel) -> None: 95 | """Validate kernel against Metal constraints.""" 96 | # Check thread dimensions 97 | thread_count = kernel.thread_count 98 | if thread_count > self.METAL_LIMITS['max_threads_per_group']: 99 | raise CudaTranslationError( 100 | f"Thread count {thread_count} exceeds Metal limit of {self.METAL_LIMITS['max_threads_per_group']}" 101 | ) 102 | 103 | # Check shared memory usage 104 | shared_mem = kernel.shared_memory_size 105 | if shared_mem > self.METAL_LIMITS['max_total_threadgroup_memory']: 106 | raise CudaTranslationError( 107 | f"Shared memory usage {shared_mem} exceeds Metal limit of {self.METAL_LIMITS['max_total_threadgroup_memory']}" 108 | ) 109 | 110 | # Validate buffer counts 111 | buffer_count = len(kernel.parameters) 112 | if buffer_count > self.METAL_LIMITS['max_buffers']: 113 | raise CudaTranslationError( 114 | f"Buffer count {buffer_count} exceeds Metal limit of {self.METAL_LIMITS['max_buffers']}" 115 | ) 116 | 117 | def _generate_kernel_signature(self, kernel: CUDAKernel) -> str: 118 | """Generate Metal kernel signature with proper attributes.""" 119 | params = [] 120 | for idx, param in enumerate(kernel.parameters): 121 | metal_type = self.mapping_registry.get_metal_type(param.cuda_type) 122 | if not metal_type: 123 | raise CudaTranslationError(f"Unsupported type: {param.cuda_type}") 124 | 125 | # Determine proper parameter attributes 126 | if param.is_buffer: 127 | qualifier = "device" if not param.is_readonly else "constant" 128 | params.append(f"{qualifier} {metal_type.name}* {param.name} [[buffer({idx})]]") 129 | else: 130 | params.append(f"constant {metal_type.name}& {param.name} [[buffer({idx})]]") 131 | 132 | # Add threadgroup attributes 133 | thread_attrs = [ 134 | "uint3 thread_position_in_grid [[thread_position_in_grid]]", 135 | "uint3 threadgroup_position [[threadgroup_position_in_grid]]", 136 | "uint3 threads_per_threadgroup [[threads_per_threadgroup]]" 137 | ] 138 | 139 | return f"kernel void {kernel.name}(\n {',\n '.join(params + thread_attrs)}\n)" 140 | 141 | def _generate_declarations(self, kernel: CUDAKernel) -> str: 142 | """Generate Metal declarations including threadgroup memory.""" 143 | declarations = [] 144 | 145 | # Add shared memory declarations 146 | for shared_var in kernel.shared_memory: 147 | metal_type = self.mapping_registry.get_metal_type(shared_var.cuda_type) 148 | if not metal_type: 149 | raise CudaTranslationError(f"Unsupported shared memory type: {shared_var.cuda_type}") 150 | 151 | declarations.append( 152 | f" threadgroup {metal_type.name} {shared_var.name}[{shared_var.size}];" 153 | ) 154 | 155 | # Add local variable declarations 156 | for local_var in kernel.local_variables: 157 | metal_type = self.mapping_registry.get_metal_type(local_var.cuda_type) 158 | if not metal_type: 159 | raise CudaTranslationError(f"Unsupported local variable type: {local_var.cuda_type}") 160 | 161 | declarations.append( 162 | f" thread {metal_type.name} {local_var.name};" 163 | ) 164 | 165 | return "\n".join(declarations) 166 | 167 | def _generate_kernel_body(self, kernel: CUDAKernel, optimization_level: int) -> str: 168 | """Generate optimized kernel body code.""" 169 | # Apply pre-processing optimizations 170 | optimized_nodes = self._optimize_nodes(kernel.body, optimization_level) 171 | 172 | # Generate code for each node 173 | body_code = [] 174 | for node in optimized_nodes: 175 | try: 176 | node_code = self._generate_node_code(node) 177 | if node_code: 178 | body_code.extend(f" {line}" for line in node_code.split('\n')) 179 | except Exception as e: 180 | logger.error(f"Failed to generate code for node: {str(e)}") 181 | raise CudaTranslationError(f"Code generation failed for node: {str(e)}") 182 | 183 | return "\n".join(body_code) 184 | 185 | def _optimize_nodes(self, nodes: List[CUDANode], optimization_level: int) -> List[CUDANode]: 186 | """Apply optimization passes to AST nodes.""" 187 | if optimization_level == 0: 188 | return nodes 189 | 190 | optimizations = [ 191 | self._optimize_memory_access, 192 | self._optimize_compute_intensity, 193 | self._optimize_control_flow, 194 | self._optimize_thread_divergence 195 | ] 196 | 197 | optimized = nodes 198 | for optimization in optimizations: 199 | if optimization_level >= 2: 200 | optimized = optimization(optimized) 201 | 202 | return optimized 203 | 204 | def _optimize_shader(self, shader_code: str, optimization_level: int) -> str: 205 | """Apply final optimization passes to generated shader code.""" 206 | if optimization_level == 0: 207 | return shader_code 208 | 209 | # Apply progressive optimizations 210 | if optimization_level >= 1: 211 | shader_code = self._optimize_register_usage(shader_code) 212 | shader_code = self._optimize_memory_barriers(shader_code) 213 | 214 | if optimization_level >= 2: 215 | shader_code = self._optimize_simd_usage(shader_code) 216 | shader_code = self._optimize_memory_coalescing(shader_code) 217 | 218 | if optimization_level >= 3: 219 | shader_code = self._optimize_aggressive(shader_code) 220 | 221 | return shader_code 222 | 223 | def _optimize_register_usage(self, code: str) -> str: 224 | """Optimize register allocation and usage.""" 225 | # Implement register optimization logic 226 | return code 227 | 228 | def _optimize_memory_barriers(self, code: str) -> str: 229 | """Optimize memory barrier placement.""" 230 | # Implement barrier optimization logic 231 | return code 232 | 233 | def _optimize_simd_usage(self, code: str) -> str: 234 | """Optimize SIMD group usage.""" 235 | # Implement SIMD optimization logic 236 | return code 237 | 238 | def _optimize_memory_coalescing(self, code: str) -> str: 239 | """Optimize memory access patterns.""" 240 | # Implement memory coalescing logic 241 | return code 242 | 243 | def _optimize_aggressive(self, code: str) -> str: 244 | """Apply aggressive optimizations.""" 245 | # Implement aggressive optimization logic 246 | return code 247 | 248 | def cleanup(self): 249 | """Cleanup resources.""" 250 | self.executor.shutdown() 251 | with self._lock: 252 | self._shader_cache.clear() 253 | self._function_registry.clear() 254 | 255 | # Additional helper classes for specific generation tasks 256 | 257 | class MetalHeaderGenerator: 258 | """Generates Metal shader headers and type definitions.""" 259 | 260 | def __init__(self, mapping_registry: MetalMappingRegistry): 261 | self.mapping_registry = mapping_registry 262 | 263 | def generate_header(self, required_types: Set[str]) -> str: 264 | """Generate Metal header with necessary type definitions.""" 265 | header = [ 266 | "#include ", 267 | "#include ", 268 | "#include ", 269 | "#include ", 270 | "", 271 | "using namespace metal;", 272 | "" 273 | ] 274 | 275 | # Add required type definitions 276 | header.extend(self._generate_type_definitions(required_types)) 277 | 278 | return "\n".join(header) 279 | 280 | def _generate_type_definitions(self, required_types: Set[str]) -> List[str]: 281 | """Generate necessary type definitions.""" 282 | definitions = [] 283 | for type_name in required_types: 284 | if metal_type := self.mapping_registry.get_metal_type(type_name): 285 | if metal_type.requires_header: 286 | definitions.extend(self._generate_type_definition(metal_type)) 287 | return definitions 288 | 289 | def _generate_type_definition(self, metal_type: Any) -> List[str]: 290 | """Generate definition for a specific type.""" 291 | # Implementation for specific type definition generation 292 | return [] 293 | 294 | class MetalFunctionGenerator: 295 | """Generates Metal device and helper functions.""" 296 | 297 | def __init__(self, mapping_registry: MetalMappingRegistry): 298 | self.mapping_registry = mapping_registry 299 | 300 | def generate_device_functions(self, required_functions: Set[str]) -> str: 301 | """Generate Metal device function implementations.""" 302 | functions = [] 303 | for func_name in required_functions: 304 | if metal_func := self.mapping_registry.get_metal_function(func_name): 305 | functions.append(self._generate_function_implementation(metal_func)) 306 | 307 | return "\n\n".join(functions) 308 | 309 | def _generate_function_implementation(self, metal_func: Any) -> str: 310 | """Generate implementation for a specific function.""" 311 | # Implementation for specific function generation 312 | return "" 313 | 314 | # Usage example for the dumdums: 315 | """ 316 | generator = MetalShaderGenerator() 317 | header_gen = MetalHeaderGenerator(generator.mapping_registry) 318 | function_gen = MetalFunctionGenerator(generator.mapping_registry) 319 | 320 | try: 321 | # Generate shader components 322 | metal_code = generator.generate_kernel(cuda_kernel, optimization_level=2) 323 | header = header_gen.generate_header(required_types) 324 | functions = function_gen.generate_device_functions(required_functions) 325 | 326 | # Combine into final shader 327 | final_shader = f"{header}\n\n{functions}\n\n{metal_code}" 328 | 329 | except CudaTranslationError as e: 330 | logger.error(f"Shader generation failed: {str(e)}") 331 | finally: 332 | generator.cleanup() 333 | """ --------------------------------------------------------------------------------