├── .gitignore ├── Cargo.toml ├── README.md ├── build.rs ├── kernel.cu └── src └── main.rs /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | /target 3 | **/*.rs.bk 4 | Cargo.lock 5 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cudac" 3 | version = "0.1.0" 4 | authors = ["Toshiki Teramura "] 5 | 6 | [build-dependencies] 7 | cc = "*" 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | HowTo: Compile CUDA with nvcc, and link to Rust through FFI 2 | ------------------------------------------------------------ 3 | 4 | Build `kernel.cu` (copied from CUDA sample) into `libvector_add.a` in `build.rs`: 5 | 6 | ```rust 7 | extern crate cc; 8 | 9 | fn main() { 10 | cc::Build::new() 11 | .cuda(true) 12 | .flag("-cudart=shared") 13 | .flag("-gencode") 14 | .flag("arch=compute_61,code=sm_61") 15 | .file("kernel.cu") 16 | .compile("libvector_add.a"); 17 | 18 | /* Link CUDA Runtime (libcudart.so) */ 19 | 20 | // Add link directory 21 | // - This path depends on where you install CUDA (i.e. depends on your Linux distribution) 22 | // - This should be set by `$LIBRARY_PATH` 23 | println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); 24 | println!("cargo:rustc-link-lib=cudart"); 25 | 26 | /* Optional: Link CUDA Driver API (libcuda.so) */ 27 | 28 | // println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64/stub"); 29 | // println!("cargo:rustc-link-lib=cuda"); 30 | } 31 | ``` 32 | 33 | and link this host code into Rust executable: 34 | 35 | ```rust 36 | #[link(name = "vector_add", kind = "static")] 37 | extern "C" { 38 | fn vectorAdd_main(); 39 | } 40 | 41 | fn main() { 42 | unsafe { 43 | vectorAdd_main(); 44 | } 45 | } 46 | ``` 47 | 48 | ```cuda 49 | /** CUDA Kernel Device code */ 50 | __global__ void vectorAdd(const float *A, const float *B, float *C, int numElements) { 51 | int i = blockDim.x * blockIdx.x + threadIdx.x; 52 | if (i < numElements) 53 | { 54 | C[i] = A[i] + B[i]; 55 | } 56 | } 57 | 58 | /** Host main routine */ 59 | extern "C" { // To avoid demangle 60 | int vectorAdd_main (void) { 61 | /* call kernel in CUDA/C++ way */ 62 | } 63 | } // extern C 64 | ``` 65 | -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | extern crate cc; 2 | 3 | fn main() { 4 | cc::Build::new() 5 | .cuda(true) 6 | .flag("-cudart=shared") 7 | .flag("-gencode") 8 | .flag("arch=compute_61,code=sm_61") 9 | .file("kernel.cu") 10 | .compile("libvector_add.a"); 11 | 12 | /* Link CUDA Runtime (libcudart.so) */ 13 | 14 | // Add link directory 15 | // - This path depends on where you install CUDA (i.e. depends on your Linux distribution) 16 | // - This should be set by `$LIBRARY_PATH` 17 | println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64"); 18 | println!("cargo:rustc-link-lib=cudart"); 19 | 20 | /* Optional: Link CUDA Driver API (libcuda.so) */ 21 | 22 | // println!("cargo:rustc-link-search=native=/usr/local/cuda/lib64/stub"); 23 | // println!("cargo:rustc-link-lib=cuda"); 24 | } 25 | -------------------------------------------------------------------------------- /kernel.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | /** 13 | * Vector addition: C = A + B. 14 | * 15 | * This sample is a very basic sample that implements element by element 16 | * vector addition. It is the same as the sample illustrating Chapter 2 17 | * of the programming guide with some additions like error checking. 18 | */ 19 | 20 | #include 21 | 22 | // For the CUDA runtime routines (prefixed with "cuda_") 23 | #include 24 | 25 | /** 26 | * CUDA Kernel Device code 27 | * 28 | * Computes the vector addition of A and B into C. The 3 vectors have the same 29 | * number of elements numElements. 30 | */ 31 | __global__ void 32 | vectorAdd(const float *A, const float *B, float *C, int numElements) 33 | { 34 | int i = blockDim.x * blockIdx.x + threadIdx.x; 35 | 36 | if (i < numElements) 37 | { 38 | C[i] = A[i] + B[i]; 39 | } 40 | } 41 | 42 | /** 43 | * Host main routine 44 | */ 45 | extern "C" { 46 | 47 | int 48 | vectorAdd_main (void) 49 | { 50 | // Error code to check return values for CUDA calls 51 | cudaError_t err = cudaSuccess; 52 | 53 | // Print the vector length to be used, and compute its size 54 | int numElements = 50000; 55 | size_t size = numElements * sizeof(float); 56 | printf("[Vector addition of %d elements]\n", numElements); 57 | 58 | // Allocate the host input vector A 59 | float *h_A = (float *)malloc(size); 60 | 61 | // Allocate the host input vector B 62 | float *h_B = (float *)malloc(size); 63 | 64 | // Allocate the host output vector C 65 | float *h_C = (float *)malloc(size); 66 | 67 | // Verify that allocations succeeded 68 | if (h_A == NULL || h_B == NULL || h_C == NULL) 69 | { 70 | fprintf(stderr, "Failed to allocate host vectors!\n"); 71 | exit(EXIT_FAILURE); 72 | } 73 | 74 | // Initialize the host input vectors 75 | for (int i = 0; i < numElements; ++i) 76 | { 77 | h_A[i] = rand()/(float)RAND_MAX; 78 | h_B[i] = rand()/(float)RAND_MAX; 79 | } 80 | 81 | // Allocate the device input vector A 82 | float *d_A = NULL; 83 | err = cudaMalloc((void **)&d_A, size); 84 | 85 | if (err != cudaSuccess) 86 | { 87 | fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); 88 | exit(EXIT_FAILURE); 89 | } 90 | 91 | // Allocate the device input vector B 92 | float *d_B = NULL; 93 | err = cudaMalloc((void **)&d_B, size); 94 | 95 | if (err != cudaSuccess) 96 | { 97 | fprintf(stderr, "Failed to allocate device vector B (error code %s)!\n", cudaGetErrorString(err)); 98 | exit(EXIT_FAILURE); 99 | } 100 | 101 | // Allocate the device output vector C 102 | float *d_C = NULL; 103 | err = cudaMalloc((void **)&d_C, size); 104 | 105 | if (err != cudaSuccess) 106 | { 107 | fprintf(stderr, "Failed to allocate device vector C (error code %s)!\n", cudaGetErrorString(err)); 108 | exit(EXIT_FAILURE); 109 | } 110 | 111 | // Copy the host input vectors A and B in host memory to the device input vectors in 112 | // device memory 113 | printf("Copy input data from the host memory to the CUDA device\n"); 114 | err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 115 | 116 | if (err != cudaSuccess) 117 | { 118 | fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err)); 119 | exit(EXIT_FAILURE); 120 | } 121 | 122 | err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 123 | 124 | if (err != cudaSuccess) 125 | { 126 | fprintf(stderr, "Failed to copy vector B from host to device (error code %s)!\n", cudaGetErrorString(err)); 127 | exit(EXIT_FAILURE); 128 | } 129 | 130 | // Launch the Vector Add CUDA Kernel 131 | int threadsPerBlock = 256; 132 | int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock; 133 | printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock); 134 | vectorAdd<<>>(d_A, d_B, d_C, numElements); 135 | err = cudaGetLastError(); 136 | 137 | if (err != cudaSuccess) 138 | { 139 | fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); 140 | exit(EXIT_FAILURE); 141 | } 142 | 143 | // Copy the device result vector in device memory to the host result vector 144 | // in host memory. 145 | printf("Copy output data from the CUDA device to the host memory\n"); 146 | err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 147 | 148 | if (err != cudaSuccess) 149 | { 150 | fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err)); 151 | exit(EXIT_FAILURE); 152 | } 153 | 154 | // Verify that the result vector is correct 155 | for (int i = 0; i < numElements; ++i) 156 | { 157 | if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) 158 | { 159 | fprintf(stderr, "Result verification failed at element %d!\n", i); 160 | exit(EXIT_FAILURE); 161 | } 162 | } 163 | 164 | printf("Test PASSED\n"); 165 | 166 | // Free device global memory 167 | err = cudaFree(d_A); 168 | 169 | if (err != cudaSuccess) 170 | { 171 | fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err)); 172 | exit(EXIT_FAILURE); 173 | } 174 | 175 | err = cudaFree(d_B); 176 | 177 | if (err != cudaSuccess) 178 | { 179 | fprintf(stderr, "Failed to free device vector B (error code %s)!\n", cudaGetErrorString(err)); 180 | exit(EXIT_FAILURE); 181 | } 182 | 183 | err = cudaFree(d_C); 184 | 185 | if (err != cudaSuccess) 186 | { 187 | fprintf(stderr, "Failed to free device vector C (error code %s)!\n", cudaGetErrorString(err)); 188 | exit(EXIT_FAILURE); 189 | } 190 | 191 | // Free host memory 192 | free(h_A); 193 | free(h_B); 194 | free(h_C); 195 | 196 | // Reset the device and exit 197 | // cudaDeviceReset causes the driver to clean up all state. While 198 | // not mandatory in normal operation, it is good practice. It is also 199 | // needed to ensure correct operation when the application is being 200 | // profiled. Calling cudaDeviceReset causes all profile data to be 201 | // flushed before the application exits 202 | err = cudaDeviceReset(); 203 | 204 | if (err != cudaSuccess) 205 | { 206 | fprintf(stderr, "Failed to deinitialize the device! error=%s\n", cudaGetErrorString(err)); 207 | exit(EXIT_FAILURE); 208 | } 209 | 210 | printf("Done\n"); 211 | return 0; 212 | } 213 | 214 | } 215 | 216 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | #[link(name = "vector_add", kind = "static")] 2 | extern "C" { 3 | fn vectorAdd_main(); 4 | } 5 | 6 | fn main() { 7 | unsafe { 8 | vectorAdd_main(); 9 | } 10 | } 11 | --------------------------------------------------------------------------------