├── .gitignore ├── LICENSE ├── README.md ├── sum └── sum.cu ├── info └── info.cu ├── matmul2d └── matmul2d.cu └── matmul2dsm └── matmul2dsm.cu /.gitignore: -------------------------------------------------------------------------------- 1 | *.i 2 | *.ii 3 | *.gpu 4 | *.ptx 5 | *.cubin 6 | *.fatbin 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Daniel Rossi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA 2 | 3 | ### Brief 4 | This repository contans multiple examples of code to be run on NVIDIA GPUs, and wants to help you dive deeper into CUDA programming language. CUDA runs in any machine which mounts a NVIDA GPU with compute capability > 3.0 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus), so make sure your system is supported. You can run CUDA scripts on Linux and Windows machines, but it is mandatory to install NVIDIA drivers and the NVIDIA CUDA Compiler. 5 | 6 | ### Available code: 7 | - **info**: display CUDA and GPU information 8 | - **sum**: adds two random number -> learn how to move data from CPU to GPU and vice versa and run code on GPU 9 | - **matmul2d**: classical matrix multiplication between two matrices -> learn how to manage multi-dimensional data structure and operate between them 10 | 11 | ### Prerequisites: 12 | 1. install NVIDIA drivers: [https://ubuntu.com/server/docs/nvidia-drivers-installation](https://www.nvidia.com/download/index.aspx) 13 | 2. install CUDA on: 14 | - Ubuntu: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html 15 | - Windows: https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html 16 | 4. check everything with ```nvidia-smi``` and ```nvcc -v``` 17 | 18 | ### Compile 19 | To compile a ```.cu``` file you need to run ```nvcc file_name.cu -o output_file_name``` 20 | -------------------------------------------------------------------------------- /sum/sum.cu: -------------------------------------------------------------------------------- 1 | /* 2 | This program is provided as is without any guarantees or warranty. 3 | By using this program the user accepts the full responsibility for any 4 | and all damages that may occur. The author is not responsible for any 5 | consequences of the use of this program. 6 | 7 | * This program adds two arrays of floats using CUDA. 8 | * 9 | * The program first allocates memory for the arrays on the host and device. 10 | * It then initializes the host arrays with random values. 11 | * The host arrays are then copied to the device. 12 | * The add() kernel is then launched on the GPU. 13 | * The result is then copied back to the host. 14 | * The program then verifies the result. 15 | * Finally, the program frees the memory on the device and host. 16 | 17 | * The program takes one command line argument, N, which is the size of the arrays. 18 | * If no argument is provided, the default value of N is 1. 19 | 20 | * The program can be compiled using the following command: 21 | * nvcc sum.cu -o sum 22 | * to run the program, use the following command: 23 | * ./sum 24 | 25 | @Author: Daniel Rossi 26 | @Date: 2023-03-08 27 | @License: MIT 28 | @Version: 1.0 29 | */ 30 | 31 | #include 32 | #include 33 | 34 | void setup() { 35 | // Set the random seed 36 | srand(time(NULL)); 37 | int device = 0; // Default device id (change if you have more than one GPU) 38 | 39 | // Set the device 40 | cudaSetDevice(device); 41 | } 42 | 43 | // Kernel function to add two arrays 44 | __global__ void add(float *a, float *b, float *c, int n) { 45 | // Get the index of the current element 46 | int index = threadIdx.x + blockIdx.x * blockDim.x; 47 | 48 | // Check if the index is within the array bounds 49 | if (index < n) { 50 | c[index] = a[index] + b[index]; 51 | } 52 | } 53 | 54 | void print_cuda_error(cudaError_t err) { 55 | if (err != cudaSuccess){ 56 | printf("CUDA error: %s\n", cudaGetErrorString(err)); 57 | } 58 | } 59 | 60 | int main(int argc, char **argv) { 61 | setup(); 62 | int N = 1; 63 | 64 | // Parse command line arguments 65 | if (argc > 1){ 66 | N = atoi(argv[1]); 67 | } 68 | 69 | printf("N = %d\n", N); 70 | 71 | float *a, *b, *c; 72 | float *d_a, *d_b, *d_c; 73 | 74 | // Allocate memory on the host 75 | a = (float *) malloc(N * sizeof(float)); 76 | b = (float *) malloc(N * sizeof(float)); 77 | c = (float *) malloc(N * sizeof(float)); 78 | 79 | // Allocate memory on the device 80 | cudaError_t err; 81 | err = cudaMalloc(&d_a, N * sizeof(float)); 82 | print_cuda_error(err); 83 | 84 | err = cudaMalloc(&d_b, N * sizeof(float)); 85 | print_cuda_error(err); 86 | 87 | err =cudaMalloc(&d_c, N * sizeof(float)); 88 | print_cuda_error(err); 89 | 90 | // Initialize host values 91 | for (int i = 0; i < N; ++i){ 92 | // Generate random values between 0 and 1 93 | a[i] = rand() / (float)RAND_MAX; 94 | b[i] = rand() / (float)RAND_MAX; 95 | } 96 | 97 | // Copy inputs to device 98 | err = cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice); 99 | print_cuda_error(err); 100 | 101 | err = cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice); 102 | print_cuda_error(err); 103 | 104 | // Launch add() kernel on GPU 105 | add<<<(N + 255) / 256, 256>>>(d_a, d_b, d_c, N); 106 | 107 | // Copy result back to host 108 | err = cudaMemcpy(c, d_c, N * sizeof(float), cudaMemcpyDeviceToHost); 109 | print_cuda_error(err); 110 | 111 | // Verify the result 112 | for (int i = 0; i < N; ++i){ 113 | if (c[i] != (a[i] + b[i])){ 114 | printf("Error: %f + %f != %f\n", a[i], b[i], c[i]); 115 | break; 116 | } 117 | } 118 | 119 | // Free memory on device 120 | cudaFree(d_a); 121 | cudaFree(d_b); 122 | cudaFree(d_c); 123 | 124 | // Free memory on host 125 | free(a); 126 | free(b); 127 | free(c); 128 | 129 | printf("Done\n"); 130 | 131 | return 0; 132 | } -------------------------------------------------------------------------------- /info/info.cu: -------------------------------------------------------------------------------- 1 | /* 2 | This program is provided as is without any guarantees or warranty. 3 | By using this program the user accepts the full responsibility for any 4 | and all damages that may occur. The author is not responsible for any 5 | consequences of the use of this program. 6 | 7 | * This program prints information about the GPU device. 8 | * The program uses the CUDA runtime API to query the device properties. 9 | 10 | * The program takes an optional argument, which is the device id. 11 | * If no argument is provided, the program will use the default device (device 0). 12 | 13 | * The program can be compiled using the following command: 14 | * nvcc info.cu -o info 15 | * to run the program, use the following command: 16 | * ./info 17 | 18 | @Author: Daniel Rossi 19 | @Date: 2023-03-11 20 | @License: MIT 21 | @Version: 1.0 22 | @ 23 | */ 24 | 25 | #include 26 | #include 27 | 28 | void info(int device){ 29 | printf("CUDA version: %d.%d\n", CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10); 30 | 31 | cudaDeviceProp prop; 32 | cudaGetDeviceProperties(&prop, device); 33 | printf("Using device %d: %s\n", device, prop.name); 34 | printf("GPU compute capability: %d.%d\n", prop.major, prop.minor); 35 | printf("Number of multiprocessors: %d\n", prop.multiProcessorCount); 36 | printf("Total global memory: %lu bytes\n", prop.totalGlobalMem); 37 | printf("Total constant memory: %lu bytes\n", prop.totalConstMem); 38 | printf("Shared memory per block: %lu bytes\n", prop.sharedMemPerBlock); 39 | printf("Max threads per block: %d\n", prop.maxThreadsPerBlock); 40 | printf("Max threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor); 41 | printf("Max threads dimensions: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); 42 | printf("Max grid size: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); 43 | printf("Warp size: %d\n", prop.warpSize); 44 | printf("Clock rate: %d kHz\n", prop.clockRate); 45 | printf("Memory clock rate: %d kHz\n", prop.memoryClockRate); 46 | printf("Memory bus width: %d bits\n", prop.memoryBusWidth); 47 | printf("L2 cache size: %d bytes\n", prop.l2CacheSize); 48 | printf("Registers per block: %d\n", prop.regsPerBlock); 49 | printf("Registers per multiprocessor: %d\n", prop.regsPerMultiprocessor); 50 | printf("Device has ECC support: %d\n", prop.ECCEnabled); 51 | printf("Device has unified addressing: %d\n", prop.unifiedAddressing); 52 | printf("Device has host memory mapping: %d\n", prop.canMapHostMemory); 53 | printf("Device has error correction: %d\n", prop.ECCEnabled); 54 | printf("Device has async engine count: %d\n", prop.asyncEngineCount); 55 | printf("Device has concurrent kernels: %d\n", prop.concurrentKernels); 56 | printf("Device has PCI bus ID: %d\n", prop.pciBusID); 57 | printf("Device has PCI device ID: %d\n", prop.pciDeviceID); 58 | printf("Device has PCI domain ID: %d\n", prop.pciDomainID); 59 | printf("Device has tcc driver: %d\n", prop.tccDriver); 60 | printf("Device has memory clock rate: %d kHz\n", prop.memoryClockRate); 61 | printf("Device has memory bus width: %d bits\n", prop.memoryBusWidth); 62 | printf("Device has memory bandwidth: %f GB/s\n", 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6); 63 | printf("Device has L2 cache size: %d bytes\n", prop.l2CacheSize); 64 | printf("Device has max memory pitch: %lu bytes\n", prop.memPitch); 65 | printf("Device has texture alignment: %lu bytes\n", prop.textureAlignment); 66 | printf("Device has texture pitch alignment: %lu bytes\n", prop.texturePitchAlignment); 67 | printf("Device has GPU overlap: %d\n", prop.deviceOverlap); 68 | printf("Device has kernel execution timeout: %d\n", prop.kernelExecTimeoutEnabled); 69 | printf("Device has integrated GPU: %d\n", prop.integrated); 70 | printf("Device has can map host memory: %d\n", prop.canMapHostMemory); 71 | printf("Device has compute mode: %d\n", prop.computeMode); 72 | printf("Device has max texture 1D size: %d\n", prop.maxTexture1D); 73 | printf("Device has max texture 1D linear size: %d\n", prop.maxTexture1DLinear); 74 | printf("Device has max texture 1D mipmapped size: %d\n", prop.maxTexture1DMipmap); 75 | printf("Device has max texture 2D size: (%d, %d)\n", prop.maxTexture2D[0], prop.maxTexture2D[1]); 76 | printf("Device has max texture 2D linear size: %d\n", prop.maxTexture2DLinear); 77 | printf("Device has max texture 2D mipmapped size: (%d, %d)\n", prop.maxTexture2DMipmap[0], prop.maxTexture2DMipmap[1]); 78 | printf("Device has max texture 3D size: (%d, %d, %d)\n", prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); 79 | printf("Device has max texture 3D size: %d\n", prop.maxTexture3D); 80 | } 81 | 82 | int main(int argc, char **argv) { 83 | int device = 0; // Default device id (change if you have more than one GPU) 84 | if (argc > 1) { 85 | device = atoi(argv[1]); 86 | } 87 | 88 | // Set the device 89 | cudaSetDevice(device); 90 | 91 | info(device); 92 | } -------------------------------------------------------------------------------- /matmul2d/matmul2d.cu: -------------------------------------------------------------------------------- 1 | /* 2 | This program is provided as is without any guarantees or warranty. 3 | By using this program the user accepts the full responsibility for any 4 | and all damages that may occur. The author is not responsible for any 5 | consequences of the use of this program. 6 | 7 | * This program performs matrix multiplication using CUDA. 8 | * The matrices are generated using three different methods: zeros, ones, random. 9 | * The program uses the following functions: 10 | * getSharedMemory: prints the amount of shared memory per block 11 | * matmul: the CUDA kernel which performs the matrix multiplication 12 | * matrix: generates a matrix of size n x m of three types: zeros, ones, random 13 | * print_matrix: prints a matrix of size n x m 14 | * cpu_matmul: performs the matrix multiplication on the CPU 15 | * equals: checks if two matrices are equal 16 | * parse_args: parses the command line arguments 17 | 18 | * The program takes three command line arguments: 19 | * n: the number of rows of the first matrix 20 | * m: the number of columns of the first matrix and the number of rows of the second matrix 21 | * p: the number of columns of the second matrix 22 | * If the number of command line arguments is less than 3, the program uses the default values of 3 for n, m, and p. 23 | * If the number of command line arguments is 1, the program uses the value of the first argument for n, m, and p. 24 | 25 | * The program can be compiled using the following command: 26 | * nvcc matmul2d.cu -o matmul 27 | * to run the program, use the following command: 28 | * ./matmul

29 | 30 | @Author: Daniel Rossi 31 | @Date: 2023-03-11 32 | @License: MIT 33 | @Version: 1.0 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #define BLOCK_SIZE 32 42 | 43 | // enums used to generate matrices of different types 44 | enum { 45 | ZEROS = 0, 46 | ONES = 1, 47 | RAND = 2, 48 | }; 49 | 50 | void getSharedMemory() { 51 | cudaDeviceProp prop; 52 | int dev = 0; 53 | cudaGetDevice(&dev); 54 | cudaGetDeviceProperties(&prop, dev); 55 | printf("Shared memory per block: %lu bytes\n", prop.sharedMemPerBlock); 56 | } 57 | 58 | __global__ void matmul(float *a, float *b, float *c, size_t n, size_t m, size_t p) { 59 | size_t row = blockIdx.y * blockDim.y + threadIdx.y; // obviously rows are along y axis 60 | size_t col = blockIdx.x * blockDim.x + threadIdx.x; // and columns are along x axis (think about a spreadsheet!) 61 | 62 | float sum = 0; 63 | if (row < n && col < p){ // we need to check if the current thread is within the matrix boundaries 64 | for (size_t i = 0; i < m; ++i) { 65 | sum += a[row * m + i] * b[i * p + col]; // this is the dot product of the row-th row of a and the col-th column of b 66 | } 67 | c[row * p + col] = sum; 68 | } 69 | } 70 | 71 | 72 | // generates a matrix of size n x m of three types: zeros, ones, random 73 | float *matrix(size_t n, size_t m, int type) { 74 | float *mat = (float *)malloc(n * m * sizeof(float)); 75 | for (size_t i = 0; i < n * m; i++) { 76 | if (type == ZEROS) { 77 | mat[i] = 0; 78 | } else if (type == ONES) { 79 | mat[i] = 1; 80 | } else if (type == RAND) { 81 | mat[i] = (float)rand() / RAND_MAX; 82 | } 83 | } 84 | return mat; 85 | } 86 | 87 | 88 | void print_matrix(char name, float *matrix, size_t n, size_t m){ 89 | printf("Matrix %c:\n", name); 90 | for (size_t i = 0; i < n; ++i) { 91 | for (size_t j = 0; j < m; ++j) { 92 | printf("%f ", matrix[i * m + j]); 93 | } 94 | printf("\n"); 95 | } 96 | printf("\n"); 97 | } 98 | 99 | 100 | void cpu_matmul(float *a, float *b, float *c_cpu, size_t n, size_t m, size_t p){ 101 | for (size_t i = 0; i < n; ++i) { 102 | for (size_t j = 0; j < p; ++j) { 103 | for (size_t k = 0; k < m; ++k) { 104 | c_cpu[i * p + j] += a[i * m + k] * b[k * p + j]; 105 | } 106 | } 107 | } 108 | } 109 | 110 | 111 | bool equals(float *a, float *b, size_t n, size_t m) { 112 | for (size_t i = 0; i < n * m; i++) { 113 | if (abs(a[i] - b[i]) > 1e-3) { 114 | return false; 115 | } 116 | } 117 | return true; 118 | } 119 | 120 | 121 | void parse_args(int argc, char **argv, size_t *n, size_t *m, size_t *p) { 122 | if (argc > 1 && argc <= 2) { 123 | *n = atoi(argv[1]); 124 | *m = atoi(argv[1]); 125 | *p = atoi(argv[1]); 126 | } else if (argc > 3) { 127 | *n = atoi(argv[1]); 128 | *m = atoi(argv[2]); 129 | *p = atoi(argv[3]); 130 | } else { 131 | *n = 3; 132 | *m = 3; 133 | *p = 3; 134 | } 135 | } 136 | 137 | int main(int argc, char** argv) { 138 | float *a, *b, *c; 139 | size_t n, m, p; 140 | parse_args(argc, argv, &n, &m, &p); 141 | 142 | getSharedMemory(); 143 | srand(41); // set the seed for random number generation 144 | 145 | // generate the matrices 146 | a = matrix(n, m, RAND); 147 | b = matrix(m, p, RAND); 148 | c = matrix(n, p, ZEROS); 149 | 150 | float *dev_a, *dev_b, *dev_c; 151 | 152 | float start_time, end_time; 153 | 154 | start_time = clock(); 155 | // Allocate memory on the device 156 | cudaMalloc((void **)&dev_a, n * m * sizeof(float)); 157 | cudaMalloc((void **)&dev_b, m * p * sizeof(float)); 158 | cudaMalloc((void **)&dev_c, n * p * sizeof(float)); 159 | 160 | // Copy the input matrices from the host to the device 161 | cudaMemcpy(dev_a, a, n * m * sizeof(float), cudaMemcpyHostToDevice); 162 | cudaMemcpy(dev_b, b, m * p * sizeof(float), cudaMemcpyHostToDevice); 163 | 164 | // Calculate the number of block needed along rows and columns to cover each matrix dimension 165 | // using blocks of size BLOCK_SIZE 166 | // BLOCK_SIZE - 1 guarantees that the last block will be filled with the remaining elements 167 | size_t gridRows = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; 168 | size_t gridCols = (p + BLOCK_SIZE - 1) / BLOCK_SIZE; 169 | 170 | dim3 dimGrid(gridCols, gridRows); // this is a struct which defines the size of the blocks grid used to perform parallel computations 171 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // this is a struct which represents the size of a CUDA block within the CUDA kernel 172 | 173 | cudaEvent_t start, stop; // these are events used to measure the time of the kernel execution 174 | float gpuTime = 0.0f; 175 | 176 | cudaEventCreate(&start); 177 | cudaEventCreate(&stop); 178 | 179 | cudaEventRecord(start, 0); 180 | matmul<<>>(dev_a, dev_b, dev_c, n, m, p); 181 | cudaEventRecord(stop, 0); 182 | 183 | cudaEventSynchronize(stop); // Wait for the stop event to complete 184 | cudaEventElapsedTime(&gpuTime, start, stop); 185 | 186 | cudaMemcpy(c, dev_c, n * p * sizeof(float), cudaMemcpyDeviceToHost); 187 | end_time = clock(); 188 | 189 | cudaError_t error = cudaGetLastError(); 190 | if (error != cudaSuccess) { 191 | fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(error)); 192 | } 193 | 194 | cudaFree(dev_a); 195 | cudaFree(dev_b); 196 | cudaFree(dev_c); 197 | 198 | if (n * p <= 25){ 199 | print_matrix('a', a, n, m); 200 | print_matrix('b', b, m, p); 201 | print_matrix('c', c, n, p); 202 | } 203 | 204 | printf("Overall GPU time: %f s \n", (end_time - start_time) / CLOCKS_PER_SEC); 205 | printf("GPU time: %f s\n", gpuTime / 1000); 206 | 207 | float *c_cpu = matrix(n, p, ZEROS); 208 | start_time = clock(); 209 | cpu_matmul(a, b, c_cpu, n, m, p); 210 | end_time = clock(); 211 | 212 | printf("Overall CPU time: %f s\n", (end_time - start_time) / CLOCKS_PER_SEC); 213 | printf("\n"); 214 | 215 | if (n * p <= 25){ 216 | print_matrix('x', c_cpu, n, p); 217 | } 218 | printf("Matrices are %s\n", equals(c, c_cpu, n, p) ? "equal" : "different"); 219 | 220 | free(a); 221 | free(b); 222 | free(c); 223 | free(c_cpu); 224 | 225 | return 0; 226 | } -------------------------------------------------------------------------------- /matmul2dsm/matmul2dsm.cu: -------------------------------------------------------------------------------- 1 | /* 2 | This program is provided as is without any guarantees or warranty. 3 | By using this program the user accepts the full responsibility for any 4 | and all damages that may occur. The author is not responsible for any 5 | consequences of the use of this program. 6 | 7 | * This program performs matrix multiplication using CUDA and shared memory. 8 | * The matrices are generated using three different methods: zeros, ones, random. 9 | * The program uses the following functions: 10 | * getSharedMemory: prints the amount of shared memory per block 11 | * matmul_sm: the CUDA kernel which performs the matrix multiplication using shared memory 12 | * matrix: generates a matrix of size n x m of three types: zeros, ones, random 13 | * print_matrix: prints a matrix of size n x m 14 | * cpu_matmul: performs the matrix multiplication on the CPU 15 | * equals: checks if two matrices are equal 16 | * parse_args: parses the command line arguments 17 | 18 | * The program takes three command line arguments: 19 | * n: the number of rows of the first matrix 20 | * m: the number of columns of the first matrix and the number of rows of the second matrix 21 | * p: the number of columns of the second matrix 22 | * If the number of command line arguments is less than 3, the program uses the default values of 3 for n, m, and p. 23 | * If the number of command line arguments is 1, the program uses the value of the first argument for n, m, and p. 24 | 25 | * The program can be compiled using the following command: 26 | * nvcc matmul2d.cu -o matmul 27 | * to run the program, use the following command: 28 | * ./matmul

29 | 30 | @Author: Daniel Rossi 31 | @Date: 2023-03-12 32 | @License: MIT 33 | @Version: 1.0 34 | */ 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #define BLOCK_SIZE 32 42 | 43 | // enums used to generate matrices of different types 44 | enum { 45 | ZEROS = 0, 46 | ONES = 1, 47 | RAND = 2, 48 | }; 49 | 50 | 51 | /* 52 | * CUDA kernel to perform matrix multiplication using shared memory 53 | * The shared memory is a memory space that is shared between all threads in a block. 54 | * It is fast because it is located on-chip, but it is limited in size. 55 | */ 56 | __global__ void matmul_sm(float *a, float *b, float *c, size_t n, size_t m, size_t p) { 57 | // Calculate the global row and column indices 58 | size_t row = blockIdx.y * blockDim.y + threadIdx.y; // this is the row index for the current thread 59 | size_t col = blockIdx.x * blockDim.x + threadIdx.x; // this is the column index for the current thread 60 | 61 | // Allocate shared memory for the tile of matrix A and B 62 | __shared__ float tileA[BLOCK_SIZE][BLOCK_SIZE]; 63 | __shared__ float tileB[BLOCK_SIZE][BLOCK_SIZE]; 64 | 65 | float sum = 0.0f; 66 | 67 | // Iterate over the tiles of matrix A and B 68 | for (size_t tileIdx = 0; tileIdx < (m + BLOCK_SIZE - 1) / BLOCK_SIZE; ++tileIdx) { 69 | /* 70 | * TILEs: 71 | * - If we consider a 3x3 matrix A, i can scroll the first row if globalRow = 0 and globalCol = 0, 1, 2; 72 | * - to scroll the second row, I need globalRow to be 3 and globalCol to be 0, 1, 2; and so on. 73 | * - Thus, the maximum value for globalRow is 3x3 = 9. This is why globalRow cannot be larger than n * m. 74 | * - But, since we operate within blocks, the row index for globalRow is given by row, which is the block index 75 | * multiplied by the block size, plus the thread index. 76 | * 77 | * - For globalCol is a little more complicated. The maximum value for globalCol is 3 in this example because we consider 78 | * A to be a 3x3 matrix. Consider now to have BLOCK_SIZE = 2. This means that we have 2x2 tiles, or better, a 4 elements tiles. 79 | * If we iterate from 0 to (3 + 2 - 1) / 2 = 2, the index of the first element would be [0 * 2 + 0] = 0, the second would be 80 | * [0 * 2 + 1] = 1, 3rd = [1 * 2 + 0] = 2 and 4th = [1 * 2 + 1] = 3. Thus we are able to scroll the columns of A. 81 | * (BLOCK_SIZE of 2 means that each block has 2 threads) 82 | * 83 | * - For B, the same logic applies, but the maximum value for globalRow is m, and for globalCol is p. 84 | * If we consider a 3x3 matrix B, the maximum value for globalRow is 3x3 = 9. We can consider to scroll the rows of B 85 | * by incrementing the globalRow index from 0 to 3 and multiplying it by B's height (3). 86 | * 87 | * - Why can we do this with tiles of size 4 when the matrix is 3x3? 88 | * Since, after the assingment of the elements to the tiles, we synchronize the threads, we can safely ignore the 89 | * elements that are not part of the matrix. In particular, we are performing the overall matmul slicing in pieces 90 | * the matrices. After filling the tiles, we have all we need to calculate the result for the current tile. 91 | */ 92 | 93 | size_t globalRow = row * m; 94 | size_t globalCol = tileIdx * BLOCK_SIZE + threadIdx.x; 95 | 96 | if (globalRow < n * m && globalCol < m) { 97 | tileA[threadIdx.y][threadIdx.x] = a[globalRow + globalCol]; 98 | } else { 99 | tileA[threadIdx.y][threadIdx.x] = 0.0f; 100 | } 101 | 102 | globalRow = tileIdx * BLOCK_SIZE + threadIdx.y; 103 | globalCol = col; 104 | 105 | if (globalRow < m && globalCol < p) { 106 | tileB[threadIdx.y][threadIdx.x] = b[(globalRow) * p + globalCol]; 107 | } else { 108 | tileB[threadIdx.y][threadIdx.x] = 0.0f; 109 | } 110 | 111 | // Synchronize threads to ensure all elements are loaded into shared memory 112 | __syncthreads(); 113 | 114 | // Perform the matrix multiplication for the current tile 115 | for (size_t k = 0; k < BLOCK_SIZE; ++k) { 116 | sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x]; 117 | } 118 | 119 | // Synchronize threads to ensure all elements are used in the matrix multiplication 120 | __syncthreads(); 121 | } 122 | 123 | // Write the result to the output matrix 124 | if (row < n && col < p) { 125 | c[row * p + col] = sum; 126 | } 127 | } 128 | 129 | // generates a matrix of size n x m of three types: zeros, ones, random 130 | float *matrix(size_t n, size_t m, int type) { 131 | float *mat = (float *)malloc(n * m * sizeof(float)); 132 | for (size_t i = 0; i < n * m; i++) { 133 | if (type == ZEROS) { 134 | mat[i] = 0; 135 | } else if (type == ONES) { 136 | mat[i] = 1; 137 | } else if (type == RAND) { 138 | mat[i] = (float)rand() / RAND_MAX; 139 | } 140 | } 141 | return mat; 142 | } 143 | 144 | 145 | void print_matrix(char name, float *matrix, size_t n, size_t m){ 146 | printf("Matrix %c:\n", name); 147 | for (size_t i = 0; i < n; ++i) { 148 | for (size_t j = 0; j < m; ++j) { 149 | printf("%f ", matrix[i * m + j]); 150 | } 151 | printf("\n"); 152 | } 153 | printf("\n"); 154 | } 155 | 156 | 157 | void cpu_matmul(float *a, float *b, float *c_cpu, size_t n, size_t m, size_t p){ 158 | for (size_t i = 0; i < n; ++i) { 159 | for (size_t j = 0; j < p; ++j) { 160 | for (size_t k = 0; k < m; ++k) { 161 | c_cpu[i * p + j] += a[i * m + k] * b[k * p + j]; 162 | } 163 | } 164 | } 165 | } 166 | 167 | 168 | bool equals(float *gpu, float *cpu, size_t n, size_t m) { 169 | for (size_t i = 0; i < n * m; i++) { 170 | if (abs(gpu[i] - cpu[i]) > 1e-3) { 171 | printf("a[%lu] = %f, b[%lu] = %f\n", i, gpu[i], i, cpu[i]); 172 | return false; 173 | } 174 | } 175 | return true; 176 | } 177 | 178 | 179 | void parse_args(int argc, char **argv, size_t *n, size_t *m, size_t *p) { 180 | if (argc > 1 && argc <= 2) { 181 | *n = atoi(argv[1]); 182 | *m = atoi(argv[1]); 183 | *p = atoi(argv[1]); 184 | } else if (argc > 3) { 185 | *n = atoi(argv[1]); 186 | *m = atoi(argv[2]); 187 | *p = atoi(argv[3]); 188 | } else { 189 | *n = 3; 190 | *m = 3; 191 | *p = 3; 192 | } 193 | } 194 | 195 | int main(int argc, char** argv) { 196 | float *a, *b, *c; 197 | size_t n, m, p; 198 | parse_args(argc, argv, &n, &m, &p); 199 | 200 | srand(41); // set the seed for random number generation 201 | 202 | // generate the matrices 203 | a = matrix(n, m, ONES); 204 | b = matrix(m, p, ONES); 205 | c = matrix(n, p, ZEROS); 206 | 207 | float *dev_a, *dev_b, *dev_c; 208 | 209 | float start_time, end_time; 210 | 211 | start_time = clock(); 212 | // Allocate memory on the device 213 | cudaMalloc((void **)&dev_a, n * m * sizeof(float)); 214 | cudaMalloc((void **)&dev_b, m * p * sizeof(float)); 215 | cudaMalloc((void **)&dev_c, n * p * sizeof(float)); 216 | 217 | // Copy the input matrices from the host to the device 218 | cudaMemcpy(dev_a, a, n * m * sizeof(float), cudaMemcpyHostToDevice); 219 | cudaMemcpy(dev_b, b, m * p * sizeof(float), cudaMemcpyHostToDevice); 220 | 221 | // Calculate the number of block needed along rows and columns to cover each matrix dimension 222 | // using blocks of size BLOCK_SIZE 223 | // BLOCK_SIZE - 1 guarantees that the last block will be filled with the remaining elements 224 | size_t gridRows = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; 225 | size_t gridCols = (p + BLOCK_SIZE - 1) / BLOCK_SIZE; 226 | 227 | dim3 dimGrid(gridCols, gridRows); // this is a struct which defines the size of the blocks grid used to perform parallel computations 228 | dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // this is a struct which represents the size of a CUDA block within the CUDA kernel 229 | 230 | cudaEvent_t start, stop; // these are events used to measure the time of the kernel execution 231 | float gpuTime = 0.0f; 232 | 233 | cudaEventCreate(&start); 234 | cudaEventCreate(&stop); 235 | 236 | cudaEventRecord(start, 0); 237 | matmul_sm<<>>(dev_a, dev_b, dev_c, n, m, p); 238 | cudaEventRecord(stop, 0); 239 | 240 | cudaEventSynchronize(stop); // Wait for the stop event to complete 241 | cudaEventElapsedTime(&gpuTime, start, stop); 242 | 243 | cudaMemcpy(c, dev_c, n * p * sizeof(float), cudaMemcpyDeviceToHost); 244 | end_time = clock(); 245 | 246 | cudaError_t error = cudaGetLastError(); 247 | if (error != cudaSuccess) { 248 | fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(error)); 249 | } 250 | 251 | cudaFree(dev_a); 252 | cudaFree(dev_b); 253 | cudaFree(dev_c); 254 | 255 | if (n * p <= 25){ 256 | print_matrix('a', a, n, m); 257 | print_matrix('b', b, m, p); 258 | print_matrix('c', c, n, p); 259 | } 260 | 261 | printf("Overall GPU time: %f s \n", (end_time - start_time) / CLOCKS_PER_SEC); 262 | printf("GPU time: %f s\n", gpuTime / 1000); 263 | 264 | float *c_cpu = matrix(n, p, ZEROS); 265 | start_time = clock(); 266 | cpu_matmul(a, b, c_cpu, n, m, p); 267 | end_time = clock(); 268 | 269 | printf("Overall CPU time: %f s\n", (end_time - start_time) / CLOCKS_PER_SEC); 270 | printf("\n"); 271 | 272 | if (n * p <= 25){ 273 | print_matrix('x', c_cpu, n, p); 274 | } 275 | printf("Matrices are %s\n", equals(c, c_cpu, n, p) ? "equal" : "different"); 276 | 277 | free(a); 278 | free(b); 279 | free(c); 280 | free(c_cpu); 281 | 282 | return 0; 283 | } --------------------------------------------------------------------------------