├── .gitignore
├── LICENSE
├── README.md
├── sum
    └── sum.cu
├── info
    └── info.cu
├── matmul2d
    └── matmul2d.cu
└── matmul2dsm
    └── matmul2dsm.cu


/.gitignore:
--------------------------------------------------------------------------------
1 | *.i
2 | *.ii
3 | *.gpu
4 | *.ptx
5 | *.cubin
6 | *.fatbin
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Daniel Rossi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CUDA
 2 | 
 3 | ### Brief
 4 | This repository contans multiple examples of code to be run on NVIDIA GPUs, and wants to help you dive deeper into CUDA programming language. CUDA runs in any machine which mounts a NVIDA GPU with compute capability > 3.0 [https://developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus), so make sure your system is supported. You can run CUDA scripts on Linux and Windows machines, but it is mandatory to install NVIDIA drivers and the NVIDIA CUDA Compiler.
 5 | 
 6 | ### Available code:
 7 | - **info**: display CUDA and GPU information
 8 | - **sum**: adds two random number -> learn how to move data from CPU to GPU and vice versa and run code on GPU
 9 | - **matmul2d**: classical matrix multiplication between two matrices -> learn how to manage multi-dimensional data structure and operate between them
10 | 
11 | ### Prerequisites:
12 | 1. install NVIDIA drivers: [https://ubuntu.com/server/docs/nvidia-drivers-installation](https://www.nvidia.com/download/index.aspx)
13 | 2. install CUDA on:
14 |    - Ubuntu: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
15 |    - Windows: https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html
16 | 4. check everything with ```nvidia-smi``` and ```nvcc -v```
17 | 
18 | ### Compile
19 | To compile a ```.cu``` file you need to run ```nvcc file_name.cu -o output_file_name```
20 | 


--------------------------------------------------------------------------------
/sum/sum.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |     This program is provided as is without any guarantees or warranty.
  3 |     By using this program the user accepts the full responsibility for any
  4 |     and all damages that may occur. The author is not responsible for any
  5 |     consequences of the use of this program.
  6 | 
  7 |     * This program adds two arrays of floats using CUDA.
  8 |     * 
  9 |     * The program first allocates memory for the arrays on the host and device.
 10 |     * It then initializes the host arrays with random values.
 11 |     * The host arrays are then copied to the device.
 12 |     * The add() kernel is then launched on the GPU.
 13 |     * The result is then copied back to the host.
 14 |     * The program then verifies the result.
 15 |     * Finally, the program frees the memory on the device and host.
 16 |     
 17 |     * The program takes one command line argument, N, which is the size of the arrays.
 18 |     * If no argument is provided, the default value of N is 1.
 19 | 
 20 |     * The program can be compiled using the following command:
 21 |         * nvcc sum.cu -o sum
 22 |     * to run the program, use the following command:
 23 |         * ./sum <N>
 24 | 
 25 |     @Author: Daniel Rossi
 26 |     @Date: 2023-03-08
 27 |     @License: MIT
 28 |     @Version: 1.0
 29 | */
 30 | 
 31 | #include <stdio.h>
 32 | #include <cuda_runtime.h>
 33 | 
 34 | void setup() {
 35 |     // Set the random seed
 36 |     srand(time(NULL));
 37 |     int device = 0; // Default device id (change if you have more than one GPU)
 38 | 
 39 |     // Set the device
 40 |     cudaSetDevice(device);
 41 | }
 42 | 
 43 | // Kernel function to add two arrays
 44 | __global__ void add(float *a, float *b, float *c, int n) {
 45 |     // Get the index of the current element
 46 |     int index = threadIdx.x + blockIdx.x * blockDim.x;
 47 | 
 48 |     // Check if the index is within the array bounds
 49 |     if (index < n) {
 50 |         c[index] = a[index] + b[index];
 51 |     }
 52 | }
 53 | 
 54 | void print_cuda_error(cudaError_t err) {
 55 |     if (err != cudaSuccess){
 56 |         printf("CUDA error: %s\n", cudaGetErrorString(err));
 57 |     }
 58 | }
 59 | 
 60 | int main(int argc, char **argv) {
 61 |     setup();
 62 |     int N = 1;
 63 | 
 64 |     // Parse command line arguments
 65 |     if (argc > 1){
 66 |         N = atoi(argv[1]);
 67 |     }
 68 | 
 69 |     printf("N = %d\n", N);
 70 | 
 71 |     float *a, *b, *c;
 72 |     float *d_a, *d_b, *d_c;
 73 | 
 74 |     // Allocate memory on the host
 75 |     a = (float *) malloc(N * sizeof(float));
 76 |     b = (float *) malloc(N * sizeof(float));
 77 |     c = (float *) malloc(N * sizeof(float));
 78 | 
 79 |     // Allocate memory on the device
 80 |     cudaError_t err;
 81 |     err = cudaMalloc(&d_a, N * sizeof(float));
 82 |     print_cuda_error(err);
 83 | 
 84 |     err = cudaMalloc(&d_b, N * sizeof(float));
 85 |     print_cuda_error(err);
 86 | 
 87 |     err =cudaMalloc(&d_c, N * sizeof(float));
 88 |     print_cuda_error(err);
 89 | 
 90 |     // Initialize host values
 91 |     for (int i = 0; i < N; ++i){
 92 |         // Generate random values between 0 and 1 
 93 |         a[i] = rand() / (float)RAND_MAX;
 94 |         b[i] = rand() / (float)RAND_MAX;
 95 |     }   
 96 | 
 97 |     // Copy inputs to device
 98 |     err = cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
 99 |     print_cuda_error(err);
100 |     
101 |     err = cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);
102 |     print_cuda_error(err);
103 | 
104 |     // Launch add() kernel on GPU
105 |     add<<<(N + 255) / 256, 256>>>(d_a, d_b, d_c, N);
106 | 
107 |     // Copy result back to host
108 |     err = cudaMemcpy(c, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
109 |     print_cuda_error(err);
110 | 
111 |     // Verify the result
112 |     for (int i = 0; i < N; ++i){
113 |         if (c[i] != (a[i] + b[i])){
114 |             printf("Error: %f + %f != %f\n", a[i], b[i], c[i]);
115 |             break;
116 |         }
117 |     }
118 | 
119 |     // Free memory on device
120 |     cudaFree(d_a);
121 |     cudaFree(d_b);
122 |     cudaFree(d_c);
123 | 
124 |     // Free memory on host
125 |     free(a);
126 |     free(b);
127 |     free(c);
128 | 
129 |     printf("Done\n");
130 | 
131 |     return 0;
132 | }


--------------------------------------------------------------------------------
/info/info.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 |     This program is provided as is without any guarantees or warranty.
 3 |     By using this program the user accepts the full responsibility for any
 4 |     and all damages that may occur. The author is not responsible for any
 5 |     consequences of the use of this program.
 6 | 
 7 |     * This program prints information about the GPU device.
 8 |     * The program uses the CUDA runtime API to query the device properties.
 9 | 
10 |     * The program takes an optional argument, which is the device id.
11 |     * If no argument is provided, the program will use the default device (device 0).
12 | 
13 |     * The program can be compiled using the following command:
14 |         * nvcc info.cu -o info
15 |     * to run the program, use the following command:
16 |         * ./info <device_id>
17 | 
18 |     @Author: Daniel Rossi
19 |     @Date: 2023-03-11
20 |     @License: MIT
21 |     @Version: 1.0
22 |     @
23 | */
24 | 
25 | #include <stdio.h>
26 | #include <cuda_runtime.h>
27 | 
28 | void info(int device){
29 |     printf("CUDA version: %d.%d\n", CUDART_VERSION / 1000, (CUDART_VERSION % 100) / 10);
30 |     
31 |     cudaDeviceProp prop;
32 |     cudaGetDeviceProperties(&prop, device);
33 |     printf("Using device %d: %s\n", device, prop.name);
34 |     printf("GPU compute capability: %d.%d\n", prop.major, prop.minor);
35 |     printf("Number of multiprocessors: %d\n", prop.multiProcessorCount);
36 |     printf("Total global memory: %lu bytes\n", prop.totalGlobalMem);
37 |     printf("Total constant memory: %lu bytes\n", prop.totalConstMem);
38 |     printf("Shared memory per block: %lu bytes\n", prop.sharedMemPerBlock);
39 |     printf("Max threads per block: %d\n", prop.maxThreadsPerBlock);
40 |     printf("Max threads per multiprocessor: %d\n", prop.maxThreadsPerMultiProcessor);
41 |     printf("Max threads dimensions: (%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
42 |     printf("Max grid size: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
43 |     printf("Warp size: %d\n", prop.warpSize);
44 |     printf("Clock rate: %d kHz\n", prop.clockRate);
45 |     printf("Memory clock rate: %d kHz\n", prop.memoryClockRate);
46 |     printf("Memory bus width: %d bits\n", prop.memoryBusWidth);
47 |     printf("L2 cache size: %d bytes\n", prop.l2CacheSize);
48 |     printf("Registers per block: %d\n", prop.regsPerBlock);
49 |     printf("Registers per multiprocessor: %d\n", prop.regsPerMultiprocessor);
50 |     printf("Device has ECC support: %d\n", prop.ECCEnabled);
51 |     printf("Device has unified addressing: %d\n", prop.unifiedAddressing);
52 |     printf("Device has host memory mapping: %d\n", prop.canMapHostMemory);
53 |     printf("Device has error correction: %d\n", prop.ECCEnabled);
54 |     printf("Device has async engine count: %d\n", prop.asyncEngineCount);
55 |     printf("Device has concurrent kernels: %d\n", prop.concurrentKernels);
56 |     printf("Device has PCI bus ID: %d\n", prop.pciBusID);
57 |     printf("Device has PCI device ID: %d\n", prop.pciDeviceID);
58 |     printf("Device has PCI domain ID: %d\n", prop.pciDomainID);
59 |     printf("Device has tcc driver: %d\n", prop.tccDriver);
60 |     printf("Device has memory clock rate: %d kHz\n", prop.memoryClockRate);
61 |     printf("Device has memory bus width: %d bits\n", prop.memoryBusWidth);
62 |     printf("Device has memory bandwidth: %f GB/s\n", 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
63 |     printf("Device has L2 cache size: %d bytes\n", prop.l2CacheSize);
64 |     printf("Device has max memory pitch: %lu bytes\n", prop.memPitch);
65 |     printf("Device has texture alignment: %lu bytes\n", prop.textureAlignment);
66 |     printf("Device has texture pitch alignment: %lu bytes\n", prop.texturePitchAlignment);
67 |     printf("Device has GPU overlap: %d\n", prop.deviceOverlap);
68 |     printf("Device has kernel execution timeout: %d\n", prop.kernelExecTimeoutEnabled);
69 |     printf("Device has integrated GPU: %d\n", prop.integrated);
70 |     printf("Device has can map host memory: %d\n", prop.canMapHostMemory);
71 |     printf("Device has compute mode: %d\n", prop.computeMode);
72 |     printf("Device has max texture 1D size: %d\n", prop.maxTexture1D);
73 |     printf("Device has max texture 1D linear size: %d\n", prop.maxTexture1DLinear);
74 |     printf("Device has max texture 1D mipmapped size: %d\n", prop.maxTexture1DMipmap);
75 |     printf("Device has max texture 2D size: (%d, %d)\n", prop.maxTexture2D[0], prop.maxTexture2D[1]);
76 |     printf("Device has max texture 2D linear size: %d\n", prop.maxTexture2DLinear);
77 |     printf("Device has max texture 2D mipmapped size: (%d, %d)\n", prop.maxTexture2DMipmap[0], prop.maxTexture2DMipmap[1]);
78 |     printf("Device has max texture 3D size: (%d, %d, %d)\n", prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
79 |     printf("Device has max texture 3D size: %d\n", prop.maxTexture3D);
80 | }
81 | 
82 | int main(int argc, char **argv) {
83 |     int device = 0; // Default device id (change if you have more than one GPU)
84 |     if (argc > 1) {
85 |         device = atoi(argv[1]);
86 |     }
87 | 
88 |     // Set the device
89 |     cudaSetDevice(device);
90 | 
91 |     info(device);
92 | }


--------------------------------------------------------------------------------
/matmul2d/matmul2d.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |     This program is provided as is without any guarantees or warranty.
  3 |     By using this program the user accepts the full responsibility for any
  4 |     and all damages that may occur. The author is not responsible for any
  5 |     consequences of the use of this program.
  6 | 
  7 |     * This program performs matrix multiplication using CUDA.
  8 |     * The matrices are generated using three different methods: zeros, ones, random.
  9 |     * The program uses the following functions:
 10 |         * getSharedMemory: prints the amount of shared memory per block
 11 |         * matmul: the CUDA kernel which performs the matrix multiplication
 12 |         * matrix: generates a matrix of size n x m of three types: zeros, ones, random
 13 |         * print_matrix: prints a matrix of size n x m
 14 |         * cpu_matmul: performs the matrix multiplication on the CPU
 15 |         * equals: checks if two matrices are equal
 16 |         * parse_args: parses the command line arguments
 17 |     
 18 |     * The program takes three command line arguments:
 19 |         * n: the number of rows of the first matrix
 20 |         * m: the number of columns of the first matrix and the number of rows of the second matrix
 21 |         * p: the number of columns of the second matrix
 22 |     * If the number of command line arguments is less than 3, the program uses the default values of 3 for n, m, and p.
 23 |     * If the number of command line arguments is 1, the program uses the value of the first argument for n, m, and p.
 24 |     
 25 |     * The program can be compiled using the following command:
 26 |         * nvcc matmul2d.cu -o matmul
 27 |     * to run the program, use the following command:
 28 |         * ./matmul <n> <m> <p>
 29 | 
 30 |     @Author: Daniel Rossi
 31 |     @Date: 2023-03-11
 32 |     @License: MIT
 33 |     @Version: 1.0
 34 | */
 35 | 
 36 | #include <stdio.h>
 37 | #include <stdlib.h>
 38 | #include <time.h>
 39 | #include <cuda_runtime.h>
 40 | 
 41 | #define BLOCK_SIZE 32
 42 | 
 43 | // enums used to generate matrices of different types
 44 | enum {
 45 |     ZEROS = 0,
 46 |     ONES = 1,
 47 |     RAND = 2,
 48 | };
 49 | 
 50 | void getSharedMemory() {
 51 |     cudaDeviceProp prop;
 52 |     int dev = 0;
 53 |     cudaGetDevice(&dev);
 54 |     cudaGetDeviceProperties(&prop, dev);
 55 |     printf("Shared memory per block: %lu bytes\n", prop.sharedMemPerBlock);
 56 | }
 57 | 
 58 | __global__ void matmul(float *a, float *b, float *c, size_t n, size_t m, size_t p) {
 59 |     size_t row = blockIdx.y * blockDim.y + threadIdx.y; // obviously rows are along y axis
 60 |     size_t col = blockIdx.x * blockDim.x + threadIdx.x; // and columns are along x axis (think about a spreadsheet!)
 61 | 
 62 |     float sum = 0;
 63 |     if (row < n && col < p){ // we need to check if the current thread is within the matrix boundaries
 64 |         for (size_t i = 0; i < m; ++i) {
 65 |             sum += a[row * m + i] * b[i * p + col]; // this is the dot product of the row-th row of a and the col-th column of b
 66 |         }
 67 |         c[row * p + col] = sum; 
 68 |     }
 69 | }
 70 | 
 71 | 
 72 | // generates a matrix of size n x m of three types: zeros, ones, random
 73 | float *matrix(size_t n, size_t m, int type) {
 74 |     float *mat = (float *)malloc(n * m * sizeof(float));
 75 |     for (size_t i = 0; i < n * m; i++) {
 76 |         if (type == ZEROS) {
 77 |             mat[i] = 0;
 78 |         } else if (type == ONES) {
 79 |             mat[i] = 1;
 80 |         } else if (type == RAND) {
 81 |             mat[i] = (float)rand() / RAND_MAX;
 82 |         }
 83 |     }
 84 |     return mat;
 85 | }
 86 | 
 87 | 
 88 | void print_matrix(char name, float *matrix, size_t n, size_t m){
 89 |     printf("Matrix %c:\n", name);
 90 |         for (size_t i = 0; i < n; ++i) {
 91 |             for (size_t j = 0; j < m; ++j) {
 92 |                 printf("%f ", matrix[i * m + j]);
 93 |             }
 94 |             printf("\n");
 95 |         }
 96 |         printf("\n");
 97 | }
 98 | 
 99 | 
100 | void cpu_matmul(float *a, float *b, float *c_cpu, size_t n, size_t m, size_t p){
101 |     for (size_t i = 0; i < n; ++i) {
102 |         for (size_t j = 0; j < p; ++j) {
103 |             for (size_t k = 0; k < m; ++k) {
104 |                 c_cpu[i * p + j] += a[i * m + k] * b[k * p + j];
105 |             }
106 |         }
107 |     }
108 | }
109 | 
110 | 
111 | bool equals(float *a, float *b, size_t n, size_t m) {
112 |     for (size_t i = 0; i < n * m; i++) {
113 |         if (abs(a[i] - b[i]) > 1e-3) {
114 |             return false;
115 |         }
116 |     }
117 |     return true;
118 | }
119 | 
120 | 
121 | void parse_args(int argc, char **argv, size_t *n, size_t *m, size_t *p) {
122 |     if (argc > 1 && argc <= 2) {
123 |         *n = atoi(argv[1]);
124 |         *m = atoi(argv[1]);
125 |         *p = atoi(argv[1]);
126 |     } else if (argc > 3) {
127 |         *n = atoi(argv[1]);
128 |         *m = atoi(argv[2]);
129 |         *p = atoi(argv[3]);
130 |     } else {
131 |         *n = 3;
132 |         *m = 3;
133 |         *p = 3;
134 |     }
135 | }
136 | 
137 | int main(int argc, char** argv) {
138 |     float *a, *b, *c;
139 |     size_t n, m, p;
140 |     parse_args(argc, argv, &n, &m, &p);
141 | 
142 |     getSharedMemory();
143 |     srand(41); // set the seed for random number generation
144 | 
145 |     // generate the matrices
146 |     a = matrix(n, m, RAND);
147 |     b = matrix(m, p, RAND);
148 |     c = matrix(n, p, ZEROS);
149 | 
150 |     float *dev_a, *dev_b, *dev_c;
151 | 
152 |     float start_time, end_time;
153 | 
154 |     start_time = clock();
155 |     // Allocate memory on the device
156 |     cudaMalloc((void **)&dev_a, n * m * sizeof(float));
157 |     cudaMalloc((void **)&dev_b, m * p * sizeof(float));
158 |     cudaMalloc((void **)&dev_c, n * p * sizeof(float));
159 | 
160 |     // Copy the input matrices from the host to the device
161 |     cudaMemcpy(dev_a, a, n * m * sizeof(float), cudaMemcpyHostToDevice);
162 |     cudaMemcpy(dev_b, b, m * p * sizeof(float), cudaMemcpyHostToDevice);
163 | 
164 |     // Calculate the number of block needed along rows and columns to cover each matrix dimension 
165 |     // using blocks of size BLOCK_SIZE
166 |     // BLOCK_SIZE - 1 guarantees that the last block will be filled with the remaining elements
167 |     size_t gridRows = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
168 |     size_t gridCols = (p + BLOCK_SIZE - 1) / BLOCK_SIZE;
169 | 
170 |     dim3 dimGrid(gridCols, gridRows); // this is a struct which defines the size of the blocks grid used to perform parallel computations
171 |     dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // this is a struct which represents the size of a CUDA block within the CUDA kernel
172 |     
173 |     cudaEvent_t start, stop; // these are events used to measure the time of the kernel execution
174 |     float gpuTime = 0.0f;
175 |     
176 |     cudaEventCreate(&start);
177 |     cudaEventCreate(&stop);
178 |     
179 |     cudaEventRecord(start, 0);
180 |     matmul<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c, n, m, p);
181 |     cudaEventRecord(stop, 0);
182 |     
183 |     cudaEventSynchronize(stop); // Wait for the stop event to complete
184 |     cudaEventElapsedTime(&gpuTime, start, stop);
185 | 
186 |     cudaMemcpy(c, dev_c, n * p * sizeof(float), cudaMemcpyDeviceToHost);
187 |     end_time = clock();
188 | 
189 |     cudaError_t error = cudaGetLastError();
190 |     if (error != cudaSuccess) {
191 |         fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(error));
192 |     }
193 | 
194 |     cudaFree(dev_a);
195 |     cudaFree(dev_b);
196 |     cudaFree(dev_c);
197 | 
198 |     if (n * p <= 25){
199 |         print_matrix('a', a, n, m);
200 |         print_matrix('b', b, m, p);
201 |         print_matrix('c', c, n, p);
202 |     }
203 | 
204 |     printf("Overall GPU time: %f s \n", (end_time - start_time) / CLOCKS_PER_SEC);
205 |     printf("GPU time: %f s\n", gpuTime / 1000);
206 | 
207 |     float *c_cpu = matrix(n, p, ZEROS);
208 |     start_time = clock();
209 |     cpu_matmul(a, b, c_cpu, n, m, p);
210 |     end_time = clock();
211 | 
212 |     printf("Overall CPU time: %f s\n", (end_time - start_time) / CLOCKS_PER_SEC);
213 |     printf("\n");
214 | 
215 |     if (n * p <= 25){
216 |         print_matrix('x', c_cpu, n, p);
217 |     }
218 |     printf("Matrices are %s\n", equals(c, c_cpu, n, p) ? "equal" : "different");
219 | 
220 |     free(a);
221 |     free(b);
222 |     free(c);
223 |     free(c_cpu);
224 | 
225 |     return 0;
226 | }


--------------------------------------------------------------------------------
/matmul2dsm/matmul2dsm.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 |     This program is provided as is without any guarantees or warranty.
  3 |     By using this program the user accepts the full responsibility for any
  4 |     and all damages that may occur. The author is not responsible for any
  5 |     consequences of the use of this program.
  6 | 
  7 |     * This program performs matrix multiplication using CUDA and shared memory.
  8 |     * The matrices are generated using three different methods: zeros, ones, random.
  9 |     * The program uses the following functions:
 10 |         * getSharedMemory: prints the amount of shared memory per block
 11 |         * matmul_sm: the CUDA kernel which performs the matrix multiplication using shared memory
 12 |         * matrix: generates a matrix of size n x m of three types: zeros, ones, random
 13 |         * print_matrix: prints a matrix of size n x m
 14 |         * cpu_matmul: performs the matrix multiplication on the CPU
 15 |         * equals: checks if two matrices are equal
 16 |         * parse_args: parses the command line arguments
 17 |     
 18 |     * The program takes three command line arguments:
 19 |         * n: the number of rows of the first matrix
 20 |         * m: the number of columns of the first matrix and the number of rows of the second matrix
 21 |         * p: the number of columns of the second matrix
 22 |     * If the number of command line arguments is less than 3, the program uses the default values of 3 for n, m, and p.
 23 |     * If the number of command line arguments is 1, the program uses the value of the first argument for n, m, and p.
 24 |     
 25 |     * The program can be compiled using the following command:
 26 |         * nvcc matmul2d.cu -o matmul
 27 |     * to run the program, use the following command:
 28 |         * ./matmul <n> <m> <p>
 29 | 
 30 |     @Author: Daniel Rossi
 31 |     @Date: 2023-03-12
 32 |     @License: MIT
 33 |     @Version: 1.0
 34 | */
 35 | 
 36 | #include <stdio.h>
 37 | #include <stdlib.h>
 38 | #include <time.h>
 39 | #include <cuda_runtime.h>
 40 | 
 41 | #define BLOCK_SIZE 32
 42 | 
 43 | // enums used to generate matrices of different types
 44 | enum {
 45 |     ZEROS = 0,
 46 |     ONES = 1,
 47 |     RAND = 2,
 48 | };
 49 | 
 50 | 
 51 | /*
 52 |  * CUDA kernel to perform matrix multiplication using shared memory
 53 |  * The shared memory is a memory space that is shared between all threads in a block.
 54 |  * It is fast because it is located on-chip, but it is limited in size.
 55 | */
 56 | __global__ void matmul_sm(float *a, float *b, float *c, size_t n, size_t m, size_t p) {
 57 |     // Calculate the global row and column indices
 58 |     size_t row = blockIdx.y * blockDim.y + threadIdx.y; // this is the row index for the current thread
 59 |     size_t col = blockIdx.x * blockDim.x + threadIdx.x; // this is the column index for the current thread
 60 | 
 61 |     // Allocate shared memory for the tile of matrix A and B
 62 |     __shared__ float tileA[BLOCK_SIZE][BLOCK_SIZE];
 63 |     __shared__ float tileB[BLOCK_SIZE][BLOCK_SIZE];
 64 |     
 65 |     float sum = 0.0f;
 66 | 
 67 |     // Iterate over the tiles of matrix A and B
 68 |     for (size_t tileIdx = 0; tileIdx < (m + BLOCK_SIZE - 1) / BLOCK_SIZE; ++tileIdx) {
 69 |         /*
 70 |         * TILEs:
 71 |         *  - If we consider a 3x3 matrix A, i can scroll the first row if globalRow = 0 and globalCol = 0, 1, 2;
 72 |         *  - to scroll the second row, I need globalRow to be 3 and globalCol to be 0, 1, 2; and so on.
 73 |         *  - Thus, the maximum value for globalRow is 3x3 = 9. This is why globalRow cannot be larger than n * m.
 74 |         *  - But, since we operate within blocks, the row index for globalRow is given by row, which is the block index 
 75 |         *       multiplied by the block size, plus the thread index.
 76 |         * 
 77 |         *   - For globalCol is a little more complicated. The maximum value for globalCol is 3 in this example because we consider
 78 |         *     A to be a 3x3 matrix. Consider now to have BLOCK_SIZE = 2. This means that we have 2x2 tiles, or better, a 4 elements tiles.
 79 |         *     If we iterate from 0 to (3 + 2 - 1) / 2 = 2, the index of the first element would be [0 * 2 + 0] = 0, the second would be 
 80 |         *     [0 * 2 + 1] = 1, 3rd = [1 * 2 + 0] = 2 and 4th = [1 * 2 + 1] = 3. Thus we are able to scroll the columns of A.
 81 |         *     (BLOCK_SIZE of 2 means that each block has 2 threads)
 82 |         *     
 83 |         *   - For B, the same logic applies, but the maximum value for globalRow is m, and for globalCol is p.
 84 |         *     If we consider a 3x3 matrix B, the maximum value for globalRow is 3x3 = 9. We can consider to scroll the rows of B
 85 |         *     by incrementing the globalRow index from 0 to 3 and multiplying it by B's height (3). 
 86 |         *     
 87 |         *   - Why can we do this with tiles of size 4 when the matrix is 3x3? 
 88 |         *     Since, after the assingment of the elements to the tiles, we synchronize the threads, we can safely ignore the
 89 |         *     elements that are not part of the matrix. In particular, we are performing the overall matmul slicing in pieces 
 90 |         *     the matrices. After filling the tiles, we have all we need to calculate the result for the current tile.
 91 |         */
 92 |         
 93 |         size_t globalRow = row * m;
 94 |         size_t globalCol = tileIdx * BLOCK_SIZE + threadIdx.x;
 95 | 
 96 |         if (globalRow < n * m && globalCol < m) {
 97 |             tileA[threadIdx.y][threadIdx.x] = a[globalRow + globalCol];
 98 |         } else {
 99 |             tileA[threadIdx.y][threadIdx.x] = 0.0f;
100 |         }
101 | 
102 |         globalRow = tileIdx * BLOCK_SIZE + threadIdx.y;
103 |         globalCol = col;
104 | 
105 |         if (globalRow < m && globalCol < p) {
106 |             tileB[threadIdx.y][threadIdx.x] = b[(globalRow) * p + globalCol];
107 |         } else {
108 |             tileB[threadIdx.y][threadIdx.x] = 0.0f;
109 |         }
110 | 
111 |         // Synchronize threads to ensure all elements are loaded into shared memory
112 |         __syncthreads();
113 | 
114 |         // Perform the matrix multiplication for the current tile
115 |         for (size_t k = 0; k < BLOCK_SIZE; ++k) {
116 |             sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
117 |         }
118 | 
119 |         // Synchronize threads to ensure all elements are used in the matrix multiplication
120 |         __syncthreads();
121 |     }
122 | 
123 |     // Write the result to the output matrix
124 |     if (row < n && col < p) {
125 |         c[row * p + col] = sum;
126 |     }
127 | }
128 | 
129 | // generates a matrix of size n x m of three types: zeros, ones, random
130 | float *matrix(size_t n, size_t m, int type) {
131 |     float *mat = (float *)malloc(n * m * sizeof(float));
132 |     for (size_t i = 0; i < n * m; i++) {
133 |         if (type == ZEROS) {
134 |             mat[i] = 0;
135 |         } else if (type == ONES) {
136 |             mat[i] = 1;
137 |         } else if (type == RAND) {
138 |             mat[i] = (float)rand() / RAND_MAX;
139 |         }
140 |     }
141 |     return mat;
142 | }
143 | 
144 | 
145 | void print_matrix(char name, float *matrix, size_t n, size_t m){
146 |     printf("Matrix %c:\n", name);
147 |         for (size_t i = 0; i < n; ++i) {
148 |             for (size_t j = 0; j < m; ++j) {
149 |                 printf("%f ", matrix[i * m + j]);
150 |             }
151 |             printf("\n");
152 |         }
153 |         printf("\n");
154 | }
155 | 
156 | 
157 | void cpu_matmul(float *a, float *b, float *c_cpu, size_t n, size_t m, size_t p){
158 |     for (size_t i = 0; i < n; ++i) {
159 |         for (size_t j = 0; j < p; ++j) {
160 |             for (size_t k = 0; k < m; ++k) {
161 |                 c_cpu[i * p + j] += a[i * m + k] * b[k * p + j];
162 |             }
163 |         }
164 |     }
165 | }
166 | 
167 | 
168 | bool equals(float *gpu, float *cpu, size_t n, size_t m) {
169 |     for (size_t i = 0; i < n * m; i++) {
170 |         if (abs(gpu[i] - cpu[i]) > 1e-3) {
171 |             printf("a[%lu] = %f, b[%lu] = %f\n", i, gpu[i], i, cpu[i]);
172 |             return false;
173 |         }
174 |     }
175 |     return true;
176 | }
177 | 
178 | 
179 | void parse_args(int argc, char **argv, size_t *n, size_t *m, size_t *p) {
180 |     if (argc > 1 && argc <= 2) {
181 |         *n = atoi(argv[1]);
182 |         *m = atoi(argv[1]);
183 |         *p = atoi(argv[1]);
184 |     } else if (argc > 3) {
185 |         *n = atoi(argv[1]);
186 |         *m = atoi(argv[2]);
187 |         *p = atoi(argv[3]);
188 |     } else {
189 |         *n = 3;
190 |         *m = 3;
191 |         *p = 3;
192 |     }
193 | }
194 | 
195 | int main(int argc, char** argv) {
196 |     float *a, *b, *c;
197 |     size_t n, m, p;
198 |     parse_args(argc, argv, &n, &m, &p);
199 | 
200 |     srand(41); // set the seed for random number generation
201 | 
202 |     // generate the matrices
203 |     a = matrix(n, m, ONES);
204 |     b = matrix(m, p, ONES);
205 |     c = matrix(n, p, ZEROS);
206 | 
207 |     float *dev_a, *dev_b, *dev_c;
208 | 
209 |     float start_time, end_time;
210 | 
211 |     start_time = clock();
212 |     // Allocate memory on the device
213 |     cudaMalloc((void **)&dev_a, n * m * sizeof(float));
214 |     cudaMalloc((void **)&dev_b, m * p * sizeof(float));
215 |     cudaMalloc((void **)&dev_c, n * p * sizeof(float));
216 | 
217 |     // Copy the input matrices from the host to the device
218 |     cudaMemcpy(dev_a, a, n * m * sizeof(float), cudaMemcpyHostToDevice);
219 |     cudaMemcpy(dev_b, b, m * p * sizeof(float), cudaMemcpyHostToDevice);
220 | 
221 |     // Calculate the number of block needed along rows and columns to cover each matrix dimension 
222 |     // using blocks of size BLOCK_SIZE
223 |     // BLOCK_SIZE - 1 guarantees that the last block will be filled with the remaining elements
224 |     size_t gridRows = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
225 |     size_t gridCols = (p + BLOCK_SIZE - 1) / BLOCK_SIZE;
226 | 
227 |     dim3 dimGrid(gridCols, gridRows); // this is a struct which defines the size of the blocks grid used to perform parallel computations
228 |     dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); // this is a struct which represents the size of a CUDA block within the CUDA kernel
229 |     
230 |     cudaEvent_t start, stop; // these are events used to measure the time of the kernel execution
231 |     float gpuTime = 0.0f;
232 |     
233 |     cudaEventCreate(&start);
234 |     cudaEventCreate(&stop);
235 |     
236 |     cudaEventRecord(start, 0);
237 |     matmul_sm<<<dimGrid, dimBlock>>>(dev_a, dev_b, dev_c, n, m, p);
238 |     cudaEventRecord(stop, 0);
239 |     
240 |     cudaEventSynchronize(stop); // Wait for the stop event to complete
241 |     cudaEventElapsedTime(&gpuTime, start, stop);
242 | 
243 |     cudaMemcpy(c, dev_c, n * p * sizeof(float), cudaMemcpyDeviceToHost);
244 |     end_time = clock();
245 | 
246 |     cudaError_t error = cudaGetLastError();
247 |     if (error != cudaSuccess) {
248 |         fprintf(stderr, "ERROR: %s\n", cudaGetErrorString(error));
249 |     }
250 | 
251 |     cudaFree(dev_a);
252 |     cudaFree(dev_b);
253 |     cudaFree(dev_c);
254 | 
255 |     if (n * p <= 25){
256 |         print_matrix('a', a, n, m);
257 |         print_matrix('b', b, m, p);
258 |         print_matrix('c', c, n, p);
259 |     }
260 | 
261 |     printf("Overall GPU time: %f s \n", (end_time - start_time) / CLOCKS_PER_SEC);
262 |     printf("GPU time: %f s\n", gpuTime / 1000);
263 | 
264 |     float *c_cpu = matrix(n, p, ZEROS);
265 |     start_time = clock();
266 |     cpu_matmul(a, b, c_cpu, n, m, p);
267 |     end_time = clock();
268 | 
269 |     printf("Overall CPU time: %f s\n", (end_time - start_time) / CLOCKS_PER_SEC);
270 |     printf("\n");
271 | 
272 |     if (n * p <= 25){
273 |         print_matrix('x', c_cpu, n, p);
274 |     }
275 |     printf("Matrices are %s\n", equals(c, c_cpu, n, p) ? "equal" : "different");
276 | 
277 |     free(a);
278 |     free(b);
279 |     free(c);
280 |     free(c_cpu);
281 | 
282 |     return 0;
283 | }


--------------------------------------------------------------------------------