├── .gitignore ├── MP1 ├── MP1.CU └── README.md ├── MP2 ├── MP2.CU └── README.md ├── MP3 ├── MP3.CU └── README.md ├── MP4 ├── MP4.CU └── README.md ├── MP5.1 ├── MP5.1.CU └── README.md ├── MP5.2 ├── MP5.2.CU └── README.md ├── MP6 ├── MP6.CU └── README.md ├── MP7 ├── MP7.CU └── README.md ├── README.md └── ece408_project ├── .vscode └── settings.json ├── README.md ├── _gitignore ├── _gitmodules ├── build_example ├── Makefile └── main.cu ├── ece408_src ├── new-forward.cuh └── new-forward.h ├── final.py ├── m1.1.py ├── m1.2.py ├── m2.1.py ├── m3.1.py ├── m4.1.py ├── rai ├── rai_build.yml ├── reader.py └── report.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | *.i 2 | *.ii 3 | *.gpu 4 | *.ptx 5 | *.cubin 6 | *.fatbin 7 | -------------------------------------------------------------------------------- /MP1/MP1.CU: -------------------------------------------------------------------------------- 1 | // MP 1 2 | #include 3 | 4 | __global__ void vecAdd(float *in1, float *in2, float *out, int len) { 5 | //@@ Insert code to implement vector addition here 6 | int i = blockIdx.x * blockDim.x + threadIdx.x; 7 | if (i>> (deviceInput1, deviceInput2, deviceOutput, size); 53 | cudaDeviceSynchronize(); 54 | wbTime_stop(Compute, "Performing CUDA computation"); 55 | 56 | wbTime_start(Copy, "Copying output memory to the CPU"); 57 | //@@ Copy the GPU memory back to the CPU here 58 | cudaMemcpy(hostOutput, deviceOutput, size, cudaMemcpyDeviceToHost); 59 | 60 | wbTime_stop(Copy, "Copying output memory to the CPU"); 61 | 62 | wbTime_start(GPU, "Freeing GPU Memory"); 63 | //@@ Free the GPU memory here 64 | cudaFree(deviceInput1); cudaFree(deviceInput2); cudaFree(deviceOutput); 65 | wbTime_stop(GPU, "Freeing GPU Memory"); 66 | 67 | wbSolution(args, hostOutput, inputLength); 68 | 69 | free(hostInput1); 70 | free(hostInput2); 71 | free(hostOutput); 72 | 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /MP1/README.md: -------------------------------------------------------------------------------- 1 | # Vector Addition 2 | ## Objective 3 | The purpose of this lab is for you to become familiar with using the CUDA API by implementing a simple vector addition kernel and its associated host code as shown in the lectures. 4 | 5 | ## Prerequisites 6 | Before starting this lab, make sure that: 7 | 8 | - You have completed all week 1 lectures or videos 9 | 10 | - You have completed “Lab Tour with Device Query” MP 11 | 12 | - You have looked over the tutorial document. 13 | 14 | - Chapter 2 of the text book would also be helpful 15 | 16 | ## Instruction 17 | Edit the code in the ‘Code’ tab to perform the following: 18 | 19 | - Allocate device memory 20 | 21 | - Copy host memory to device 22 | 23 | - Initialize thread block and kernel grid dimensions 24 | 25 | - Invoke CUDA kernel 26 | 27 | - Copy results from device to host 28 | 29 | - Free device memory 30 | 31 | - Write the CUDA kernel 32 | 33 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines. 34 | 35 | ## Suggestions (for all labs) 36 | - The system’s autosave feature is not an excuse to not backup your code and answers to your questions regularly. 37 | 38 | - If you have not done so already, watch the tutorial video. 39 | 40 | - Do not modify the template code provided -- only insert code where the //@@ demarcation is placed 41 | 42 | - Develop your solution incrementally and test each version thoroughly before moving on to the next version 43 | 44 | - Do not wait until the last minute to attempt the lab. 45 | 46 | - If you get stuck with boundary conditions, grab a pen and paper. It is much easier to figure out the boundary conditions there. 47 | 48 | - Implement the serial CPU version first, this will give you an understanding of the loops 49 | 50 | - Get the first dataset working first. The datasets are ordered so the first one is the easiest to handle 51 | 52 | - Make sure that your algorithm handles non-regular dimensional inputs (not square or multiples of 2). The slides may present the algorithm with nice inputs, since it minimizes the conditions. The datasets reflect different sizes of input that you are expected to handle 53 | 54 | - Make sure that you test your program using all the datasets provided (the datasets can be selected using the dropdown next to the submission button) 55 | 56 | - Check for errors: for example, when developing CUDA code, one can check for if the function call succeeded and print an error if not via the following macro: 57 | ``` 58 | #define wbCheck(stmt) do { \ 59 | cudaError_t err = stmt; \ 60 | if (err != cudaSuccess) { \ 61 | wbLog(ERROR, "Failed to run stmt ", #stmt); \ 62 | wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 63 | return -1; \ 64 | } \ 65 | } while(0) 66 | An example usage is wbCheck(cudaMalloc(...)). 67 | ``` 68 | -------------------------------------------------------------------------------- /MP2/MP2.CU: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define wbCheck(stmt) \ 4 | do { \ 5 | cudaError_t err = stmt; \ 6 | if (err != cudaSuccess) { \ 7 | wbLog(ERROR, "Failed to run stmt ", #stmt); \ 8 | wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 9 | return -1; \ 10 | } \ 11 | } while (0) 12 | 13 | // Compute C = A * B 14 | __global__ void matrixMultiply(float *A, float *B, float *C, int numARows, 15 | int numAColumns, int numBRows, 16 | int numBColumns, int numCRows, 17 | int numCColumns) { 18 | //@@ Insert code to implement matrix multiplication here 19 | int Row = blockIdx.y*blockDim.y + threadIdx.y; 20 | int Col = blockIdx.x*blockDim.x + threadIdx.x; 21 | 22 | 23 | if ((Row < numCRows) && (Col < numCColumns)) { 24 | float ElementVal = 0; 25 | for (int k = 0; k < numAColumns; k++) 26 | ElementVal += A[Row*numAColumns + k]*B[k*numBColumns + Col]; 27 | C[Row*numCColumns + Col] = ElementVal; 28 | } 29 | } 30 | 31 | int main(int argc, char **argv) { 32 | wbArg_t args; 33 | float *hostA; // The A matrix 34 | float *hostB; // The B matrix 35 | float *hostC; // The output C matrix 36 | float *deviceA; 37 | float *deviceB; 38 | float *deviceC; 39 | int numARows; // number of rows in the matrix A 40 | int numAColumns; // number of columns in the matrix A 41 | int numBRows; // number of rows in the matrix B 42 | int numBColumns; // number of columns in the matrix B 43 | int numCRows; // number of rows in the matrix C (you have to set this) 44 | int numCColumns; // number of columns in the matrix C (you have to set 45 | // this) 46 | 47 | args = wbArg_read(argc, argv); 48 | 49 | wbTime_start(Generic, "Importing data and creating memory on host"); 50 | hostA = (float *)wbImport(wbArg_getInputFile(args, 0), &numARows, 51 | &numAColumns); 52 | hostB = (float *)wbImport(wbArg_getInputFile(args, 1), &numBRows, 53 | &numBColumns); 54 | //@@ Set numCRows and numCColumns 55 | numCRows = numARows; 56 | numCColumns = numBColumns; 57 | 58 | //@@ Allocate the hostC matrix 59 | int sizeA, sizeB, sizeC; 60 | sizeA = numARows*numAColumns*sizeof(float); 61 | sizeB = numBRows*numBColumns*sizeof(float); 62 | sizeC = numCRows*numCColumns*sizeof(float); 63 | hostC = (float *)malloc(sizeC); 64 | wbTime_stop(Generic, "Importing data and creating memory on host"); 65 | 66 | wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 67 | wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 68 | 69 | wbTime_start(GPU, "Allocating GPU memory."); 70 | //@@ Allocate GPU memory here 71 | cudaMalloc((void**) &deviceA, sizeA); 72 | cudaMalloc((void**) &deviceB, sizeB); 73 | cudaMalloc((void**) &deviceC, sizeC); 74 | wbTime_stop(GPU, "Allocating GPU memory."); 75 | 76 | wbTime_start(GPU, "Copying input memory to the GPU."); 77 | //@@ Copy memory to the GPU here 78 | cudaMemcpy(deviceA, hostA, sizeA, cudaMemcpyHostToDevice); 79 | cudaMemcpy(deviceB, hostB, sizeB, cudaMemcpyHostToDevice); 80 | wbTime_stop(GPU, "Copying input memory to the GPU."); 81 | 82 | //@@ Initialize the grid and block dimensions here 83 | dim3 dimGrid(ceil((1.0*numCColumns)/2), ceil((1.0*numCRows))/2, 1); 84 | dim3 dimBlock(2, 2, 1); 85 | 86 | wbTime_start(Compute, "Performing CUDA computation"); 87 | //@@ Launch the GPU Kernel here 88 | matrixMultiply<<>>(deviceA, deviceB, deviceC, numARows,numAColumns, 89 | numBRows, numBColumns, numCRows,numCColumns); 90 | cudaDeviceSynchronize(); 91 | wbTime_stop(Compute, "Performing CUDA computation"); 92 | 93 | wbTime_start(Copy, "Copying output memory to the CPU"); 94 | //@@ Copy the GPU memory back to the CPU here 95 | cudaMemcpy(hostC, deviceC, sizeC, cudaMemcpyDeviceToHost); 96 | wbTime_stop(Copy, "Copying output memory to the CPU"); 97 | 98 | wbTime_start(GPU, "Freeing GPU Memory"); 99 | //@@ Free the GPU memory here 100 | cudaFree(deviceA); cudaFree(deviceB); cudaFree(deviceC); 101 | wbTime_stop(GPU, "Freeing GPU Memory"); 102 | 103 | wbSolution(args, hostC, numCRows, numCColumns); 104 | 105 | free(hostA); 106 | free(hostB); 107 | free(hostC); 108 | 109 | return 0; 110 | } 111 | -------------------------------------------------------------------------------- /MP2/README.md: -------------------------------------------------------------------------------- 1 | # Basic Matrix Multiplication 2 | ## Objective 3 | The purpose of this lab is to implement a basic dense matrix multiplication routine. 4 | 5 | # Prerequisites 6 | - Before starting this lab, make sure that: 7 | 8 | - You have completed the “Vector Addition” MP 9 | 10 | - You have completed all week 2 lecture videos 11 | 12 | # Instruction 13 | Edit the code in the ‘Code’ tab to perform the following: 14 | 15 | - allocate device memory 16 | - copy host memory to device 17 | - initialize thread block and kernel grid dimensions 18 | - invoke CUDA kernel 19 | - copy results from device to host 20 | - deallocate device memory 21 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines. 22 | 23 | # Suggestions (for all labs) 24 | - The system’s autosave feature is not an excuse to not backup your code and answers to your questions regularly. 25 | 26 | - If you have not done so already, watch the tutorial video. 27 | 28 | - Do not modify the template code provided -- only insert code where the //@@ demarcation is placed 29 | 30 | - Develop your solution incrementally and test each version thoroughly before moving on to the next version 31 | 32 | - Do not wait until the last minute to attempt the lab. 33 | 34 | - If you get stuck with boundary conditions, grab a pen and paper. It is much easier to figure out the boundary conditions there. 35 | 36 | - Implement the serial CPU version first, this will give you an understanding of the loops 37 | 38 | - Get the first dataset working first. The datasets are ordered so the first one is the easiest to handle 39 | 40 | - Make sure that your algorithm handles non-regular dimensional inputs (not square or multiples of 2). The slides may present the algorithm with nice inputs, since it minimizes the conditions. The datasets reflect different sizes of input that you are expected to handle 41 | 42 | - Make sure that you test your program using all the datasets provided (the datasets can be selected using the dropdown next to the submission button) 43 | 44 | - Check for errors: for example, when developing CUDA code, one can check for if the function call succeeded and print an error if not via the following macro: 45 | ``` 46 | #define wbCheck(stmt) do { \ 47 | cudaError_t err = stmt; \ 48 | if (err != cudaSuccess) { \ 49 | wbLog(ERROR, "Failed to run stmt ", #stmt); \ 50 | wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 51 | return -1; \ 52 | } \ 53 | } while(0) 54 | An example usage is wbCheck(cudaMalloc(...)). 55 | ``` 56 | -------------------------------------------------------------------------------- /MP3/MP3.CU: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define wbCheck(stmt) \ 4 | do { \ 5 | cudaError_t err = stmt; \ 6 | if (err != cudaSuccess) { \ 7 | wbLog(ERROR, "Failed to run stmt ", #stmt); \ 8 | wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 9 | return -1; \ 10 | } \ 11 | } while (0) 12 | #define TW 16 13 | // Compute C = A * B 14 | __global__ void matrixMultiplyShared(float *A, float *B, float *C, 15 | int numARows, int numAColumns, 16 | int numBRows, int numBColumns, 17 | int numCRows, int numCColumns) { 18 | //@@ Insert code to implement matrix multiplication here 19 | //@@ You have to use shared memory for this MP 20 | 21 | __shared__ float subTileM[TW][TW]; 22 | __shared__ float subTileN[TW][TW]; 23 | 24 | int bx = blockIdx.x; int by = blockIdx.y; 25 | int tx = threadIdx.x; int ty = threadIdx.y; 26 | 27 | int Row = by * TW + ty; 28 | int Col = bx * TW + tx; 29 | 30 | float Pvalue = 0; 31 | 32 | 33 | for (int m = 0; m < ceil((1.0*numAColumns)/(TW) ); m++){ 34 | if (Row < numARows && m*TW + tx < numAColumns){ 35 | subTileM[ty][tx] = A[Row*numAColumns + m*TW + tx]; 36 | } 37 | else 38 | subTileM[ty][tx] = 0; 39 | 40 | if (Col < numBColumns && (m*TW + ty) < numBRows){ 41 | subTileN[ty][tx] = B[(m*TW + ty)*numBColumns + Col]; 42 | } 43 | else 44 | subTileN[ty][tx] = 0; 45 | __syncthreads(); 46 | for (int k = 0; k < TW; k++) 47 | Pvalue += subTileM[ty][k] * subTileN[k][tx]; 48 | 49 | __syncthreads(); 50 | } 51 | if (Row < numCRows && Col < numCColumns) 52 | C[Row*numCColumns + Col] = Pvalue; 53 | } 54 | 55 | 56 | 57 | int main(int argc, char **argv) { 58 | wbArg_t args; 59 | float *hostA; // The A matrix 60 | float *hostB; // The B matrix 61 | float *hostC; // The output C matrix 62 | float *deviceA; 63 | float *deviceB; 64 | float *deviceC; 65 | int numARows; // number of rows in the matrix A 66 | int numAColumns; // number of columns in the matrix A 67 | int numBRows; // number of rows in the matrix B 68 | int numBColumns; // number of columns in the matrix B 69 | int numCRows; // number of rows in the matrix C (you have to set this) 70 | int numCColumns; // number of columns in the matrix C (you have to set 71 | // this) 72 | 73 | args = wbArg_read(argc, argv); 74 | 75 | wbTime_start(Generic, "Importing data and creating memory on host"); 76 | hostA = (float *)wbImport(wbArg_getInputFile(args, 0), &numARows, 77 | &numAColumns); 78 | hostB = (float *)wbImport(wbArg_getInputFile(args, 1), &numBRows, 79 | &numBColumns); 80 | //@@ Set numCRows and numCColumns 81 | numCRows = numARows; 82 | numCColumns = numBColumns; 83 | 84 | //@@ Allocate the hostC matrix 85 | int sizeA, sizeB, sizeC; 86 | sizeA = numARows*numAColumns*sizeof(float); 87 | sizeB = numBRows*numBColumns*sizeof(float); 88 | sizeC = numCRows*numCColumns*sizeof(float); 89 | hostC = (float *)malloc(sizeC); 90 | 91 | wbTime_stop(Generic, "Importing data and creating memory on host"); 92 | 93 | wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); 94 | wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns); 95 | 96 | wbTime_start(GPU, "Allocating GPU memory."); 97 | //@@ Allocate GPU memory here 98 | cudaMalloc((void**) &deviceA, sizeA); 99 | cudaMalloc((void**) &deviceB, sizeB); 100 | cudaMalloc((void**) &deviceC, sizeC); 101 | 102 | wbTime_stop(GPU, "Allocating GPU memory."); 103 | 104 | wbTime_start(GPU, "Copying input memory to the GPU."); 105 | //@@ Copy memory to the GPU here 106 | cudaMemcpy(deviceA, hostA, sizeA, cudaMemcpyHostToDevice); 107 | cudaMemcpy(deviceB, hostB, sizeB, cudaMemcpyHostToDevice); 108 | wbTime_stop(GPU, "Copying input memory to the GPU."); 109 | 110 | //@@ Initialize the grid and block dimensions here 111 | int x; 112 | int y; 113 | y = ceil((1.0*numCRows)/(TW)); 114 | x = ceil((1.0*numCColumns)/(TW)); 115 | dim3 dimGrid(x, y, 1); 116 | dim3 dimBlock(TW, TW, 1); 117 | wbTime_start(Compute, "Performing CUDA computation"); 118 | 119 | //@@ Launch the GPU Kernel here 120 | matrixMultiplyShared<<>>(deviceA, deviceB, deviceC, numARows,numAColumns, 121 | numBRows, numBColumns, numCRows, numCColumns); 122 | cudaDeviceSynchronize(); 123 | wbTime_stop(Compute, "Performing CUDA computation"); 124 | 125 | wbTime_start(Copy, "Copying output memory to the CPU"); 126 | //@@ Copy the GPU memory back to the CPU here 127 | cudaMemcpy(hostC, deviceC, sizeC, cudaMemcpyDeviceToHost); 128 | wbTime_stop(Copy, "Copying output memory to the CPU"); 129 | 130 | wbTime_start(GPU, "Freeing GPU Memory"); 131 | //@@ Free the GPU memory here 132 | cudaFree(deviceA); cudaFree(deviceB); cudaFree(deviceC); 133 | wbTime_stop(GPU, "Freeing GPU Memory"); 134 | 135 | wbSolution(args, hostC, numCRows, numCColumns); 136 | 137 | free(hostA); 138 | free(hostB); 139 | free(hostC); 140 | 141 | return 0; 142 | } 143 | -------------------------------------------------------------------------------- /MP3/README.md: -------------------------------------------------------------------------------- 1 | # Tiled Matrix Multiplication 2 | ## Objective 3 | The purpose of this lab is to implement a tiled dense matrix multiplication routine using shared memory. 4 | 5 | ## Prerequisites 6 | Before starting this lab, make sure that: 7 | 8 | - You have completed the “Basic Matrix Multiplication” MP 9 | 10 | - You have completed all week 3 videos 11 | 12 | # Instruction 13 | - Edit the code in the “Code” tab to perform the following: 14 | 15 | - allocate device memory 16 | - copy host memory to device 17 | - initialize thread block and kernel grid dimensions 18 | - invoke CUDA kernel 19 | - copy results from device to host 20 | - deallocate device memory 21 | - implement the matrix-matrix multiplication routine using shared memory and tiling 22 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines. 23 | 24 | # Suggestions (for all labs) 25 | - The system’s autosave feature is not an excuse to not backup your code and answers to your questions regularly. 26 | 27 | - If you have not done so already, watch the tutorial video. 28 | 29 | - Do not modify the template code provided -- only insert code where the //@@ demarcation is placed 30 | 31 | - Develop your solution incrementally and test each version thoroughly before moving on to the next version 32 | 33 | - Do not wait until the last minute to attempt the lab. 34 | 35 | - If you get stuck with boundary conditions, grab a pen and paper. It is much easier to figure out the boundary conditions there. 36 | 37 | - Implement the serial CPU version first, this will give you an understanding of the loops 38 | 39 | - Get the first dataset working first. The datasets are ordered so the first one is the easiest to handle 40 | 41 | - Make sure that your algorithm handles non-regular dimensional inputs (not square or multiples of 2). The slides may present the algorithm with nice inputs, since it minimizes the conditions. The datasets reflect different sizes of input that you are expected to handle 42 | 43 | - Make sure that you test your program using all the datasets provided (the datasets can be selected using the dropdown next to the submission button) 44 | 45 | - Check for errors: for example, when developing CUDA code, one can check for if the function call succeeded and print an error if not via the following macro: 46 | ``` 47 | #define wbCheck(stmt) do { \ 48 | cudaError_t err = stmt; \ 49 | if (err != cudaSuccess) { \ 50 | wbLog(ERROR, "Failed to run stmt ", #stmt); \ 51 | wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 52 | return -1; \ 53 | } \ 54 | } while(0) 55 | An example usage is wbCheck(cudaMalloc(...)). 56 | ``` 57 | -------------------------------------------------------------------------------- /MP4/MP4.CU: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define wbCheck(stmt) \ 4 | do { \ 5 | cudaError_t err = stmt; \ 6 | if (err != cudaSuccess) { \ 7 | wbLog(ERROR, "CUDA error: ", cudaGetErrorString(err)); \ 8 | wbLog(ERROR, "Failed to run stmt ", #stmt); \ 9 | return -1; \ 10 | } \ 11 | } while (0) 12 | 13 | //@@ Define any useful program-wide constants here 14 | #define TILE_WIDTH 4 15 | #define MASK_WIDTH 3 16 | #define RADIUS 1 17 | 18 | //@@ Define constant memory for device kernel here 19 | __constant__ float MASK[MASK_WIDTH*MASK_WIDTH*MASK_WIDTH]; 20 | 21 | __global__ void conv3d(float *input, float *output, const int z_size, 22 | const int y_size, const int x_size) { 23 | //@@ Insert kernel code here 24 | 25 | int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z; 26 | int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z; 27 | 28 | int x_o = bx*TILE_WIDTH + tx; 29 | int y_o = by*TILE_WIDTH + ty; 30 | int z_o = bz*TILE_WIDTH + tz; 31 | 32 | __shared__ float N_ds[TILE_WIDTH][TILE_WIDTH][TILE_WIDTH]; 33 | 34 | if (x_o >=0 && x_o < x_size && y_o >=0 && y_o < y_size && z_o >=0 && z_o < z_size) 35 | N_ds[tz][ty][tx] = input[x_size*y_size*z_o + x_size*y_o + x_o]; 36 | else 37 | N_ds[tz][ty][tx] = 0; 38 | __syncthreads(); 39 | 40 | int x_N_ds_Start = tx - RADIUS; 41 | int y_N_ds_Start = ty - RADIUS; 42 | int z_N_ds_Start = tz - RADIUS; 43 | float Pvalue = 0; 44 | 45 | for (int i = 0; i < MASK_WIDTH; i++) 46 | for (int j = 0; j < MASK_WIDTH; j++) 47 | for (int k = 0; k < MASK_WIDTH; k++){ 48 | 49 | int x_ds_index = x_N_ds_Start + i; 50 | int y_ds_index = y_N_ds_Start + j; 51 | int z_ds_index = z_N_ds_Start + k; 52 | 53 | if (x_ds_index >= 0 && x_ds_index < TILE_WIDTH && 54 | y_ds_index >= 0 && y_ds_index < TILE_WIDTH && 55 | z_ds_index >= 0 && z_ds_index < TILE_WIDTH 56 | ) 57 | Pvalue += N_ds[z_ds_index][y_ds_index][x_ds_index]*MASK[MASK_WIDTH*MASK_WIDTH*k + MASK_WIDTH*j + i]; 58 | else{ 59 | int x_Global = bx*TILE_WIDTH + x_ds_index; 60 | int y_Global = by*TILE_WIDTH + y_ds_index; 61 | int z_Global = bz*TILE_WIDTH + z_ds_index; 62 | 63 | if (x_Global >= 0 && x_Global < x_size && 64 | y_Global >= 0 && y_Global < y_size && 65 | z_Global >= 0 && z_Global < z_size) 66 | Pvalue += input[x_size*y_size*z_Global + x_size*y_Global + x_Global]*MASK[MASK_WIDTH*MASK_WIDTH*k + MASK_WIDTH*j + i]; 67 | } 68 | } 69 | 70 | if (x_o < x_size && y_o < y_size && z_o < z_size) 71 | output[x_size*y_size*z_o + x_size*y_o + x_o] = Pvalue; 72 | 73 | __syncthreads(); 74 | } 75 | 76 | int main(int argc, char *argv[]) { 77 | wbArg_t args; 78 | int z_size; 79 | int y_size; 80 | int x_size; 81 | int inputLength, kernelLength; 82 | float *hostInput; 83 | float *hostKernel; 84 | float *hostOutput; 85 | float *deviceInput; 86 | float *deviceOutput; 87 | 88 | args = wbArg_read(argc, argv); 89 | 90 | // Import data 91 | hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &inputLength); 92 | hostKernel = (float *)wbImport(wbArg_getInputFile(args, 1), &kernelLength); 93 | hostOutput = (float *)malloc(inputLength * sizeof(float)); 94 | 95 | // First three elements are the input dimensions 96 | z_size = hostInput[0]; 97 | y_size = hostInput[1]; 98 | x_size = hostInput[2]; 99 | wbLog(TRACE, "The input size is ", z_size, "x", y_size, "x", x_size); 100 | assert(z_size * y_size * x_size == inputLength - 3); 101 | assert(kernelLength == 27); 102 | 103 | wbTime_start(GPU, "Doing GPU Computation (memory + compute)"); 104 | 105 | wbTime_start(GPU, "Doing GPU memory allocation"); 106 | //@@ Allocate GPU memory here 107 | // Recall that inputLength is 3 elements longer than the input data 108 | // because the first three elements were the dimensions 109 | cudaMalloc((void**)&deviceInput, x_size*y_size*z_size*sizeof(float)); 110 | cudaMalloc((void**)&deviceOutput, x_size*y_size*z_size*sizeof(float)); 111 | wbTime_stop(GPU, "Doing GPU memory allocation"); 112 | 113 | wbTime_start(Copy, "Copying data to the GPU"); 114 | //@@ Copy input and kernel to GPU here 115 | // Recall that the first three elements of hostInput are dimensions and 116 | // do 117 | // not need to be copied to the gpu 118 | cudaMemcpy(deviceInput, hostInput+3, x_size*y_size*z_size*sizeof(float), cudaMemcpyHostToDevice); 119 | cudaMemcpyToSymbol(MASK, hostKernel, MASK_WIDTH*MASK_WIDTH*MASK_WIDTH*sizeof(float)); 120 | wbTime_stop(Copy, "Copying data to the GPU"); 121 | 122 | wbTime_start(Compute, "Doing the computation on the GPU"); 123 | //@@ Initialize grid and block dimensions here 124 | dim3 dimGrid(ceil((1.0*x_size)/TILE_WIDTH), ceil((1.0*y_size)/TILE_WIDTH), ceil((1.0*z_size)/TILE_WIDTH)); 125 | dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, TILE_WIDTH); 126 | //@@ Launch the GPU kernel here 127 | conv3d<<>>(deviceInput, deviceOutput, z_size, 128 | y_size, x_size); 129 | cudaDeviceSynchronize(); 130 | wbTime_stop(Compute, "Doing the computation on the GPU"); 131 | 132 | wbTime_start(Copy, "Copying data from the GPU"); 133 | //@@ Copy the device memory back to the host here 134 | // Recall that the first three elements of the output are the dimensions 135 | // and should not be set here (they are set below) 136 | cudaMemcpy(hostOutput+3 , deviceOutput, z_size * y_size * x_size * sizeof(float), cudaMemcpyDeviceToHost); 137 | wbTime_stop(Copy, "Copying data from the GPU"); 138 | 139 | wbTime_stop(GPU, "Doing GPU Computation (memory + compute)"); 140 | 141 | // Set the output dimensions for correctness checking 142 | hostOutput[0] = z_size; 143 | hostOutput[1] = y_size; 144 | hostOutput[2] = x_size; 145 | wbSolution(args, hostOutput, inputLength); 146 | 147 | // Free device memory 148 | cudaFree(deviceInput); 149 | cudaFree(deviceOutput); 150 | 151 | // Free host memory 152 | free(hostInput); 153 | free(hostOutput); 154 | return 0; 155 | } 156 | -------------------------------------------------------------------------------- /MP4/README.md: -------------------------------------------------------------------------------- 1 | ## 3D Convolution 2 | # Objective 3 | The purpose of this lab is to implement a 3D convolution using constant memory for the kernel and 3D shared memory tiling. 4 | 5 | ## Prerequisite Before starting this lab, make sure that: 6 | 7 | - You have completed the “Tiled Matrix Multiplication” MP 8 | 9 | - You have completed all week 4 videos 10 | 11 | # Instructions 12 | - Edit the code to implement a 3D convolution with a 3x3x3 kernel in constant memory and a 3D shared-memory tiling. 13 | 14 | - Edit the code to launch the kernel you implemented. The function should launch 3D CUDA grid and blocks, where each thread is responsible for computing a single element of the output. 15 | 16 | - Answer the questions found in the questions tab. 17 | 18 | ## Algorithm Specification 19 | You will be implementing the following 3D convolution. 20 | ``` 21 | for z_out = 0 to z_size - 1: 22 | for y_out = 0 to y_size - 1: 23 | for x_out = 0 to x_size - 1: { 24 | let res = 0; 25 | for z_mask = - MASK_RADIUS to MASK_RADIUS: 26 | for y_mask = - MASK_RADIUS to MASK_RADIUS: 27 | for x_mask = - MASK_RADIUS to MASK_RADIUS: 28 | let z_in = z_out + z_mask; 29 | let y_in = y_out + y_mask; 30 | let x_in = x_out + x_mask; 31 | // Pad boundary with 0 32 | if (z_in >= 0 && z_in < z_size && 33 | y_in >= 0 && y_in < y_size && 34 | x_in >= 0 && x_in < x_size) then 35 | res += mask[z_mask + MASK_RADIUS][y_mask + MASK_RADIUS][x_mask + MASK_RADIUS] * in[z_in][y_in][x_in] 36 | } 37 | out[z_out][y_out][x_out] = res; 38 | } 39 | ``` 40 | - The kernel size is fixed to 3x3x3, given MASK_WIDTH = 3 and MASK_RADIUS = 1. 41 | - Halo elements should be read as 0. 42 | - You should support input data of any size. 43 | - Note that the input and output size is the same. 44 | ## Other Notes 45 | - The raw format of the input data is a flattened array, where the first three elements are the z_size, y_size, and x_size respectively. For example, a 5x4x3 input array will look like 46 | 47 | - float inputData[] = { 5.0, 4.0, 3.0, ... < 60 floats > } 48 | - A point (z,y,x) may be accessed at z * (y_size * x_size) + y * (x_size) + x. 49 | 50 | - The template code reads the first three elements into z_size, y_size, and x_size. You will need to copy the rest of the data to the device. 51 | 52 | - Likewise, the result needs to have the sizes prepended for WebGPU to check your result correctly. The template code does that as well, but you must copy the data into outputData from the fourth element on. 53 | 54 | - Remember that you can get a pointer to the fourth element of the array with &arry[3]. 55 | -------------------------------------------------------------------------------- /MP5.1/MP5.1.CU: -------------------------------------------------------------------------------- 1 | // MP Reduction 2 | // Given a list (lst) of length n 3 | // Output its sum = lst[0] + lst[1] + ... + lst[n-1]; 4 | 5 | #include 6 | 7 | #define BLOCK_SIZE 512 //@@ You can change this 8 | 9 | #define wbCheck(stmt) \ 10 | do { \ 11 | cudaError_t err = stmt; \ 12 | if (err != cudaSuccess) { \ 13 | wbLog(ERROR, "Failed to run stmt ", #stmt); \ 14 | wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 15 | return -1; \ 16 | } \ 17 | } while (0) 18 | 19 | __global__ void total(float *input, float *output, int len) { 20 | //@@ Load a segment of the input vector into shared memory 21 | //@@ Traverse the reduction tree 22 | //@@ Write the computed sum of the block to the output vector at the 23 | //@@ correct index 24 | __shared__ float partialSum[2*BLOCK_SIZE]; 25 | 26 | unsigned int t = threadIdx.x; 27 | unsigned int start = 2*blockIdx.x*blockDim.x; 28 | 29 | for (int k = 0; k < 2; k++){ 30 | if ((t + start + k*BLOCK_SIZE)>>(deviceInput, deviceOutput, numInputElements); 99 | cudaDeviceSynchronize(); 100 | wbTime_stop(Compute, "Performing CUDA computation"); 101 | 102 | wbTime_start(Copy, "Copying output memory to the CPU"); 103 | //@@ Copy the GPU memory back to the CPU here 104 | cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * numOutputElements, cudaMemcpyDeviceToHost); 105 | wbTime_stop(Copy, "Copying output memory to the CPU"); 106 | 107 | /******************************************************************** 108 | * Reduce output vector on the host 109 | * NOTE: One could also perform the reduction of the output vector 110 | * recursively and support any size input. For simplicity, we do not 111 | * require that for this lab. 112 | ********************************************************************/ 113 | for (ii = 1; ii < numOutputElements; ii++) { 114 | hostOutput[0] += hostOutput[ii]; 115 | } 116 | 117 | wbTime_start(GPU, "Freeing GPU Memory"); 118 | //@@ Free the GPU memory here 119 | cudaFree(deviceInput); 120 | cudaFree(deviceOutput); 121 | wbTime_stop(GPU, "Freeing GPU Memory"); 122 | 123 | wbSolution(args, hostOutput, 1); 124 | 125 | free(hostInput); 126 | free(hostOutput); 127 | 128 | return 0; 129 | } 130 | -------------------------------------------------------------------------------- /MP5.1/README.md: -------------------------------------------------------------------------------- 1 | # List Reduction 2 | ## Objective 3 | Implement a kernel and associated host code that performs reduction of a 1D list stored in a C array. The reduction should give the sum of the list. You should implement the improved kernel discussed in the lecture. Your kernel should be able to handle input lists of arbitrary length. 4 | 5 | # Prerequisites 6 | - Before starting this lab, make sure that: 7 | 8 | - You have completed week 4 lecture videos 9 | # Instruction 10 | For simplicity, you can assume that the input list will contain at most 2048 x 65535 elements so that it can be handled by only one kernel launch. The boundary condition can be handled by filling ‘identity value (0 for sum)’ into the shared memory of the last block when the length is not a multiple of the thread block size. Write a host (CPU) loop to calculate the total of the reduction sums of each section generated by individual blocks. 11 | 12 | Edit the code in the ‘Code’ tab to perform the following: 13 | 14 | - allocate device memory 15 | - copy host memory to device 16 | - initialize thread block and kernel grid dimensions 17 | - invoke CUDA kernel 18 | - copy results from device to host 19 | - deallocate device memory 20 | - implement the improved reduction kernel 21 | - use shared memory to reduce the number of global accesses, handle the boundary conditions when loading input list elements into the shared memory 22 | - implement a CPU loop to perform final reduction based on the sums of sections generated by the thread blocks after copying the partial sum array back to the host memory 23 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines. 24 | 25 | -------------------------------------------------------------------------------- /MP5.2/MP5.2.CU: -------------------------------------------------------------------------------- 1 | // MP Scan 2 | // Given a list (lst) of length n 3 | // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ... 4 | // + 5 | // lst[n-1]} 6 | 7 | #include 8 | 9 | #define BLOCK_SIZE 512 //@@ You can change this 10 | 11 | #define wbCheck(stmt) \ 12 | do { \ 13 | cudaError_t err = stmt; \ 14 | if (err != cudaSuccess) { \ 15 | wbLog(ERROR, "Failed to run stmt ", #stmt); \ 16 | wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 17 | return -1; \ 18 | } \ 19 | } while (0) 20 | 21 | __global__ void scan(float *input, float *output, int len, float *psum) { 22 | //@@ Modify the body of this function to complete the functionality of 23 | //@@ the scan on the device 24 | //@@ You may need multiple kernel calls; write your kernels before this 25 | //@@ function and call them from the host 26 | 27 | //Load into shared memory 28 | 29 | __shared__ int T[2*BLOCK_SIZE]; 30 | 31 | int tx = threadIdx.x + (blockIdx.x * blockDim.x * 2); 32 | if (tx < len) 33 | T[threadIdx.x] = input[tx]; 34 | else 35 | T[threadIdx.x] = 0; 36 | __syncthreads(); 37 | 38 | if ((tx + blockDim.x) < len) 39 | T[threadIdx.x + blockDim.x] = input[tx + blockDim.x]; 40 | else 41 | T[threadIdx.x + blockDim.x] = 0; 42 | 43 | __syncthreads(); 44 | 45 | // First Scan Step 46 | int stride = 1; 47 | while(stride < 2*BLOCK_SIZE) 48 | { 49 | __syncthreads(); 50 | int index = (threadIdx.x+1)*stride*2 - 1; 51 | if(index < 2*BLOCK_SIZE && (index-stride) >= 0) 52 | T[index] += T[index-stride]; 53 | stride = stride*2; 54 | } 55 | 56 | // post_scan 57 | stride = BLOCK_SIZE/2; 58 | while(stride > 0) 59 | { 60 | __syncthreads(); 61 | int index = (threadIdx.x+1)*stride*2 - 1; 62 | if((index+stride) < 2*BLOCK_SIZE) 63 | { 64 | T[index+stride] += T[index]; 65 | } 66 | stride = stride / 2; 67 | } 68 | 69 | 70 | // copy to output 71 | 72 | //Write result back to Global memory 73 | __syncthreads(); 74 | if (tx < len) 75 | output[tx] = T[threadIdx.x]; 76 | if ((tx + blockDim.x) < len) 77 | output[tx + blockDim.x] = T[threadIdx.x + blockDim.x]; 78 | 79 | // store partial sum 80 | if(threadIdx.x == 0) 81 | psum[blockIdx.x] = T[2*BLOCK_SIZE-1]; 82 | 83 | } 84 | 85 | __global__ void add(float *psum, float *output, int len){ 86 | __shared__ float increment; 87 | 88 | if (threadIdx.x == 0){ 89 | if (blockIdx.x == 0) 90 | increment = 0; 91 | else 92 | increment = psum[blockIdx.x - 1]; 93 | } 94 | __syncthreads(); 95 | 96 | for(int k = 0; k < 2; ++k){ 97 | int tile = (blockIdx.x * blockDim.x * 2) + threadIdx.x + (k * BLOCK_SIZE); 98 | if(tile < len){ 99 | output[tile] += increment; 100 | } 101 | } 102 | 103 | } 104 | 105 | 106 | 107 | int main(int argc, char **argv) { 108 | wbArg_t args; 109 | float *hostInput; // The input 1D list 110 | float *hostOutput; // The output list 111 | float *deviceInput; 112 | float *deviceOutput; 113 | float *deviceBuffer; //extra 114 | float *auxSum; //extra 115 | float *tmp; //extra 116 | int numElements; // number of elements in the list 117 | 118 | args = wbArg_read(argc, argv); 119 | 120 | wbTime_start(Generic, "Importing data and creating memory on host"); 121 | hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements); 122 | hostOutput = (float *)malloc(numElements * sizeof(float)); 123 | wbTime_stop(Generic, "Importing data and creating memory on host"); 124 | 125 | wbLog(TRACE, "The number of input elements in the input is ", 126 | numElements); 127 | 128 | wbTime_start(GPU, "Allocating GPU memory."); 129 | int numBlocks = ceil((numElements*1.0) / (BLOCK_SIZE*2)); 130 | wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float))); 131 | wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float))); 132 | wbCheck(cudaMalloc((void **)&deviceBuffer, numBlocks * sizeof(float))); 133 | wbCheck(cudaMalloc((void **)&auxSum, numBlocks * sizeof(float))); 134 | wbCheck(cudaMalloc((void **)&tmp, sizeof(float))); 135 | 136 | wbTime_stop(GPU, "Allocating GPU memory."); 137 | 138 | wbTime_start(GPU, "Clearing output memory."); 139 | wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float))); 140 | wbTime_stop(GPU, "Clearing output memory."); 141 | 142 | wbTime_start(GPU, "Copying input memory to the GPU."); 143 | wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float), 144 | cudaMemcpyHostToDevice)); 145 | wbTime_stop(GPU, "Copying input memory to the GPU."); 146 | 147 | //@@ Initialize the grid and block dimensions here 148 | dim3 DimBlock(BLOCK_SIZE, 1, 1); 149 | dim3 DimGrid(numBlocks, 1, 1); 150 | wbTime_start(Compute, "Performing CUDA computation"); 151 | //@@ Modify this to complete the functionality of the scan 152 | //@@ on the deivce 153 | 154 | //Scan Input 155 | scan<<>>(deviceInput, deviceOutput, numElements, deviceBuffer); 156 | cudaDeviceSynchronize(); 157 | 158 | //Scan deviceBuffer 159 | dim3 singleDimGrid(1, 1, 1); 160 | scan<<>>(deviceBuffer, auxSum, numBlocks, tmp); 161 | cudaDeviceSynchronize(); 162 | 163 | //Add Input & deviceBuffer 164 | add<<>>(auxSum, deviceOutput, numElements); 165 | cudaDeviceSynchronize(); 166 | 167 | wbTime_stop(Compute, "Performing CUDA computation"); 168 | 169 | wbTime_start(Copy, "Copying output memory to the CPU"); 170 | wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements * sizeof(float), 171 | cudaMemcpyDeviceToHost)); 172 | wbTime_stop(Copy, "Copying output memory to the CPU"); 173 | 174 | wbTime_start(GPU, "Freeing GPU Memory"); 175 | cudaFree(deviceInput); 176 | cudaFree(deviceOutput); 177 | wbTime_stop(GPU, "Freeing GPU Memory"); 178 | 179 | wbSolution(args, hostOutput, numElements); 180 | 181 | free(hostInput); 182 | free(hostOutput); 183 | 184 | return 0; 185 | } 186 | -------------------------------------------------------------------------------- /MP5.2/README.md: -------------------------------------------------------------------------------- 1 | # List Scan (Parallel Scan) 2 | ## Objective 3 | The purpose of this lab is to implement one or more kernels and their associated host code to perform parallel scan on a 1D list. The scan operator used will be addition. You should implement the work- efficient kernel discussed in lecture. Your kernel should be able to handle input lists of arbitrary length. However, for simplicity, you can assume that the input list will be at most 2,048 * 2,048 elements. 4 | 5 | ## Prerequisites 6 | Before starting this lab, make sure that: 7 | 8 | - You have completed all week 4 lecture videos 9 | - You have completed all week 5 lecture videos 10 | - You have completed the List Reduction Lab 11 | ## Instruction 12 | The boundary condition can be handled by filling ‘identity value (0 for sum)’ into the shared memory of the last block when the length is not a multiple of the thread block size. 13 | 14 | You will need to launch multiple kernels to complete the parallel scan as discussed in the lecture. 15 | 16 | Edit the code in the code tab to perform the following: 17 | 18 | - allocate device memory 19 | - copy host memory to device 20 | - initialize thread block and kernel grid dimensions 21 | - invoke CUDA kernel 22 | - copy results from device to host 23 | - deallocate device memory 24 | - implement the work-efficient scan kernel to generate per-block scan array and store the block sums into an auxiliary block sum array. 25 | - use shared memory to reduce the number of global memory accesses, handle the boundary conditions when loading input list elements into the shared memory 26 | - reuse the kernel to perform scan on the auxiliary block sum array to translate the elements into accumulative block sums. Note that - - this kernel will be launched with only one block. 27 | - implement the kernel that adds the accumulative block sums to the appropriate elements of the per-block scan array to complete the scan for all the elements. 28 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines. 29 | -------------------------------------------------------------------------------- /MP6/MP6.CU: -------------------------------------------------------------------------------- 1 | // Histogram Equalization 2 | 3 | #include 4 | 5 | #define HISTOGRAM_LENGTH 256 6 | 7 | //@@ insert code here 8 | __global__ void float2Char(float *input, unsigned char *output, int width, int height){ 9 | int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z; 10 | int tx = threadIdx.x; int ty = threadIdx.y; 11 | int x = bx*blockDim.x + tx; 12 | int y = by*blockDim.y + ty; 13 | if(y < height && x < width){ 14 | int idx = (width * height)*bz + (width)*y + x; 15 | output[idx] = (unsigned char) (255 * input[idx]); 16 | } 17 | } 18 | 19 | __global__ void rgb2Gray(unsigned char *input, unsigned char *output, int width, int height){ 20 | int bx = blockIdx.x; int by = blockIdx.y; 21 | int tx = threadIdx.x; int ty = threadIdx.y; 22 | int x = bx*blockDim.x + tx; 23 | int y = by*blockDim.y + ty; 24 | if(y < height && x < width){ 25 | int idx = y * (width) + x; 26 | uint8_t r = input[3 * idx]; 27 | uint8_t g = input[3 * idx + 1]; 28 | uint8_t b = input[3 * idx + 2]; 29 | output[idx] = (unsigned char) (0.21*r + 0.71*g + 0.07*b); 30 | } 31 | } 32 | 33 | __global__ void gray2Hist(unsigned char *input, unsigned int *output, int width, int height){ 34 | int bx = blockIdx.x; int by = blockIdx.y; 35 | int tx = threadIdx.x; int ty = threadIdx.y; 36 | int x = bx*blockDim.x + tx; 37 | int y = by*blockDim.y + ty; 38 | 39 | __shared__ unsigned int hist[HISTOGRAM_LENGTH]; 40 | int tIdx = blockDim.x*ty + tx; 41 | if (tIdx < HISTOGRAM_LENGTH) 42 | hist[tIdx] = 0; 43 | 44 | __syncthreads(); 45 | 46 | if (x < width && y < height) { 47 | int idx = y * (width) + x; 48 | unsigned char val = input[idx]; 49 | atomicAdd(&(hist[val]), 1); 50 | } 51 | 52 | __syncthreads(); 53 | if (tIdx < HISTOGRAM_LENGTH) 54 | atomicAdd(&(output[tIdx]), hist[tIdx]); 55 | 56 | } 57 | 58 | __global__ void hist2CDF(unsigned int *input, float *output, int width, int height){ 59 | __shared__ unsigned int cdf[HISTOGRAM_LENGTH]; 60 | int id = threadIdx.x; 61 | 62 | if(id < HISTOGRAM_LENGTH) 63 | cdf[id] = input[id]; 64 | __syncthreads(); 65 | 66 | //reduction 67 | int stride = 1; 68 | while(stride < HISTOGRAM_LENGTH) 69 | { 70 | __syncthreads(); 71 | int index = (threadIdx.x+1)*stride*2 - 1; 72 | if(index < HISTOGRAM_LENGTH && (index-stride) >= 0) 73 | cdf[index] += cdf[index-stride]; 74 | stride = stride*2; 75 | } 76 | 77 | stride = HISTOGRAM_LENGTH/4; 78 | while(stride > 0) 79 | { 80 | __syncthreads(); 81 | int index = (threadIdx.x+1)*stride*2 - 1; 82 | if((index+stride) < HISTOGRAM_LENGTH) 83 | { 84 | cdf[index+stride] += cdf[index]; 85 | } 86 | stride = stride / 2; 87 | } 88 | __syncthreads(); 89 | output[id] = cdf[id] / ((float)(width * height)); 90 | } 91 | 92 | 93 | __global__ void equal(unsigned char *img, float *cdf, int width, int height){ 94 | int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z; 95 | int tx = threadIdx.x; int ty = threadIdx.y; 96 | int x = bx*blockDim.x + tx; 97 | int y = by*blockDim.y + ty; 98 | 99 | if(x < width && y < height){ 100 | int idx = (width * height)*bz + (width)*y + x; 101 | float v = 255*(cdf[img[idx]] - cdf[0])/(1.0 - cdf[0]); 102 | img[idx] = (unsigned char) min(max(v, 0.0), 255.0); 103 | } 104 | } 105 | 106 | __global__ void uint2float(unsigned char *input, float *output, int width, int height){ 107 | int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z; 108 | int tx = threadIdx.x; int ty = threadIdx.y; 109 | int x = bx*blockDim.x + tx; 110 | int y = by*blockDim.y + ty; 111 | if(x < width && y < height){ 112 | int idx = (width * height)*bz + (width)*y + x; 113 | output[idx] = (float) (input[idx] / 255.0); 114 | } 115 | } 116 | 117 | int main(int argc, char **argv) { 118 | wbArg_t args; 119 | int imageWidth; 120 | int imageHeight; 121 | int imageChannels; 122 | wbImage_t inputImage; 123 | wbImage_t outputImage; 124 | float *hostInputImageData; 125 | float *hostOutputImageData; 126 | const char *inputImageFile; 127 | 128 | //@@ Insert more code here 129 | float *deviceFloat; 130 | unsigned char *deviceUint; 131 | unsigned char *deviceGray; 132 | unsigned int *deviceHist; 133 | float *deviceCDF; 134 | 135 | args = wbArg_read(argc, argv); /* parse the input arguments */ 136 | inputImageFile = wbArg_getInputFile(args, 0); 137 | wbTime_start(Generic, "Importing data and creating memory on host"); 138 | inputImage = wbImport(inputImageFile); 139 | imageWidth = wbImage_getWidth(inputImage); 140 | imageHeight = wbImage_getHeight(inputImage); 141 | imageChannels = wbImage_getChannels(inputImage); 142 | outputImage = wbImage_new(imageWidth, imageHeight, imageChannels); 143 | hostInputImageData = wbImage_getData(inputImage);//get image data 144 | hostOutputImageData = wbImage_getData(outputImage); 145 | wbTime_stop(Generic, "Importing data and creating memory on host"); 146 | 147 | //@@ insert code here 148 | cudaMalloc((void**) &deviceFloat, imageWidth * imageHeight * imageChannels * sizeof(float)); 149 | cudaMalloc((void**) &deviceUint, imageWidth * imageHeight * imageChannels * sizeof(unsigned char)); 150 | cudaMalloc((void**) &deviceGray, imageWidth * imageHeight * sizeof(unsigned char)); 151 | cudaMalloc((void**) &deviceHist, HISTOGRAM_LENGTH * sizeof(unsigned int)); 152 | cudaMemset((void *) deviceHist, 0, HISTOGRAM_LENGTH * sizeof(unsigned int)); 153 | cudaMalloc((void**) &deviceCDF, HISTOGRAM_LENGTH * sizeof(float)); 154 | 155 | cudaMemcpy(deviceFloat, hostInputImageData, 156 | imageWidth*imageHeight*imageChannels*sizeof(float), cudaMemcpyHostToDevice); 157 | 158 | 159 | dim3 dimGrid; 160 | dim3 dimBlock; 161 | 162 | dimGrid = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), imageChannels); 163 | dimBlock = dim3(32, 32, 1); 164 | float2Char<<>>(deviceFloat, deviceUint, imageWidth, imageHeight); 165 | cudaDeviceSynchronize(); 166 | 167 | 168 | dimGrid = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), 1); 169 | dimBlock = dim3(32, 32, 1); 170 | rgb2Gray<<>>(deviceUint, deviceGray, imageWidth, imageHeight); 171 | cudaDeviceSynchronize(); 172 | 173 | 174 | dimGrid = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), 1); 175 | dimBlock = dim3(32, 32, 1); 176 | gray2Hist<<>>(deviceGray, deviceHist, imageWidth, imageHeight); 177 | cudaDeviceSynchronize(); 178 | 179 | 180 | dimGrid = dim3(1, 1, 1); 181 | dimBlock = dim3(HISTOGRAM_LENGTH, 1, 1); 182 | hist2CDF<<>>(deviceHist, deviceCDF, imageWidth, imageHeight); 183 | cudaDeviceSynchronize(); 184 | 185 | 186 | dimGrid = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), imageChannels); 187 | dimBlock = dim3(32, 32, 1); 188 | equal<<>>(deviceUint, deviceCDF, imageWidth, imageHeight); 189 | cudaDeviceSynchronize(); 190 | 191 | 192 | dimGrid = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), imageChannels); 193 | dimBlock = dim3(32, 32, 1); 194 | uint2float<<>>(deviceUint, deviceFloat, imageWidth, imageHeight); 195 | cudaDeviceSynchronize(); 196 | 197 | cudaMemcpy(hostOutputImageData, deviceFloat, 198 | imageWidth*imageHeight*imageChannels*sizeof(float), cudaMemcpyDeviceToHost); 199 | 200 | wbSolution(args, outputImage); 201 | 202 | //@@ insert code here 203 | cudaFree(deviceFloat); 204 | cudaFree(deviceUint); 205 | cudaFree(deviceGray); 206 | cudaFree(deviceHist); 207 | cudaFree(deviceCDF); 208 | 209 | return 0; 210 | } 211 | -------------------------------------------------------------------------------- /MP6/README.md: -------------------------------------------------------------------------------- 1 | # Histogram 2 | ## Objective 3 | The purpose of this lab is to implement an efficient histogramming equalization algorithm for an input image. Like the image convolution MP, the image is represented as RGB float values. You will convert that to GrayScale unsigned char values and compute the histogram. Based on the histogram, you will compute a histogram equalization function which you will then apply to the original image to get the color corrected image. 4 | 5 | ## Prerequisites 6 | Before starting this lab, make sure that: 7 | 8 | - You have completed all week 7 lecture videos 9 | ## Instruction 10 | - Edit the code in the code tab to perform the following: 11 | 12 | - Cast the image to unsigned char 13 | 14 | - Convert the image from RGB to Gray Scale. You will find one of the lectures and textbook chapters helpful. 15 | 16 | - Compute the histogram of the image 17 | 18 | - Compute the scan (prefix sum) of the histogram to arrive at the histogram equalization function 19 | 20 | - Apply the equalization function to the input image to get the color corrected image 21 | 22 | ## Background 23 | In this section we discuss some of the background details of the histogram equalization algorithm. For images that represent the full color space, we expect an image’s histogram to be evenly distributed. This means that we expect the bin values in the histogram to be pixel_count/.256, assuming that we scale the pixel luminous values so that they fit between 0 and 256. This algorithm adjusts an image’s histogram so that all bins have equal probability. 24 | 25 | ### image 26 | 27 | We first need to convert the image to gray scale by computing it’s luminosity values that represent the brightness of the image and would allow us to simplify the histogram computation. 28 | 29 | ### Gray 30 | 31 | The histogram computes the number of pixels having a specific brightness value. Dividing by the number of pixels (width * height) gives us the probability of a luminosity value to occur in an image. 32 | 33 | ### OrigProb 34 | 35 | A color balanced image is expected to have a uniform distribution of the luminosity values. 36 | 37 | This means that if we compute the Cumulative Distribution Function (CDF) we expect a linear curve for a color equalized image. For images that are not color equalized, we expect the curve to be non-linear. 38 | 39 | ### origcdf 40 | 41 | The algorithm equalizes the curve by computing a transformation function to map the original CDF to the desired CDF (the desired CDF being an almost linear function). 42 | 43 | ### newcdf 44 | 45 | The computed transformation is applied to the original image to produce the equalized image. 46 | 47 | ### newimg 48 | 49 | Note that the CDF of the histogram of the new image has been transformed into an almost linear curve. 50 | 51 | ### compare 52 | 53 | ## Implementation Steps 54 | Here we show the steps to be performed. The computation to be performed by each kernel is illustrated with serial pseudo code. 55 | 56 | ### Cast the image from float to unsigned char 57 | Implement a kernel that casts the image from float * to unsigned char *. 58 | ``` 59 | for ii from 0 to (width * height * channels) do 60 | ucharImage[ii] = (unsigned char) (255 * inputImage[ii]) 61 | end 62 | ``` 63 | 64 | ### Convert the image from RGB to GrayScale 65 | Implement a kernel that converts the RGB image to GrayScale. A sample sequential pseudo code is shown below. You will find one the lectures and one of the textbook chapters helpful. 66 | ``` 67 | for ii from 0 to height do 68 | for jj from 0 to width do 69 | idx = ii * width + jj 70 | # here channels is 3 71 | r = ucharImage[3*idx] 72 | g = ucharImage[3*idx + 1] 73 | b = ucharImage[3*idx + 2] 74 | grayImage[idx] = (unsigned char) (0.21*r + 0.71*g + 0.07*b) 75 | end 76 | end 77 | ``` 78 | 79 | ### Compute the histogram of grayImage 80 | Implement a kernel that computes the histogram (like in the lectures) of the image. A sample pseudo code is shown below. You will find one of the lectures and one of the textbook chapters helpful. 81 | ``` 82 | histogram = [0, ...., 0] # here len(histogram) = 256 83 | for ii from 0 to width * height do 84 | histogram[grayImage[ii]]++ 85 | end 86 | ``` 87 | 88 | ### Compute the Cumulative Distribution Function of histogram 89 | This is a scan operation like you have done in the previous lab. A sample sequential pseudo code is shown below. 90 | ``` 91 | cdf[0] = p(histogram[0]) 92 | for ii from 1 to 256 do 93 | cdf[ii] = cdf[ii - 1] + p(histogram[ii]) 94 | end 95 | ``` 96 | Where p() calculates the probability of a pixel to be in a histogram bin 97 | ``` 98 | def p(x): 99 | return x / (width * height) 100 | end 101 | ``` 102 | Compute the minimum value of the CDF. The maximal value of the CDF should be 1.0. 103 | 104 | ### Define the histogram equalization function 105 | The histogram equalization function (correct) remaps the cdf of the histogram of the image to a linear function and is defined as 106 | ``` 107 | def correct_color(val) 108 | return clamp(255*(cdf[val] - cdfmin)/(1.0 - cdfmin), 0, 255.0) 109 | end 110 | ``` 111 | ``` 112 | def clamp(x, start, end) 113 | return min(max(x, start), end) 114 | end 115 | ``` 116 | 117 | ### Apply the histogram equalization function 118 | Once you have implemented all of the above, then you are ready to correct the input image. This can be done by writing a kernel to apply the correct_color() function to the RGB pixel values in parallel. 119 | ``` 120 | for ii from 0 to (width * height * channels) do 121 | ucharImage[ii] = correct_color(ucharImage[ii]) 122 | end 123 | ``` 124 | 125 | ### Cast back to float 126 | ``` 127 | for ii from 0 to (width * height * channels) do 128 | outputImage[ii] = (float) (ucharImage[ii]/255.0) 129 | end 130 | ``` 131 | And you’re done 132 | 133 | ### Image Format 134 | For people who are developing on their own system. The images are stored in PPM (P6) format, this means that you can (if you want) create your own input images. The easiest way to create image is via external tools. You can use tools such as bmptoppm. 135 | -------------------------------------------------------------------------------- /MP7/MP7.CU: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define wbCheck(stmt) \ 4 | do { \ 5 | cudaError_t err = stmt; \ 6 | if (err != cudaSuccess) { \ 7 | wbLog(ERROR, "Failed to run stmt ", #stmt); \ 8 | wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ 9 | return -1; \ 10 | } \ 11 | } while (0) 12 | 13 | #define TILE_WIDTH 1024 14 | 15 | __global__ void spmvJDSKernel(float *out, int *matColStart, int *matCols, 16 | int *matRowPerm, int *matRows, 17 | float *matData, float *vec, int dim) { 18 | //@@ insert spmv kernel for jds format 19 | int x = blockIdx.x * blockDim.x + threadIdx.x; 20 | if (x < dim) { 21 | float dot = 0; 22 | int elem = 0; 23 | while (elem < matRows[x]){ 24 | dot += matData[matColStart[elem] + x] * vec[matCols[matColStart[elem] + x]]; 25 | elem++; 26 | } 27 | out[matRowPerm[x]] = dot; 28 | } 29 | } 30 | 31 | static void spmvJDS(float *out, int *matColStart, int *matCols, 32 | int *matRowPerm, int *matRows, float *matData, 33 | float *vec, int dim) { 34 | 35 | //@@ invoke spmv kernel for jds format 36 | 37 | spmvJDSKernel<<>>(out, matColStart, matCols, matRowPerm, matRows, matData, vec, dim); 38 | 39 | } 40 | 41 | int main(int argc, char **argv) { 42 | wbArg_t args; 43 | int *hostCSRCols; 44 | int *hostCSRRows; 45 | float *hostCSRData; 46 | int *hostJDSColStart; 47 | int *hostJDSCols; 48 | int *hostJDSRowPerm; 49 | int *hostJDSRows; 50 | float *hostJDSData; 51 | float *hostVector; 52 | float *hostOutput; 53 | int *deviceJDSColStart; 54 | int *deviceJDSCols; 55 | int *deviceJDSRowPerm; 56 | int *deviceJDSRows; 57 | float *deviceJDSData; 58 | float *deviceVector; 59 | float *deviceOutput; 60 | int dim, ncols, nrows, ndata; 61 | int maxRowNNZ; 62 | 63 | args = wbArg_read(argc, argv); 64 | 65 | wbTime_start(Generic, "Importing data and creating memory on host"); 66 | hostCSRCols = (int *)wbImport(wbArg_getInputFile(args, 0), &ncols, "Integer"); 67 | hostCSRRows = (int *)wbImport(wbArg_getInputFile(args, 1), &nrows, "Integer"); 68 | hostCSRData = (float *)wbImport(wbArg_getInputFile(args, 2), &ndata, "Real"); 69 | hostVector = (float *)wbImport(wbArg_getInputFile(args, 3), &dim, "Real"); 70 | 71 | hostOutput = (float *)malloc(sizeof(float) * dim); 72 | 73 | wbTime_stop(Generic, "Importing data and creating memory on host"); 74 | 75 | CSRToJDS(dim, hostCSRRows, hostCSRCols, hostCSRData, &hostJDSRowPerm, &hostJDSRows, 76 | &hostJDSColStart, &hostJDSCols, &hostJDSData); 77 | maxRowNNZ = hostJDSRows[0]; 78 | 79 | wbTime_start(GPU, "Allocating GPU memory."); 80 | cudaMalloc((void **)&deviceJDSColStart, sizeof(int) * maxRowNNZ); 81 | cudaMalloc((void **)&deviceJDSCols, sizeof(int) * ndata); 82 | cudaMalloc((void **)&deviceJDSRowPerm, sizeof(int) * dim); 83 | cudaMalloc((void **)&deviceJDSRows, sizeof(int) * dim); 84 | cudaMalloc((void **)&deviceJDSData, sizeof(float) * ndata); 85 | 86 | cudaMalloc((void **)&deviceVector, sizeof(float) * dim); 87 | cudaMalloc((void **)&deviceOutput, sizeof(float) * dim); 88 | wbTime_stop(GPU, "Allocating GPU memory."); 89 | 90 | wbTime_start(GPU, "Copying input memory to the GPU."); 91 | cudaMemcpy(deviceJDSColStart, hostJDSColStart, sizeof(int) * maxRowNNZ, 92 | cudaMemcpyHostToDevice); 93 | cudaMemcpy(deviceJDSCols, hostJDSCols, sizeof(int) * ndata, cudaMemcpyHostToDevice); 94 | cudaMemcpy(deviceJDSRowPerm, hostJDSRowPerm, sizeof(int) * dim, cudaMemcpyHostToDevice); 95 | cudaMemcpy(deviceJDSRows, hostJDSRows, sizeof(int) * dim, cudaMemcpyHostToDevice); 96 | cudaMemcpy(deviceJDSData, hostJDSData, sizeof(float) * ndata, cudaMemcpyHostToDevice); 97 | cudaMemcpy(deviceVector, hostVector, sizeof(float) * dim, cudaMemcpyHostToDevice); 98 | wbTime_stop(GPU, "Copying input memory to the GPU."); 99 | 100 | wbTime_start(Compute, "Performing CUDA computation"); 101 | spmvJDS(deviceOutput, deviceJDSColStart, deviceJDSCols, deviceJDSRowPerm, deviceJDSRows, 102 | deviceJDSData, deviceVector, dim); 103 | cudaDeviceSynchronize(); 104 | wbTime_stop(Compute, "Performing CUDA computation"); 105 | 106 | wbTime_start(Copy, "Copying output memory to the CPU"); 107 | cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * dim, cudaMemcpyDeviceToHost); 108 | wbTime_stop(Copy, "Copying output memory to the CPU"); 109 | 110 | wbTime_start(GPU, "Freeing GPU Memory"); 111 | cudaFree(deviceVector); 112 | cudaFree(deviceOutput); 113 | cudaFree(deviceJDSColStart); 114 | cudaFree(deviceJDSCols); 115 | cudaFree(deviceJDSRowPerm); 116 | cudaFree(deviceJDSRows); 117 | cudaFree(deviceJDSData); 118 | 119 | wbTime_stop(GPU, "Freeing GPU Memory"); 120 | 121 | wbSolution(args, hostOutput, dim); 122 | 123 | free(hostCSRCols); 124 | free(hostCSRRows); 125 | free(hostCSRData); 126 | free(hostVector); 127 | free(hostOutput); 128 | free(hostJDSColStart); 129 | free(hostJDSCols); 130 | free(hostJDSRowPerm); 131 | free(hostJDSRows); 132 | free(hostJDSData); 133 | 134 | return 0; 135 | } 136 | -------------------------------------------------------------------------------- /MP7/README.md: -------------------------------------------------------------------------------- 1 | # Sparse Matrix Multiplication (JDS) 2 | ## Objective 3 | The purpose of this lab is to implement a SpMV (Sparse Matrix Vector Multiplication) kernel for an input sparse matrix based on the Jagged Diagonal Storage (JDS) transposed format. 4 | 5 | ## Prerequisites 6 | Before starting this lab, make sure that: 7 | 8 | - You have completed all week 8 lecture videos 9 | - You have completed MP-6 10 | ## Instructions 11 | Edit the kernel and the host function in the file to implement sparse matrix-vector multiplication using the JDS format. The kernel shall be launched so that each thread will generate one output Y element. The kernel should have each thread to use the appropriate elements of the JDS data array, the JDS col index array, JDS row index array, and the JDS transposed col ptr array to generate one Y element. 12 | 13 | Instructions about where to place each part of the code is demarcated by the //@@ comment lines. 14 | 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UIUC-CS-483-Parallel Programming 2 | ## Official Description 3 | Parallel programming with emphasis on developing applications for processors with many computation cores. Computational thinking, forms of parallelism, programming models, mapping computations to parallel hardware, efficient data structures, paradigms for efficient parallel algorithms, and application case studies. 4 | ## Lab Equipment 5 | Linux based cluster system 6 | 7 | ## Lab Software 8 | C Programming Language and CUDA Software Development Kit, WebGPU for labs, RAI for final project 9 | 10 | ## Topical Prerequisites 11 | C programming, Basic data structures, Introduction to computer organization 12 | -------------------------------------------------------------------------------- /ece408_project/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.formatOnSave": true, 3 | "files.associations": { 4 | "*.cu": "cpp", 5 | "*.hu": "cpp", 6 | "*.cuh": "cpp", 7 | "chrono": "cpp" 8 | } 9 | } -------------------------------------------------------------------------------- /ece408_project/README.md: -------------------------------------------------------------------------------- 1 | # ECE408/CS483 Final Project 2 | 3 | ## Introduction 4 | 5 | This is the skeleton code for the Fall 2019 ECE408 / CS483 / CSE408 course project. 6 | In this project, you will: 7 | 8 | * Get practical experience by using, profiling, and modifying MXNet, a standard open-source neural-network framework. 9 | * Demonstrate command of CUDA and optimization approaches by designing and implementing an optimized neural-network convolution layer forward pass. 10 | 11 | The project will be broken up into 4 milestones and a final submission. Read the description of the final report before starting, so you can collect the necessary info along the way. 12 | Each milestone (except milestone 1) will consist of an updated report (culminating in the final report). Append each milestone's deliverable at the beginning of the document such that your latest milestone is at the beginning of the report. 13 | 14 | You will be working in teams of 3 (no excuse here). 15 | Chicago city scholars can form teams with on campus students. 16 | 17 | You are expected to adhere to University of Illinois academic integrity standards. 18 | Do not attempt to subvert any of the performance-measurement aspects of the final project. 19 | If you are unsure about whether something does not meet those guidelines, ask a member of the teaching staff. 20 | 21 | ## Table of Contents 22 | 23 | * [Milestone 1: Due 10/06/2019 @5pm](#milestone-1) 24 | * [Milestone 2: Due 10/12/2019 @5pm](#milestone-2) 25 | * [Milestone 3: Due 10/19/2019 @5pm](#milestone-3) 26 | * [Milestone 4: Due 11/21/2019 @5pm](#milestone-4) 27 | * [Final Submission: Due 12/19/2019 @5pm](#final-submission) 28 | * [Rubric](#rubric) 29 | * [Final Report](#final-report) 30 | * [Extras](#extras) 31 | 32 | ## Milestone 1 33 | 34 | Due October 06 @ 5pm 35 | 36 | | Deliverables | 37 | | ------------ | 38 | | Register your team in the google sheet. | 39 | 40 | You and your team should agree on a team name and enter it in this [google sheet](https://docs.google.com/spreadsheets/d/1vhThuFT0isnYPac8Gnh7Pp9FMOTWkZ6RcNq-C8ND9LQ/edit#gid=0). Graduate students can use this [google form](https://docs.google.com/forms/d/e/1FAIpQLScDU5QrC9pKsaI8KGKqT4HjjZodSlcYfr-IlR3d7qxzwpFCeg/viewform?usp=sf_link). 41 | 42 | ## Milestone 2 43 | 44 | Due October 12 @ 5pm 45 | 46 | As with all milestones, you will include an updated PDF `report.pdf` in the project directory you submit with rai. 47 | This report should contain all of the deliverables. 48 | This report should contain your names, netids, rai ids (if different), team names, and school affiliation (Chicago Scholars or on campus students). 49 | 50 | | Deliverables | 51 | | ------------ | 52 | | Report: Include a list of all kernels that collectively consume more than 90% of the program time. | 53 | | Report: Include a list of all CUDA API calls that collectively consume more than 90% of the program time. | 54 | | Report: Include an explanation of the difference between kernels and API calls | 55 | | Report: Show output of rai running MXNet on the CPU | 56 | | Report: List program run time | 57 | | Report: Show output of rai running MXNet on the GPU | 58 | | Report: List program run time | 59 | | Create a CPU implementation | 60 | | Report: List whole program execution time | 61 | | Report: List Op Times | 62 | | Use `rai -p --queue rai_amd64_ece408 --submit=m2` to mark your job for grading | 63 | 64 | Clone this repository to get the project folder. 65 | 66 | git clone https://github.com/illinois-impact/ece408_project.git 67 | 68 | Download the rai binary for your platform from [here](https://drive.google.com/drive/folders/1Pp84x3So9OEHUwRHQVZcRP441wRsO-UV). 69 | You will probably use it for development, and definitely use it for submission. 70 | 71 | You should have received a `.rai_profile` file by email. 72 | Put that file in `~/.rai_profile` (Linux/macOS). 73 | Your `.rai_profile` should look something like this (indented with space!) 74 | 75 | profile: 76 | firstname: 77 | lastname: 78 | username: 79 | email: 80 | access_key: 81 | secret_key: 82 | affiliation: uiuc 83 | 84 | You will need to add your team name in the following way: 85 | 86 | profile: 87 | firstname: 88 | lastname: 89 | username: 90 | email: 91 | access_key: 92 | secret_key: 93 | affiliation: uiuc 94 | team: 95 | name: 96 | 97 | Some more info is available on the [Client Documentation Page](https://github.com/rai-project/rai). 98 | 99 | Run the built-in MXNet forward pass using rai 100 | 101 | Consult `m1.1py` to examine the neural-network architecture used in this project. 102 | 103 | Use RAI to run a batch forward pass on some test data. 104 | 105 | rai -p --queue rai_amd64_ece408 106 | 107 | This will upload your project directory to rai (running on AWS) and move it to `/src`, where the execution specified in `rai_build.yml` will occur. 108 | 109 | The `image:` key specifies the environment that the rest of the execution will occur in. 110 | This environment includes a prebuilt MXNet (so rai will only do a partial compile with your code) as well as the model definition and the training data. 111 | 112 | The `resources:` key specifies what computation resources will be available to the execution. 113 | 114 | The `commands:` key specifies the recipe that rai will execute. First, the project files are copied to the `/build` directory. 115 | Then the files in `ece408_src` are copied to `src/operator/custom/` in the MXNet source tree. 116 | MXNet is recompiled, and the Python bindings are installed. 117 | `python /src/m1.1.py` runs the `m1.1.py` python program. 118 | 119 | You should see the following output: 120 | 121 | Loading fashion-mnist data... done 122 | Loading model... done 123 | New Inference 124 | EvalMetric: {'accuracy': 0.8154} 125 | 126 | Modify `rai_build.yml` to use `/usr/bin/time` to measure the elapsed time of the whole program. 127 | 128 | - /usr/bin/time python m1.1.py 129 | 130 | Next, we will run on the GPU! 131 | 132 | Compare `m1.2.py` and `m1.1.py`. You'll see that it is the same, except for `mx.gpu()` has been substituted for `mx.cpu()`. This is how we tell MXNet that we wish to use a GPU instead of a CPU. 133 | 134 | Modify `rai_build.yml` to time `python m1.2.py` 135 | 136 | Again, submit the job to rai 137 | 138 | rai -p --queue rai_amd64_ece408 139 | 140 | Next, we will learn how to use `nvprof` to profile the execution 141 | 142 | Once you've gotten the appropriate accuracy results, generate a profile using nvprof. You will be able to use nvprof to evaluate how effective your optimizations are. 143 | As described above, make sure `rai_build.yml` is configured for a GPU run. 144 | Then, modify `rai_build.yml` to generate a profile instead of just execuing the code. 145 | 146 | nvprof python m1.2.py 147 | 148 | You should see something that looks like the following: 149 | 150 | ~~~bash 151 | ==278== NVPROF is profiling process 278, command: python m1.2.py 152 | Loading model... done 153 | New Inference 154 | EvalMetric: {'accuracy': 0.8154} 155 | ==15163== Profiling application: python m1.2.py 156 | ==15163== Profiling result: 157 | Type Time(%) Time Calls Avg Min Max Name 158 | GPU activities: 39.80% 16.602ms 20 830.11us 1.1200us 16.092ms [CUDA memcpy HtoD] 159 | 20.28% 8.4577ms 1 8.4577ms 8.4577ms 8.4577ms void cudnn::detail::implicit_convolve_sgemm 160 | 11.89% 4.9587ms 1 4.9587ms 4.9587ms 4.9587ms volta_cgemm_64x32_tn 161 | 7.11% 2.9642ms 2 1.4821ms 25.760us 2.9384ms void op_generic_tensor_kernel 162 | 163 | ... 164 | 165 | API calls: 42.14% 3.03300s 22 137.86ms 13.006us 1.56281s cudaStreamCreateWithFlags 166 | 34.07% 2.45202s 24 102.17ms 117.07us 2.44545s cudaMemGetInfo 167 | 21.32% 1.53449s 19 80.763ms 805ns 407.00ms cudaFree 168 | 1.18% 84.772ms 912 92.951us 308ns 38.118ms cudaFuncSetAttribute 169 | 0.47% 33.977ms 9 3.7753ms 33.322us 16.253ms cudaMemcpy2DAsync 170 | 171 | ... 172 | ~~~ 173 | 174 | The GPU Activities section shows the kernels and memory transfers, and the API calls section shows the CUDA API calls that are executed. 175 | There are columns corresponding to percentage of time consumed, total time, number of calls, and average/min/max time of those calls. 176 | Think about the distinction between a CUDA API call and a kernel launch, and describe it briefly in your report. 177 | The CUDA documentation describes [kernels](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels) and the [programming interface](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programming-interface). 178 | 179 | You can find more information about nvprof in the [CUDA Toolkit Documentation](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) 180 | 181 | ### Create a CPU Implementation 182 | 183 | See the [description](#skeleton-code-description) of the skeleton code for background information, including the data storage layout of the tensors. 184 | 185 | Modify `ece408_src/new-forward.h` to implement the forward convolution described in Chapter 16 of the textbook. 186 | The performance of the CPU convolution is not part of the project evaluation. 187 | The algorithm is also below, for your convenience 188 | 189 | for b = 0 .. B // for each image in the batch 190 | for m = 0 .. M // for each output feature maps 191 | for h = 0 .. H_out // for each output element 192 | for w = 0 .. W_out 193 | { 194 | y[b][m][h][w] = 0; 195 | for c = 0 .. C // sum over all input feature maps 196 | for p = 0 .. K // KxK filter 197 | for q = 0 .. K 198 | y[b][m][h][w] += x[b][c][h + p][w + q] * k[m][c][p][q] 199 | } 200 | 201 | Unlike the convolutions described in the class, note that this one is not centered on the input image. 202 | 203 | Because this operator is different than the built-in MXNet operator, you will need to load a different model. 204 | `m2.1.py` handles this for you. 205 | Modify `rai_build.yml` to invoke 206 | 207 | python m2.1.py 208 | 209 | When your implementation is correct, you should see output like this: 210 | 211 | Loading fashion-mnist data... done 212 | Loading model... done 213 | New Inference 214 | Op Time: 10.906517 215 | Op Time: 58.887046 216 | Correctness: 0.7653 Model: ece408 217 | 218 | 219 | Every time your layer is invoked, it will print the "Op Time," the time spent working on that layer. 220 | Since the network has two convolutional layers, two times will be printed. 221 | You can time the whole program execution by modifying `rai_build.yml` with 222 | 223 | /usr/bin/time python m2.1.py 224 | 225 | `m2.1.py` takes one optional argument: the dataset size. 226 | If the correctness for each possible model is as below, you can be reasonably confident your implementation is right. 227 | The correctness does depend on the data size. 228 | 229 | For example, to check your correctness on the full data size of 10000, you could modify `rai_build.yml` to run 230 | 231 | python m2.1.py 10000 232 | 233 | | Model | Number of Images | Correctness | 234 | |-------------| -----| ----- | 235 | | ece408 | 100 | 0.76 | 236 | | ece408 | 1000 | 0.767 | 237 | | ece408 | 10000 (default) | 0.7653 | 238 | 239 | (Final model that will be used for internal evaluation shall be different.) 240 | 241 | The provided `m2.1.py` is identical to the one used by `--submit=m2`. 242 | You may modify `m2.1.py` as you please, but check that `--submit=m2` will still invoke your code correctly. 243 | 244 | Use 245 | 246 | rai -p --queue rai_amd64_ece408 --submit=m2 247 | 248 | to mark your submission. 249 | 250 | ## Milestone 3 251 | 252 | Due October 19 @ 5pm 253 | 254 | | Deliverables | 255 | | ------------ | 256 | | Everything from Milestone 2 | 257 | | Implement a GPU Convolution | 258 | | Correctness and timing with 3 different dataset sizes | 259 | | Report: demonstrate `nvprof` profiling the execution | 260 | | Use `rai -p --queue rai_amd64_ece408 --submit=m3` to mark your job for grading | 261 | 262 | ### Create a GPU Implementation 263 | 264 | Modify `ece408_src/new-forward.cuh` to create GPU implementation of the forward convolution. 265 | 266 | Modify `rai_build.yml` to run 267 | 268 | python m3.1.py 269 | 270 | to use your GPU implementation. 271 | When it is correct, it will show the same correctness as Milestone 2. 272 | 273 | ### Use `nvprof` and NVVP for initial Performance Results 274 | 275 | First, ensure you are using correct image in rai_build.yml file 276 | 277 | `image: illinoisimpact/ece408_mxnet_docker:amd64-gpu-latest-fa19` 278 | 279 | Modify `rai_build.yml` to use nvprof to save some timeline and analysis information, as described in [nvprof](#profiling). 280 | Use the NVIDIA Visual Profiler to find the execution of your kernel, and show it in your report. 281 | The [NVVP on EWS](#nvvp-on-ews) section describes how to install NVVP. 282 | 283 | Use 284 | 285 | rai -p --queue rai_amd64_ece408 --submit=m3 286 | 287 | to mark your submission. 288 | 289 | `m3.1.py` takes one optional argument: the dataset size. 290 | If the correctness for each possible model is as below, you can be reasonably confident your implementation is right. 291 | The correctness does depend on the data size. 292 | 293 | For example, you could modify `rai_build.yml` to run 294 | 295 | python m3.1.py 296 | 297 | | Model | Number of Images | Correctness | 298 | |-------------| -----| ----- | 299 | | ece408 | 100 | 0.76 | 300 | | ece408 | 1000 | 0.767 | 301 | | ece408 | 10000 (default) | 0.7653 | 302 | 303 | (Final model that will be used for internal evaluation shall be different.) 304 | 305 | ## Milestone 4 306 | 307 | Due November 21 @ 5pm 308 | 309 | | Deliverables | 310 | | ------------ | 311 | | Everything from Milestone 3 | 312 | | Implement three GPU optimizations | 313 | | Report: Describe the optimization | 314 | | Report: demonstrate `nvprof` profiling the execution | 315 | | Report: use NVVP to analyze your optimization | 316 | | Use `rai -p --queue rai_amd64_ece408 --submit=m4` to mark your job for grading | 317 | 318 | ### 3.1 Add three GPU Optimization 319 | 320 | For this milestone, you should attempt at least three GPU optimizations (see [optimizations](#optimizations)). 321 | 322 | Describe the optimizations in your `report.pdf`. 323 | 324 | ### 3.2 Performance Analysis with `nvprof` and NVVP 325 | 326 | Use the NVIDIA Visual Profiler and your analysis information to describe the effect that your optimizations had on the performance of your convolution. 327 | If possible, you should try to separate the effect of each optimization in your analysis. 328 | 329 | Use 330 | 331 | rai -p --queue rai_amd64_ece408 --submit=m4 332 | 333 | to submit your project folder. 334 | 335 | ## Final Submission 336 | 337 | Due December 19 @ 5pm 338 | 339 | | Deliverables | 340 | | ------------ | 341 | | Everything from Milestone 4 | 342 | | Implement final GPU optimizations | 343 | | Report: Describe and analyze the optimizations | 344 | | Report: demonstrate `nvprof` profiling the execution | 345 | | Use `rai -p --queue rai_amd64_ece408 --submit=final` to mark your job for grading | 346 | 347 | ### Optimized Layer 348 | 349 | Optimize your GPU convolution (see [optimizations](#optimizations)). 350 | 351 | Your implementation must work with `rai -p --queue rai_amd64_ece408 --submit=final`. 352 | This means all your source files must be in `ece408_src`, and your implementation must work when they are copied to `src/operator/custom` in the MXNet tree, and `make` is invoked on the MXNet tree. 353 | This is done in the provided `rai_build.yml`. 354 | Likewise, the provided `final.py` provides an example of the script that will be used to time your implementation. 355 | 356 | All of your code for this and the later milestones must be executed between `auto start = ...` and `auto end = ...` in `new-inl.h`. 357 | The easiest way to ensure this is that all of your code should be in `forward()` or called by `forward()` from `new-forward.cuh` or `new-forward.h`. 358 | Do not modify any timing-related code. 359 | 360 | Use `rai -p --queue rai_amd64_ece408 --submit=final` to submit your project folder. 361 | 362 | ### Final Report 363 | 364 | You've been building this final report through all the milestones. 365 | Keep the content from the earlier milestones, but be sure to include the following: 366 | 367 | * Your team name 368 | * Your team member names 369 | * your netids 370 | * your UINs 371 | 372 | The final report should include at least the following information for each optimization 373 | 374 | 1. **Optimization Approach and Results** 375 | * how you identified the optimization opportunity 376 | * why you thought the approach would be fruitful 377 | * the effect of the optimization. was it fruitful, and why or why not. Use nvprof and NVVP to justify your explanation. 378 | * Any external references used during identification or development of the optimization 379 | * How your team organized and divided up this work. 380 | 2. **References** (as needed) 381 | 3. **(Optional) Suggestions for Improving Next Year** 382 | 383 | ### Rubric 384 | 385 | The overall project score will be computed as follows: 386 | 387 | 1. Milestone 1 ( 5% ) 388 | 2. Milestone 2 ( 10% ) 389 | 3. Milestone 3 ( 10% ) 390 | 4. Milestone 4 ( 30% ) 391 | * Optimization 1 ( 10% ) 392 | * Optimization 2 ( 10% ) 393 | * Optimization 3 ( 10% ) 394 | 5. Final Optimizations ( 30% ) 395 | * Optimization 4 ( 10% ) 396 | * Optimization 5 ( 10% ) 397 | * Optimization 6 ( 10% ) 398 | * Additional Optimizations / detailed insights ( up to +10% extra!!! ) 399 | 6. Performance Ranking ( 10% ) 400 | 7. Report Style (5 %) 401 | * Clear, concise writing, good layout, and good organization will be rewarded. 402 | 403 | Each optimization will be graded as follows: 404 | 405 | 1. Explanation of Performance Impact ( 40% ) 406 | 2. Correctness ( 60% ) 407 | 408 | The Performance Ranking will be graded as follows: 409 | 410 | 1. The median performance will be determined (how well the class did as a whole) 411 | 2. Your performance will be converted to a number of standard deviations above/below that median (how well you did compared to the class). 412 | 3. That value will be linearly mapped into the space of 0-10 to determine the ranking grade. 413 | 414 | The ranking is determined by the total run time of the two layer invocations. 415 | If your implementation is not correct, you will get a 0 for this component of the grade. 416 | The `rai ranking` command is not the final word: the staff will re-run all final submissions multiple times and choose the fastest result as your time. 417 | THe ranking is determined solely by the values printed by `Op Time:` during your run. 418 | That `Op Time` is computed by wrapping the MXNet op that you implement in a timer. 419 | 420 | ## Optimizations 421 | 422 | We are going to suggest a set of possible optimizations for you to attempt. 423 | 424 | * Unroll + shared-memory Matrix multiply 425 | * Shared Memory convolution 426 | * Kernel fusion for unrolling and matrix-multiplication 427 | * Weight matrix (kernel values) in constant memory 428 | * Tuning with restrict and loop unrolling (considered as one optimization only if you do both) 429 | * An advanced matrix multiplication algorithm (register-tiled, for example) 430 | * Sweeping various parameters to find best values (block sizes, amount of thread coarsening) 431 | * Exploiting parallelism in input images, input channels, and output channels. 432 | * Multiple kernel implementations for different layer sizes 433 | * Input channel reduction: tree 434 | * Input channel reduction: atomics 435 | * ... 436 | 437 | Other optimizations that do not fit in here may also be considered as optimizations. 438 | If in doubt, contact the course staff. 439 | 440 | ## Extras 441 | 442 | ### Checking for Errors 443 | 444 | Within MXNet, you can use `MSHADOW_CUDA_CALL(...);` as is done in `new-forward.cuh`. 445 | Or, you can define a macro/function similar to `wbCheck` used in WebGPU. 446 | 447 | ### Profiling 448 | 449 | You can gather detailed GPU profile information with `nvprof` and view that information with `nvvp`. 450 | 451 | You can see some simple information like so (as we did in milestone 1): 452 | 453 | nvprof 454 | 455 | You can gather a timeline file like the following: 456 | 457 | nvprof -o timeline.nvprof 458 | 459 | This will generate timeline.nvprof. 460 | 461 | You can additionally gather some detailed performance metrics. 462 | 463 | nvprof -o timeline.nvprof 464 | nvprof --kernels "::forward:1" --analysis-metrics -o forward1_analysis.nvprof 465 | nvprof --kernels "::forward:2" --analysis-metrics -o forward2_analysis.nvprof 466 | 467 | This will generate `timeline.nvprof` and `*analysis.nvprof`. 468 | `--analysis-metrics` significantly slows the run time, you may wish to modify the python scripts to run on smaller datasets during this profiling. 469 | 470 | You will need to follow the link rai prints after the execution to retrieve these files. 471 | You can use the NVIDIA Visual Profiler (nvvp) to import those files. 472 | You will need to install nvvp on your own machine. It can be downloaded as part of the CUDA SDK. 473 | 474 | To import the files: 475 | * File > import > select nvprof > next > single process > next 476 | * timeline data file should be your timeline.nvprof 477 | * event/metrics data file should be your analysis.nvprof. 478 | * finish 479 | 480 | ### NVVP on EWS 481 | 482 | The process will be similar for any machine without an NVIDIA GPU (like your linux laptop). 483 | 484 | If you wish to install it on Windows or macOS, the CUDA Toolkit installer may partially fail if you do not have an NVIDIA GPU. 485 | The teaching staff doesn't support this, but you may be able to figure it out. 486 | 487 | Establish an ssh session with x-forwarding 488 | 489 | ssh -Y @linux.ews.illinois.edu 490 | 491 | Download CUDA toolkit for CentOS 7 and install to `~/software/cuda-10.0` (You may choose a different location). 492 | This takes a while (1GB+ download and install). 493 | 494 | mkdir -p $HOME/software \ 495 | && wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux -O cuda10.run \ 496 | && chmod +x cuda10.run \ 497 | && ./cuda10.run --silent --toolkit --toolkitpath=$HOME/software/cuda-10.0 498 | 499 | Free up your EWS space (I'm not sure what the disk quotas are) 500 | 501 | rm cuda10.run 502 | 503 | Optional: modify .bashrc to add `~/software/cuda-10.0/bin` to your path. Or, just run it directly 504 | 505 | ~/software/cuda-10.0/bin/nvvp & 506 | 507 | ### Comparing GPU implementation to CPU implementation 508 | 509 | It may be hard to directly debug by inspecting values during the forward pass since the weights are already trained and the input data is from a real dataset. 510 | You can always extract your implementations into a separate set of files, generate your own test data, and modify `rai_build.yml` to build execute your separate test code instead of the MXNet code while developing. 511 | 512 | A simple code is provided in `build_example`. You could modify the `build` step of rai_build.yml in the following way to compile and run it: 513 | 514 | commands: 515 | build: 516 | - echo "Building arbitrary code" 517 | - make -C /src/build_example 518 | - echo "Running compiled code" 519 | - /src/build_example/main 520 | 521 | ### Offline Development 522 | 523 | If you'd like to develop using a local copy of MXNet, you may do so. Keep in mind your project will be evaluated through rai. Your submission must work through rai. 524 | 525 | Let's use the following directory structure for these instructions. The directories will be created each step along the way. 526 | 527 | 528 | ├── fashion-mnist 529 | ├── incubator-mxnet 530 | ├── m1.1.py 531 | ├── m1.2.py 532 | ├── m2.1.py 533 | ├── m3.1.py 534 | ├── m4.1.py 535 | └── models 536 | 537 | The MXNet instructions are available [here](https://mxnet.incubator.apache.org/get_started/install.html). A short form of them follows for Ubuntu. 538 | 539 | # install mxnet prereqs 540 | sudo apt install -y build-essential git libopenblas-dev liblapack-dev libopencv-dev python-pip python-dev python-setuptools python-numpy 541 | # download MXNet release 1.3.0 542 | git clone --single-branch --depth 1 --branch v1.3.0 --recursive https://github.com/apache/incubator-mxnet 543 | # build MXNet 544 | nice -n20 make -C incubator-mxnet -j`nproc` USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_BLAS=openblas 545 | # install python bindings 546 | pip2 install --user -e incubator-mxnet/python 547 | 548 | You can always uninstall the python package with 549 | 550 | pip2 uninstall mxnet 551 | 552 | The training dataset is a modified version of the mxnet dataset. The scripts to generate it are written in python3 553 | 554 | # install data-generation prereqs 555 | sudo apt install python3 python3-pip 556 | pip3 install --user numpy scikit-image 557 | mkdir -p fashion-mnist 558 | wget -P fashion-mnist \ 559 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/generate-data.py \ 560 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/reader.py 561 | 562 | Run the generation script. It will download the fashion-mnist dataset and resize it, which may take a few minutes and consume a few hundred megabytes of disk space 563 | 564 | chmod +x fashion-mnist/generate-data.py 565 | fashion-mnist/generate-data.py fashion-mnist 566 | 567 | Download the trained models (for the existing MXNet implementation and your implementation) using 568 | 569 | mkdir -p models \ 570 | && wget -P models \ 571 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/models/baseline-0002.params \ 572 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/models/baseline-symbol.json \ 573 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/models/ece408-002.params \ 574 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/models/ece408-symbol.json 575 | 576 | Download the scripts we use for evaluation (needs to be modified to use 74x74 input image size) 577 | 578 | wget \ 579 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m1.1.py \ 580 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m1.2.py \ 581 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m2.1.py \ 582 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m3.1.py \ 583 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m4.1.py 584 | 585 | 586 | Download the skeleton source files into incubator-mxnet. This is also where you will put the skeleton code from `ece408_src`. 587 | 588 | wget -P incubator-mxnet/src/operator/custom \ 589 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/ece408_src/new.cc \ 590 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/ece408_src/new.cu \ 591 | https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/ece408_src/new-inl.h 592 | 593 | Modify the python forward convolution scripts to point to where you downloaded fashion-mnist 594 | 595 | ... load_mnist(path="fashion-mnist", ...) 596 | 597 | Modify the python forward convolution scripts to point to where you downloaded the models 598 | 599 | lenet_model = mx.mod.Module.load(prefix='models/baseline' ... 600 | 601 | Build your modified MXNet 602 | 603 | cp incubator-mxnet/src/operator/custom 604 | make -C incubator-mxnet USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 605 | 606 | 607 | ### Skeleton Code Description 608 | 609 | `new-forward.h` and `new-forward.cuh` contain skeleton implementations for CPU and GPU convolutions. You can complete the project by modifying only these two files. These functions are called from `Forward()` in `new-inl.h`. 610 | 611 | The code in `new-inl.h`, `new.cc`, and `new.cu` describes the convolution layer to MXNet. You should not modify these files. They are provided for your curiosity. 612 | As of rai 0.2.20, When you use the `--submit` flag, a golden version of these files from [here](https://github.com/cwpearson/2017fa_ece408_mxnet_docker/tree/master/ece408-src) is used. 613 | 614 | | File | Function | Description | 615 | | -- | -- | -- | 616 | | `new-forward.h` | `forward()` | Your CPU implementation goes here. | 617 | | `new-forward.cuh` | `forward()` | Your GPU host code goes here. | 618 | | `new-forward.cuh` | `forward_kernel()` | Your GPU kernel implementation goes here. | 619 | | -- | -- | -- | 620 | | `new-inl.h` | `InferShape()` | Computes shape of output tensor from input and kernel shape | 621 | | `new-inl.h` | `InferType()` | Computes type of the output tensor based on the inputs. | 622 | | `new-inl.h` | `Forward()` | Defines the operations of the forward pass. Calls our implementation. | 623 | | `new-inl.h` | `Backward()` | Defines the operations of the backward (training) pass. Not used in this project. | 624 | | `new-inl.h` | `struct NewParam` | Defines the arguments passed to the operator in python. | 625 | | `new.cc` | `CreateOperatorEx()` | Called by MXNet to create the appropriate operator for a CPU or GPU execution. | 626 | | `new.cc` | `CreateOp()` | Creates the CPU operator. | 627 | | `new.cu` | `CreateOp()` | Creates the GPU operator when CUDA is enabled. | 628 | 629 | The `x`, `y`, and `k` tensors constructed in `new-inl.h`/`Forward()` have the following data layout: 630 | 631 | | Tensor | Descrption | Data Layout | 632 | | -- | -- | -- | 633 | | `x` | Input data | batch size * input channels * y * x | 634 | | `y` | Output data | batch size * output channels * y * x | 635 | | `k` | kernel weights | output channels * input channels * y * x | 636 | 637 | You can see this being constructed in `new-inl.h`/`InferShape()`. 638 | 639 | 640 | ### Installing CUDA locally 641 | 642 | The Docker containers that we use to run your code runs on CUDA 10.0. 643 | To view the nvprof results, you need to install the CUDA tookkit locally. 644 | 645 | You can download the CUDA toolkit from: https://developer.nvidia.com/cuda-downloads. 646 | Follow the installation instructions. 647 | 648 | If you dont have CUDA enabled (Nvidia GPU), then dont install the driver. Just use the CUDA toolkit and it should work smoothly. 649 | If you are stuck on how to use, please visit the TA office hours. 650 | 651 | We might consider updating the CUDA tool version inside the Docker container. We will inform incase if we do. 652 | 653 | ## License 654 | 655 | NCSA/UIUC © 2018 [Carl Pearson](https://cwpearson.github.io) 656 | 657 | Modified in fall 2018 [Vikram](https://github.com/msharmavikram/) 658 | 659 | Last modified by Rui Lan and Zhichun Wan 660 | 661 | ## Final Report 662 | - [Final Report](https://github.com/leo811121/UIUC-CS-483-Parallel-Programming/blob/master/ece408_project/report.pdf) 663 | -------------------------------------------------------------------------------- /ece408_project/_gitignore: -------------------------------------------------------------------------------- 1 | *.pyc -------------------------------------------------------------------------------- /ece408_project/_gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leo811121/UIUC-CS-483-Parallel-Programming/0aa3d6097073c4dd5de7f9a52b54e9d230a4df4a/ece408_project/_gitmodules -------------------------------------------------------------------------------- /ece408_project/build_example/Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: main 3 | 4 | main: main.cu 5 | nvcc main.cu -o main 6 | 7 | .PHONY: clean 8 | clean: 9 | rm -f main -------------------------------------------------------------------------------- /ece408_project/build_example/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(void) { 4 | fprintf(stdout, "Hello world\n"); 5 | 6 | int deviceCount = 0; 7 | cudaError_t error_id = cudaGetDeviceCount(&deviceCount); 8 | 9 | if (error_id != cudaSuccess) { 10 | printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id)); 11 | printf("Result = FAIL\n"); 12 | exit(EXIT_FAILURE); 13 | } else { 14 | printf("Found %d GPUs\n"); 15 | } 16 | 17 | } -------------------------------------------------------------------------------- /ece408_project/ece408_src/new-forward.cuh: -------------------------------------------------------------------------------- 1 | 2 | #ifndef MXNET_OPERATOR_NEW_FORWARD_CUH_ 3 | #define MXNET_OPERATOR_NEW_FORWARD_CUH_ 4 | 5 | #include 6 | #define TILE_WIDTH 5 7 | #define TILE_WIDTH1 16 8 | #define TILE_WIDTH2 24 9 | 10 | #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0] 11 | #define x4d(i3, i2, i1, i0) x[(i3) * (C * H * W) + (i2) * (H * W) + (i1) * (W) + i0] 12 | #define k4d(i3, i2, i1, i0) k[(i3) * (C * K * K) + (i2) * (K * K) + (i1) * (K) + i0] 13 | #define X_unroll(i2, i1, i0) X_unroll[(i2) * (C * K * K * H_out * W_out) + (i1) * (H_out * W_out) + i0] 14 | #define k_unroll(i1, i0) k_unroll[i1 * (C * K * K) + i0] 15 | #define xunroll3d(i2, i1, i0) x_unroll[(i2) * (C * K * K * H_out * W_out) + (i1) * (H_out * W_out) + i0] 16 | #define kunroll2d(i1, i0) k_unroll[(i1) * (C * K * K) + i0] 17 | 18 | #define TILE_SZ_A 128 19 | #define TILE_SZ_B 8 20 | 21 | #define kernel4d(i3, i2, i1, i0) kernel[(i3) * (C * K * K) + (i2) * (K * K) + (i1) * (K) + i0] 22 | 23 | 24 | namespace mxnet 25 | { 26 | namespace op 27 | { 28 | __constant__ float kernel[7200]; 29 | 30 | //baseline 31 | __global__ void forward_kernel(float *y, const float *x, const float *k, const int B, const int M, const int C, const int H, const int W, const int K) 32 | { 33 | 34 | const int H_out = H - K + 1; 35 | const int W_out = W - K + 1; 36 | 37 | int W_grid = ceil((float)W_out / TILE_WIDTH); 38 | const int b = blockIdx.x; 39 | const int m = blockIdx.y; 40 | const int h = blockIdx.z / W_grid * TILE_WIDTH + threadIdx.y; 41 | const int w = blockIdx.z % W_grid * TILE_WIDTH + threadIdx.x; 42 | if(h < H_out && w < W_out){ 43 | float val = 0; 44 | //#pragma unroll 2 45 | for(int c = 0; c < C; c++){ 46 | //#pragma unroll 5 47 | for(int p = 0; p < K; p++){ 48 | //#pragma unroll 5 49 | for(int q = 0; q < K; q++){ 50 | //val += x4d(b, c, h+p, w+q) * k4d(m, c, p, q); 51 | val += x4d(b, c, h+p, w+q) * kernel4d(m, c, p, q); 52 | } 53 | } 54 | } 55 | y4d(b, m, h, w) = val; 56 | } 57 | } 58 | 59 | 60 | //kernel for first convolution layer 61 | __global__ void forward_kernel1(float * __restrict__ y, const float * __restrict__ x, const float * __restrict__ k, const int B, const int M, const int C, const int H, const int W, const int K) 62 | { 63 | 64 | const int H_out = H - K + 1; 65 | const int W_out = W - K + 1; 66 | 67 | //int W_grid = ceil((float)W_out / TILE_WIDTH); 68 | int W_grid = ceil((float)W_out / 16); 69 | const int b = blockIdx.x; 70 | const int m = blockIdx.y; 71 | //const int h = blockIdx.z / W_grid * TILE_WIDTH + threadIdx.y; 72 | //const int w = blockIdx.z % W_grid * TILE_WIDTH + threadIdx.x; 73 | const int h = blockIdx.z / W_grid * 16 + threadIdx.y; 74 | const int w = blockIdx.z % W_grid * 16 + threadIdx.x; 75 | if(h < H_out && w < W_out){ 76 | float val = 0; 77 | //#pragma unroll 2 78 | //for(int c = 0; c < C; c++){ 79 | #pragma unroll 5 80 | for(int p = 0; p < K; p++){ 81 | #pragma unroll 5 82 | for(int q = 0; q < K; q++){ 83 | //val += x4d(b, c, h+p, w+q) * k4d(m, c, p, q); 84 | //val += x4d(b, c, h+p, w+q) * kernel4d(m, c, p, q); 85 | val += x4d(b, 0, h+p, w+q) * kernel4d(m, 0, p, q); 86 | } 87 | } 88 | //} 89 | y4d(b, m, h, w) = val; 90 | } 91 | } 92 | 93 | 94 | //kernel for second convolution layer 95 | __global__ void forward_kernel2(float* /*__restrict__*/ y, const float* /*__restrict__*/ x, const float* __restrict__ k, const int B, const int M, const int C, const int H, const int W, const int K) 96 | { 97 | 98 | const int H_out = H - K + 1; 99 | const int W_out = W - K + 1; 100 | 101 | int W_grid = ceil((float)W_out / TILE_WIDTH); 102 | const int b = blockIdx.x; 103 | const int m = blockIdx.y; 104 | const int h = blockIdx.z / W_grid * TILE_WIDTH + threadIdx.y; 105 | const int w = blockIdx.z % W_grid * TILE_WIDTH + threadIdx.x; 106 | if(h < H_out && w < W_out){ 107 | float val = 0; 108 | #pragma unroll 12 109 | for(int c = 0; c < C; c++){ 110 | #pragma unroll 5 111 | for(int p = 0; p < K; p++){ 112 | #pragma unroll 5 113 | for(int q = 0; q < K; q++){ 114 | //val += x4d(b, c, h+p, w+q) * k4d(m, c, p, q); 115 | val += x4d(b, c, h+p, w+q) * kernel4d(m, c, p, q); 116 | } 117 | } 118 | } 119 | y4d(b, m, h, w) = val; 120 | } 121 | } 122 | 123 | 124 | __global__ void unroll_x(const int C, const int H, const int W, const int K, const float* x, float* X_unroll, const int B, int B_pre) 125 | { 126 | int t = blockIdx.x * blockDim.x + threadIdx.x; 127 | int b = blockIdx.y * blockDim.y + threadIdx.y; 128 | int c, s, h_out, w_out, w_unroll, h_unroll, h_base, p, q; 129 | int H_out = H - K + 1; 130 | int W_out = W - K + 1; 131 | int out_number = H_out * W_out; 132 | if (t < C * out_number && b < (B-B_pre)) { 133 | c = t / out_number; 134 | s = t % out_number; 135 | h_out = s / W_out; 136 | w_out = s % W_out; 137 | w_unroll = h_out * W_out + w_out; 138 | h_base = c * K * K; 139 | 140 | 141 | for(p = 0; p < K; p++){ 142 | for(q = 0; q < K; q++) { 143 | h_unroll = h_base + p * K + q; 144 | X_unroll(b, h_unroll, w_unroll) = x4d(b+B_pre, c, h_out + p, w_out + q); 145 | } 146 | } 147 | 148 | } 149 | } 150 | 151 | __global__ void matrixMultiply(float *A, float *B, float *C, int numARows, 152 | int numAColumns, int numBRows, 153 | int numBColumns, int numCRows, 154 | int numCColumns, int batch, int B_pre) { 155 | 156 | int Row = blockIdx.y * blockDim.y + threadIdx.y; 157 | int Col = blockIdx.x * blockDim.x + threadIdx.x; 158 | int Layer = blockIdx.z * blockDim.z + threadIdx.z; 159 | 160 | int tx = threadIdx.x; 161 | int ty = threadIdx.y; 162 | 163 | __shared__ float tileA[TILE_WIDTH][TILE_WIDTH]; 164 | __shared__ float tileB[TILE_WIDTH][TILE_WIDTH]; 165 | 166 | int numTiles = numAColumns/TILE_WIDTH; 167 | if (numAColumns%TILE_WIDTH) numTiles++; 168 | 169 | float CVal = 0.0; 170 | 171 | 172 | for(int a = 0; a < numTiles; ++a){ 173 | 174 | if(Row < numARows && a * TILE_WIDTH + tx < numAColumns){ 175 | tileA[ty][tx] = A[Row * numAColumns + a * TILE_WIDTH + tx]; 176 | } 177 | 178 | else tileA[ty][tx] = 0.0; 179 | 180 | if(a * TILE_WIDTH + ty < numBRows && Col < numBColumns){ 181 | tileB[ty][tx] = B[(Layer) * numBColumns * numBRows + (a * TILE_WIDTH + ty) * numBColumns + Col]; 182 | } 183 | else tileB[ty][tx] = 0.0; 184 | __syncthreads(); 185 | 186 | for(int k = 0; k < TILE_WIDTH; ++k){ 187 | CVal += tileA[ty][k] * tileB[k][tx]; 188 | } 189 | __syncthreads(); 190 | 191 | } 192 | if(Row < numCRows && Col < numCColumns && Layer < (batch-B_pre)){ 193 | C[(Layer+B_pre) * numCColumns * numCRows + Row * numCColumns + Col] = CVal; 194 | } 195 | } 196 | 197 | __global__ void reg_matrixMultiply(float *A, float *B, float *C, int numARows, 198 | int numAColumns, int numBRows, 199 | int numBColumns, int numCRows, 200 | int numCColumns, int batch, int B_pre) { 201 | 202 | __shared__ float tile[TILE_SZ_A]; 203 | 204 | int ty = threadIdx.y; 205 | int row = ty + blockIdx.y*blockDim.y; 206 | int Layer = blockIdx.z * blockDim.z + threadIdx.z; 207 | float reg = 0.0; 208 | float Pvalues[TILE_SZ_B]={0}; 209 | 210 | for(int ph=0;ph 491 | void forward(mshadow::Tensor &y, const mshadow::Tensor &x, const mshadow::Tensor &w) 492 | { 493 | 494 | // Extract the tensor dimensions into B,M,C,H,W,K 495 | // ... 496 | 497 | const int B = x.shape_[0]; 498 | const int C = x.shape_[1]; 499 | const int H = x.shape_[2]; 500 | const int W = x.shape_[3]; 501 | 502 | const int M = y.shape_[1]; 503 | const int K = w.shape_[3]; 504 | 505 | int H_out = H - K + 1; 506 | int W_out = W - K + 1; 507 | 508 | //printf("Hout= %d, WOUT = %d, K = %d , M = %d\n", H_out, W_out, K, M); 509 | 510 | 511 | /*********************************different layer ***********************************************************/ 512 | 513 | if(C==1){ 514 | 515 | //unroll directly to share memory 516 | dim3 gridDim1(ceil(H_out*W_out/(1.0*TILE_WIDTH1)),ceil(M/(1.0*TILE_WIDTH1)),B); 517 | dim3 blockDim1(TILE_WIDTH1,TILE_WIDTH1,1); 518 | unrolltoshare1<<>>(C, K, M, H, W ,W_out, H_out, x.dptr_, w.dptr_, y.dptr_); 519 | 520 | /* 521 | int W_grid = ceil(1.0*W_out / 16); 522 | int H_grid = ceil(1.0*H_out / 16); 523 | 524 | int Z = H_grid * W_grid; 525 | 526 | cudaMemcpyToSymbol(kernel, w.dptr_, w.shape_[0]*w.shape_[1]*w.shape_[2]*w.shape_[3]* sizeof(float), 0, cudaMemcpyDeviceToDevice); 527 | // Set the kernel dimensions 528 | dim3 blockDim(16, 16, 1); 529 | dim3 gridDim(B, M, Z); 530 | 531 | // Call the kernel 532 | forward_kernel1<<>>(y.dptr_,x.dptr_,w.dptr_,B,M,C,H,W,K); 533 | */ 534 | 535 | } 536 | 537 | else if(C==12){ 538 | 539 | //unroll directly to share memory 540 | dim3 gridDim2(ceil(H_out*W_out/(1.0*TILE_WIDTH2)),ceil(M/(1.0*TILE_WIDTH2)),B); 541 | dim3 blockDim2(TILE_WIDTH2,TILE_WIDTH2,1); 542 | unrolltoshare2<<>>(C, K, M, H, W ,W_out, H_out, x.dptr_, w.dptr_, y.dptr_); 543 | 544 | /* 545 | int W_grid = ceil(1.0*W_out / 32); 546 | int H_grid = ceil(1.0*H_out / 32); 547 | 548 | int Z = H_grid * W_grid; 549 | 550 | //constant memory 551 | cudaMemcpyToSymbol(kernel, w.dptr_, w.shape_[0]*w.shape_[1]*w.shape_[2]*w.shape_[3]* sizeof(float), 0, cudaMemcpyDeviceToDevice); 552 | // Set the kernel dimensions 553 | dim3 blockDim(32, 32, 1); 554 | dim3 gridDim(B, M, Z); 555 | 556 | // Call the kernel 557 | forward_kernel2<<>>(y.dptr_,x.dptr_,w.dptr_,B,M,C,H,W,K); 558 | */ 559 | } 560 | 561 | 562 | 563 | /*********************************unroll + share memory multiplication***********************************************************/ 564 | /* 565 | //parameters for matrix multiply 566 | int numARows; 567 | int numAColumns; 568 | int numBRows; 569 | int numBColumns; 570 | int numCRows; 571 | int numCColumns; 572 | 573 | numARows = M; 574 | numAColumns = C *K *K; 575 | 576 | numBRows = C *K * K; 577 | numBColumns = H_out * W_out; 578 | 579 | numCRows = numARows; 580 | numCColumns = numBColumns; 581 | 582 | //unroll x & matrix multi. 583 | //part1 584 | int B_curr = 4000; 585 | int B_pre = 0; 586 | float *x_unroll; 587 | cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float)); 588 | dim3 unrollGrid(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1); 589 | dim3 unrollBlock(TILE_WIDTH, TILE_WIDTH, 1); 590 | unroll_x<<>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre); 591 | 592 | dim3 matrixGrid(numCColumns/TILE_WIDTH,numCRows/TILE_WIDTH,(B_curr-B_pre)); 593 | if (numCColumns%TILE_WIDTH) matrixGrid.x++; 594 | if (numCRows%TILE_WIDTH) matrixGrid.y++; 595 | dim3 matrixBlock(TILE_WIDTH,TILE_WIDTH,1); 596 | matrixMultiply<<>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre); 597 | cudaFree(x_unroll); 598 | 599 | //part2 600 | B_curr = 8000; 601 | B_pre = 4000; 602 | cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float)); 603 | dim3 unrollGrid_2(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1); 604 | dim3 unrollBlock_2(TILE_WIDTH, TILE_WIDTH, 1); 605 | unroll_x<<>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre); 606 | 607 | dim3 matrixGrid2(numCColumns/TILE_WIDTH,numCRows/TILE_WIDTH,(B_curr-B_pre)); 608 | if (numCColumns%TILE_WIDTH) matrixGrid2.x++; 609 | if (numCRows%TILE_WIDTH) matrixGrid2.y++; 610 | //dim3 matrixGrid(ceil((float)numCColumns/TILE_WIDTH),ceil((float)numCRows/TILE_WIDTH,(B_curr-B_pre))); 611 | dim3 matrixBlock2(TILE_WIDTH,TILE_WIDTH,1); 612 | matrixMultiply<<>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre); 613 | cudaFree(x_unroll); 614 | 615 | //part3 616 | B_curr = 10000; 617 | B_pre = 8000; 618 | cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float)); 619 | dim3 unrollGrid_3(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1); 620 | dim3 unrollBlock_3(TILE_WIDTH, TILE_WIDTH, 1); 621 | unroll_x<<>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre); 622 | 623 | dim3 matrixGrid3(numCColumns/TILE_WIDTH,numCRows/TILE_WIDTH,(B_curr-B_pre)); 624 | if (numCColumns%TILE_WIDTH) matrixGrid3.x++; 625 | if (numCRows%TILE_WIDTH) matrixGrid3.y++; 626 | //dim3 matrixGrid(ceil((float)numCColumns/TILE_WIDTH),ceil((float)numCRows/TILE_WIDTH,(B_curr-B_pre))); 627 | dim3 matrixBlock3(TILE_WIDTH,TILE_WIDTH,1); 628 | matrixMultiply<<>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre); 629 | cudaFree(x_unroll); 630 | */ 631 | /**********************************************unroll + register tiling multiplication*****************************************************************************************************************/ 632 | 633 | /* 634 | //parameters for matrix multiply 635 | int numARows; 636 | int numAColumns; 637 | int numBRows; 638 | int numBColumns; 639 | int numCRows; 640 | int numCColumns; 641 | 642 | numARows = M; 643 | numAColumns = C *K *K; 644 | 645 | numBRows = C *K * K; 646 | numBColumns = H_out * W_out; 647 | 648 | numCRows = numARows; 649 | numCColumns = numBColumns; 650 | 651 | //reg tiling matrix multiply 652 | //part1 653 | 654 | int B_curr = 4000; 655 | int B_pre = 0; 656 | float *x_unroll; 657 | cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float)); 658 | dim3 unrollGrid(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1); 659 | dim3 unrollBlock(TILE_WIDTH, TILE_WIDTH, 1); 660 | unroll_x<<>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre); 661 | 662 | dim3 tileGrid((numCColumns-1)/TILE_SZ_B +1,(numCRows-1)/TILE_SZ_A+1,(B_curr-B_pre)); 663 | dim3 tileBlock(1,TILE_SZ_A,1); 664 | reg_matrixMultiply<<>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre); 665 | cudaFree(x_unroll); 666 | 667 | //part2 668 | B_curr = 8000; 669 | B_pre = 4000; 670 | cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float)); 671 | dim3 unrollGrid2(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1); 672 | dim3 unrollBlock2(TILE_WIDTH, TILE_WIDTH, 1); 673 | unroll_x<<>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre); 674 | 675 | dim3 tileGrid2((numCColumns-1)/TILE_SZ_B +1,(numCRows-1)/TILE_SZ_A+1,(B_curr-B_pre)); 676 | dim3 tileBlock2(1,TILE_SZ_A,1); 677 | reg_matrixMultiply<<>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre); 678 | cudaFree(x_unroll); 679 | 680 | //part3 681 | B_curr = 10000; 682 | B_pre = 8000; 683 | cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float)); 684 | dim3 unrollGrid3(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1); 685 | dim3 unrollBlock3(TILE_WIDTH, TILE_WIDTH, 1); 686 | unroll_x<<>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre); 687 | 688 | dim3 tileGrid3((numCColumns-1)/TILE_SZ_B +1,(numCRows-1)/TILE_SZ_A+1,(B_curr-B_pre)); 689 | dim3 tileBlock3(1,TILE_SZ_A,1); 690 | reg_matrixMultiply<<>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre); 691 | cudaFree(x_unroll); 692 | */ 693 | /************************************************fusion unroll sharmemory multiplication*************************************************************************/ 694 | 695 | /* 696 | //parameters for matrix multiply 697 | int numARows; 698 | int numAColumns; 699 | int numBRows; 700 | int numBColumns; 701 | int numCRows; 702 | int numCColumns; 703 | 704 | numARows = M; 705 | numAColumns = C *K *K; 706 | 707 | numBRows = C *K * K; 708 | numBColumns = H_out * W_out; 709 | 710 | numCRows = numARows; 711 | numCColumns = numBColumns; 712 | 713 | 714 | //fusion 715 | //part1 716 | float *x_unroll; 717 | int B_curr = 4000; 718 | int B_pre = 0; 719 | cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float)); 720 | dim3 unrollGrid(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1); 721 | dim3 unrollBlock(TILE_WIDTH, TILE_WIDTH, 1); 722 | dim3 fusionGrid(unrollGrid.x,unrollGrid.y, B_curr-B_pre); 723 | dim3 fusionBlock(TILE_WIDTH,TILE_WIDTH, 1); 724 | fusion<<>>(C, H, W, K, M, B_curr, B_pre, x.dptr_, y.dptr_, w.dptr_, x_unroll,numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns); 725 | cudaFree(x_unroll); 726 | 727 | //part2 728 | B_curr = 8000; 729 | B_pre = 4000; 730 | cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float)); 731 | dim3 unrollGrid2(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1); 732 | dim3 unrollBlock2(TILE_WIDTH, TILE_WIDTH, 1); 733 | dim3 fusionGrid2(unrollGrid2.x,unrollGrid2.y, B_curr-B_pre); 734 | dim3 fusionBlock2(TILE_WIDTH,TILE_WIDTH, 1); 735 | fusion<<>>(C, H, W, K, M, B_curr, B_pre, x.dptr_, y.dptr_, w.dptr_, x_unroll,numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns); 736 | cudaFree(x_unroll); 737 | 738 | //part3 739 | //part3 740 | B_curr = 10000; 741 | B_pre = 8000; 742 | cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float)); 743 | dim3 unrollGrid3(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1); 744 | dim3 unrollBlock3(TILE_WIDTH, TILE_WIDTH, 1); 745 | dim3 fusionGrid3(unrollGrid3.x,unrollGrid3.y, B_curr-B_pre); 746 | dim3 fusionBlock3(TILE_WIDTH,TILE_WIDTH, 1); 747 | fusion<<>>(C, H, W, K, M, B_curr, B_pre, x.dptr_, y.dptr_, w.dptr_, x_unroll,numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns); 748 | cudaFree(x_unroll); 749 | MSHADOW_CUDA_CALL(cudaDeviceSynchronize()); 750 | */ 751 | } 752 | 753 | 754 | template 755 | void forward(mshadow::Tensor &y, const mshadow::Tensor &x, const mshadow::Tensor &w) 756 | { 757 | //CHECK_EQ(0,1) << "Remove this line and replace it with your implementation."; 758 | } 759 | } 760 | } 761 | 762 | #undef y4d 763 | #undef x4d 764 | #undef k4d 765 | #undef X_unroll 766 | #undef funroll2d 767 | #endif 768 | 769 | -------------------------------------------------------------------------------- /ece408_project/ece408_src/new-forward.h: -------------------------------------------------------------------------------- 1 | #ifndef MXNET_OPERATOR_NEW_FORWARD_H_ 2 | #define MXNET_OPERATOR_NEW_FORWARD_H_ 3 | 4 | #include 5 | 6 | namespace mxnet 7 | { 8 | namespace op 9 | { 10 | 11 | 12 | template 13 | void forward(mshadow::Tensor &y, const mshadow::Tensor &x, const mshadow::Tensor &k) 14 | { 15 | /* 16 | Modify this function to implement the forward pass described in Chapter 16. 17 | The code in 16 is for a single image. 18 | We have added an additional dimension to the tensors to support an entire mini-batch 19 | The goal here is to be correct, not fast (this is the CPU implementation.) 20 | */ 21 | 22 | const int B = x.shape_[0]; 23 | const int M = y.shape_[1]; 24 | const int C = x.shape_[1]; 25 | const int H = x.shape_[2]; 26 | const int W = x.shape_[3]; 27 | const int K = k.shape_[3]; 28 | 29 | for (int b = 0; b < B; ++b) { 30 | for(int m = 0; m < M; ++m){ 31 | for(int h = 0; h < H; ++h){ 32 | for(int w = 0; w < W; ++w){ 33 | y[b][m][h][w] = 0; 34 | for(int c = 0; c < C; ++c){ 35 | for(int p = 0; p < K; ++p){ 36 | for(int q = 0; q< K; ++q){ 37 | y[b][m][h][w] += x[b][c][h + p][w + q] * k[m][c][p][q]; 38 | } 39 | } 40 | } 41 | } 42 | } 43 | } 44 | } 45 | 46 | 47 | 48 | 49 | } 50 | } 51 | } 52 | 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /ece408_project/final.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import mxnet as mx 4 | import logging 5 | import sys 6 | from reader import load_mnist 7 | 8 | MODEL_DIR = "/models" 9 | model_prefix = "ece408" 10 | dataset_size = float("inf") 11 | 12 | if len(sys.argv) > 1: 13 | dataset_size = int(sys.argv[1]) 14 | if len(sys.argv) > 2: 15 | print "Usage:", sys.argv[0], "" 16 | print " = [0 - 10000]" 17 | sys.exit(-1) 18 | 19 | # Log to stdout for MXNet 20 | logging.getLogger().setLevel(logging.DEBUG) # logging to stdout 21 | 22 | print "Loading fashion-mnist data...", 23 | test_images, test_labels = load_mnist( 24 | path="/fashion-mnist", rows=70, cols=70, kind="t10k-70") 25 | print "done" 26 | 27 | # Reduce the size of the dataset, if desired 28 | dataset_size = max(0, min(dataset_size, 10000)) 29 | test_images = test_images[:dataset_size] 30 | test_labels = test_labels[:dataset_size] 31 | 32 | # Cap batch size at the size of our training data 33 | batch_size = len(test_images) 34 | 35 | # Get iterators that cover the dataset 36 | test_iter = mx.io.NDArrayIter( 37 | test_images, test_labels, batch_size) 38 | 39 | # Evaluate the network 40 | print "Loading model...", 41 | lenet_model = mx.mod.Module.load( 42 | prefix=MODEL_DIR + "/" + model_prefix, epoch=2, context=mx.gpu()) 43 | lenet_model.bind(data_shapes=test_iter.provide_data, 44 | label_shapes=test_iter.provide_label) 45 | print "done" 46 | 47 | print "New Inference" 48 | acc = mx.metric.Accuracy() 49 | lenet_model.score(test_iter, acc) 50 | print "Correctness:", acc.get()[1], "Model:", model_prefix 51 | -------------------------------------------------------------------------------- /ece408_project/m1.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import mxnet as mx 4 | import logging 5 | from reader import load_mnist 6 | 7 | # Log to stdout for MXNet 8 | logging.getLogger().setLevel(logging.DEBUG) # logging to stdout 9 | 10 | print "Loading fashion-mnist data...", 11 | test_images, test_labels = load_mnist( 12 | path="/fashion-mnist", rows=70, cols=70, kind="t10k-70") 13 | print "done" 14 | 15 | # Do everything in a single batch 16 | batch_size = len(test_images) 17 | 18 | # Get iterators that cover the dataset 19 | test_iter = mx.io.NDArrayIter( 20 | test_images, test_labels, batch_size) 21 | 22 | # Evaluate the network 23 | print "Loading model...", 24 | lenet_model = mx.mod.Module.load( 25 | prefix='/models/baseline', epoch=2, context=mx.cpu()) 26 | lenet_model.bind(data_shapes=test_iter.provide_data, 27 | label_shapes=test_iter.provide_label) 28 | print "done" 29 | 30 | print "New Inference" 31 | acc = mx.metric.Accuracy() 32 | lenet_model.score(test_iter, acc) 33 | print(acc) 34 | -------------------------------------------------------------------------------- /ece408_project/m1.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import mxnet as mx 4 | import logging 5 | from reader import load_mnist 6 | 7 | # Log to stdout for MXNet 8 | logging.getLogger().setLevel(logging.DEBUG) # logging to stdout 9 | 10 | print "Loading fashion-mnist data...", 11 | test_images, test_labels = load_mnist( 12 | path="/fashion-mnist", rows=70, cols=70, kind="t10k-70") 13 | print "done" 14 | 15 | # Do everything in a single batch 16 | batch_size = len(test_images) 17 | 18 | # Get iterators that cover the dataset 19 | test_iter = mx.io.NDArrayIter( 20 | test_images, test_labels, batch_size) 21 | 22 | # Evaluate the network 23 | print "Loading model...", 24 | lenet_model = mx.mod.Module.load( 25 | prefix='/models/baseline', epoch=2, context=mx.gpu()) 26 | lenet_model.bind(data_shapes=test_iter.provide_data, 27 | label_shapes=test_iter.provide_label) 28 | print "done" 29 | 30 | print "New Inference" 31 | acc = mx.metric.Accuracy() 32 | lenet_model.score(test_iter, acc) 33 | print(acc) 34 | -------------------------------------------------------------------------------- /ece408_project/m2.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import mxnet as mx 4 | import logging 5 | import sys 6 | from reader import load_mnist 7 | 8 | MODEL_DIR = "/models" 9 | model_prefix = "ece408" 10 | dataset_size = float("inf") 11 | 12 | if len(sys.argv) > 1: 13 | dataset_size = int(sys.argv[1]) 14 | if len(sys.argv) > 2: 15 | print "Usage:", sys.argv[0], "" 16 | print " = [0 - 10000]" 17 | sys.exit(-1) 18 | 19 | # Log to stdout for MXNet 20 | logging.getLogger().setLevel(logging.DEBUG) # logging to stdout 21 | 22 | print "Loading fashion-mnist data...", 23 | test_images, test_labels = load_mnist( 24 | path="/fashion-mnist", rows=70, cols=70, kind="t10k-70") 25 | print "done" 26 | 27 | # Reduce the size of the dataset, if desired 28 | dataset_size = max(0, min(dataset_size, 10000)) 29 | test_images = test_images[:dataset_size] 30 | test_labels = test_labels[:dataset_size] 31 | 32 | # Cap batch size at the size of our training data 33 | batch_size = len(test_images) 34 | 35 | # Get iterators that cover the dataset 36 | test_iter = mx.io.NDArrayIter( 37 | test_images, test_labels, batch_size) 38 | 39 | # Evaluate the network 40 | print "Loading model...", 41 | lenet_model = mx.mod.Module.load( 42 | prefix=MODEL_DIR + "/" + model_prefix, epoch=2, context=mx.cpu()) 43 | lenet_model.bind(data_shapes=test_iter.provide_data, 44 | label_shapes=test_iter.provide_label) 45 | print "done" 46 | 47 | print "New Inference" 48 | acc = mx.metric.Accuracy() 49 | lenet_model.score(test_iter, acc) 50 | print "Correctness:", acc.get()[1], "Model:", model_prefix 51 | -------------------------------------------------------------------------------- /ece408_project/m3.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import mxnet as mx 4 | import logging 5 | import sys 6 | from reader import load_mnist 7 | 8 | MODEL_DIR = "/models" 9 | model_prefix = "ece408" 10 | dataset_size = float("inf") 11 | 12 | if len(sys.argv) > 1: 13 | dataset_size = int(sys.argv[1]) 14 | if len(sys.argv) > 2: 15 | print "Usage:", sys.argv[0], "" 16 | print " = [0 - 10000]" 17 | sys.exit(-1) 18 | 19 | # Log to stdout for MXNet 20 | logging.getLogger().setLevel(logging.DEBUG) # logging to stdout 21 | 22 | print "Loading fashion-mnist data...", 23 | test_images, test_labels = load_mnist( 24 | path="/fashion-mnist", rows=70, cols=70, kind="t10k-70") 25 | print "done" 26 | 27 | # Reduce the size of the dataset, if desired 28 | dataset_size = max(0, min(dataset_size, 10000)) 29 | test_images = test_images[:dataset_size] 30 | test_labels = test_labels[:dataset_size] 31 | 32 | # Cap batch size at the size of our training data 33 | batch_size = len(test_images) 34 | 35 | # Get iterators that cover the dataset 36 | test_iter = mx.io.NDArrayIter( 37 | test_images, test_labels, batch_size) 38 | 39 | # Evaluate the network 40 | print "Loading model...", 41 | lenet_model = mx.mod.Module.load( 42 | prefix=MODEL_DIR + "/" + model_prefix, epoch=2, context=mx.gpu()) 43 | lenet_model.bind(data_shapes=test_iter.provide_data, 44 | label_shapes=test_iter.provide_label) 45 | print "done" 46 | 47 | print "New Inference" 48 | acc = mx.metric.Accuracy() 49 | lenet_model.score(test_iter, acc) 50 | print "Correctness:", acc.get()[1], "Model:", model_prefix 51 | -------------------------------------------------------------------------------- /ece408_project/m4.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import mxnet as mx 4 | import logging 5 | import sys 6 | from reader import load_mnist 7 | 8 | MODEL_DIR = "/models" 9 | model_prefix = "ece408" 10 | dataset_size = float("inf") 11 | 12 | if len(sys.argv) > 1: 13 | dataset_size = int(sys.argv[1]) 14 | if len(sys.argv) > 2: 15 | print "Usage:", sys.argv[0], "" 16 | print " = [0 - 10000]" 17 | sys.exit(-1) 18 | 19 | # Log to stdout for MXNet 20 | logging.getLogger().setLevel(logging.DEBUG) # logging to stdout 21 | 22 | print "Loading fashion-mnist data...", 23 | test_images, test_labels = load_mnist( 24 | path="/fashion-mnist", rows=70, cols=70, kind="t10k-70") 25 | print "done" 26 | 27 | # Reduce the size of the dataset, if desired 28 | dataset_size = max(0, min(dataset_size, 10000)) 29 | test_images = test_images[:dataset_size] 30 | test_labels = test_labels[:dataset_size] 31 | 32 | # Cap batch size at the size of our training data 33 | batch_size = len(test_images) 34 | 35 | # Get iterators that cover the dataset 36 | test_iter = mx.io.NDArrayIter( 37 | test_images, test_labels, batch_size) 38 | 39 | # Evaluate the network 40 | print "Loading model...", 41 | lenet_model = mx.mod.Module.load( 42 | prefix=MODEL_DIR + "/" + model_prefix, epoch=2, context=mx.gpu()) 43 | lenet_model.bind(data_shapes=test_iter.provide_data, 44 | label_shapes=test_iter.provide_label) 45 | print "done" 46 | 47 | print "New Inference" 48 | acc = mx.metric.Accuracy() 49 | lenet_model.score(test_iter, acc) 50 | print "Correctness:", acc.get()[1], "Model:", model_prefix 51 | -------------------------------------------------------------------------------- /ece408_project/rai: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leo811121/UIUC-CS-483-Parallel-Programming/0aa3d6097073c4dd5de7f9a52b54e9d230a4df4a/ece408_project/rai -------------------------------------------------------------------------------- /ece408_project/rai_build.yml: -------------------------------------------------------------------------------- 1 | rai: 2 | version: 0.2 3 | image: illinoisimpact/ece408_mxnet_docker:amd64-gpu-latest-fa19 4 | resources: 5 | cpu: 6 | architecture: amd64 7 | gpu: 8 | architecture: volta 9 | count: 1 10 | network: false 11 | commands: 12 | build: 13 | - /bin/bash -c "cp -rv /ece408_src/* /mxnet/src/operator/custom" # copy golden files to mxnet source tree 14 | - /bin/bash -c "cp -rv /src/* /build" # copy the project folder to /build so everything appears in the upload 15 | - /bin/bash -c "for src in ece408_src/*; do cp -v $src /mxnet/src/operator/custom/.; done" # copy source files to mxnet tree 16 | - nice -n20 make -C /mxnet # build mxnet 17 | - pip2 install --user -e /mxnet/python # install python bindings 18 | #- /usr/bin/time python m1.1.py # execute code 19 | #- /usr/bin/time python m1.2.py # execute code 20 | #- /usr/bin/time python m2.1.py # execute code 21 | #- /usr/bin/time python m3.1.py # execute code 22 | #- /usr/bin/time python m4.1.py # execute code 23 | #- /usr/bin/time python final.py # execute code 24 | #- nvprof python m1.2.py 25 | #- python m3.1.py 26 | #- python m3.1.py 100 27 | #- nvprof -o timeline.nvprof python m3.1.py 28 | #- nvprof --kernels "::forward:1" --analysis-metrics -o forward1_analysis.nvprof python m3.1.py 29 | #- nvprof --kernels "::forward:2" --analysis-metrics -o forward2_analysis.nvprof python m3.1.py 30 | - python m4.1.py 10000 31 | #- nvprof python m4.1.py 10000 32 | #- nvprof -o timeline.nvprof python m4.1.py 10000 33 | #- nvprof --kernels "::unrolltoshare1:1" --analysis-metrics -o unrolltoshare1.nvprof python m4.1.py 10000 34 | #- nvprof --kernels "::unrolltoshare2:1" --analysis-metrics -o unrolltoshare2.nvprof python m4.1.py 10000 35 | -------------------------------------------------------------------------------- /ece408_project/reader.py: -------------------------------------------------------------------------------- 1 | def load_mnist(path, rows, cols, kind): 2 | import os 3 | import gzip 4 | import numpy as np 5 | 6 | filters = 1 7 | 8 | """Load MNIST data from `path`""" 9 | labels_path = os.path.join(path, 10 | '%s-labels-idx1-ubyte.gz' 11 | % kind) 12 | images_path = os.path.join(path, 13 | '%s-images-idx3-ubyte.gz' 14 | % kind) 15 | 16 | with gzip.open(labels_path, 'rb') as lbpath: 17 | labels = np.frombuffer(lbpath.read(), dtype=np.uint8, 18 | offset=8) 19 | labels.reshape(len(labels)) 20 | 21 | with gzip.open(images_path, 'rb') as imgpath: 22 | images = np.frombuffer(imgpath.read(), dtype=np.uint8, 23 | offset=16).reshape(len(labels), filters, rows, cols) 24 | 25 | return images, labels 26 | 27 | 28 | def store_mnist(path, images, labels, kind): 29 | import os 30 | import gzip 31 | import numpy as np 32 | import struct 33 | 34 | """Store data to `path`""" 35 | labels_path = os.path.join(path, 36 | '%s-labels-idx1-ubyte.gz' 37 | % kind) 38 | images_path = os.path.join(path, 39 | '%s-images-idx3-ubyte.gz' 40 | % kind) 41 | 42 | with gzip.open(labels_path, 'wb') as lbpath: 43 | lbpath.write(struct.pack("i", 0)) # magic 44 | lbpath.write(struct.pack("i", labels.size)) # number of items (32b) 45 | lbpath.write(labels.tobytes()) 46 | 47 | with gzip.open(images_path, 'wb') as imgpath: 48 | imgpath.write(struct.pack("i", 0)) # magic number 49 | # number of images (32b) 50 | imgpath.write(struct.pack("i", images.shape[0])) 51 | # number of rows (32b) 52 | imgpath.write(struct.pack("i", images.shape[1])) 53 | # number of cols (32b) 54 | imgpath.write(struct.pack("i", images.shape[2])) 55 | imgpath.write(images.tobytes()) 56 | 57 | return images, labels 58 | -------------------------------------------------------------------------------- /ece408_project/report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leo811121/UIUC-CS-483-Parallel-Programming/0aa3d6097073c4dd5de7f9a52b54e9d230a4df4a/ece408_project/report.pdf --------------------------------------------------------------------------------