├── .gitignore
├── MP1
    ├── MP1.CU
    └── README.md
├── MP2
    ├── MP2.CU
    └── README.md
├── MP3
    ├── MP3.CU
    └── README.md
├── MP4
    ├── MP4.CU
    └── README.md
├── MP5.1
    ├── MP5.1.CU
    └── README.md
├── MP5.2
    ├── MP5.2.CU
    └── README.md
├── MP6
    ├── MP6.CU
    └── README.md
├── MP7
    ├── MP7.CU
    └── README.md
├── README.md
└── ece408_project
    ├── .vscode
        └── settings.json
    ├── README.md
    ├── _gitignore
    ├── _gitmodules
    ├── build_example
        ├── Makefile
        └── main.cu
    ├── ece408_src
        ├── new-forward.cuh
        └── new-forward.h
    ├── final.py
    ├── m1.1.py
    ├── m1.2.py
    ├── m2.1.py
    ├── m3.1.py
    ├── m4.1.py
    ├── rai
    ├── rai_build.yml
    ├── reader.py
    └── report.pdf


/.gitignore:
--------------------------------------------------------------------------------
1 | *.i
2 | *.ii
3 | *.gpu
4 | *.ptx
5 | *.cubin
6 | *.fatbin
7 | 


--------------------------------------------------------------------------------
/MP1/MP1.CU:
--------------------------------------------------------------------------------
 1 | // MP 1
 2 | #include <wb.h>
 3 | 
 4 | __global__ void vecAdd(float *in1, float *in2, float *out, int len) {
 5 |   //@@ Insert code to implement vector addition here
 6 |   int i = blockIdx.x * blockDim.x + threadIdx.x;
 7 |   if (i<len) out[i] = in1[i] + in2[i];
 8 | }
 9 | 
10 | int main(int argc, char **argv) {
11 |   wbArg_t args;
12 |   int inputLength;
13 |   int size;
14 |   float *hostInput1;
15 |   float *hostInput2;
16 |   float *hostOutput;
17 |   float *deviceInput1;
18 |   float *deviceInput2;
19 |   float *deviceOutput;
20 | 
21 |   args = wbArg_read(argc, argv);
22 | 
23 |   wbTime_start(Generic, "Importing data and creating memory on host");
24 |   hostInput1 =
25 |       (float *)wbImport(wbArg_getInputFile(args, 0), &inputLength);
26 |   hostInput2 =
27 |       (float *)wbImport(wbArg_getInputFile(args, 1), &inputLength);
28 |   hostOutput = (float *)malloc(inputLength * sizeof(float));
29 |   wbTime_stop(Generic, "Importing data and creating memory on host");
30 | 
31 |   wbLog(TRACE, "The input length is ", inputLength);
32 |   size = inputLength * sizeof(float);
33 |   wbTime_start(GPU, "Allocating GPU memory.");
34 |   //@@ Allocate GPU memory here
35 |   cudaMalloc((void **) &deviceInput1, size);
36 |   cudaMalloc((void **) &deviceInput2, size);
37 |   cudaMalloc((void **) &deviceOutput, size);
38 |   wbTime_stop(GPU, "Allocating GPU memory.");
39 | 
40 |   wbTime_start(GPU, "Copying input memory to the GPU.");
41 |   //@@ Copy memory to the GPU here
42 |   cudaMemcpy(deviceInput1, hostInput1, size, cudaMemcpyHostToDevice);
43 |   cudaMemcpy(deviceInput2, hostInput2, size, cudaMemcpyHostToDevice);
44 |   wbTime_stop(GPU, "Copying input memory to the GPU.");
45 | 
46 |   //@@ Initialize the grid and block dimensions here
47 |    int block_Num = ceil(size/256.0);
48 |    int thread_Num = 256;
49 |   
50 |   wbTime_start(Compute, "Performing CUDA computation");
51 |   //@@ Launch the GPU Kernel here
52 |   vecAdd<<<block_Num, thread_Num>>> (deviceInput1, deviceInput2, deviceOutput, size);
53 |   cudaDeviceSynchronize();
54 |   wbTime_stop(Compute, "Performing CUDA computation");
55 | 
56 |   wbTime_start(Copy, "Copying output memory to the CPU");
57 |   //@@ Copy the GPU memory back to the CPU here
58 |   cudaMemcpy(hostOutput, deviceOutput, size, cudaMemcpyDeviceToHost);
59 |   
60 |   wbTime_stop(Copy, "Copying output memory to the CPU");
61 |   
62 |   wbTime_start(GPU, "Freeing GPU Memory");
63 |   //@@ Free the GPU memory here
64 |   cudaFree(deviceInput1); cudaFree(deviceInput2); cudaFree(deviceOutput);
65 |   wbTime_stop(GPU, "Freeing GPU Memory");
66 | 
67 |   wbSolution(args, hostOutput, inputLength);
68 | 
69 |   free(hostInput1);
70 |   free(hostInput2);
71 |   free(hostOutput);
72 | 
73 |   return 0;
74 | }
75 | 


--------------------------------------------------------------------------------
/MP1/README.md:
--------------------------------------------------------------------------------
 1 | # Vector Addition
 2 | ## Objective
 3 | The purpose of this lab is for you to become familiar with using the CUDA API by implementing a simple vector addition kernel and its associated host code as shown in the lectures.
 4 | 
 5 | ## Prerequisites
 6 | Before starting this lab, make sure that:
 7 | 
 8 | - You have completed all week 1 lectures or videos
 9 | 
10 | - You have completed “Lab Tour with Device Query” MP
11 | 
12 | - You have looked over the tutorial document.
13 | 
14 | - Chapter 2 of the text book would also be helpful
15 | 
16 | ## Instruction
17 | Edit the code in the ‘Code’ tab to perform the following:
18 | 
19 | - Allocate device memory
20 | 
21 | - Copy host memory to device
22 | 
23 | - Initialize thread block and kernel grid dimensions
24 | 
25 | - Invoke CUDA kernel
26 | 
27 | - Copy results from device to host
28 | 
29 | - Free device memory
30 | 
31 | - Write the CUDA kernel
32 | 
33 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines.
34 | 
35 | ## Suggestions (for all labs)
36 | - The system’s autosave feature is not an excuse to not backup your code and answers to your questions regularly.
37 | 
38 | - If you have not done so already, watch the tutorial video.
39 | 
40 | - Do not modify the template code provided -- only insert code where the //@@ demarcation is placed
41 | 
42 | - Develop your solution incrementally and test each version thoroughly before moving on to the next version
43 | 
44 | - Do not wait until the last minute to attempt the lab.
45 | 
46 | - If you get stuck with boundary conditions, grab a pen and paper. It is much easier to figure out the boundary conditions there.
47 | 
48 | - Implement the serial CPU version first, this will give you an understanding of the loops
49 | 
50 | - Get the first dataset working first. The datasets are ordered so the first one is the easiest to handle
51 | 
52 | - Make sure that your algorithm handles non-regular dimensional inputs (not square or multiples of 2). The slides may present the algorithm with nice inputs, since it minimizes the conditions. The datasets reflect different sizes of input that you are expected to handle
53 | 
54 | - Make sure that you test your program using all the datasets provided (the datasets can be selected using the dropdown next to the submission button)
55 | 
56 | - Check for errors: for example, when developing CUDA code, one can check for if the function call succeeded and print an error if not via the following macro:
57 | ```
58 | #define wbCheck(stmt) do {                                                    \
59 |         cudaError_t err = stmt;                                               \
60 |         if (err != cudaSuccess) {                                             \
61 |             wbLog(ERROR, "Failed to run stmt ", #stmt);                       \
62 |             wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    \
63 |             return -1;                                                        \
64 |         }                                                                     \
65 |     } while(0)
66 | An example usage is wbCheck(cudaMalloc(...)).
67 | ```
68 | 


--------------------------------------------------------------------------------
/MP2/MP2.CU:
--------------------------------------------------------------------------------
  1 | #include <wb.h>
  2 | 
  3 | #define wbCheck(stmt)                                                     \
  4 |   do {                                                                    \
  5 |     cudaError_t err = stmt;                                               \
  6 |     if (err != cudaSuccess) {                                             \
  7 |       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  8 |       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  9 |       return -1;                                                          \
 10 |     }                                                                     \
 11 |   } while (0)
 12 | 
 13 | // Compute C = A * B
 14 | __global__ void matrixMultiply(float *A, float *B, float *C, int numARows,
 15 |                                int numAColumns, int numBRows,
 16 |                                int numBColumns, int numCRows,
 17 |                                int numCColumns) {
 18 |   //@@ Insert code to implement matrix multiplication here
 19 |   int Row = blockIdx.y*blockDim.y + threadIdx.y;
 20 |   int Col = blockIdx.x*blockDim.x + threadIdx.x;
 21 |   
 22 |   
 23 |   if ((Row < numCRows) && (Col < numCColumns)) {
 24 |     float ElementVal = 0;
 25 |     for (int k = 0; k < numAColumns; k++)
 26 |       ElementVal += A[Row*numAColumns + k]*B[k*numBColumns + Col];
 27 |     C[Row*numCColumns + Col] = ElementVal;
 28 |   }
 29 | }
 30 | 
 31 | int main(int argc, char **argv) {
 32 |   wbArg_t args;
 33 |   float *hostA; // The A matrix
 34 |   float *hostB; // The B matrix
 35 |   float *hostC; // The output C matrix
 36 |   float *deviceA;
 37 |   float *deviceB;
 38 |   float *deviceC;
 39 |   int numARows;    // number of rows in the matrix A
 40 |   int numAColumns; // number of columns in the matrix A
 41 |   int numBRows;    // number of rows in the matrix B
 42 |   int numBColumns; // number of columns in the matrix B
 43 |   int numCRows;    // number of rows in the matrix C (you have to set this)
 44 |   int numCColumns; // number of columns in the matrix C (you have to set
 45 |                    // this)
 46 | 
 47 |   args = wbArg_read(argc, argv);
 48 | 
 49 |   wbTime_start(Generic, "Importing data and creating memory on host");
 50 |   hostA = (float *)wbImport(wbArg_getInputFile(args, 0), &numARows,
 51 |                             &numAColumns);
 52 |   hostB = (float *)wbImport(wbArg_getInputFile(args, 1), &numBRows,
 53 |                             &numBColumns);
 54 |   //@@ Set numCRows and numCColumns
 55 |   numCRows = numARows;
 56 |   numCColumns = numBColumns;
 57 |   
 58 |   //@@ Allocate the hostC matrix
 59 |   int sizeA, sizeB, sizeC;
 60 |   sizeA = numARows*numAColumns*sizeof(float);
 61 |   sizeB = numBRows*numBColumns*sizeof(float);
 62 |   sizeC = numCRows*numCColumns*sizeof(float);
 63 |   hostC = (float *)malloc(sizeC);
 64 |   wbTime_stop(Generic, "Importing data and creating memory on host");
 65 | 
 66 |   wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
 67 |   wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
 68 | 
 69 |   wbTime_start(GPU, "Allocating GPU memory.");
 70 |   //@@ Allocate GPU memory here
 71 |   cudaMalloc((void**) &deviceA, sizeA);
 72 |   cudaMalloc((void**) &deviceB, sizeB);
 73 |   cudaMalloc((void**) &deviceC, sizeC);
 74 |   wbTime_stop(GPU, "Allocating GPU memory.");
 75 | 
 76 |   wbTime_start(GPU, "Copying input memory to the GPU.");
 77 |   //@@ Copy memory to the GPU here
 78 |   cudaMemcpy(deviceA, hostA, sizeA, cudaMemcpyHostToDevice);
 79 |   cudaMemcpy(deviceB, hostB, sizeB, cudaMemcpyHostToDevice);
 80 |   wbTime_stop(GPU, "Copying input memory to the GPU.");
 81 | 
 82 |   //@@ Initialize the grid and block dimensions here
 83 |   dim3 dimGrid(ceil((1.0*numCColumns)/2), ceil((1.0*numCRows))/2, 1);
 84 |   dim3 dimBlock(2, 2, 1);
 85 |   
 86 |   wbTime_start(Compute, "Performing CUDA computation");
 87 |   //@@ Launch the GPU Kernel here
 88 |   matrixMultiply<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, numARows,numAColumns, 
 89 |                                              numBRows, numBColumns, numCRows,numCColumns);
 90 |   cudaDeviceSynchronize();
 91 |   wbTime_stop(Compute, "Performing CUDA computation");
 92 | 
 93 |   wbTime_start(Copy, "Copying output memory to the CPU");
 94 |   //@@ Copy the GPU memory back to the CPU here
 95 |   cudaMemcpy(hostC, deviceC, sizeC, cudaMemcpyDeviceToHost);
 96 |   wbTime_stop(Copy, "Copying output memory to the CPU");
 97 | 
 98 |   wbTime_start(GPU, "Freeing GPU Memory");
 99 |   //@@ Free the GPU memory here
100 |   cudaFree(deviceA); cudaFree(deviceB); cudaFree(deviceC);
101 |   wbTime_stop(GPU, "Freeing GPU Memory");
102 | 
103 |   wbSolution(args, hostC, numCRows, numCColumns);
104 | 
105 |   free(hostA);
106 |   free(hostB);
107 |   free(hostC);
108 | 
109 |   return 0;
110 | }
111 | 


--------------------------------------------------------------------------------
/MP2/README.md:
--------------------------------------------------------------------------------
 1 | # Basic Matrix Multiplication
 2 | ## Objective
 3 | The purpose of this lab is to implement a basic dense matrix multiplication routine.
 4 | 
 5 | # Prerequisites
 6 | - Before starting this lab, make sure that:
 7 | 
 8 | - You have completed the “Vector Addition” MP
 9 | 
10 | - You have completed all week 2 lecture videos
11 | 
12 | # Instruction
13 | Edit the code in the ‘Code’ tab to perform the following:
14 | 
15 | - allocate device memory
16 | - copy host memory to device
17 | - initialize thread block and kernel grid dimensions
18 | - invoke CUDA kernel
19 | - copy results from device to host
20 | - deallocate device memory
21 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines.
22 | 
23 | # Suggestions (for all labs)
24 | - The system’s autosave feature is not an excuse to not backup your code and answers to your questions regularly.
25 | 
26 | - If you have not done so already, watch the tutorial video.
27 | 
28 | - Do not modify the template code provided -- only insert code where the //@@ demarcation is placed
29 | 
30 | - Develop your solution incrementally and test each version thoroughly before moving on to the next version
31 | 
32 | - Do not wait until the last minute to attempt the lab.
33 | 
34 | - If you get stuck with boundary conditions, grab a pen and paper. It is much easier to figure out the boundary conditions there.
35 | 
36 | - Implement the serial CPU version first, this will give you an understanding of the loops
37 | 
38 | - Get the first dataset working first. The datasets are ordered so the first one is the easiest to handle
39 | 
40 | - Make sure that your algorithm handles non-regular dimensional inputs (not square or multiples of 2). The slides may present the algorithm with nice inputs, since it minimizes the conditions. The datasets reflect different sizes of input that you are expected to handle
41 | 
42 | - Make sure that you test your program using all the datasets provided (the datasets can be selected using the dropdown next to the submission button)
43 | 
44 | - Check for errors: for example, when developing CUDA code, one can check for if the function call succeeded and print an error if not via the following macro:
45 | ```
46 | #define wbCheck(stmt) do {                                                    \
47 |         cudaError_t err = stmt;                                               \
48 |         if (err != cudaSuccess) {                                             \
49 |             wbLog(ERROR, "Failed to run stmt ", #stmt);                       \
50 |             wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    \
51 |             return -1;                                                        \
52 |         }                                                                     \
53 |     } while(0)
54 | An example usage is wbCheck(cudaMalloc(...)).
55 | ```
56 | 


--------------------------------------------------------------------------------
/MP3/MP3.CU:
--------------------------------------------------------------------------------
  1 | #include <wb.h>
  2 | 
  3 | #define wbCheck(stmt)                                                     \
  4 |   do {                                                                    \
  5 |     cudaError_t err = stmt;                                               \
  6 |     if (err != cudaSuccess) {                                             \
  7 |       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  8 |       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  9 |       return -1;                                                          \
 10 |     }                                                                     \
 11 |   } while (0)
 12 | #define TW 16
 13 | // Compute C = A * B
 14 | __global__ void matrixMultiplyShared(float *A, float *B, float *C,
 15 |                                      int numARows, int numAColumns,
 16 |                                      int numBRows, int numBColumns,
 17 |                                      int numCRows, int numCColumns) {
 18 |   //@@ Insert code to implement matrix multiplication here
 19 |   //@@ You have to use shared memory for this MP
 20 |   
 21 |   __shared__ float subTileM[TW][TW];
 22 |   __shared__ float subTileN[TW][TW];
 23 |   
 24 |   int bx = blockIdx.x; int by = blockIdx.y;
 25 |   int tx = threadIdx.x; int ty = threadIdx.y;
 26 |   
 27 |   int Row = by * TW + ty;
 28 |   int Col = bx * TW + tx;
 29 |   
 30 |   float Pvalue = 0;
 31 |   
 32 |   
 33 |   for (int m = 0; m < ceil((1.0*numAColumns)/(TW) ); m++){
 34 |     if (Row < numARows && m*TW + tx < numAColumns){
 35 |       subTileM[ty][tx] = A[Row*numAColumns + m*TW + tx];
 36 |     }
 37 |     else
 38 |       subTileM[ty][tx] = 0;
 39 |     
 40 |     if (Col < numBColumns && (m*TW + ty) < numBRows){ 
 41 |       subTileN[ty][tx] = B[(m*TW + ty)*numBColumns + Col];
 42 |     }
 43 |     else
 44 |       subTileN[ty][tx] = 0;
 45 |     __syncthreads();
 46 |     for (int k = 0; k < TW; k++)
 47 |       Pvalue += subTileM[ty][k] * subTileN[k][tx];
 48 |     
 49 |     __syncthreads();
 50 |   }
 51 |   if (Row < numCRows && Col < numCColumns)
 52 |     C[Row*numCColumns + Col] = Pvalue;
 53 | } 
 54 |   
 55 | 
 56 | 
 57 | int main(int argc, char **argv) {
 58 |   wbArg_t args;
 59 |   float *hostA; // The A matrix
 60 |   float *hostB; // The B matrix
 61 |   float *hostC; // The output C matrix
 62 |   float *deviceA;
 63 |   float *deviceB;
 64 |   float *deviceC;
 65 |   int numARows;    // number of rows in the matrix A
 66 |   int numAColumns; // number of columns in the matrix A
 67 |   int numBRows;    // number of rows in the matrix B
 68 |   int numBColumns; // number of columns in the matrix B
 69 |   int numCRows;    // number of rows in the matrix C (you have to set this)
 70 |   int numCColumns; // number of columns in the matrix C (you have to set
 71 |                    // this)
 72 | 
 73 |   args = wbArg_read(argc, argv);
 74 | 
 75 |   wbTime_start(Generic, "Importing data and creating memory on host");
 76 |   hostA = (float *)wbImport(wbArg_getInputFile(args, 0), &numARows,
 77 |                             &numAColumns);
 78 |   hostB = (float *)wbImport(wbArg_getInputFile(args, 1), &numBRows,
 79 |                             &numBColumns);
 80 |   //@@ Set numCRows and numCColumns
 81 |   numCRows = numARows;
 82 |   numCColumns = numBColumns;
 83 |   
 84 |   //@@ Allocate the hostC matrix
 85 |   int sizeA, sizeB, sizeC;
 86 |   sizeA = numARows*numAColumns*sizeof(float);
 87 |   sizeB = numBRows*numBColumns*sizeof(float);
 88 |   sizeC = numCRows*numCColumns*sizeof(float);
 89 |   hostC = (float *)malloc(sizeC);
 90 |   
 91 |   wbTime_stop(Generic, "Importing data and creating memory on host");
 92 | 
 93 |   wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
 94 |   wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
 95 | 
 96 |   wbTime_start(GPU, "Allocating GPU memory.");
 97 |   //@@ Allocate GPU memory here
 98 |   cudaMalloc((void**) &deviceA, sizeA);
 99 |   cudaMalloc((void**) &deviceB, sizeB);
100 |   cudaMalloc((void**) &deviceC, sizeC);
101 |   
102 |   wbTime_stop(GPU, "Allocating GPU memory.");
103 | 
104 |   wbTime_start(GPU, "Copying input memory to the GPU.");
105 |   //@@ Copy memory to the GPU here
106 |   cudaMemcpy(deviceA, hostA, sizeA, cudaMemcpyHostToDevice);
107 |   cudaMemcpy(deviceB, hostB, sizeB, cudaMemcpyHostToDevice);
108 |   wbTime_stop(GPU, "Copying input memory to the GPU.");
109 | 
110 |   //@@ Initialize the grid and block dimensions here
111 |   int x;
112 |   int y;
113 |   y = ceil((1.0*numCRows)/(TW));
114 |   x = ceil((1.0*numCColumns)/(TW));
115 |   dim3 dimGrid(x, y, 1);
116 |   dim3 dimBlock(TW, TW, 1);
117 |   wbTime_start(Compute, "Performing CUDA computation");
118 |   
119 |   //@@ Launch the GPU Kernel here
120 |   matrixMultiplyShared<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, numARows,numAColumns, 
121 |                                               numBRows, numBColumns, numCRows, numCColumns);
122 |   cudaDeviceSynchronize();
123 |   wbTime_stop(Compute, "Performing CUDA computation");
124 | 
125 |   wbTime_start(Copy, "Copying output memory to the CPU");
126 |   //@@ Copy the GPU memory back to the CPU here
127 |   cudaMemcpy(hostC, deviceC, sizeC, cudaMemcpyDeviceToHost);
128 |   wbTime_stop(Copy, "Copying output memory to the CPU");
129 | 
130 |   wbTime_start(GPU, "Freeing GPU Memory");
131 |   //@@ Free the GPU memory here
132 |   cudaFree(deviceA); cudaFree(deviceB); cudaFree(deviceC);
133 |   wbTime_stop(GPU, "Freeing GPU Memory");
134 | 
135 |   wbSolution(args, hostC, numCRows, numCColumns);
136 | 
137 |   free(hostA);
138 |   free(hostB);
139 |   free(hostC);
140 | 
141 |   return 0;
142 | }
143 | 


--------------------------------------------------------------------------------
/MP3/README.md:
--------------------------------------------------------------------------------
 1 | # Tiled Matrix Multiplication
 2 | ## Objective
 3 | The purpose of this lab is to implement a tiled dense matrix multiplication routine using shared memory.
 4 | 
 5 | ## Prerequisites
 6 | Before starting this lab, make sure that:
 7 | 
 8 | - You have completed the “Basic Matrix Multiplication” MP
 9 | 
10 | - You have completed all week 3 videos
11 | 
12 | # Instruction
13 | - Edit the code in the “Code” tab to perform the following:
14 | 
15 | - allocate device memory
16 | - copy host memory to device
17 | - initialize thread block and kernel grid dimensions
18 | - invoke CUDA kernel
19 | - copy results from device to host
20 | - deallocate device memory
21 | - implement the matrix-matrix multiplication routine using shared memory and tiling
22 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines.
23 | 
24 | # Suggestions (for all labs)
25 | - The system’s autosave feature is not an excuse to not backup your code and answers to your questions regularly.
26 | 
27 | - If you have not done so already, watch the tutorial video.
28 | 
29 | - Do not modify the template code provided -- only insert code where the //@@ demarcation is placed
30 | 
31 | - Develop your solution incrementally and test each version thoroughly before moving on to the next version
32 | 
33 | - Do not wait until the last minute to attempt the lab.
34 | 
35 | - If you get stuck with boundary conditions, grab a pen and paper. It is much easier to figure out the boundary conditions there.
36 | 
37 | - Implement the serial CPU version first, this will give you an understanding of the loops
38 | 
39 | - Get the first dataset working first. The datasets are ordered so the first one is the easiest to handle
40 | 
41 | - Make sure that your algorithm handles non-regular dimensional inputs (not square or multiples of 2). The slides may present the algorithm with nice inputs, since it minimizes the conditions. The datasets reflect different sizes of input that you are expected to handle
42 | 
43 | - Make sure that you test your program using all the datasets provided (the datasets can be selected using the dropdown next to the submission button)
44 | 
45 | - Check for errors: for example, when developing CUDA code, one can check for if the function call succeeded and print an error if not via the following macro:
46 | ```
47 | #define wbCheck(stmt) do {                                                    \
48 |         cudaError_t err = stmt;                                               \
49 |         if (err != cudaSuccess) {                                             \
50 |             wbLog(ERROR, "Failed to run stmt ", #stmt);                       \
51 |             wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));    \
52 |             return -1;                                                        \
53 |         }                                                                     \
54 |     } while(0)
55 | An example usage is wbCheck(cudaMalloc(...)).
56 | ```
57 | 


--------------------------------------------------------------------------------
/MP4/MP4.CU:
--------------------------------------------------------------------------------
  1 | #include <wb.h>
  2 | 
  3 | #define wbCheck(stmt)                                                     \
  4 |   do {                                                                    \
  5 |     cudaError_t err = stmt;                                               \
  6 |     if (err != cudaSuccess) {                                             \
  7 |       wbLog(ERROR, "CUDA error: ", cudaGetErrorString(err));              \
  8 |       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  9 |       return -1;                                                          \
 10 |     }                                                                     \
 11 |   } while (0)
 12 | 
 13 | //@@ Define any useful program-wide constants here
 14 | #define TILE_WIDTH 4
 15 | #define MASK_WIDTH 3
 16 | #define RADIUS 1
 17 | 
 18 | //@@ Define constant memory for device kernel here
 19 | __constant__ float MASK[MASK_WIDTH*MASK_WIDTH*MASK_WIDTH];
 20 | 
 21 | __global__ void conv3d(float *input, float *output, const int z_size,
 22 |                        const int y_size, const int x_size) {
 23 |   //@@ Insert kernel code here
 24 |   
 25 |   int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z; 
 26 |   int tx = threadIdx.x; int ty = threadIdx.y; int tz = threadIdx.z;
 27 |   
 28 |   int x_o = bx*TILE_WIDTH + tx;
 29 |   int y_o = by*TILE_WIDTH + ty;
 30 |   int z_o = bz*TILE_WIDTH + tz;
 31 |   
 32 |   __shared__ float N_ds[TILE_WIDTH][TILE_WIDTH][TILE_WIDTH];
 33 |   
 34 |   if (x_o >=0 && x_o < x_size && y_o >=0 && y_o < y_size && z_o >=0 && z_o < z_size)
 35 |     N_ds[tz][ty][tx] = input[x_size*y_size*z_o + x_size*y_o + x_o];
 36 |   else
 37 |     N_ds[tz][ty][tx] = 0;
 38 |   __syncthreads();
 39 |   
 40 |   int x_N_ds_Start = tx - RADIUS;
 41 |   int y_N_ds_Start = ty - RADIUS;
 42 |   int z_N_ds_Start = tz - RADIUS;
 43 |   float Pvalue = 0;
 44 |   
 45 |   for (int i = 0; i < MASK_WIDTH; i++)
 46 |     for (int j = 0; j < MASK_WIDTH; j++)
 47 |       for (int k = 0; k < MASK_WIDTH; k++){
 48 |         
 49 |         int x_ds_index = x_N_ds_Start + i;
 50 |         int y_ds_index = y_N_ds_Start + j;
 51 |         int z_ds_index = z_N_ds_Start + k;
 52 |         
 53 |         if (x_ds_index >= 0 && x_ds_index < TILE_WIDTH &&
 54 |             y_ds_index >= 0 && y_ds_index < TILE_WIDTH &&
 55 |             z_ds_index >= 0 && z_ds_index < TILE_WIDTH
 56 |            )
 57 |           Pvalue += N_ds[z_ds_index][y_ds_index][x_ds_index]*MASK[MASK_WIDTH*MASK_WIDTH*k + MASK_WIDTH*j + i];
 58 |         else{
 59 |           int x_Global = bx*TILE_WIDTH + x_ds_index;
 60 |           int y_Global = by*TILE_WIDTH + y_ds_index;
 61 |           int z_Global = bz*TILE_WIDTH + z_ds_index;
 62 |           
 63 |           if (x_Global >= 0 && x_Global < x_size &&
 64 |               y_Global >= 0 && y_Global < y_size &&
 65 |               z_Global >= 0 && z_Global < z_size)
 66 |             Pvalue += input[x_size*y_size*z_Global + x_size*y_Global + x_Global]*MASK[MASK_WIDTH*MASK_WIDTH*k + MASK_WIDTH*j + i];
 67 |         }
 68 |       }
 69 | 
 70 |   if (x_o < x_size && y_o < y_size && z_o < z_size)
 71 |      output[x_size*y_size*z_o + x_size*y_o + x_o] = Pvalue;
 72 | 
 73 |    __syncthreads();
 74 | }
 75 | 
 76 | int main(int argc, char *argv[]) {
 77 |   wbArg_t args;
 78 |   int z_size;
 79 |   int y_size;
 80 |   int x_size;
 81 |   int inputLength, kernelLength;
 82 |   float *hostInput;
 83 |   float *hostKernel;
 84 |   float *hostOutput;
 85 |   float *deviceInput;
 86 |   float *deviceOutput;
 87 | 
 88 |   args = wbArg_read(argc, argv);
 89 | 
 90 |   // Import data
 91 |   hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &inputLength);
 92 |   hostKernel = (float *)wbImport(wbArg_getInputFile(args, 1), &kernelLength);
 93 |   hostOutput = (float *)malloc(inputLength * sizeof(float));
 94 | 
 95 |   // First three elements are the input dimensions
 96 |   z_size = hostInput[0];
 97 |   y_size = hostInput[1];
 98 |   x_size = hostInput[2];
 99 |   wbLog(TRACE, "The input size is ", z_size, "x", y_size, "x", x_size);
100 |   assert(z_size * y_size * x_size == inputLength - 3);
101 |   assert(kernelLength == 27);
102 | 
103 |   wbTime_start(GPU, "Doing GPU Computation (memory + compute)");
104 | 
105 |   wbTime_start(GPU, "Doing GPU memory allocation");
106 |   //@@ Allocate GPU memory here
107 |   // Recall that inputLength is 3 elements longer than the input data
108 |   // because the first  three elements were the dimensions
109 |   cudaMalloc((void**)&deviceInput, x_size*y_size*z_size*sizeof(float));
110 |   cudaMalloc((void**)&deviceOutput, x_size*y_size*z_size*sizeof(float));
111 |   wbTime_stop(GPU, "Doing GPU memory allocation");
112 | 
113 |   wbTime_start(Copy, "Copying data to the GPU");
114 |   //@@ Copy input and kernel to GPU here
115 |   // Recall that the first three elements of hostInput are dimensions and
116 |   // do
117 |   // not need to be copied to the gpu
118 |   cudaMemcpy(deviceInput, hostInput+3, x_size*y_size*z_size*sizeof(float), cudaMemcpyHostToDevice);
119 |   cudaMemcpyToSymbol(MASK, hostKernel, MASK_WIDTH*MASK_WIDTH*MASK_WIDTH*sizeof(float));
120 |   wbTime_stop(Copy, "Copying data to the GPU");
121 | 
122 |   wbTime_start(Compute, "Doing the computation on the GPU");
123 |   //@@ Initialize grid and block dimensions here
124 |   dim3 dimGrid(ceil((1.0*x_size)/TILE_WIDTH), ceil((1.0*y_size)/TILE_WIDTH), ceil((1.0*z_size)/TILE_WIDTH));
125 |   dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, TILE_WIDTH);
126 |   //@@ Launch the GPU kernel here
127 |   conv3d<<<dimGrid, dimBlock>>>(deviceInput, deviceOutput, z_size,
128 |                                 y_size, x_size);
129 |   cudaDeviceSynchronize();
130 |   wbTime_stop(Compute, "Doing the computation on the GPU");
131 | 
132 |   wbTime_start(Copy, "Copying data from the GPU");
133 |   //@@ Copy the device memory back to the host here
134 |   // Recall that the first three elements of the output are the dimensions
135 |   // and should not be set here (they are set below)
136 |   cudaMemcpy(hostOutput+3 , deviceOutput, z_size * y_size * x_size * sizeof(float), cudaMemcpyDeviceToHost);
137 |   wbTime_stop(Copy, "Copying data from the GPU");
138 | 
139 |   wbTime_stop(GPU, "Doing GPU Computation (memory + compute)");
140 | 
141 |   // Set the output dimensions for correctness checking
142 |   hostOutput[0] = z_size;
143 |   hostOutput[1] = y_size;
144 |   hostOutput[2] = x_size;
145 |   wbSolution(args, hostOutput, inputLength);
146 | 
147 |   // Free device memory
148 |   cudaFree(deviceInput);
149 |   cudaFree(deviceOutput);
150 | 
151 |   // Free host memory
152 |   free(hostInput);
153 |   free(hostOutput);
154 |   return 0;
155 | }
156 | 


--------------------------------------------------------------------------------
/MP4/README.md:
--------------------------------------------------------------------------------
 1 | ## 3D Convolution
 2 | # Objective
 3 | The purpose of this lab is to implement a 3D convolution using constant memory for the kernel and 3D shared memory tiling.
 4 | 
 5 | ## Prerequisite Before starting this lab, make sure that:
 6 | 
 7 | - You have completed the “Tiled Matrix Multiplication” MP
 8 | 
 9 | - You have completed all week 4 videos
10 | 
11 | # Instructions
12 | - Edit the code to implement a 3D convolution with a 3x3x3 kernel in constant memory and a 3D shared-memory tiling.
13 | 
14 | - Edit the code to launch the kernel you implemented. The function should launch 3D CUDA grid and blocks, where each thread is responsible for computing a single element of the output.
15 | 
16 | - Answer the questions found in the questions tab.
17 | 
18 | ## Algorithm Specification
19 | You will be implementing the following 3D convolution.
20 | ```
21 | for z_out = 0 to z_size - 1:
22 |   for y_out = 0 to y_size - 1:
23 |     for x_out = 0 to x_size - 1: {
24 |       let res = 0;
25 |       for z_mask = - MASK_RADIUS to MASK_RADIUS:
26 |         for y_mask = - MASK_RADIUS to MASK_RADIUS:
27 |           for x_mask = - MASK_RADIUS to MASK_RADIUS:
28 |             let z_in = z_out +  z_mask;
29 |             let y_in = y_out + y_mask;
30 |             let x_in = x_out + x_mask;
31 |             // Pad boundary with 0
32 |             if (z_in >= 0 && z_in < z_size &&
33 |                 y_in >= 0 && y_in < y_size &&
34 |                 x_in >= 0 && x_in < x_size) then
35 |                res += mask[z_mask + MASK_RADIUS][y_mask + MASK_RADIUS][x_mask + MASK_RADIUS] * in[z_in][y_in][x_in]
36 |            }
37 |       out[z_out][y_out][x_out] = res;
38 |     }
39 |  ```
40 | - The kernel size is fixed to 3x3x3, given MASK_WIDTH = 3 and MASK_RADIUS = 1.
41 | - Halo elements should be read as 0.
42 | - You should support input data of any size.
43 | - Note that the input and output size is the same.
44 | ## Other Notes
45 | - The raw format of the input data is a flattened array, where the first three elements are the z_size, y_size, and x_size respectively. For example, a 5x4x3 input array will look like
46 | 
47 | - float inputData[] = { 5.0, 4.0, 3.0, ... < 60 floats > }
48 | - A point (z,y,x) may be accessed at z * (y_size * x_size) + y * (x_size) + x.
49 | 
50 | - The template code reads the first three elements into z_size, y_size, and x_size. You will need to copy the rest of the data to the device.
51 | 
52 | - Likewise, the result needs to have the sizes prepended for WebGPU to check your result correctly. The template code does that as well, but you must copy the data into outputData from the fourth element on.
53 | 
54 | - Remember that you can get a pointer to the fourth element of the array with &arry[3].
55 | 


--------------------------------------------------------------------------------
/MP5.1/MP5.1.CU:
--------------------------------------------------------------------------------
  1 | // MP Reduction
  2 | // Given a list (lst) of length n
  3 | // Output its sum = lst[0] + lst[1] + ... + lst[n-1];
  4 | 
  5 | #include <wb.h>
  6 | 
  7 | #define BLOCK_SIZE 512 //@@ You can change this
  8 | 
  9 | #define wbCheck(stmt)                                                     \
 10 |   do {                                                                    \
 11 |     cudaError_t err = stmt;                                               \
 12 |     if (err != cudaSuccess) {                                             \
 13 |       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
 14 |       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
 15 |       return -1;                                                          \
 16 |     }                                                                     \
 17 |   } while (0)
 18 | 
 19 | __global__ void total(float *input, float *output, int len) {
 20 |   //@@ Load a segment of the input vector into shared memory
 21 |   //@@ Traverse the reduction tree
 22 |   //@@ Write the computed sum of the block to the output vector at the
 23 |   //@@ correct index
 24 |   __shared__ float partialSum[2*BLOCK_SIZE];
 25 |   
 26 |   unsigned int t = threadIdx.x;
 27 |   unsigned int start = 2*blockIdx.x*blockDim.x;
 28 |   
 29 |   for (int k = 0; k < 2; k++){
 30 |     if ((t + start + k*BLOCK_SIZE)<len){
 31 |       partialSum[t + k*BLOCK_SIZE] = input[t + start + k*BLOCK_SIZE];
 32 |     }
 33 |     else{
 34 |       partialSum[t + k*BLOCK_SIZE] = 0;
 35 |     }
 36 |     __syncthreads();
 37 |   }
 38 |   __syncthreads();
 39 |   //partialSum[t] = input[start + t];
 40 |   //partialSum[blockDim.x + t] = input[start + blockDim.x + t];
 41 |   
 42 |   for (int stride = 1; stride <= BLOCK_SIZE; stride *= 2){
 43 |     if (t % stride == 0){
 44 |       partialSum[2*t] += partialSum[2*t + stride];
 45 |     }
 46 |     __syncthreads();
 47 |   }
 48 |   
 49 |   if (t == 0)
 50 |     output[blockIdx.x] = partialSum[0];
 51 | }
 52 | 
 53 | 
 54 | int main(int argc, char **argv) {
 55 |   int ii;
 56 |   wbArg_t args;
 57 |   float *hostInput;  // The input 1D list
 58 |   float *hostOutput; // The output list
 59 |   float *deviceInput;
 60 |   float *deviceOutput;
 61 |   int numInputElements;  // number of elements in the input list
 62 |   int numOutputElements; // number of elements in the output list
 63 | 
 64 |   args = wbArg_read(argc, argv);
 65 | 
 66 |   wbTime_start(Generic, "Importing data and creating memory on host");
 67 |   hostInput =
 68 |       (float *)wbImport(wbArg_getInputFile(args, 0), &numInputElements);
 69 | 
 70 |   numOutputElements = numInputElements / (BLOCK_SIZE << 1);
 71 |   if (numInputElements % (BLOCK_SIZE << 1)) {
 72 |     numOutputElements++;
 73 |   }
 74 |   hostOutput = (float *)malloc(numOutputElements * sizeof(float));
 75 | 
 76 |   wbTime_stop(Generic, "Importing data and creating memory on host");
 77 | 
 78 |   wbLog(TRACE, "The number of input elements in the input is ",
 79 |         numInputElements);
 80 |   wbLog(TRACE, "The number of output elements in the input is ",
 81 |         numOutputElements);
 82 | 
 83 |   wbTime_start(GPU, "Allocating GPU memory.");
 84 |   //@@ Allocate GPU memory here
 85 |   cudaMalloc((void**) &deviceInput, sizeof(float) * (numInputElements));
 86 |   cudaMalloc((void**) &deviceOutput, sizeof(float) * (numOutputElements));
 87 |   wbTime_stop(GPU, "Allocating GPU memory.");
 88 | 
 89 |   wbTime_start(GPU, "Copying input memory to the GPU.");
 90 |   //@@ Copy memory to the GPU here
 91 |   cudaMemcpy(deviceInput, hostInput, sizeof(float) * numInputElements, cudaMemcpyHostToDevice);
 92 |   wbTime_stop(GPU, "Copying input memory to the GPU.");
 93 |   //@@ Initialize the grid and block dimensions here
 94 |   dim3 DimBlock(BLOCK_SIZE, 1, 1);
 95 |   dim3 DimGrid(numOutputElements, 1, 1);
 96 |   wbTime_start(Compute, "Performing CUDA computation");
 97 |   //@@ Launch the GPU Kernel here
 98 |   total<<< DimGrid, DimBlock>>>(deviceInput, deviceOutput, numInputElements);
 99 |   cudaDeviceSynchronize();
100 |   wbTime_stop(Compute, "Performing CUDA computation");
101 | 
102 |   wbTime_start(Copy, "Copying output memory to the CPU");
103 |   //@@ Copy the GPU memory back to the CPU here
104 |   cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * numOutputElements, cudaMemcpyDeviceToHost);
105 |   wbTime_stop(Copy, "Copying output memory to the CPU");
106 | 
107 |   /********************************************************************
108 |    * Reduce output vector on the host
109 |    * NOTE: One could also perform the reduction of the output vector
110 |    * recursively and support any size input. For simplicity, we do not
111 |    * require that for this lab.
112 |    ********************************************************************/
113 |   for (ii = 1; ii < numOutputElements; ii++) {
114 |     hostOutput[0] += hostOutput[ii];
115 |   }
116 | 
117 |   wbTime_start(GPU, "Freeing GPU Memory");
118 |   //@@ Free the GPU memory here
119 |   cudaFree(deviceInput);
120 |   cudaFree(deviceOutput);
121 |   wbTime_stop(GPU, "Freeing GPU Memory");
122 | 
123 |   wbSolution(args, hostOutput, 1);
124 | 
125 |   free(hostInput);
126 |   free(hostOutput);
127 | 
128 |   return 0;
129 | }
130 | 


--------------------------------------------------------------------------------
/MP5.1/README.md:
--------------------------------------------------------------------------------
 1 | # List Reduction
 2 | ## Objective
 3 | Implement a kernel and associated host code that performs reduction of a 1D list stored in a C array. The reduction should give the sum of the list. You should implement the improved kernel discussed in the lecture. Your kernel should be able to handle input lists of arbitrary length.
 4 | 
 5 | # Prerequisites
 6 | - Before starting this lab, make sure that:
 7 | 
 8 | - You have completed week 4 lecture videos
 9 | # Instruction
10 | For simplicity, you can assume that the input list will contain at most 2048 x 65535 elements so that it can be handled by only one kernel launch. The boundary condition can be handled by filling ‘identity value (0 for sum)’ into the shared memory of the last block when the length is not a multiple of the thread block size. Write a host (CPU) loop to calculate the total of the reduction sums of each section generated by individual blocks.
11 | 
12 | Edit the code in the ‘Code’ tab to perform the following:
13 | 
14 | - allocate device memory
15 | - copy host memory to device
16 | - initialize thread block and kernel grid dimensions
17 | - invoke CUDA kernel
18 | - copy results from device to host
19 | - deallocate device memory
20 | - implement the improved reduction kernel
21 | - use shared memory to reduce the number of global accesses, handle the boundary conditions when loading input list elements into the shared memory
22 | - implement a CPU loop to perform final reduction based on the sums of sections generated by the thread blocks after copying the partial sum array back to the host memory
23 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines.
24 | 
25 | 


--------------------------------------------------------------------------------
/MP5.2/MP5.2.CU:
--------------------------------------------------------------------------------
  1 | // MP Scan
  2 | // Given a list (lst) of length n
  3 | // Output its prefix sum = {lst[0], lst[0] + lst[1], lst[0] + lst[1] + ...
  4 | // +
  5 | // lst[n-1]}
  6 | 
  7 | #include <wb.h>
  8 | 
  9 | #define BLOCK_SIZE 512 //@@ You can change this
 10 | 
 11 | #define wbCheck(stmt)                                                     \
 12 |   do {                                                                    \
 13 |     cudaError_t err = stmt;                                               \
 14 |     if (err != cudaSuccess) {                                             \
 15 |       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
 16 |       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
 17 |       return -1;                                                          \
 18 |     }                                                                     \
 19 |   } while (0)
 20 | 
 21 | __global__ void scan(float *input, float *output, int len, float *psum) {
 22 |   //@@ Modify the body of this function to complete the functionality of
 23 |   //@@ the scan on the device
 24 |   //@@ You may need multiple kernel calls; write your kernels before this
 25 |   //@@ function and call them from the host
 26 |   
 27 |   //Load into shared memory
 28 |   
 29 |   __shared__ int T[2*BLOCK_SIZE];
 30 |   
 31 |   int tx = threadIdx.x + (blockIdx.x * blockDim.x * 2);
 32 |   if (tx < len)
 33 |     T[threadIdx.x] = input[tx];
 34 |   else 
 35 |     T[threadIdx.x] = 0;
 36 |   __syncthreads();
 37 |   
 38 |   if ((tx + blockDim.x) < len)
 39 |     T[threadIdx.x + blockDim.x] = input[tx + blockDim.x];
 40 |   else 
 41 |     T[threadIdx.x + blockDim.x] = 0;
 42 |   
 43 |   __syncthreads();
 44 |   
 45 |   // First Scan Step
 46 |   int stride = 1;
 47 |   while(stride < 2*BLOCK_SIZE)
 48 |     {
 49 |         __syncthreads();
 50 |         int index = (threadIdx.x+1)*stride*2 - 1;
 51 |         if(index < 2*BLOCK_SIZE && (index-stride) >= 0)
 52 |             T[index] += T[index-stride];
 53 |         stride = stride*2;
 54 |     }
 55 |      
 56 |   // post_scan
 57 |   stride = BLOCK_SIZE/2;
 58 |   while(stride > 0)
 59 |     {
 60 |         __syncthreads();
 61 |         int index = (threadIdx.x+1)*stride*2 - 1;
 62 |         if((index+stride) < 2*BLOCK_SIZE)
 63 |         {
 64 | 	       T[index+stride] += T[index];
 65 |         }				
 66 |         stride = stride / 2;
 67 |     }
 68 |   
 69 |   
 70 |   // copy to output
 71 |   
 72 |   //Write result back to Global memory
 73 |   __syncthreads();
 74 |   if (tx < len)
 75 |     output[tx] = T[threadIdx.x];
 76 |   if ((tx + blockDim.x) < len)
 77 |     output[tx + blockDim.x] = T[threadIdx.x + blockDim.x];
 78 |   
 79 |   // store partial sum
 80 |   if(threadIdx.x == 0)
 81 |     psum[blockIdx.x] = T[2*BLOCK_SIZE-1];  
 82 | 
 83 | }
 84 | 
 85 | __global__ void add(float *psum, float *output, int len){
 86 |   __shared__ float increment;
 87 |   
 88 |   if (threadIdx.x == 0){
 89 |     if (blockIdx.x == 0)
 90 |       increment = 0;
 91 |     else
 92 |       increment = psum[blockIdx.x - 1];
 93 |   }
 94 |   __syncthreads();
 95 |   
 96 |   for(int k = 0; k < 2; ++k){
 97 |     int tile = (blockIdx.x * blockDim.x * 2) + threadIdx.x + (k * BLOCK_SIZE);
 98 |     if(tile < len){
 99 |       output[tile] += increment;
100 |     }
101 |   }
102 |   
103 | }
104 | 
105 | 
106 | 
107 | int main(int argc, char **argv) {
108 |   wbArg_t args;
109 |   float *hostInput;  // The input 1D list
110 |   float *hostOutput; // The output list
111 |   float *deviceInput;
112 |   float *deviceOutput;
113 |   float *deviceBuffer; //extra
114 |   float *auxSum;       //extra
115 |   float *tmp;          //extra
116 |   int numElements; // number of elements in the list
117 | 
118 |   args = wbArg_read(argc, argv);
119 | 
120 |   wbTime_start(Generic, "Importing data and creating memory on host");
121 |   hostInput = (float *)wbImport(wbArg_getInputFile(args, 0), &numElements);
122 |   hostOutput = (float *)malloc(numElements * sizeof(float));
123 |   wbTime_stop(Generic, "Importing data and creating memory on host");
124 | 
125 |   wbLog(TRACE, "The number of input elements in the input is ",
126 |         numElements);
127 | 
128 |   wbTime_start(GPU, "Allocating GPU memory.");
129 |   int numBlocks = ceil((numElements*1.0) / (BLOCK_SIZE*2));
130 |   wbCheck(cudaMalloc((void **)&deviceInput, numElements * sizeof(float)));
131 |   wbCheck(cudaMalloc((void **)&deviceOutput, numElements * sizeof(float)));
132 |   wbCheck(cudaMalloc((void **)&deviceBuffer, numBlocks * sizeof(float)));
133 |   wbCheck(cudaMalloc((void **)&auxSum, numBlocks * sizeof(float)));
134 |   wbCheck(cudaMalloc((void **)&tmp, sizeof(float)));
135 |   
136 |   wbTime_stop(GPU, "Allocating GPU memory.");
137 | 
138 |   wbTime_start(GPU, "Clearing output memory.");
139 |   wbCheck(cudaMemset(deviceOutput, 0, numElements * sizeof(float)));
140 |   wbTime_stop(GPU, "Clearing output memory.");
141 | 
142 |   wbTime_start(GPU, "Copying input memory to the GPU.");
143 |   wbCheck(cudaMemcpy(deviceInput, hostInput, numElements * sizeof(float),
144 |                      cudaMemcpyHostToDevice));
145 |   wbTime_stop(GPU, "Copying input memory to the GPU.");
146 | 
147 |   //@@ Initialize the grid and block dimensions here
148 |   dim3 DimBlock(BLOCK_SIZE, 1, 1);
149 |   dim3 DimGrid(numBlocks, 1, 1);
150 |   wbTime_start(Compute, "Performing CUDA computation");
151 |   //@@ Modify this to complete the functionality of the scan
152 |   //@@ on the deivce
153 |   
154 |   //Scan Input
155 |   scan<<<DimGrid, DimBlock>>>(deviceInput, deviceOutput, numElements, deviceBuffer);
156 |   cudaDeviceSynchronize();
157 |   
158 |   //Scan deviceBuffer
159 |   dim3 singleDimGrid(1, 1, 1);
160 |   scan<<<singleDimGrid, DimBlock>>>(deviceBuffer, auxSum, numBlocks, tmp);
161 |   cudaDeviceSynchronize();
162 |   
163 |   //Add Input & deviceBuffer
164 |   add<<<DimGrid, DimBlock>>>(auxSum, deviceOutput, numElements);
165 |   cudaDeviceSynchronize();
166 |   
167 |   wbTime_stop(Compute, "Performing CUDA computation");
168 | 
169 |   wbTime_start(Copy, "Copying output memory to the CPU");
170 |   wbCheck(cudaMemcpy(hostOutput, deviceOutput, numElements * sizeof(float),
171 |                      cudaMemcpyDeviceToHost));
172 |   wbTime_stop(Copy, "Copying output memory to the CPU");
173 | 
174 |   wbTime_start(GPU, "Freeing GPU Memory");
175 |   cudaFree(deviceInput);
176 |   cudaFree(deviceOutput);
177 |   wbTime_stop(GPU, "Freeing GPU Memory");
178 | 
179 |   wbSolution(args, hostOutput, numElements);
180 | 
181 |   free(hostInput);
182 |   free(hostOutput);
183 | 
184 |   return 0;
185 | }
186 | 


--------------------------------------------------------------------------------
/MP5.2/README.md:
--------------------------------------------------------------------------------
 1 | # List Scan (Parallel Scan)
 2 | ## Objective
 3 | The purpose of this lab is to implement one or more kernels and their associated host code to perform parallel scan on a 1D list. The scan operator used will be addition. You should implement the work- efficient kernel discussed in lecture. Your kernel should be able to handle input lists of arbitrary length. However, for simplicity, you can assume that the input list will be at most 2,048 * 2,048 elements.
 4 | 
 5 | ## Prerequisites
 6 | Before starting this lab, make sure that:
 7 | 
 8 | - You have completed all week 4 lecture videos
 9 | - You have completed all week 5 lecture videos
10 | - You have completed the List Reduction Lab
11 | ## Instruction
12 | The boundary condition can be handled by filling ‘identity value (0 for sum)’ into the shared memory of the last block when the length is not a multiple of the thread block size.
13 | 
14 | You will need to launch multiple kernels to complete the parallel scan as discussed in the lecture.
15 | 
16 | Edit the code in the code tab to perform the following:
17 | 
18 | - allocate device memory
19 | - copy host memory to device
20 | - initialize thread block and kernel grid dimensions
21 | - invoke CUDA kernel
22 | - copy results from device to host
23 | - deallocate device memory
24 | - implement the work-efficient scan kernel to generate per-block scan array and store the block sums into an auxiliary block sum array.
25 | - use shared memory to reduce the number of global memory accesses, handle the boundary conditions when loading input list elements into the shared memory
26 | - reuse the kernel to perform scan on the auxiliary block sum array to translate the elements into accumulative block sums. Note that - - this kernel will be launched with only one block.
27 | - implement the kernel that adds the accumulative block sums to the appropriate elements of the per-block scan array to complete the scan for all the elements.
28 | - Instructions about where to place each part of the code is demarcated by the //@@ comment lines.
29 | 


--------------------------------------------------------------------------------
/MP6/MP6.CU:
--------------------------------------------------------------------------------
  1 | // Histogram Equalization
  2 | 
  3 | #include <wb.h>
  4 | 
  5 | #define HISTOGRAM_LENGTH 256
  6 | 
  7 | //@@ insert code here
  8 | __global__ void float2Char(float *input, unsigned char *output, int width, int height){
  9 |   int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z;
 10 |   int tx = threadIdx.x; int ty = threadIdx.y;
 11 |   int x = bx*blockDim.x + tx;
 12 |   int y = by*blockDim.y + ty;
 13 |   if(y < height && x < width){
 14 |     int idx = (width * height)*bz + (width)*y + x;
 15 |     output[idx] = (unsigned char) (255 * input[idx]);
 16 |   }
 17 | }
 18 | 
 19 | __global__ void rgb2Gray(unsigned char *input, unsigned char *output, int width, int height){
 20 |   int bx = blockIdx.x; int by = blockIdx.y;  
 21 |   int tx = threadIdx.x; int ty = threadIdx.y;
 22 |   int x = bx*blockDim.x + tx;
 23 |   int y = by*blockDim.y + ty;
 24 |   if(y < height && x < width){
 25 |     int idx = y * (width) + x;
 26 |     uint8_t r = input[3 * idx];
 27 |     uint8_t g = input[3 * idx + 1];
 28 |     uint8_t b = input[3 * idx + 2];
 29 |     output[idx] = (unsigned char) (0.21*r + 0.71*g + 0.07*b);
 30 |   }
 31 | }
 32 | 
 33 | __global__ void gray2Hist(unsigned char *input, unsigned int *output, int width, int height){
 34 |   int bx = blockIdx.x; int by = blockIdx.y;
 35 |   int tx = threadIdx.x; int ty = threadIdx.y;
 36 |   int x = bx*blockDim.x + tx;
 37 |   int y = by*blockDim.y + ty;
 38 |   
 39 |   __shared__ unsigned int hist[HISTOGRAM_LENGTH];
 40 |   int tIdx = blockDim.x*ty + tx;
 41 |   if (tIdx < HISTOGRAM_LENGTH) 
 42 |     hist[tIdx] = 0;
 43 |   
 44 |   __syncthreads();
 45 |   
 46 |   if (x < width && y < height) {
 47 |     int idx = y * (width) + x;
 48 |     unsigned char val = input[idx];
 49 |     atomicAdd(&(hist[val]), 1);
 50 |   }
 51 | 
 52 |   __syncthreads();
 53 |   if (tIdx < HISTOGRAM_LENGTH) 
 54 |     atomicAdd(&(output[tIdx]), hist[tIdx]);
 55 |   
 56 | }
 57 | 
 58 | __global__ void hist2CDF(unsigned int *input, float *output, int width, int height){
 59 |   __shared__ unsigned int cdf[HISTOGRAM_LENGTH];
 60 |   int id = threadIdx.x;
 61 |   
 62 |   if(id < HISTOGRAM_LENGTH)
 63 |     cdf[id] = input[id];
 64 |   __syncthreads();
 65 |   
 66 |   //reduction
 67 |   int stride = 1;
 68 |   while(stride < HISTOGRAM_LENGTH)
 69 |     {
 70 |         __syncthreads();
 71 |         int index = (threadIdx.x+1)*stride*2 - 1;
 72 |         if(index < HISTOGRAM_LENGTH && (index-stride) >= 0)
 73 |             cdf[index] += cdf[index-stride];
 74 |         stride = stride*2;
 75 |     }
 76 |   
 77 |   stride = HISTOGRAM_LENGTH/4;
 78 |   while(stride > 0)
 79 |     {
 80 |         __syncthreads();
 81 |         int index = (threadIdx.x+1)*stride*2 - 1;
 82 |         if((index+stride) < HISTOGRAM_LENGTH)
 83 |         {
 84 | 	       cdf[index+stride] += cdf[index];
 85 |         }				
 86 |         stride = stride / 2;
 87 |     }
 88 |   __syncthreads();
 89 |   output[id] = cdf[id] / ((float)(width * height));
 90 | }
 91 | 
 92 | 
 93 | __global__ void equal(unsigned char *img, float *cdf, int width, int height){
 94 |   int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z; 
 95 |   int tx = threadIdx.x; int ty = threadIdx.y;
 96 |   int x = bx*blockDim.x + tx;
 97 |   int y = by*blockDim.y + ty;
 98 |   
 99 |   if(x < width && y < height){
100 |     int idx = (width * height)*bz + (width)*y + x;
101 |     float v = 255*(cdf[img[idx]] - cdf[0])/(1.0 - cdf[0]);
102 |     img[idx] = (unsigned char) min(max(v, 0.0), 255.0);
103 |   }
104 | }
105 | 
106 | __global__ void uint2float(unsigned char *input, float *output, int width, int height){
107 |   int bx = blockIdx.x; int by = blockIdx.y; int bz = blockIdx.z; 
108 |   int tx = threadIdx.x; int ty = threadIdx.y;
109 |   int x = bx*blockDim.x + tx;
110 |   int y = by*blockDim.y + ty;
111 |   if(x < width && y < height){
112 |     int idx = (width * height)*bz + (width)*y + x;
113 |     output[idx] = (float) (input[idx] / 255.0);
114 |   }
115 | }
116 | 
117 | int main(int argc, char **argv) {
118 |   wbArg_t args;
119 |   int imageWidth;
120 |   int imageHeight;
121 |   int imageChannels;
122 |   wbImage_t inputImage;
123 |   wbImage_t outputImage;
124 |   float *hostInputImageData;
125 |   float *hostOutputImageData;
126 |   const char *inputImageFile;
127 | 
128 |   //@@ Insert more code here
129 |   float *deviceFloat;
130 |   unsigned char *deviceUint;
131 |   unsigned char *deviceGray;
132 |   unsigned int *deviceHist;
133 |   float *deviceCDF; 
134 | 
135 |   args = wbArg_read(argc, argv); /* parse the input arguments */
136 |   inputImageFile = wbArg_getInputFile(args, 0);
137 |   wbTime_start(Generic, "Importing data and creating memory on host");
138 |   inputImage = wbImport(inputImageFile);
139 |   imageWidth = wbImage_getWidth(inputImage);
140 |   imageHeight = wbImage_getHeight(inputImage);
141 |   imageChannels = wbImage_getChannels(inputImage);
142 |   outputImage = wbImage_new(imageWidth, imageHeight, imageChannels);
143 |   hostInputImageData = wbImage_getData(inputImage);//get image data 
144 |   hostOutputImageData = wbImage_getData(outputImage); 
145 |   wbTime_stop(Generic, "Importing data and creating memory on host");
146 |   
147 |   //@@ insert code here
148 |   cudaMalloc((void**) &deviceFloat, imageWidth * imageHeight * imageChannels * sizeof(float));
149 |   cudaMalloc((void**) &deviceUint, imageWidth * imageHeight * imageChannels * sizeof(unsigned char));
150 |   cudaMalloc((void**) &deviceGray, imageWidth * imageHeight * sizeof(unsigned char));
151 |   cudaMalloc((void**) &deviceHist, HISTOGRAM_LENGTH * sizeof(unsigned int));
152 |   cudaMemset((void *) deviceHist, 0, HISTOGRAM_LENGTH * sizeof(unsigned int));
153 |   cudaMalloc((void**) &deviceCDF, HISTOGRAM_LENGTH * sizeof(float));
154 |   
155 |   cudaMemcpy(deviceFloat, hostInputImageData, 
156 |              imageWidth*imageHeight*imageChannels*sizeof(float), cudaMemcpyHostToDevice);
157 |   
158 |   
159 |   dim3 dimGrid;
160 |   dim3 dimBlock;
161 |   
162 |   dimGrid  = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), imageChannels);
163 |   dimBlock = dim3(32, 32, 1);
164 |   float2Char<<<dimGrid, dimBlock>>>(deviceFloat, deviceUint, imageWidth, imageHeight);
165 |   cudaDeviceSynchronize();
166 |   
167 |   
168 |   dimGrid  = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), 1);
169 |   dimBlock = dim3(32, 32, 1);
170 |   rgb2Gray<<<dimGrid, dimBlock>>>(deviceUint, deviceGray, imageWidth, imageHeight);
171 |   cudaDeviceSynchronize();
172 |   
173 |   
174 |   dimGrid  = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), 1);
175 |   dimBlock = dim3(32, 32, 1);
176 |   gray2Hist<<<dimGrid, dimBlock>>>(deviceGray, deviceHist, imageWidth, imageHeight);
177 |   cudaDeviceSynchronize();
178 |   
179 |   
180 |   dimGrid  = dim3(1, 1, 1);
181 |   dimBlock = dim3(HISTOGRAM_LENGTH, 1, 1);
182 |   hist2CDF<<<dimGrid, dimBlock>>>(deviceHist, deviceCDF, imageWidth, imageHeight);
183 |   cudaDeviceSynchronize();
184 |   
185 |   
186 |   dimGrid  = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), imageChannels);
187 |   dimBlock = dim3(32, 32, 1);
188 |   equal<<<dimGrid, dimBlock>>>(deviceUint, deviceCDF, imageWidth, imageHeight);
189 |   cudaDeviceSynchronize();
190 |   
191 |   
192 |   dimGrid  = dim3(ceil(imageWidth/32.0), ceil(imageHeight/32.0), imageChannels);
193 |   dimBlock = dim3(32, 32, 1);
194 |   uint2float<<<dimGrid, dimBlock>>>(deviceUint, deviceFloat, imageWidth, imageHeight);
195 |   cudaDeviceSynchronize();
196 |   
197 |   cudaMemcpy(hostOutputImageData, deviceFloat,
198 |              imageWidth*imageHeight*imageChannels*sizeof(float), cudaMemcpyDeviceToHost);
199 | 
200 |   wbSolution(args, outputImage);
201 | 
202 |   //@@ insert code here
203 |   cudaFree(deviceFloat);
204 |   cudaFree(deviceUint);
205 |   cudaFree(deviceGray);
206 |   cudaFree(deviceHist);
207 |   cudaFree(deviceCDF);
208 | 
209 |   return 0;
210 | }
211 | 


--------------------------------------------------------------------------------
/MP6/README.md:
--------------------------------------------------------------------------------
  1 | # Histogram
  2 | ## Objective
  3 | The purpose of this lab is to implement an efficient histogramming equalization algorithm for an input image. Like the image convolution MP, the image is represented as RGB float values. You will convert that to GrayScale unsigned char values and compute the histogram. Based on the histogram, you will compute a histogram equalization function which you will then apply to the original image to get the color corrected image.
  4 | 
  5 | ## Prerequisites
  6 | Before starting this lab, make sure that:
  7 | 
  8 | - You have completed all week 7 lecture videos
  9 | ## Instruction
 10 | - Edit the code in the code tab to perform the following:
 11 | 
 12 | - Cast the image to unsigned char
 13 | 
 14 | - Convert the image from RGB to Gray Scale. You will find one of the lectures and textbook chapters helpful.
 15 | 
 16 | - Compute the histogram of the image
 17 | 
 18 | - Compute the scan (prefix sum) of the histogram to arrive at the histogram equalization function
 19 | 
 20 | - Apply the equalization function to the input image to get the color corrected image
 21 | 
 22 | ## Background
 23 | In this section we discuss some of the background details of the histogram equalization algorithm. For images that represent the full color space, we expect an image’s histogram to be evenly distributed. This means that we expect the bin values in the histogram to be pixel_count/.256, assuming that we scale the pixel luminous values so that they fit between 0 and 256. This algorithm adjusts an image’s histogram so that all bins have equal probability.
 24 | 
 25 | ### image
 26 | 
 27 | We first need to convert the image to gray scale by computing it’s luminosity values that represent the brightness of the image and would allow us to simplify the histogram computation.
 28 | 
 29 | ### Gray
 30 | 
 31 | The histogram computes the number of pixels having a specific brightness value. Dividing by the number of pixels (width * height) gives us the probability of a luminosity value to occur in an image.
 32 | 
 33 | ### OrigProb
 34 | 
 35 | A color balanced image is expected to have a uniform distribution of the luminosity values.
 36 | 
 37 | This means that if we compute the Cumulative Distribution Function (CDF) we expect a linear curve for a color equalized image. For images that are not color equalized, we expect the curve to be non-linear.
 38 | 
 39 | ### origcdf
 40 | 
 41 | The algorithm equalizes the curve by computing a transformation function to map the original CDF to the desired CDF (the desired CDF being an almost linear function).
 42 | 
 43 | ### newcdf
 44 | 
 45 | The computed transformation is applied to the original image to produce the equalized image.
 46 | 
 47 | ### newimg
 48 | 
 49 | Note that the CDF of the histogram of the new image has been transformed into an almost linear curve.
 50 | 
 51 | ### compare
 52 | 
 53 | ## Implementation Steps
 54 | Here we show the steps to be performed. The computation to be performed by each kernel is illustrated with serial pseudo code.
 55 | 
 56 | ### Cast the image from float to unsigned char
 57 | Implement a kernel that casts the image from float * to unsigned char *.
 58 | ```
 59 | for ii from 0 to (width * height * channels) do
 60 |     ucharImage[ii] = (unsigned char) (255 * inputImage[ii])
 61 | end
 62 | ```
 63 | 
 64 | ### Convert the image from RGB to GrayScale
 65 | Implement a kernel that converts the RGB image to GrayScale. A sample sequential pseudo code is shown below. You will find one the lectures and one of the textbook chapters helpful.
 66 | ```
 67 | for ii from 0 to height do
 68 |     for jj from 0 to width do
 69 |         idx = ii * width + jj
 70 |         # here channels is 3
 71 |         r = ucharImage[3*idx]
 72 |         g = ucharImage[3*idx + 1]
 73 |         b = ucharImage[3*idx + 2]
 74 |         grayImage[idx] = (unsigned char) (0.21*r + 0.71*g + 0.07*b)
 75 |     end
 76 | end
 77 | ```
 78 | 
 79 | ### Compute the histogram of grayImage
 80 | Implement a kernel that computes the histogram (like in the lectures) of the image. A sample pseudo code is shown below. You will find one of the lectures and one of the textbook chapters helpful.
 81 | ```
 82 | histogram = [0, ...., 0] # here len(histogram) = 256
 83 | for ii from 0 to width * height do
 84 |     histogram[grayImage[ii]]++
 85 | end
 86 | ```
 87 | 
 88 | ### Compute the Cumulative Distribution Function of histogram
 89 | This is a scan operation like you have done in the previous lab. A sample sequential pseudo code is shown below.
 90 | ```
 91 | cdf[0] = p(histogram[0])
 92 | for ii from 1 to 256 do
 93 |     cdf[ii] = cdf[ii - 1] + p(histogram[ii])
 94 | end
 95 | ```
 96 | Where p() calculates the probability of a pixel to be in a histogram bin
 97 | ```
 98 | def p(x):
 99 |     return x / (width * height)
100 | end
101 | ```
102 | Compute the minimum value of the CDF. The maximal value of the CDF should be 1.0.
103 | 
104 | ### Define the histogram equalization function
105 | The histogram equalization function (correct) remaps the cdf of the histogram of the image to a linear function and is defined as
106 | ```
107 | def correct_color(val) 
108 |     return clamp(255*(cdf[val] - cdfmin)/(1.0 - cdfmin), 0, 255.0)
109 | end
110 | ```
111 | ```
112 | def clamp(x, start, end)
113 |     return min(max(x, start), end)
114 | end
115 | ```
116 | 
117 | ### Apply the histogram equalization function
118 | Once you have implemented all of the above, then you are ready to correct the input image. This can be done by writing a kernel to apply the correct_color() function to the RGB pixel values in parallel.
119 | ```
120 | for ii from 0 to (width * height * channels) do
121 |     ucharImage[ii] = correct_color(ucharImage[ii])
122 | end
123 | ```
124 | 
125 | ### Cast back to float
126 | ```
127 | for ii from 0 to (width * height * channels) do
128 |     outputImage[ii] = (float) (ucharImage[ii]/255.0)
129 | end
130 | ```
131 | And you’re done
132 | 
133 | ### Image Format
134 | For people who are developing on their own system. The images are stored in PPM (P6) format, this means that you can (if you want) create your own input images. The easiest way to create image is via external tools. You can use tools such as bmptoppm.
135 | 


--------------------------------------------------------------------------------
/MP7/MP7.CU:
--------------------------------------------------------------------------------
  1 | #include <wb.h>
  2 | 
  3 | #define wbCheck(stmt)                                                     \
  4 |   do {                                                                    \
  5 |     cudaError_t err = stmt;                                               \
  6 |     if (err != cudaSuccess) {                                             \
  7 |       wbLog(ERROR, "Failed to run stmt ", #stmt);                         \
  8 |       wbLog(ERROR, "Got CUDA error ...  ", cudaGetErrorString(err));      \
  9 |       return -1;                                                          \
 10 |     }                                                                     \
 11 |   } while (0)
 12 | 
 13 | #define TILE_WIDTH 1024
 14 | 
 15 | __global__ void spmvJDSKernel(float *out, int *matColStart, int *matCols,
 16 |                               int *matRowPerm, int *matRows,
 17 |                               float *matData, float *vec, int dim) {
 18 |   //@@ insert spmv kernel for jds format
 19 |   int x = blockIdx.x * blockDim.x + threadIdx.x;
 20 |   if (x < dim) {
 21 |     float dot = 0;
 22 |     int elem = 0;
 23 |     while (elem < matRows[x]){
 24 |       dot += matData[matColStart[elem] + x] * vec[matCols[matColStart[elem] + x]];
 25 |       elem++;
 26 |     }
 27 |     out[matRowPerm[x]] = dot;
 28 |   }
 29 | }
 30 | 
 31 | static void spmvJDS(float *out, int *matColStart, int *matCols,
 32 |                     int *matRowPerm, int *matRows, float *matData,
 33 |                     float *vec, int dim) {
 34 | 
 35 |   //@@ invoke spmv kernel for jds format
 36 |   
 37 |   spmvJDSKernel<<<ceil(dim*(1.0)/TILE_WIDTH), TILE_WIDTH>>>(out, matColStart, matCols, matRowPerm, matRows, matData, vec, dim);
 38 |   
 39 | }
 40 | 
 41 | int main(int argc, char **argv) {
 42 |   wbArg_t args;
 43 |   int *hostCSRCols;
 44 |   int *hostCSRRows;
 45 |   float *hostCSRData;
 46 |   int *hostJDSColStart;
 47 |   int *hostJDSCols;
 48 |   int *hostJDSRowPerm;
 49 |   int *hostJDSRows;
 50 |   float *hostJDSData;
 51 |   float *hostVector;
 52 |   float *hostOutput;
 53 |   int *deviceJDSColStart;
 54 |   int *deviceJDSCols;
 55 |   int *deviceJDSRowPerm;
 56 |   int *deviceJDSRows;
 57 |   float *deviceJDSData;
 58 |   float *deviceVector;
 59 |   float *deviceOutput;
 60 |   int dim, ncols, nrows, ndata;
 61 |   int maxRowNNZ;
 62 | 
 63 |   args = wbArg_read(argc, argv);
 64 | 
 65 |   wbTime_start(Generic, "Importing data and creating memory on host");
 66 |   hostCSRCols = (int *)wbImport(wbArg_getInputFile(args, 0), &ncols, "Integer");
 67 |   hostCSRRows = (int *)wbImport(wbArg_getInputFile(args, 1), &nrows, "Integer");
 68 |   hostCSRData = (float *)wbImport(wbArg_getInputFile(args, 2), &ndata, "Real");
 69 |   hostVector = (float *)wbImport(wbArg_getInputFile(args, 3), &dim, "Real");
 70 | 
 71 |   hostOutput = (float *)malloc(sizeof(float) * dim);
 72 | 
 73 |   wbTime_stop(Generic, "Importing data and creating memory on host");
 74 | 
 75 |   CSRToJDS(dim, hostCSRRows, hostCSRCols, hostCSRData, &hostJDSRowPerm, &hostJDSRows,
 76 |            &hostJDSColStart, &hostJDSCols, &hostJDSData);
 77 |   maxRowNNZ = hostJDSRows[0];
 78 | 
 79 |   wbTime_start(GPU, "Allocating GPU memory.");
 80 |   cudaMalloc((void **)&deviceJDSColStart, sizeof(int) * maxRowNNZ);
 81 |   cudaMalloc((void **)&deviceJDSCols, sizeof(int) * ndata);
 82 |   cudaMalloc((void **)&deviceJDSRowPerm, sizeof(int) * dim);
 83 |   cudaMalloc((void **)&deviceJDSRows, sizeof(int) * dim);
 84 |   cudaMalloc((void **)&deviceJDSData, sizeof(float) * ndata);
 85 | 
 86 |   cudaMalloc((void **)&deviceVector, sizeof(float) * dim);
 87 |   cudaMalloc((void **)&deviceOutput, sizeof(float) * dim);
 88 |   wbTime_stop(GPU, "Allocating GPU memory.");
 89 | 
 90 |   wbTime_start(GPU, "Copying input memory to the GPU.");
 91 |   cudaMemcpy(deviceJDSColStart, hostJDSColStart, sizeof(int) * maxRowNNZ,
 92 |              cudaMemcpyHostToDevice);
 93 |   cudaMemcpy(deviceJDSCols, hostJDSCols, sizeof(int) * ndata, cudaMemcpyHostToDevice);
 94 |   cudaMemcpy(deviceJDSRowPerm, hostJDSRowPerm, sizeof(int) * dim, cudaMemcpyHostToDevice);
 95 |   cudaMemcpy(deviceJDSRows, hostJDSRows, sizeof(int) * dim, cudaMemcpyHostToDevice);
 96 |   cudaMemcpy(deviceJDSData, hostJDSData, sizeof(float) * ndata, cudaMemcpyHostToDevice);
 97 |   cudaMemcpy(deviceVector, hostVector, sizeof(float) * dim, cudaMemcpyHostToDevice);
 98 |   wbTime_stop(GPU, "Copying input memory to the GPU.");
 99 | 
100 |   wbTime_start(Compute, "Performing CUDA computation");
101 |   spmvJDS(deviceOutput, deviceJDSColStart, deviceJDSCols, deviceJDSRowPerm, deviceJDSRows,
102 |           deviceJDSData, deviceVector, dim);
103 |   cudaDeviceSynchronize();
104 |   wbTime_stop(Compute, "Performing CUDA computation");
105 | 
106 |   wbTime_start(Copy, "Copying output memory to the CPU");
107 |   cudaMemcpy(hostOutput, deviceOutput, sizeof(float) * dim, cudaMemcpyDeviceToHost);
108 |   wbTime_stop(Copy, "Copying output memory to the CPU");
109 | 
110 |   wbTime_start(GPU, "Freeing GPU Memory");
111 |   cudaFree(deviceVector);
112 |   cudaFree(deviceOutput);
113 |   cudaFree(deviceJDSColStart);
114 |   cudaFree(deviceJDSCols);
115 |   cudaFree(deviceJDSRowPerm);
116 |   cudaFree(deviceJDSRows);
117 |   cudaFree(deviceJDSData);
118 | 
119 |   wbTime_stop(GPU, "Freeing GPU Memory");
120 | 
121 |   wbSolution(args, hostOutput, dim);
122 | 
123 |   free(hostCSRCols);
124 |   free(hostCSRRows);
125 |   free(hostCSRData);
126 |   free(hostVector);
127 |   free(hostOutput);
128 |   free(hostJDSColStart);
129 |   free(hostJDSCols);
130 |   free(hostJDSRowPerm);
131 |   free(hostJDSRows);
132 |   free(hostJDSData);
133 | 
134 |   return 0;
135 | }
136 | 


--------------------------------------------------------------------------------
/MP7/README.md:
--------------------------------------------------------------------------------
 1 | # Sparse Matrix Multiplication (JDS)
 2 | ## Objective
 3 | The purpose of this lab is to implement a SpMV (Sparse Matrix Vector Multiplication) kernel for an input sparse matrix based on the Jagged Diagonal Storage (JDS) transposed format.
 4 | 
 5 | ## Prerequisites
 6 | Before starting this lab, make sure that:
 7 | 
 8 | - You have completed all week 8 lecture videos
 9 | - You have completed MP-6
10 | ## Instructions
11 | Edit the kernel and the host function in the file to implement sparse matrix-vector multiplication using the JDS format. The kernel shall be launched so that each thread will generate one output Y element. The kernel should have each thread to use the appropriate elements of the JDS data array, the JDS col index array, JDS row index array, and the JDS transposed col ptr array to generate one Y element.
12 | 
13 | Instructions about where to place each part of the code is demarcated by the //@@ comment lines.
14 | 
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # UIUC-CS-483-Parallel Programming
 2 | ## Official Description
 3 | Parallel programming with emphasis on developing applications for processors with many computation cores. Computational thinking, forms of parallelism, programming models, mapping computations to parallel hardware, efficient data structures, paradigms for efficient parallel algorithms, and application case studies. 
 4 | ## Lab Equipment
 5 | Linux based cluster system
 6 | 
 7 | ## Lab Software
 8 | C Programming Language and CUDA Software Development Kit, WebGPU for labs, RAI for final project
 9 | 
10 | ## Topical Prerequisites
11 | C programming, Basic data structures, Introduction to computer organization
12 | 


--------------------------------------------------------------------------------
/ece408_project/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.formatting.formatOnSave": true,
3 |     "files.associations": {
4 |         "*.cu": "cpp",
5 |         "*.hu": "cpp",
6 |         "*.cuh": "cpp",
7 |         "chrono": "cpp"
8 |     }
9 | }


--------------------------------------------------------------------------------
/ece408_project/README.md:
--------------------------------------------------------------------------------
  1 | # ECE408/CS483 Final Project
  2 | 
  3 | ## Introduction
  4 | 
  5 | This is the skeleton code for the Fall 2019 ECE408 / CS483 / CSE408 course project.
  6 | In this project, you will:
  7 | 
  8 | * Get practical experience by using, profiling, and modifying MXNet, a standard open-source neural-network framework.
  9 | * Demonstrate command of CUDA and optimization approaches by designing and implementing an optimized neural-network convolution layer forward pass.
 10 | 
 11 | The project will be broken up into 4 milestones and a final submission. Read the description of the final report before starting, so you can collect the necessary info along the way.
 12 | Each milestone (except milestone 1) will consist of an updated report (culminating in the final report). Append each milestone's deliverable at the beginning of the document such that your latest milestone is at the beginning of the report.
 13 | 
 14 | You will be working in teams of 3 (no excuse here). 
 15 | Chicago city scholars can form teams with on campus students. 
 16 | 
 17 | You are expected to adhere to University of Illinois academic integrity standards.
 18 | Do not attempt to subvert any of the performance-measurement aspects of the final project.
 19 | If you are unsure about whether something does not meet those guidelines, ask a member of the teaching staff.
 20 | 
 21 | ## Table of Contents
 22 | 
 23 | * [Milestone 1: Due 10/06/2019 @5pm](#milestone-1)
 24 | * [Milestone 2: Due 10/12/2019 @5pm](#milestone-2)
 25 | * [Milestone 3: Due 10/19/2019 @5pm](#milestone-3)
 26 | * [Milestone 4: Due 11/21/2019 @5pm](#milestone-4)
 27 | * [Final Submission: Due 12/19/2019 @5pm](#final-submission)
 28 | * [Rubric](#rubric)
 29 | * [Final Report](#final-report)
 30 | * [Extras](#extras)
 31 | 
 32 | ## Milestone 1
 33 | 
 34 | Due October 06 @ 5pm
 35 | 
 36 | | Deliverables |
 37 | | ------------ |
 38 | | Register your team in the google sheet. |
 39 | 
 40 | You and your team should agree on a team name and enter it in this [google sheet](https://docs.google.com/spreadsheets/d/1vhThuFT0isnYPac8Gnh7Pp9FMOTWkZ6RcNq-C8ND9LQ/edit#gid=0). Graduate students can use this [google form](https://docs.google.com/forms/d/e/1FAIpQLScDU5QrC9pKsaI8KGKqT4HjjZodSlcYfr-IlR3d7qxzwpFCeg/viewform?usp=sf_link).
 41 | 
 42 | ## Milestone 2
 43 | 
 44 | Due October 12 @ 5pm
 45 | 
 46 | As with all milestones, you will include an updated PDF `report.pdf` in the project directory you submit with rai.
 47 | This report should contain all of the deliverables.
 48 | This report should contain your names, netids, rai ids (if different), team names, and school affiliation (Chicago Scholars or on campus students).
 49 | 
 50 | | Deliverables |
 51 | | ------------ |
 52 | | Report: Include a list of all kernels that collectively consume more than 90% of the program time. |
 53 | | Report: Include a list of all CUDA API calls that collectively consume more than 90% of the program time. |
 54 | | Report: Include an explanation of the difference between kernels and API calls |
 55 | | Report: Show output of rai running MXNet on the CPU |
 56 | | Report: List program run time |
 57 | | Report: Show output of rai running MXNet on the GPU |
 58 | | Report: List program run time |
 59 | | Create a  CPU implementation |
 60 | | Report: List whole program execution time |
 61 | | Report: List Op Times |
 62 | | Use `rai -p <project folder> --queue rai_amd64_ece408 --submit=m2` to mark your job for grading |
 63 | 
 64 | Clone this repository to get the project folder.
 65 | 
 66 |     git clone https://github.com/illinois-impact/ece408_project.git
 67 | 
 68 | Download the rai binary for your platform from [here](https://drive.google.com/drive/folders/1Pp84x3So9OEHUwRHQVZcRP441wRsO-UV). 
 69 | You will probably use it for development, and definitely use it for submission.
 70 | 
 71 | You should have received a `.rai_profile` file by email.
 72 | Put that file in `~/.rai_profile` (Linux/macOS).
 73 | Your `.rai_profile` should look something like this (indented with space!)
 74 | 
 75 |     profile:
 76 |         firstname: <your-given-name>
 77 |         lastname: <your-surname>
 78 |         username: <your-username>
 79 |         email: <your-institution-email>
 80 |         access_key: <your-access-key>
 81 |         secret_key: <your-secret-key>
 82 |         affiliation: uiuc
 83 | 
 84 | You will need to add your team name in the following way:
 85 | 
 86 |     profile:
 87 |         firstname: <your-given-name>
 88 |         lastname: <your-surname>
 89 |         username: <your-username>
 90 |         email: <your-institution-email>
 91 |         access_key: <your-access-key>
 92 |         secret_key: <your-secret-key>
 93 |         affiliation: uiuc
 94 |         team:
 95 |             name: <your-team-name>
 96 | 
 97 | Some more info is available on the [Client Documentation Page](https://github.com/rai-project/rai).
 98 | 
 99 | Run the built-in MXNet forward pass using rai
100 | 
101 | Consult `m1.1py` to examine the neural-network architecture used in this project.
102 | 
103 | Use RAI to run a batch forward pass on some test data.
104 | 
105 |     rai -p <project-folder> --queue rai_amd64_ece408
106 | 
107 | This will upload your project directory to rai (running on AWS) and move it to `/src`, where the execution specified in `rai_build.yml` will occur. 
108 | 
109 | The `image:` key specifies the environment that the rest of the execution will occur in.
110 | This environment includes a prebuilt MXNet (so rai will only do a partial compile with your code) as well as the model definition and the training data.
111 | 
112 | The `resources:` key specifies what computation resources will be available to the execution.
113 | 
114 | The `commands:` key specifies the recipe that rai will execute. First, the project files are copied to the `/build` directory.
115 | Then the files in `ece408_src` are copied to `src/operator/custom/` in the MXNet source tree.
116 | MXNet is recompiled, and the Python bindings are installed.
117 | `python /src/m1.1.py` runs the `m1.1.py` python program.
118 | 
119 | You should see the following output:
120 | 
121 |     Loading fashion-mnist data... done
122 |     Loading model... done
123 |     New Inference
124 |     EvalMetric: {'accuracy': 0.8154}
125 | 
126 | Modify `rai_build.yml` to use `/usr/bin/time` to measure the elapsed time of the whole program.
127 | 
128 |     - /usr/bin/time python m1.1.py
129 | 
130 | Next, we will run on the GPU!
131 | 
132 | Compare `m1.2.py` and `m1.1.py`. You'll see that it is the same, except for `mx.gpu()` has been substituted for `mx.cpu()`. This is how we tell MXNet that we wish to use a GPU instead of a CPU.
133 | 
134 | Modify `rai_build.yml` to time `python m1.2.py`
135 | 
136 | Again, submit the job to rai
137 | 
138 |     rai -p <project-folder> --queue rai_amd64_ece408
139 | 
140 | Next, we will learn how to use `nvprof` to profile the execution
141 | 
142 | Once you've gotten the appropriate accuracy results, generate a profile using nvprof. You will be able to use nvprof to evaluate how effective your optimizations are.
143 | As described above, make sure `rai_build.yml` is configured for a GPU run.
144 | Then, modify `rai_build.yml` to generate a profile instead of just execuing the code.
145 | 
146 |     nvprof python m1.2.py
147 | 
148 | You should see something that looks like the following:
149 | 
150 | ~~~bash 
151 | ==278== NVPROF is profiling process 278, command: python m1.2.py
152 | Loading model... done
153 | New Inference
154 | EvalMetric: {'accuracy': 0.8154}
155 | ==15163== Profiling application: python m1.2.py
156 | ==15163== Profiling result:
157 |             Type  Time(%)      Time     Calls       Avg       Min       Max  Name
158 |  GPU activities:   39.80%  16.602ms        20  830.11us  1.1200us  16.092ms  [CUDA memcpy HtoD]
159 |                    20.28%  8.4577ms         1  8.4577ms  8.4577ms  8.4577ms  void cudnn::detail::implicit_convolve_sgemm
160 |                    11.89%  4.9587ms         1  4.9587ms  4.9587ms  4.9587ms  volta_cgemm_64x32_tn
161 |                     7.11%  2.9642ms         2  1.4821ms  25.760us  2.9384ms  void op_generic_tensor_kernel 
162 | 
163 | ...
164 | 
165 |       API calls:   42.14%  3.03300s        22  137.86ms  13.006us  1.56281s  cudaStreamCreateWithFlags
166 |                    34.07%  2.45202s        24  102.17ms  117.07us  2.44545s  cudaMemGetInfo
167 |                    21.32%  1.53449s        19  80.763ms     805ns  407.00ms  cudaFree
168 |                     1.18%  84.772ms       912  92.951us     308ns  38.118ms  cudaFuncSetAttribute
169 |                     0.47%  33.977ms         9  3.7753ms  33.322us  16.253ms  cudaMemcpy2DAsync
170 | 
171 | ...
172 | ~~~
173 | 
174 | The GPU Activities section shows the kernels and memory transfers, and the API calls section shows the CUDA API calls that are executed.
175 | There are columns corresponding to percentage of time consumed, total time, number of calls, and average/min/max time of those calls.
176 | Think about the distinction between a CUDA API call and a kernel launch, and describe it briefly in your report.
177 | The CUDA documentation describes [kernels](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels) and the [programming interface](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programming-interface).
178 | 
179 | You can find more information about nvprof in the [CUDA Toolkit Documentation](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview)
180 | 
181 | ### Create a CPU Implementation
182 | 
183 | See the [description](#skeleton-code-description) of the skeleton code for background information, including the data storage layout of the tensors.
184 | 
185 | Modify `ece408_src/new-forward.h` to implement the forward convolution described in Chapter 16 of the textbook.
186 | The performance of the CPU convolution is not part of the project evaluation.
187 | The algorithm is also below, for your convenience
188 | 
189 |     for b = 0 .. B                     // for each image in the batch 
190 |         for m = 0 .. M                 // for each output feature maps
191 |             for h = 0 .. H_out         // for each output element
192 |                 for w = 0 .. W_out
193 |                 {
194 |                     y[b][m][h][w] = 0;
195 |                     for c = 0 .. C     // sum over all input feature maps
196 |                         for p = 0 .. K // KxK filter
197 |                             for q = 0 .. K
198 |                                 y[b][m][h][w] += x[b][c][h + p][w + q] * k[m][c][p][q]
199 |                 }
200 | 
201 | Unlike the convolutions described in the class, note that this one is not centered on the input image.
202 | 
203 | Because this operator is different than the built-in MXNet operator, you will need to load a different model.
204 | `m2.1.py` handles this for you.
205 | Modify `rai_build.yml` to invoke
206 | 
207 |     python m2.1.py
208 | 
209 | When your implementation is correct, you should see output like this:
210 | 
211 |     Loading fashion-mnist data... done
212 |     Loading model... done
213 |     New Inference
214 |     Op Time: 10.906517
215 |     Op Time: 58.887046
216 |     Correctness: 0.7653 Model: ece408
217 |     
218 | 
219 | Every time your layer is invoked, it will print the "Op Time," the time spent working on that layer.
220 | Since the network has two convolutional layers, two times will be printed.
221 | You can time the whole program execution by modifying `rai_build.yml` with
222 | 
223 |     /usr/bin/time python m2.1.py
224 | 
225 | `m2.1.py` takes one optional argument: the dataset size.  
226 | If the correctness for each possible model is as below, you can be reasonably confident your implementation is right.
227 | The correctness does depend on the data size. 
228 | 
229 | For example, to check your correctness on the full data size of 10000, you could modify `rai_build.yml` to run
230 | 
231 |     python m2.1.py 10000
232 | 
233 | | Model | Number of Images | Correctness  |
234 | |-------------| -----| -----  |
235 | | ece408 | 100       | 0.76 |
236 | | ece408 | 1000      | 0.767 |
237 | | ece408 | 10000 (default) | 0.7653 |
238 | 
239 | (Final model that will be used for internal evaluation shall be different.)
240 | 
241 | The provided `m2.1.py` is identical to the one used by `--submit=m2`.
242 | You may modify `m2.1.py` as you please, but check that `--submit=m2` will still invoke your code correctly.
243 | 
244 | Use
245 | 
246 |     rai -p <project folder> --queue rai_amd64_ece408 --submit=m2
247 | 
248 | to mark your submission.
249 | 
250 | ## Milestone 3
251 | 
252 | Due October 19 @ 5pm
253 | 
254 | | Deliverables |
255 | | ------------ |
256 | | Everything from Milestone 2 |
257 | | Implement a GPU Convolution |
258 | | Correctness and timing with 3 different dataset sizes |
259 | | Report: demonstrate `nvprof` profiling the execution |
260 | | Use `rai -p <project folder> --queue rai_amd64_ece408 --submit=m3` to mark your job for grading |
261 | 
262 | ### Create a GPU Implementation
263 | 
264 | Modify `ece408_src/new-forward.cuh` to create GPU implementation of the forward convolution.
265 | 
266 | Modify `rai_build.yml` to run
267 | 
268 |     python m3.1.py
269 | 
270 | to use your GPU implementation.
271 | When it is correct, it will show the same correctness as Milestone 2.
272 | 
273 | ### Use `nvprof` and NVVP for initial Performance Results
274 | 
275 | First, ensure you are using correct image in rai_build.yml file
276 | 
277 | `image: illinoisimpact/ece408_mxnet_docker:amd64-gpu-latest-fa19`
278 | 
279 | Modify `rai_build.yml` to use nvprof to save some timeline and analysis information, as described in [nvprof](#profiling).
280 | Use the NVIDIA Visual Profiler to find the execution of your kernel, and show it in your report.
281 | The [NVVP on EWS](#nvvp-on-ews) section describes how to install NVVP.
282 | 
283 | Use
284 | 
285 |     rai -p <project folder> --queue rai_amd64_ece408 --submit=m3
286 | 
287 | to mark your submission.
288 | 
289 | `m3.1.py` takes one optional argument: the dataset size. 
290 | If the correctness for each possible model is as below, you can be reasonably confident your implementation is right.
291 | The correctness does depend on the data size. 
292 | 
293 | For example, you could modify `rai_build.yml` to run
294 | 
295 |     python m3.1.py
296 | 
297 | | Model | Number of Images | Correctness  |
298 | |-------------| -----| -----  |
299 | | ece408 | 100       | 0.76 |
300 | | ece408 | 1000      | 0.767 |
301 | | ece408 | 10000 (default) | 0.7653 |
302 | 
303 | (Final model that will be used for internal evaluation shall be different.)
304 | 
305 | ## Milestone 4
306 | 
307 | Due November 21 @ 5pm
308 | 
309 | | Deliverables |
310 | | ------------ |
311 | | Everything from Milestone 3 |
312 | | Implement three GPU optimizations |
313 | | Report: Describe the optimization |
314 | | Report: demonstrate `nvprof` profiling the execution |
315 | | Report: use NVVP to analyze your optimization |
316 | | Use `rai -p <project folder> --queue rai_amd64_ece408 --submit=m4` to mark your job for grading |
317 | 
318 | ### 3.1 Add three GPU Optimization
319 | 
320 | For this milestone, you should attempt at least three GPU optimizations (see [optimizations](#optimizations)).
321 | 
322 | Describe the optimizations in your `report.pdf`.
323 | 
324 | ### 3.2 Performance Analysis with `nvprof` and NVVP
325 | 
326 | Use the NVIDIA Visual Profiler and your analysis information to describe the effect that your optimizations had on the performance of your convolution.
327 | If possible, you should try to separate the effect of each optimization in your analysis.
328 | 
329 | Use 
330 |     
331 |     rai -p <project folder> --queue rai_amd64_ece408 --submit=m4
332 |     
333 | to submit your project folder.
334 | 
335 | ## Final Submission
336 | 
337 | Due December 19 @ 5pm
338 | 
339 | | Deliverables |
340 | | ------------ |
341 | | Everything from Milestone 4 |
342 | | Implement final GPU optimizations |
343 | | Report: Describe and analyze the optimizations |
344 | | Report: demonstrate `nvprof` profiling the execution |
345 | | Use `rai -p <project folder> --queue rai_amd64_ece408 --submit=final` to mark your job for grading |
346 | 
347 | ### Optimized Layer
348 | 
349 | Optimize your GPU convolution (see [optimizations](#optimizations)).
350 | 
351 | Your implementation must work with `rai -p <project-folder> --queue rai_amd64_ece408 --submit=final`.
352 | This means all your source files must be in `ece408_src`, and your implementation must work when they are copied to `src/operator/custom` in the MXNet tree, and `make` is invoked on the MXNet tree.
353 | This is done in the provided `rai_build.yml`.
354 | Likewise, the provided `final.py` provides an example of the script that will be used to time your implementation.
355 | 
356 | All of your code for this and the later milestones must be executed between `auto start = ...` and `auto end = ...` in `new-inl.h`.
357 | The easiest way to ensure this is that all of your code should be in `forward()` or called by `forward()` from `new-forward.cuh` or `new-forward.h`.
358 | Do not modify any timing-related code.
359 | 
360 | Use `rai -p <project folder> --queue rai_amd64_ece408 --submit=final` to submit your project folder.
361 | 
362 | ### Final Report
363 | 
364 | You've been building this final report through all the milestones.
365 | Keep the content from the earlier milestones, but be sure to include the following:
366 | 
367 | * Your team name
368 | * Your team member names
369 | * your netids
370 | * your UINs
371 | 
372 | The final report should include at least the following information for each optimization
373 | 
374 | 1. **Optimization Approach and Results**
375 |     * how you identified the optimization opportunity
376 |     * why you thought the approach would be fruitful
377 |     * the effect of the optimization. was it fruitful, and why or why not. Use nvprof and NVVP to justify your explanation.
378 |     * Any external references used during identification or development of the optimization
379 |     * How  your team organized and divided up this work.
380 | 2. **References** (as needed)
381 | 3. **(Optional) Suggestions for Improving Next Year**
382 | 
383 | ### Rubric
384 | 
385 | The overall project score will be computed as follows:
386 | 
387 | 1. Milestone 1 ( 5% )
388 | 2. Milestone 2 ( 10% )
389 | 3. Milestone 3 ( 10% )
390 | 4. Milestone 4 ( 30% )
391 |     * Optimization 1 ( 10% )
392 |     * Optimization 2 ( 10% )
393 |     * Optimization 3 ( 10% )
394 | 5. Final Optimizations ( 30% )
395 |     * Optimization 4 ( 10% )
396 |     * Optimization 5 ( 10% )
397 |     * Optimization 6 ( 10% )
398 |     * Additional Optimizations / detailed insights ( up to +10% extra!!! )
399 | 6. Performance Ranking ( 10% )
400 | 7. Report Style (5 %)
401 |     * Clear, concise writing, good layout, and good organization will be rewarded.
402 | 
403 | Each optimization will be graded as follows:
404 | 
405 | 1. Explanation of Performance Impact ( 40% )
406 | 2. Correctness ( 60% )
407 | 
408 | The Performance Ranking will be graded as follows:
409 | 
410 | 1. The median performance will be determined (how well the class did as a whole)
411 | 2. Your performance will be converted to a number of standard deviations above/below that median (how well you did compared to the class).
412 | 3. That value will be linearly mapped into the space of 0-10 to determine the ranking grade.
413 | 
414 | The ranking is determined by the total run time of the two layer invocations.
415 | If your implementation is not correct, you will get a 0 for this component of the grade.
416 | The `rai ranking` command is not the final word: the staff will re-run all final submissions multiple times and choose the fastest result as your time.
417 | THe ranking is determined solely by the values printed by `Op Time:` during your run.
418 | That `Op Time` is computed by wrapping the MXNet op that you implement in a timer.
419 | 
420 | ## Optimizations
421 | 
422 | We are going to suggest a set of possible optimizations for you to attempt.
423 | 
424 | * Unroll + shared-memory Matrix multiply
425 | * Shared Memory convolution
426 | * Kernel fusion for unrolling and matrix-multiplication
427 | * Weight matrix (kernel values) in constant memory
428 | * Tuning with restrict and loop unrolling (considered as one optimization only if you do both)
429 | * An advanced matrix multiplication algorithm (register-tiled, for example)
430 | * Sweeping various parameters to find best values (block sizes, amount of thread coarsening)
431 | * Exploiting parallelism in input images, input channels, and output channels.
432 | * Multiple kernel implementations for different layer sizes
433 | * Input channel reduction: tree
434 | * Input channel reduction: atomics
435 | * ... 
436 | 
437 | Other optimizations that do not fit in here may also be considered as optimizations.
438 | If in doubt, contact the course staff.
439 | 
440 | ## Extras
441 | 
442 | ### Checking for Errors
443 | 
444 | Within MXNet, you can use `MSHADOW_CUDA_CALL(...);` as is done in `new-forward.cuh`.
445 | Or, you can define a macro/function similar to `wbCheck` used in WebGPU.
446 | 
447 | ### Profiling
448 | 
449 | You can gather detailed GPU profile information with `nvprof` and view that information with `nvvp`.
450 | 
451 | You can see some simple information like so (as we did in milestone 1):
452 | 
453 |     nvprof <your command here>
454 | 
455 | You can gather a timeline file like the following:
456 | 
457 |     nvprof -o timeline.nvprof <your command here>
458 | 
459 | This will generate timeline.nvprof.
460 | 
461 | You can additionally gather some detailed performance metrics.
462 | 
463 |     nvprof -o timeline.nvprof <your command here>
464 |     nvprof --kernels "::forward:1" --analysis-metrics -o forward1_analysis.nvprof <the same command>
465 |     nvprof --kernels "::forward:2" --analysis-metrics -o forward2_analysis.nvprof <the same command>
466 | 
467 | This will generate `timeline.nvprof` and `*analysis.nvprof`.
468 | `--analysis-metrics` significantly slows the run time, you may wish to modify the python scripts to run on smaller datasets during this profiling.
469 | 
470 | You will need to follow the link rai prints after the execution to retrieve these files.
471 | You can use the NVIDIA Visual Profiler (nvvp) to import those files.
472 | You will need to install nvvp on your own machine. It can be downloaded as part of the CUDA SDK.
473 | 
474 | To import the files:
475 | * File > import > select nvprof > next > single process > next
476 | * timeline data file should be your timeline.nvprof
477 | * event/metrics data file should be your analysis.nvprof.
478 | * finish
479 | 
480 | ### NVVP on EWS
481 | 
482 | The process will be similar for any machine without an NVIDIA GPU (like your linux laptop).
483 | 
484 | If you wish to install it on Windows or macOS, the CUDA Toolkit installer may partially fail if you do not have an NVIDIA GPU.
485 | The teaching staff doesn't support this, but you may be able to figure it out.
486 | 
487 | Establish an ssh session with x-forwarding
488 | 
489 |     ssh -Y <netid>@linux.ews.illinois.edu
490 | 
491 | Download CUDA toolkit for CentOS 7 and install to `~/software/cuda-10.0` (You may choose a different location).
492 | This takes a while (1GB+ download and install).
493 | 
494 |     mkdir -p $HOME/software \
495 |     && wget https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux -O cuda10.run \
496 |     && chmod +x cuda10.run \
497 |     && ./cuda10.run --silent --toolkit --toolkitpath=$HOME/software/cuda-10.0
498 | 
499 | Free up your EWS space (I'm not sure what the disk quotas are)
500 | 
501 |     rm cuda10.run
502 | 
503 | Optional: modify .bashrc to add `~/software/cuda-10.0/bin` to your path. Or, just run it directly
504 | 
505 |     ~/software/cuda-10.0/bin/nvvp &
506 | 
507 | ### Comparing GPU implementation to CPU implementation
508 | 
509 | It may be hard to directly debug by inspecting values during the forward pass since the weights are already trained and the input data is from a real dataset.
510 | You can always extract your implementations into a separate set of files, generate your own test data, and modify `rai_build.yml` to build execute your separate test code instead of the MXNet code while developing.
511 | 
512 | A simple code is provided in `build_example`. You could modify the `build` step of rai_build.yml in the following way to compile and run it:
513 | 
514 |     commands:
515 |         build:
516 |             - echo "Building arbitrary code"
517 |             - make -C /src/build_example
518 |             - echo "Running compiled code"
519 |             - /src/build_example/main
520 | 
521 | ### Offline Development
522 | 
523 | If you'd like to develop using a local copy of MXNet, you may do so. Keep in mind your project will be evaluated through rai. Your submission must work through rai.
524 | 
525 | Let's use the following directory structure for these instructions. The directories will be created each step along the way.
526 | 
527 |     <some root dir>
528 |     ├── fashion-mnist
529 |     ├── incubator-mxnet
530 |     ├── m1.1.py
531 |     ├── m1.2.py
532 |     ├── m2.1.py
533 |     ├── m3.1.py
534 |     ├── m4.1.py
535 |     └── models
536 | 
537 | The MXNet instructions are available [here](https://mxnet.incubator.apache.org/get_started/install.html). A short form of them follows for Ubuntu.
538 | 
539 |     # install  mxnet prereqs
540 |     sudo apt install -y build-essential git libopenblas-dev liblapack-dev libopencv-dev python-pip python-dev python-setuptools python-numpy
541 |     # download MXNet release 1.3.0
542 |     git clone --single-branch --depth 1 --branch v1.3.0 --recursive https://github.com/apache/incubator-mxnet
543 |     # build MXNet
544 |     nice -n20 make -C incubator-mxnet -j`nproc` USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1 USE_BLAS=openblas
545 |     # install python bindings
546 |     pip2 install --user -e incubator-mxnet/python
547 | 
548 | You can always uninstall the python package with
549 | 
550 |     pip2 uninstall mxnet
551 | 
552 | The training dataset is a modified version of the mxnet dataset. The scripts to generate it are written in python3
553 | 
554 |     # install data-generation prereqs
555 |     sudo apt install python3 python3-pip
556 |     pip3 install --user numpy scikit-image
557 |     mkdir -p fashion-mnist
558 |     wget -P fashion-mnist \
559 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/generate-data.py \
560 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/reader.py
561 | 
562 | Run the generation script. It will download the fashion-mnist dataset and resize it, which may take a few minutes and consume a few hundred megabytes of disk space
563 | 
564 |     chmod +x fashion-mnist/generate-data.py
565 |     fashion-mnist/generate-data.py fashion-mnist
566 | 
567 | Download the trained models (for the existing MXNet implementation and your implementation) using 
568 | 
569 |     mkdir -p models \
570 |     && wget -P models \
571 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/models/baseline-0002.params \
572 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/models/baseline-symbol.json \
573 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/models/ece408-002.params \
574 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/models/ece408-symbol.json
575 | 
576 | Download the scripts we use for evaluation (needs to be modified to use 74x74 input image size)
577 | 
578 |     wget \
579 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m1.1.py \
580 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m1.2.py \
581 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m2.1.py \
582 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m3.1.py \
583 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/scripts/m4.1.py
584 | 
585 | 
586 | Download the skeleton source files into incubator-mxnet. This is also where you will put the skeleton code from `ece408_src`.
587 | 
588 |     wget -P incubator-mxnet/src/operator/custom \
589 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/ece408_src/new.cc \
590 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/ece408_src/new.cu \
591 |         https://github.com/illinois-impact/ece408_mxnet_docker/raw/2019sp/ece408_src/new-inl.h
592 | 
593 | Modify the python forward convolution scripts to point to where you downloaded fashion-mnist
594 | 
595 |     ... load_mnist(path="fashion-mnist", ...)
596 | 
597 | Modify the python forward convolution scripts to point to where you downloaded the models
598 | 
599 |     lenet_model = mx.mod.Module.load(prefix='models/baseline' ...
600 | 
601 | Build your modified MXNet
602 | 
603 |     cp <your source files> incubator-mxnet/src/operator/custom
604 |     make -C incubator-mxnet USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1
605 | 
606 | 
607 | ### Skeleton Code Description
608 | 
609 | `new-forward.h` and `new-forward.cuh` contain skeleton implementations for CPU and GPU convolutions. You can complete the project by modifying only these two files. These functions are called from `Forward()` in `new-inl.h`.
610 | 
611 | The code in `new-inl.h`, `new.cc`, and `new.cu` describes the convolution layer to MXNet. You should not modify these files. They are provided for your curiosity.
612 | As of rai 0.2.20, When you use the `--submit` flag, a golden version of these files from [here](https://github.com/cwpearson/2017fa_ece408_mxnet_docker/tree/master/ece408-src) is used.
613 | 
614 | | File | Function | Description |
615 | | -- | -- | -- |
616 | | `new-forward.h` | `forward()` | Your CPU implementation goes here. |
617 | | `new-forward.cuh` | `forward()` | Your GPU host code goes here. |
618 | | `new-forward.cuh` | `forward_kernel()` | Your GPU kernel implementation goes here. |
619 | | -- | -- | -- |
620 | | `new-inl.h` | `InferShape()` | Computes shape of output tensor from input and kernel shape |
621 | | `new-inl.h` | `InferType()` | Computes type of the output tensor based on the inputs. |
622 | | `new-inl.h` | `Forward()` | Defines the operations of the forward pass. Calls our implementation. |
623 | | `new-inl.h` | `Backward()` | Defines the operations of the backward (training) pass. Not used in this project. |
624 | | `new-inl.h` | `struct NewParam` | Defines the arguments passed to the operator in python. |
625 | | `new.cc` | `CreateOperatorEx()` | Called by MXNet to create the appropriate operator for a CPU or GPU execution. |
626 | | `new.cc` | `CreateOp<cpu>()` | Creates the CPU operator. |
627 | | `new.cu` | `CreateOp<gpu>()` | Creates the GPU operator when CUDA is enabled. |
628 | 
629 | The `x`, `y`, and `k` tensors constructed in `new-inl.h`/`Forward()` have the following data layout:
630 | 
631 | | Tensor | Descrption | Data Layout |
632 | | -- | -- | -- |
633 | | `x` | Input data     | batch size * input channels * y * x |
634 | | `y` | Output data    | batch size * output channels * y * x |
635 | | `k` | kernel weights | output channels * input channels * y * x |
636 | 
637 | You can see this being constructed in `new-inl.h`/`InferShape()`.
638 | 
639 | 
640 | ### Installing CUDA locally
641 | 
642 | The Docker containers that we use to run your code runs on CUDA 10.0. 
643 | To view the nvprof results, you need to install the CUDA tookkit locally. 
644 | 
645 | You can download the CUDA toolkit from: https://developer.nvidia.com/cuda-downloads. 
646 | Follow the installation instructions. 
647 | 
648 | If you dont have CUDA enabled (Nvidia GPU), then dont install the driver. Just use the CUDA toolkit and it should work smoothly. 
649 | If you are stuck on how to use, please visit the TA office hours.
650 | 
651 | We might consider updating the CUDA tool version inside the Docker container. We will inform  incase if we do. 
652 | 
653 | ## License
654 | 
655 | NCSA/UIUC © 2018 [Carl Pearson](https://cwpearson.github.io)
656 | 
657 | Modified in fall 2018 [Vikram](https://github.com/msharmavikram/)
658 | 
659 | Last modified by Rui Lan and Zhichun Wan
660 | 
661 | ## Final Report
662 | - [Final Report](https://github.com/leo811121/UIUC-CS-483-Parallel-Programming/blob/master/ece408_project/report.pdf)
663 | 


--------------------------------------------------------------------------------
/ece408_project/_gitignore:
--------------------------------------------------------------------------------
1 | *.pyc


--------------------------------------------------------------------------------
/ece408_project/_gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leo811121/UIUC-CS-483-Parallel-Programming/0aa3d6097073c4dd5de7f9a52b54e9d230a4df4a/ece408_project/_gitmodules


--------------------------------------------------------------------------------
/ece408_project/build_example/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | all: main
3 | 
4 | main: main.cu
5 | 	nvcc main.cu -o main
6 | 
7 | .PHONY: clean
8 | clean:
9 | 	rm -f main


--------------------------------------------------------------------------------
/ece408_project/build_example/main.cu:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | 
 3 | int main(void) {
 4 |     fprintf(stdout, "Hello world\n");
 5 | 
 6 |     int deviceCount = 0;
 7 |     cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
 8 | 
 9 |     if (error_id != cudaSuccess) {
10 |         printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id, cudaGetErrorString(error_id));
11 |         printf("Result = FAIL\n");
12 |         exit(EXIT_FAILURE);
13 |     } else {
14 |         printf("Found %d GPUs\n");
15 |     }
16 | 
17 | }


--------------------------------------------------------------------------------
/ece408_project/ece408_src/new-forward.cuh:
--------------------------------------------------------------------------------
  1 | 
  2 | #ifndef MXNET_OPERATOR_NEW_FORWARD_CUH_
  3 | #define MXNET_OPERATOR_NEW_FORWARD_CUH_
  4 | 
  5 | #include <mxnet/base.h>
  6 | #define TILE_WIDTH 5
  7 | #define TILE_WIDTH1 16 
  8 | #define TILE_WIDTH2 24
  9 | 
 10 | #define y4d(i3, i2, i1, i0) y[(i3) * (M * H_out * W_out) + (i2) * (H_out * W_out) + (i1) * (W_out) + i0]
 11 | #define x4d(i3, i2, i1, i0) x[(i3) * (C * H * W) + (i2) * (H * W) + (i1) * (W) + i0]
 12 | #define k4d(i3, i2, i1, i0) k[(i3) * (C * K * K) + (i2) * (K * K) + (i1) * (K) + i0]
 13 | #define X_unroll(i2, i1, i0) X_unroll[(i2) * (C * K * K * H_out * W_out) + (i1) * (H_out * W_out) + i0]
 14 | #define k_unroll(i1, i0) k_unroll[i1 * (C * K * K) + i0]
 15 | #define xunroll3d(i2, i1, i0) x_unroll[(i2) * (C * K * K * H_out * W_out) + (i1) * (H_out * W_out) + i0]
 16 | #define kunroll2d(i1, i0) k_unroll[(i1) * (C * K * K) + i0]
 17 | 
 18 | #define TILE_SZ_A 128
 19 | #define TILE_SZ_B 8
 20 | 
 21 | #define kernel4d(i3, i2, i1, i0) kernel[(i3) * (C * K * K) + (i2) * (K * K) + (i1) * (K) + i0]
 22 | 
 23 | 
 24 | namespace mxnet
 25 | {
 26 | namespace op
 27 | {
 28 | __constant__ float kernel[7200];
 29 | 
 30 | //baseline
 31 | __global__ void forward_kernel(float *y, const float *x, const float *k, const int B, const int M, const int C, const int H, const int W, const int K)
 32 | {
 33 | 
 34 |     const int H_out = H - K + 1;
 35 |     const int W_out = W - K + 1;
 36 | 	
 37 |     int W_grid = ceil((float)W_out / TILE_WIDTH);
 38 |     const int b = blockIdx.x;
 39 |     const int m = blockIdx.y;
 40 |     const int h = blockIdx.z / W_grid * TILE_WIDTH + threadIdx.y;
 41 |     const int w = blockIdx.z % W_grid * TILE_WIDTH + threadIdx.x;
 42 |     if(h < H_out && w < W_out){
 43 |         float val = 0;
 44 | 		//#pragma unroll 2 
 45 |         for(int c = 0; c < C; c++){
 46 | 			//#pragma unroll 5
 47 |             for(int p = 0; p < K; p++){
 48 | 				//#pragma unroll 5
 49 |                 for(int q = 0; q < K; q++){
 50 |                     //val += x4d(b, c, h+p, w+q) * k4d(m, c, p, q);
 51 | 					val += x4d(b, c, h+p, w+q) * kernel4d(m, c, p, q);
 52 |                 }
 53 |             }
 54 |         }
 55 |         y4d(b, m, h, w) = val;
 56 |     }   
 57 | }
 58 | 
 59 | 
 60 | //kernel for first convolution layer 
 61 | __global__ void forward_kernel1(float * __restrict__ y, const float * __restrict__ x, const float * __restrict__ k, const int B, const int M, const int C, const int H, const int W, const int K)
 62 | {
 63 | 
 64 |     const int H_out = H - K + 1;
 65 |     const int W_out = W - K + 1;
 66 | 	
 67 |     //int W_grid = ceil((float)W_out / TILE_WIDTH);
 68 | 	int W_grid = ceil((float)W_out / 16);
 69 |     const int b = blockIdx.x;
 70 |     const int m = blockIdx.y;
 71 |     //const int h = blockIdx.z / W_grid * TILE_WIDTH + threadIdx.y;
 72 |     //const int w = blockIdx.z % W_grid * TILE_WIDTH + threadIdx.x;
 73 | 	const int h = blockIdx.z / W_grid * 16 + threadIdx.y;
 74 |     const int w = blockIdx.z % W_grid * 16 + threadIdx.x;
 75 |     if(h < H_out && w < W_out){
 76 |         float val = 0;
 77 | 		//#pragma unroll 2 
 78 |         //for(int c = 0; c < C; c++){
 79 | 			#pragma unroll 5
 80 |             for(int p = 0; p < K; p++){
 81 | 				#pragma unroll 5
 82 |                 for(int q = 0; q < K; q++){
 83 |                     //val += x4d(b, c, h+p, w+q) * k4d(m, c, p, q);
 84 | 					//val += x4d(b, c, h+p, w+q) * kernel4d(m, c, p, q);
 85 | 					val += x4d(b, 0, h+p, w+q) * kernel4d(m, 0, p, q);
 86 |                 }
 87 |             }
 88 |         //}
 89 |         y4d(b, m, h, w) = val;
 90 |     }   
 91 | }
 92 | 
 93 | 
 94 | //kernel for second convolution layer
 95 | __global__ void forward_kernel2(float* /*__restrict__*/  y, const float* /*__restrict__*/  x, const float* __restrict__ k, const int B, const int M, const int C, const int H, const int W, const int K)
 96 | {
 97 | 
 98 |     const int H_out = H - K + 1;
 99 |     const int W_out = W - K + 1;
100 | 	
101 |     int W_grid = ceil((float)W_out / TILE_WIDTH);
102 |     const int b = blockIdx.x;
103 |     const int m = blockIdx.y;
104 |     const int h = blockIdx.z / W_grid * TILE_WIDTH + threadIdx.y;
105 |     const int w = blockIdx.z % W_grid * TILE_WIDTH + threadIdx.x;
106 |     if(h < H_out && w < W_out){
107 |         float val = 0;
108 | 		#pragma unroll 12 
109 |         for(int c = 0; c < C; c++){
110 | 			#pragma unroll 5
111 |             for(int p = 0; p < K; p++){
112 | 				#pragma unroll 5
113 |                 for(int q = 0; q < K; q++){
114 |                     //val += x4d(b, c, h+p, w+q) * k4d(m, c, p, q);
115 | 					val += x4d(b, c, h+p, w+q) * kernel4d(m, c, p, q);
116 |                 }
117 |             }
118 |         }
119 |         y4d(b, m, h, w) = val;
120 |     }   
121 | }
122 | 
123 | 
124 | __global__ void unroll_x(const int C, const int H, const int W, const int K, const float* x, float* X_unroll, const int B, int B_pre)
125 | {
126 | 	int t = blockIdx.x * blockDim.x + threadIdx.x;
127 | 	int b = blockIdx.y * blockDim.y + threadIdx.y;
128 | 	int c, s, h_out, w_out, w_unroll, h_unroll, h_base, p, q;
129 | 	int H_out = H - K + 1;
130 | 	int W_out = W - K + 1;
131 | 	int out_number = H_out * W_out;
132 | 	if (t < C * out_number && b < (B-B_pre)) {
133 | 		c = t / out_number;
134 | 		s = t % out_number;
135 | 		h_out = s / W_out;
136 | 		w_out = s % W_out;
137 | 		w_unroll = h_out * W_out + w_out;
138 | 		h_base = c * K * K;
139 | 		
140 | 		
141 | 		for(p = 0; p < K; p++){
142 | 			for(q = 0; q < K; q++) {
143 | 				h_unroll = h_base + p * K + q; 
144 | 				X_unroll(b, h_unroll, w_unroll) = x4d(b+B_pre, c, h_out + p, w_out + q);				
145 | 			}
146 | 		}
147 | 	
148 | 	}		
149 | }
150 | 
151 | __global__ void matrixMultiply(float *A, float *B, float *C, int numARows,
152 |                                int numAColumns, int numBRows,
153 |                                int numBColumns, int numCRows,
154 |                                int numCColumns, int batch, int B_pre) {
155 |   
156 | 	int Row = blockIdx.y * blockDim.y + threadIdx.y;
157 | 	int Col = blockIdx.x * blockDim.x + threadIdx.x;
158 | 	int Layer = blockIdx.z * blockDim.z + threadIdx.z;
159 | 	
160 | 	int tx = threadIdx.x;
161 | 	int ty = threadIdx.y;
162 | 	
163 |   	__shared__ float tileA[TILE_WIDTH][TILE_WIDTH];
164 | 	__shared__ float tileB[TILE_WIDTH][TILE_WIDTH];
165 | 	
166 | 	int numTiles = numAColumns/TILE_WIDTH;
167 |     if (numAColumns%TILE_WIDTH) numTiles++;
168 | 	
169 | 	float CVal = 0.0;
170 | 
171 | 	
172 | 	for(int a = 0; a < numTiles; ++a){
173 | 		
174 | 		if(Row < numARows && a * TILE_WIDTH + tx < numAColumns){
175 | 			tileA[ty][tx] = A[Row * numAColumns + a * TILE_WIDTH + tx];
176 | 		}
177 | 	
178 | 		else tileA[ty][tx] = 0.0;
179 | 	
180 | 		if(a * TILE_WIDTH + ty < numBRows && Col < numBColumns){
181 | 			tileB[ty][tx] = B[(Layer) * numBColumns * numBRows + (a * TILE_WIDTH + ty) * numBColumns + Col];
182 | 		}
183 | 		else tileB[ty][tx] = 0.0;
184 | 		__syncthreads();
185 | 		
186 | 			for(int k = 0; k < TILE_WIDTH; ++k){
187 | 				CVal += tileA[ty][k] * tileB[k][tx];
188 | 			}
189 | 			__syncthreads(); 		
190 | 				
191 | 	}
192 | 	if(Row < numCRows && Col < numCColumns && Layer < (batch-B_pre)){
193 | 		C[(Layer+B_pre) * numCColumns * numCRows + Row * numCColumns + Col] = CVal;
194 | 	}		
195 | }
196 | 
197 | __global__ void reg_matrixMultiply(float *A, float *B, float *C, int numARows,
198 |                                int numAColumns, int numBRows,
199 |                                int numBColumns, int numCRows,
200 |                                int numCColumns, int batch, int B_pre) {
201 |   
202 | 	__shared__ float tile[TILE_SZ_A];
203 | 
204 | 	int ty = threadIdx.y;
205 | 	int row = ty + blockIdx.y*blockDim.y;
206 | 	int Layer = blockIdx.z * blockDim.z + threadIdx.z;
207 | 	float reg = 0.0;
208 | 	float Pvalues[TILE_SZ_B]={0};
209 | 	
210 | 	for(int ph=0;ph<numAColumns;ph++)
211 | 	{	
212 | 		
213 | 		if(TILE_SZ_B*blockIdx.x+ty < numBColumns && Layer < (batch-B_pre) && ty < TILE_SZ_B){
214 | 			//tile[ty] = B(ph,TILE_SZ_B*blockIdx.x+ty);
215 | 			tile[ty] = B[(Layer) * numBColumns * numBRows + ph*numBColumns +TILE_SZ_B*blockIdx.x+ty];			
216 | 		}
217 | 		
218 | 		//else if (ty < TILE_SZ_B){
219 | 		else 		
220 | 		tile[ty] = 0.0;
221 | 		//}
222 |     	__syncthreads();
223 | 		
224 |     	if(row < numARows){
225 |         	//reg = A(row,ph);
226 | 			reg = A[row*numAColumns + ph];
227 | 		}
228 |       	else
229 |        		reg = 0.0;
230 |  
231 |     	for(int step=0;step<TILE_SZ_B;step++)
232 |     	{
233 |   
234 | 			Pvalues[step]+=reg*tile[step];
235 |       
236 | 		}
237 | 		__syncthreads();
238 | 	}
239 |  
240 |     for(int x=0;x<TILE_SZ_B;x++){
241 | 		if(row< numCRows &&  TILE_SZ_B*blockIdx.x+x < numCColumns && Layer< (batch-B_pre)){
242 |        		//C(row,TILE_SZ_B*blockIdx.x+x) = Pvalues[x];
243 | 			C[(Layer+B_pre) * numCColumns * numCRows + row*numCColumns + TILE_SZ_B*blockIdx.x+x] = Pvalues[x]; 
244 |      	}
245 | 	}		
246 | }
247 | 
248 | __global__ void fusion(const int C, const int H, const int W, const int K, const int M, const int B, int B_pre, const float* x, float* y, float *w, float* X_unroll, int numARows, int numAColumns, int numBRows
249 | 						,int numBColumns, int numCRows, int numCColumns) {
250 |   	
251 | 	
252 | 	int t = blockIdx.x * blockDim.x + threadIdx.x;
253 | 	int b = blockIdx.y * blockDim.y + threadIdx.y;
254 | 	int c, s, h_out, w_out, w_unroll, h_unroll, h_base, p, q;
255 | 	int H_out = H - K + 1;
256 | 	int W_out = W - K + 1;
257 | 	int out_number = H_out * W_out;
258 | 	int Layer = blockIdx.z * blockDim.z + threadIdx.z;
259 | 	
260 | 	
261 | 	if (t < C * out_number && b < (B-B_pre) && Layer == 0) {
262 | 		c = t / out_number;
263 | 		s = t % out_number;
264 | 		h_out = s / W_out;
265 | 		w_out = s % W_out;
266 | 		w_unroll = h_out * W_out + w_out;
267 | 		h_base = c * K * K;
268 | 				
269 | 		for(p = 0; p < K; p++){
270 | 			for(q = 0; q < K; q++) {
271 | 				h_unroll = h_base + p * K + q; 
272 | 				X_unroll(b, h_unroll, w_unroll) = x4d(b+B_pre, c, h_out + p, w_out + q);				
273 | 			}
274 | 		}
275 | 	}
276 | 	__syncthreads();
277 | 
278 | 	int Row = blockIdx.y * blockDim.y + threadIdx.y;
279 | 	int Col = blockIdx.x * blockDim.x + threadIdx.x;
280 | 	//int Layer = blockIdx.z * blockDim.z + threadIdx.z;
281 | 	
282 | 	int tx = threadIdx.x;
283 | 	int ty = threadIdx.y;
284 | 	
285 | 	__shared__ float tileA[TILE_WIDTH][TILE_WIDTH];
286 | 	__shared__ float tileB[TILE_WIDTH][TILE_WIDTH];
287 | 	
288 | 	int numTiles = numAColumns/TILE_WIDTH;
289 |     if (numAColumns%TILE_WIDTH) numTiles++;
290 | 	
291 | 	float CVal = 0.0;
292 | 	for(int a = 0; a < numTiles; ++a){
293 | 		if(Row < numARows && a * TILE_WIDTH + tx < numAColumns){
294 | 			tileA[ty][tx] = w[Row * numAColumns + a * TILE_WIDTH + tx];
295 | 		}
296 | 		else tileA[ty][tx] = 0.0;
297 | 		if(a * TILE_WIDTH + ty < numBRows && Col < numBColumns){
298 | 			tileB[ty][tx] = X_unroll[Layer * numBColumns * numBRows + (a * TILE_WIDTH + ty) * numBColumns + Col];
299 | 		}
300 | 		else tileB[ty][tx] = 0.0;
301 | 		__syncthreads();
302 | 			for(int k = 0; k < TILE_WIDTH; ++k){
303 | 				CVal += tileA[ty][k] * tileB[k][tx];
304 | 			}
305 | 		__syncthreads(); 		
306 | 		
307 | 	}
308 | 	if(Row < numCRows && Col < numCColumns && Layer < (B-B_pre)){
309 | 		y[(Layer+B_pre) * numCColumns * numCRows + Row * numCColumns + Col] = CVal;
310 | 	}
311 | }
312 | 
313 | __global__ void unrolltoshare(int C, int K, int M,int H, int W, int W_out, int H_out, float* __restrict__  x, float* __restrict__  k, float* __restrict__ y) {
314 | 	__shared__ float tileMatA[TILE_WIDTH][TILE_WIDTH];
315 | 	__shared__ float tileMatB[TILE_WIDTH][TILE_WIDTH];
316 | 
317 | 	int b = blockIdx.z;
318 | 	int tx = threadIdx.x, ty = threadIdx.y;
319 | 	int row = blockIdx.y * TILE_WIDTH + ty;
320 | 	int column = blockIdx.x * TILE_WIDTH + tx;
321 | 	int numMatAColumns = C*K*K; // This is the same as numMatBRows.
322 | 
323 | 	float acc = 0.0;
324 | 	int num_iterations = ceil(numMatAColumns/(1.0*TILE_WIDTH));
325 | 	//#pragma unroll
326 | 	for (int i = 0; i < num_iterations; i++) { 
327 | 		int temp_col = i*TILE_WIDTH + tx, temp_row = i*TILE_WIDTH + ty;
328 | 		tileMatA[ty][tx] = 0;
329 | 		tileMatB[ty][tx] = 0;
330 | 
331 | 		// Original indices in the filter tensor.
332 | 		int W_m = row;
333 | 		int W_c = temp_col/(K*K);
334 | 		int W_h = (temp_col%(K*K))/K, W_w = (temp_col%(K*K))%K;
335 | 
336 | 		if (temp_col < numMatAColumns && row < M)
337 | 			tileMatA[ty][tx] = k4d(W_m, W_c, W_h, W_w);
338 | 		else
339 | 			tileMatA[ty][tx] = 0;
340 | 		// Original indices in the input tensor.
341 | 				
342 | 		int X_b = b;				
343 | 		int X_c = temp_row/(K*K);
344 | 		int X_p = temp_row%(K*K)/K , X_q = (temp_row%(K*K))%K;
345 | 		int X_h = column/W_out, X_w =column%W_out;
346 | 		
347 | 
348 | 		if (temp_row < numMatAColumns && column < H_out*W_out)
349 | 			tileMatB[ty][tx] = x4d(X_b, X_c, X_h + X_p, X_w + X_q);
350 | 		else
351 | 			tileMatB[ty][tx] = 0;
352 | 
353 | 		__syncthreads();
354 | 		//#pragma unroll 23
355 | 		for (int q = 0; q < TILE_WIDTH; q++)
356 | 			acc += tileMatA[ty][q] * tileMatB[q][tx];
357 | 		__syncthreads();
358 | 	}
359 | 
360 | 	// Original indices in the output tensor.
361 | 	int Y_b = b;
362 | 	int Y_m = row;
363 | 	int Y_h = column / W_out, Y_w = column % W_out;
364 | 
365 | 	if (row < M && column < W_out*H_out)
366 | 		y4d(Y_b, Y_m, Y_h, Y_w) = acc;
367 | }
368 | 
369 | 
370 | __global__ void unrolltoshare1(int C, int K, int M,int H, int W, int W_out, int H_out, float* __restrict__  x, float* __restrict__  k, float* __restrict__ y) {
371 | 	__shared__ float tileMatA[TILE_WIDTH1][TILE_WIDTH1];
372 | 	__shared__ float tileMatB[TILE_WIDTH1][TILE_WIDTH1];
373 | 
374 | 	int b = blockIdx.z;
375 | 	int tx = threadIdx.x, ty = threadIdx.y;
376 | 	int row = blockIdx.y * TILE_WIDTH1 + ty;
377 | 	int column = blockIdx.x * TILE_WIDTH1 + tx;
378 | 	int numMatAColumns = C*K*K; // This is the same as numMatBRows.
379 | 
380 | 	float acc = 0.0;
381 | 	int num_iterations = ceil(numMatAColumns/(1.0*TILE_WIDTH1));
382 | 	//#pragma unroll
383 | 	for (int i = 0; i < num_iterations; i++) { 
384 | 		int temp_col = i*TILE_WIDTH1 + tx, temp_row = i*TILE_WIDTH1 + ty;
385 | 		tileMatA[ty][tx] = 0;
386 | 		tileMatB[ty][tx] = 0;
387 | 
388 | 		// Original indices in the filter tensor.
389 | 		int W_m = row;
390 | 		int W_c = temp_col/(K*K);
391 | 		int W_h = (temp_col%(K*K))/K, W_w = (temp_col%(K*K))%K;
392 | 
393 | 		if (temp_col < numMatAColumns && row < M)
394 | 			tileMatA[ty][tx] = k4d(W_m, W_c, W_h, W_w);
395 | 		else
396 | 			tileMatA[ty][tx] = 0;
397 | 		// Original indices in the input tensor.
398 | 				
399 | 		int X_b = b;				
400 | 		int X_c = temp_row/(K*K);
401 | 		int X_p = temp_row%(K*K)/K , X_q = (temp_row%(K*K))%K;
402 | 		int X_h = column/W_out, X_w =column%W_out;
403 | 		
404 | 
405 | 		if (temp_row < numMatAColumns && column < H_out*W_out)
406 | 			tileMatB[ty][tx] = x4d(X_b, X_c, X_h + X_p, X_w + X_q);
407 | 		else
408 | 			tileMatB[ty][tx] = 0;
409 | 
410 | 		__syncthreads();
411 | 		//#pragma unroll 23
412 | 		for (int q = 0; q < TILE_WIDTH1; q++){
413 | 			acc += tileMatA[ty][q] * tileMatB[q][tx];
414 | 		}
415 | 		__syncthreads();
416 | 	}
417 | 
418 | 	// Original indices in the output tensor.
419 | 	int Y_b = b;
420 | 	int Y_m = row;
421 | 	int Y_h = column / W_out, Y_w = column % W_out;
422 | 
423 | 	if (row < M && column < W_out*H_out)
424 | 		y4d(Y_b, Y_m, Y_h, Y_w) = acc;
425 | }
426 | 
427 | 
428 | __global__ void unrolltoshare2(int C, int K, int M,int H, int W, int W_out, int H_out, float* x, float* __restrict__ k, float*  y) {
429 | 	__shared__ float tileMatA[TILE_WIDTH2][TILE_WIDTH2];
430 | 	__shared__ float tileMatB[TILE_WIDTH2][TILE_WIDTH2];
431 | 
432 | 	int b = blockIdx.z;
433 | 	int tx = threadIdx.x, ty = threadIdx.y;
434 | 	int row = blockIdx.y * TILE_WIDTH2 + ty;
435 | 	int column = blockIdx.x * TILE_WIDTH2 + tx;
436 | 	int numMatAColumns = C*K*K; // This is the same as numMatBRows.
437 | 
438 | 	float acc = 0.0;
439 | 	int num_iterations = ceil(numMatAColumns/(1.0*TILE_WIDTH2));
440 | 	//#pragma unroll
441 | 	for (int i = 0; i < num_iterations; i++) { 
442 | 		int temp_col = i*TILE_WIDTH2 + tx, temp_row = i*TILE_WIDTH2 + ty;
443 | 		tileMatA[ty][tx] = 0;
444 | 		tileMatB[ty][tx] = 0;
445 | 
446 | 		// Original indices in the filter tensor.
447 | 		int W_m = row;
448 | 		int W_c = temp_col/(K*K);
449 | 		int W_h = (temp_col%(K*K))/K, W_w = (temp_col%(K*K))%K;
450 | 
451 | 		if (temp_col < numMatAColumns && row < M)
452 | 			tileMatA[ty][tx] = k4d(W_m, W_c, W_h, W_w);
453 | 		else
454 | 			tileMatA[ty][tx] = 0;
455 | 		// Original indices in the input tensor.
456 | 				
457 | 		int X_b = b;				
458 | 		int X_c = temp_row/(K*K);
459 | 		int X_p = temp_row%(K*K)/K , X_q = (temp_row%(K*K))%K;
460 | 		int X_h = column/W_out, X_w =column%W_out;
461 | 		
462 | 
463 | 		if (temp_row < numMatAColumns && column < H_out*W_out)
464 | 			tileMatB[ty][tx] = x4d(X_b, X_c, X_h + X_p, X_w + X_q);
465 | 		else
466 | 			tileMatB[ty][tx] = 0;
467 | 
468 | 		__syncthreads();
469 | 		//#pragma unroll 24
470 | 		for (int q = 0; q < TILE_WIDTH2; q++){		
471 | 			acc += tileMatA[ty][q] * tileMatB[q][tx];
472 | 		}
473 | 		__syncthreads();
474 | 	}
475 | 
476 | 	// Original indices in the output tensor.
477 | 	int Y_b = b;
478 | 	int Y_m = row;
479 | 	int Y_h = column / W_out, Y_w = column % W_out;
480 | 
481 | 	if (row < M && column < W_out*H_out)
482 | 		y4d(Y_b, Y_m, Y_h, Y_w) = acc;
483 | }
484 | 
485 | /* 
486 |    This function is called by new-inl.h
487 |    Any code you write should be executed by this function.
488 |    For ECE408, we only expect the float version of the operator to be called, so here we specialize with only floats.
489 | */
490 | template <>
491 | void forward<gpu, float>(mshadow::Tensor<gpu, 4, float> &y, const mshadow::Tensor<gpu, 4, float> &x, const mshadow::Tensor<gpu, 4, float> &w)
492 | {
493 | 
494 |     // Extract the tensor dimensions into B,M,C,H,W,K
495 |     // ...
496 | 	
497 |     const int B = x.shape_[0];
498 |     const int C = x.shape_[1];
499 |     const int H = x.shape_[2];
500 |     const int W = x.shape_[3];    
501 |     
502 |     const int M = y.shape_[1];
503 |     const int K = w.shape_[3];
504 | 	
505 |     int H_out = H - K + 1;
506 |     int W_out = W - K + 1;
507 | 
508 | 	//printf("Hout= %d, WOUT = %d, K = %d , M = %d\n", H_out, W_out, K, M);
509 | 
510 | 	
511 | /*********************************different layer ***********************************************************/ 
512 | 
513 | 	if(C==1){
514 | 		
515 | 	//unroll directly to share memory			
516 | 	dim3 gridDim1(ceil(H_out*W_out/(1.0*TILE_WIDTH1)),ceil(M/(1.0*TILE_WIDTH1)),B);
517 | 	dim3 blockDim1(TILE_WIDTH1,TILE_WIDTH1,1);
518 | 	unrolltoshare1<<<gridDim1, blockDim1>>>(C, K, M, H, W ,W_out, H_out, x.dptr_, w.dptr_, y.dptr_);	
519 | 		
520 | 	/*	
521 | 	int W_grid = ceil(1.0*W_out / 16);
522 |     int H_grid = ceil(1.0*H_out / 16);
523 | 
524 |     int Z = H_grid * W_grid;
525 | 
526 | 	cudaMemcpyToSymbol(kernel, w.dptr_, w.shape_[0]*w.shape_[1]*w.shape_[2]*w.shape_[3]* sizeof(float), 0, cudaMemcpyDeviceToDevice);		
527 |     // Set the kernel dimensions
528 |     dim3 blockDim(16, 16, 1);
529 |     dim3 gridDim(B, M, Z);
530 | 
531 |     // Call the kernel
532 | 	forward_kernel1<<<gridDim, blockDim,0,0>>>(y.dptr_,x.dptr_,w.dptr_,B,M,C,H,W,K);
533 | 	*/
534 | 
535 | 	}
536 | 	
537 | 	else if(C==12){
538 | 			
539 | 	//unroll directly to share memory	
540 | 	dim3 gridDim2(ceil(H_out*W_out/(1.0*TILE_WIDTH2)),ceil(M/(1.0*TILE_WIDTH2)),B);
541 | 	dim3 blockDim2(TILE_WIDTH2,TILE_WIDTH2,1);
542 | 	unrolltoshare2<<<gridDim2, blockDim2>>>(C, K, M, H, W ,W_out, H_out, x.dptr_, w.dptr_, y.dptr_);
543 | 	
544 | 	/*
545 | 	int W_grid = ceil(1.0*W_out / 32);
546 |     int H_grid = ceil(1.0*H_out / 32);
547 | 
548 |     int Z = H_grid * W_grid;
549 | 
550 | 	//constant memory
551 | 	cudaMemcpyToSymbol(kernel, w.dptr_, w.shape_[0]*w.shape_[1]*w.shape_[2]*w.shape_[3]* sizeof(float), 0, cudaMemcpyDeviceToDevice);	
552 |     // Set the kernel dimensions
553 |     dim3 blockDim(32, 32, 1);
554 |     dim3 gridDim(B, M, Z);
555 | 	
556 |     // Call the kernel
557 | 	forward_kernel2<<<gridDim, blockDim,0,0>>>(y.dptr_,x.dptr_,w.dptr_,B,M,C,H,W,K);
558 | 	*/
559 | 	}
560 | 	
561 | 	
562 | 
563 | /*********************************unroll + share memory multiplication***********************************************************/ 
564 | /*		
565 | 	//parameters for matrix multiply 		
566 | 	int numARows;    
567 | 	int numAColumns; 
568 | 	int numBRows;    
569 | 	int numBColumns; 
570 | 	int numCRows;   
571 | 	int numCColumns;
572 | 	
573 | 	numARows = M;
574 | 	numAColumns = C *K *K; 
575 | 	
576 | 	numBRows = C *K * K;
577 | 	numBColumns = H_out * W_out; 
578 | 	
579 | 	numCRows = numARows;
580 |   	numCColumns = numBColumns;
581 | 
582 |     //unroll x & matrix multi.
583 |     //part1	 
584 |     int B_curr = 4000;
585 |     int B_pre = 0;
586 | 	float *x_unroll;	
587 | 	cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float));
588 | 	dim3 unrollGrid(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1);
589 | 	dim3 unrollBlock(TILE_WIDTH, TILE_WIDTH, 1);
590 | 	unroll_x<<<unrollGrid, unrollBlock>>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre);
591 |     
592 | 	dim3 matrixGrid(numCColumns/TILE_WIDTH,numCRows/TILE_WIDTH,(B_curr-B_pre));
593 | 	if (numCColumns%TILE_WIDTH) matrixGrid.x++;
594 | 	if (numCRows%TILE_WIDTH) matrixGrid.y++;
595 | 	dim3 matrixBlock(TILE_WIDTH,TILE_WIDTH,1);
596 | 	matrixMultiply<<<matrixGrid, matrixBlock>>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre);
597 |     cudaFree(x_unroll);    
598 | 
599 |     //part2
600 |     B_curr = 8000;
601 |     B_pre = 4000;	
602 | 	cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float));
603 | 	dim3 unrollGrid_2(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1);
604 | 	dim3 unrollBlock_2(TILE_WIDTH, TILE_WIDTH, 1);
605 | 	unroll_x<<<unrollGrid_2, unrollBlock_2>>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre);
606 |     
607 |     dim3 matrixGrid2(numCColumns/TILE_WIDTH,numCRows/TILE_WIDTH,(B_curr-B_pre));
608 | 	if (numCColumns%TILE_WIDTH) matrixGrid2.x++;
609 | 	if (numCRows%TILE_WIDTH) matrixGrid2.y++;
610 |     //dim3 matrixGrid(ceil((float)numCColumns/TILE_WIDTH),ceil((float)numCRows/TILE_WIDTH,(B_curr-B_pre)));
611 | 	dim3 matrixBlock2(TILE_WIDTH,TILE_WIDTH,1);
612 | 	matrixMultiply<<<matrixGrid2, matrixBlock2>>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre);
613 |     cudaFree(x_unroll);
614 |     
615 |     //part3
616 |     B_curr = 10000;
617 |     B_pre = 8000;	
618 | 	cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float));
619 | 	dim3 unrollGrid_3(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1);
620 | 	dim3 unrollBlock_3(TILE_WIDTH, TILE_WIDTH, 1);
621 | 	unroll_x<<<unrollGrid_3, unrollBlock_3>>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre);
622 |     
623 |     dim3 matrixGrid3(numCColumns/TILE_WIDTH,numCRows/TILE_WIDTH,(B_curr-B_pre));
624 | 	if (numCColumns%TILE_WIDTH) matrixGrid3.x++;
625 | 	if (numCRows%TILE_WIDTH) matrixGrid3.y++;
626 |     //dim3 matrixGrid(ceil((float)numCColumns/TILE_WIDTH),ceil((float)numCRows/TILE_WIDTH,(B_curr-B_pre)));
627 | 	dim3 matrixBlock3(TILE_WIDTH,TILE_WIDTH,1);
628 | 	matrixMultiply<<<matrixGrid3, matrixBlock3>>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre);
629 |     cudaFree(x_unroll);
630 | */
631 | /**********************************************unroll + register tiling multiplication*****************************************************************************************************************/
632 | 
633 | /*
634 | 	//parameters for matrix multiply 		
635 | 	int numARows;    
636 | 	int numAColumns; 
637 | 	int numBRows;    
638 | 	int numBColumns; 
639 | 	int numCRows;   
640 | 	int numCColumns;
641 | 	
642 | 	numARows = M;
643 | 	numAColumns = C *K *K; 
644 | 	
645 | 	numBRows = C *K * K;
646 | 	numBColumns = H_out * W_out; 
647 | 	
648 | 	numCRows = numARows;
649 |   	numCColumns = numBColumns;
650 | 
651 |     //reg tiling matrix multiply
652 |     //part1
653 |     
654 |     int B_curr = 4000;
655 |     int B_pre = 0;
656 | 	float *x_unroll;	
657 | 	cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float));
658 | 	dim3 unrollGrid(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1);
659 | 	dim3 unrollBlock(TILE_WIDTH, TILE_WIDTH, 1);
660 | 	unroll_x<<<unrollGrid, unrollBlock>>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre);
661 | 
662 | 	dim3 tileGrid((numCColumns-1)/TILE_SZ_B +1,(numCRows-1)/TILE_SZ_A+1,(B_curr-B_pre));
663 | 	dim3 tileBlock(1,TILE_SZ_A,1);
664 | 	reg_matrixMultiply<<<tileGrid, tileBlock>>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre);
665 |     cudaFree(x_unroll);
666 |     
667 |     //part2
668 |     B_curr = 8000;
669 |     B_pre = 4000;	
670 | 	cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float));
671 | 	dim3 unrollGrid2(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1);
672 | 	dim3 unrollBlock2(TILE_WIDTH, TILE_WIDTH, 1);
673 | 	unroll_x<<<unrollGrid2, unrollBlock2>>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre);
674 |     
675 |     dim3 tileGrid2((numCColumns-1)/TILE_SZ_B +1,(numCRows-1)/TILE_SZ_A+1,(B_curr-B_pre));
676 | 	dim3 tileBlock2(1,TILE_SZ_A,1);
677 | 	reg_matrixMultiply<<<tileGrid2, tileBlock2>>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre);
678 |     cudaFree(x_unroll);
679 | 
680 |     //part3
681 |     B_curr = 10000;
682 |     B_pre = 8000;	
683 | 	cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float));
684 | 	dim3 unrollGrid3(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1);
685 | 	dim3 unrollBlock3(TILE_WIDTH, TILE_WIDTH, 1);
686 | 	unroll_x<<<unrollGrid3, unrollBlock3>>>(C, H, W, K, x.dptr_, x_unroll, B_curr, B_pre);
687 |     
688 |     dim3 tileGrid3((numCColumns-1)/TILE_SZ_B +1,(numCRows-1)/TILE_SZ_A+1,(B_curr-B_pre));
689 | 	dim3 tileBlock3(1,TILE_SZ_A,1);
690 | 	reg_matrixMultiply<<<tileGrid3, tileBlock3>>>(w.dptr_, x_unroll, y.dptr_, numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns,B_curr, B_pre);
691 |     cudaFree(x_unroll);
692 | */	
693 | /************************************************fusion unroll sharmemory multiplication*************************************************************************/
694 | 
695 | /*
696 | 	//parameters for matrix multiply 		
697 | 	int numARows;    
698 | 	int numAColumns; 
699 | 	int numBRows;    
700 | 	int numBColumns; 
701 | 	int numCRows;   
702 | 	int numCColumns;
703 | 	
704 | 	numARows = M;
705 | 	numAColumns = C *K *K; 
706 | 	
707 | 	numBRows = C *K * K;
708 | 	numBColumns = H_out * W_out; 
709 | 	
710 | 	numCRows = numARows;
711 |   	numCColumns = numBColumns;
712 | 
713 | 	
714 |     //fusion
715 |     //part1
716 |     float *x_unroll;
717 |     int B_curr = 4000;
718 |     int B_pre = 0;	
719 | 	cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float));
720 | 	dim3 unrollGrid(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1);
721 | 	dim3 unrollBlock(TILE_WIDTH, TILE_WIDTH, 1);
722 | 	dim3 fusionGrid(unrollGrid.x,unrollGrid.y, B_curr-B_pre);
723 | 	dim3 fusionBlock(TILE_WIDTH,TILE_WIDTH, 1);
724 | 	fusion<<<fusionGrid, fusionBlock>>>(C, H, W, K, M, B_curr, B_pre, x.dptr_, y.dptr_, w.dptr_, x_unroll,numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns);
725 |     cudaFree(x_unroll);
726 | 
727 |     //part2
728 |     B_curr = 8000;
729 |     B_pre = 4000;	
730 | 	cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float));
731 | 	dim3 unrollGrid2(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1);
732 | 	dim3 unrollBlock2(TILE_WIDTH, TILE_WIDTH, 1);
733 | 	dim3 fusionGrid2(unrollGrid2.x,unrollGrid2.y, B_curr-B_pre);
734 | 	dim3 fusionBlock2(TILE_WIDTH,TILE_WIDTH, 1);
735 | 	fusion<<<fusionGrid2, fusionBlock2>>>(C, H, W, K, M, B_curr, B_pre, x.dptr_, y.dptr_, w.dptr_, x_unroll,numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns);
736 |     cudaFree(x_unroll);
737 | 
738 |     //part3
739 |     //part3
740 |     B_curr = 10000;
741 |     B_pre = 8000;	
742 | 	cudaMalloc((void **)&x_unroll, (B_curr-B_pre) * K * K * C * H_out* W_out*sizeof(float));
743 | 	dim3 unrollGrid3(ceil((float)C*H_out*W_out/TILE_WIDTH), ceil((float)(B_curr-B_pre)/TILE_WIDTH), 1);
744 | 	dim3 unrollBlock3(TILE_WIDTH, TILE_WIDTH, 1);
745 | 	dim3 fusionGrid3(unrollGrid3.x,unrollGrid3.y, B_curr-B_pre);
746 | 	dim3 fusionBlock3(TILE_WIDTH,TILE_WIDTH, 1);
747 | 	fusion<<<fusionGrid3, fusionBlock3>>>(C, H, W, K, M, B_curr, B_pre, x.dptr_, y.dptr_, w.dptr_, x_unroll,numARows,numAColumns,numBRows,numBColumns,numCRows,numCColumns);
748 |     cudaFree(x_unroll);
749 | 	MSHADOW_CUDA_CALL(cudaDeviceSynchronize());
750 | */
751 | }
752 | 
753 | 
754 | template <typename gpu, typename DType>
755 | void forward(mshadow::Tensor<gpu, 4, DType> &y, const mshadow::Tensor<gpu, 4, DType> &x, const mshadow::Tensor<gpu, 4, DType> &w)
756 | {
757 |     //CHECK_EQ(0,1) << "Remove this line and replace it with your implementation.";
758 | }
759 | }
760 | }
761 | 
762 | #undef y4d
763 | #undef x4d
764 | #undef k4d
765 | #undef X_unroll
766 | #undef funroll2d
767 | #endif
768 | 
769 | 


--------------------------------------------------------------------------------
/ece408_project/ece408_src/new-forward.h:
--------------------------------------------------------------------------------
 1 | #ifndef MXNET_OPERATOR_NEW_FORWARD_H_
 2 | #define MXNET_OPERATOR_NEW_FORWARD_H_
 3 | 
 4 | #include <mxnet/base.h>
 5 | 
 6 | namespace mxnet
 7 | {
 8 | namespace op
 9 | {
10 | 
11 | 
12 | template <typename cpu, typename DType>
13 | void forward(mshadow::Tensor<cpu, 4, DType> &y, const mshadow::Tensor<cpu, 4, DType> &x, const mshadow::Tensor<cpu, 4, DType> &k)
14 | {
15 |     /*
16 |     Modify this function to implement the forward pass described in Chapter 16.
17 |     The code in 16 is for a single image.
18 |     We have added an additional dimension to the tensors to support an entire mini-batch
19 |     The goal here is to be correct, not fast (this is the CPU implementation.)
20 |     */
21 | 
22 |     const int B = x.shape_[0];
23 |     const int M = y.shape_[1];
24 |     const int C = x.shape_[1];
25 |     const int H = x.shape_[2];
26 |     const int W = x.shape_[3];
27 |     const int K = k.shape_[3];
28 | 	
29 |     for (int b = 0; b < B; ++b) {
30 |         for(int m = 0; m < M; ++m){
31 |             for(int h = 0; h < H; ++h){
32 |                 for(int w = 0; w < W; ++w){
33 |                     y[b][m][h][w] = 0;
34 |                     for(int c = 0; c < C; ++c){
35 |                         for(int p = 0; p < K; ++p){
36 |                             for(int q = 0; q< K; ++q){
37 |                                 y[b][m][h][w] += x[b][c][h + p][w + q] * k[m][c][p][q];
38 |                             }
39 |                         }
40 |                     }
41 |                 }
42 |             }
43 |         }
44 |     }
45 | 
46 | 
47 | 
48 | 
49 | }
50 | }
51 | }
52 | 
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/ece408_project/final.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import mxnet as mx
 4 | import logging
 5 | import sys
 6 | from reader import load_mnist
 7 | 
 8 | MODEL_DIR = "/models"
 9 | model_prefix = "ece408"
10 | dataset_size = float("inf")
11 | 
12 | if len(sys.argv) > 1:
13 |     dataset_size = int(sys.argv[1])
14 | if len(sys.argv) > 2:
15 |     print "Usage:", sys.argv[0], "<dataset size>"
16 |     print "    <dataset_size> = [0 - 10000]"
17 |     sys.exit(-1)
18 | 
19 | # Log to stdout for MXNet
20 | logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
21 | 
22 | print "Loading fashion-mnist data...",
23 | test_images, test_labels = load_mnist(
24 |     path="/fashion-mnist", rows=70, cols=70, kind="t10k-70")
25 | print "done"
26 | 
27 | # Reduce the size of the dataset, if desired
28 | dataset_size = max(0, min(dataset_size, 10000))
29 | test_images = test_images[:dataset_size]
30 | test_labels = test_labels[:dataset_size]
31 | 
32 | # Cap batch size at the size of our training data
33 | batch_size = len(test_images)
34 | 
35 | # Get iterators that cover the dataset
36 | test_iter = mx.io.NDArrayIter(
37 |     test_images, test_labels, batch_size)
38 | 
39 | # Evaluate the network
40 | print "Loading model...",
41 | lenet_model = mx.mod.Module.load(
42 |     prefix=MODEL_DIR + "/" + model_prefix, epoch=2, context=mx.gpu())
43 | lenet_model.bind(data_shapes=test_iter.provide_data,
44 |                  label_shapes=test_iter.provide_label)
45 | print "done"
46 | 
47 | print "New Inference"
48 | acc = mx.metric.Accuracy()
49 | lenet_model.score(test_iter, acc)
50 | print "Correctness:", acc.get()[1], "Model:", model_prefix
51 | 


--------------------------------------------------------------------------------
/ece408_project/m1.1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import mxnet as mx
 4 | import logging
 5 | from reader import load_mnist
 6 | 
 7 | # Log to stdout for MXNet
 8 | logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
 9 | 
10 | print "Loading fashion-mnist data...",
11 | test_images, test_labels = load_mnist(
12 |     path="/fashion-mnist", rows=70, cols=70, kind="t10k-70")
13 | print "done"
14 | 
15 | # Do everything in a single batch
16 | batch_size = len(test_images)
17 | 
18 | # Get iterators that cover the dataset
19 | test_iter = mx.io.NDArrayIter(
20 |     test_images, test_labels, batch_size)
21 | 
22 | # Evaluate the network
23 | print "Loading model...",
24 | lenet_model = mx.mod.Module.load(
25 |     prefix='/models/baseline', epoch=2, context=mx.cpu())
26 | lenet_model.bind(data_shapes=test_iter.provide_data,
27 |                  label_shapes=test_iter.provide_label)
28 | print "done"
29 | 
30 | print "New Inference"
31 | acc = mx.metric.Accuracy()
32 | lenet_model.score(test_iter, acc)
33 | print(acc)
34 | 


--------------------------------------------------------------------------------
/ece408_project/m1.2.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import mxnet as mx
 4 | import logging
 5 | from reader import load_mnist
 6 | 
 7 | # Log to stdout for MXNet
 8 | logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
 9 | 
10 | print "Loading fashion-mnist data...",
11 | test_images, test_labels = load_mnist(
12 |     path="/fashion-mnist", rows=70, cols=70, kind="t10k-70")
13 | print "done"
14 | 
15 | # Do everything in a single batch
16 | batch_size = len(test_images)
17 | 
18 | # Get iterators that cover the dataset
19 | test_iter = mx.io.NDArrayIter(
20 |     test_images, test_labels, batch_size)
21 | 
22 | # Evaluate the network
23 | print "Loading model...",
24 | lenet_model = mx.mod.Module.load(
25 |     prefix='/models/baseline', epoch=2, context=mx.gpu())
26 | lenet_model.bind(data_shapes=test_iter.provide_data,
27 |                  label_shapes=test_iter.provide_label)
28 | print "done"
29 | 
30 | print "New Inference"
31 | acc = mx.metric.Accuracy()
32 | lenet_model.score(test_iter, acc)
33 | print(acc)
34 | 


--------------------------------------------------------------------------------
/ece408_project/m2.1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import mxnet as mx
 4 | import logging
 5 | import sys
 6 | from reader import load_mnist
 7 | 
 8 | MODEL_DIR = "/models"
 9 | model_prefix = "ece408"
10 | dataset_size = float("inf")
11 | 
12 | if len(sys.argv) > 1:
13 |     dataset_size = int(sys.argv[1])
14 | if len(sys.argv) > 2:
15 |     print "Usage:", sys.argv[0], "<dataset size>"
16 |     print "    <dataset_size> = [0 - 10000]"
17 |     sys.exit(-1)
18 | 
19 | # Log to stdout for MXNet
20 | logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
21 | 
22 | print "Loading fashion-mnist data...",
23 | test_images, test_labels = load_mnist(
24 |     path="/fashion-mnist", rows=70, cols=70, kind="t10k-70")
25 | print "done"
26 | 
27 | # Reduce the size of the dataset, if desired
28 | dataset_size = max(0, min(dataset_size, 10000))
29 | test_images = test_images[:dataset_size]
30 | test_labels = test_labels[:dataset_size]
31 | 
32 | # Cap batch size at the size of our training data
33 | batch_size = len(test_images)
34 | 
35 | # Get iterators that cover the dataset
36 | test_iter = mx.io.NDArrayIter(
37 |     test_images, test_labels, batch_size)
38 | 
39 | # Evaluate the network
40 | print "Loading model...",
41 | lenet_model = mx.mod.Module.load(
42 |     prefix=MODEL_DIR + "/" + model_prefix, epoch=2, context=mx.cpu())
43 | lenet_model.bind(data_shapes=test_iter.provide_data,
44 |                  label_shapes=test_iter.provide_label)
45 | print "done"
46 | 
47 | print "New Inference"
48 | acc = mx.metric.Accuracy()
49 | lenet_model.score(test_iter, acc)
50 | print "Correctness:", acc.get()[1], "Model:", model_prefix
51 | 


--------------------------------------------------------------------------------
/ece408_project/m3.1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import mxnet as mx
 4 | import logging
 5 | import sys
 6 | from reader import load_mnist
 7 | 
 8 | MODEL_DIR = "/models"
 9 | model_prefix = "ece408"
10 | dataset_size = float("inf")
11 | 
12 | if len(sys.argv) > 1:
13 |     dataset_size = int(sys.argv[1])
14 | if len(sys.argv) > 2:
15 |     print "Usage:", sys.argv[0], "<dataset size>"
16 |     print "    <dataset_size> = [0 - 10000]"
17 |     sys.exit(-1)
18 | 
19 | # Log to stdout for MXNet
20 | logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
21 | 
22 | print "Loading fashion-mnist data...",
23 | test_images, test_labels = load_mnist(
24 |     path="/fashion-mnist", rows=70, cols=70, kind="t10k-70")
25 | print "done"
26 | 
27 | # Reduce the size of the dataset, if desired
28 | dataset_size = max(0, min(dataset_size, 10000))
29 | test_images = test_images[:dataset_size]
30 | test_labels = test_labels[:dataset_size]
31 | 
32 | # Cap batch size at the size of our training data
33 | batch_size = len(test_images)
34 | 
35 | # Get iterators that cover the dataset
36 | test_iter = mx.io.NDArrayIter(
37 |     test_images, test_labels, batch_size)
38 | 
39 | # Evaluate the network
40 | print "Loading model...",
41 | lenet_model = mx.mod.Module.load(
42 |     prefix=MODEL_DIR + "/" + model_prefix, epoch=2, context=mx.gpu())
43 | lenet_model.bind(data_shapes=test_iter.provide_data,
44 |                  label_shapes=test_iter.provide_label)
45 | print "done"
46 | 
47 | print "New Inference"
48 | acc = mx.metric.Accuracy()
49 | lenet_model.score(test_iter, acc)
50 | print "Correctness:", acc.get()[1], "Model:", model_prefix
51 | 


--------------------------------------------------------------------------------
/ece408_project/m4.1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import mxnet as mx
 4 | import logging
 5 | import sys
 6 | from reader import load_mnist
 7 | 
 8 | MODEL_DIR = "/models"
 9 | model_prefix = "ece408"
10 | dataset_size = float("inf")
11 | 
12 | if len(sys.argv) > 1:
13 |     dataset_size = int(sys.argv[1])
14 | if len(sys.argv) > 2:
15 |     print "Usage:", sys.argv[0], "<dataset size>"
16 |     print "    <dataset_size> = [0 - 10000]"
17 |     sys.exit(-1)
18 | 
19 | # Log to stdout for MXNet
20 | logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
21 | 
22 | print "Loading fashion-mnist data...",
23 | test_images, test_labels = load_mnist(
24 |     path="/fashion-mnist", rows=70, cols=70, kind="t10k-70")
25 | print "done"
26 | 
27 | # Reduce the size of the dataset, if desired
28 | dataset_size = max(0, min(dataset_size, 10000))
29 | test_images = test_images[:dataset_size]
30 | test_labels = test_labels[:dataset_size]
31 | 
32 | # Cap batch size at the size of our training data
33 | batch_size = len(test_images)
34 | 
35 | # Get iterators that cover the dataset
36 | test_iter = mx.io.NDArrayIter(
37 |     test_images, test_labels, batch_size)
38 | 
39 | # Evaluate the network
40 | print "Loading model...",
41 | lenet_model = mx.mod.Module.load(
42 |     prefix=MODEL_DIR + "/" + model_prefix, epoch=2, context=mx.gpu())
43 | lenet_model.bind(data_shapes=test_iter.provide_data,
44 |                  label_shapes=test_iter.provide_label)
45 | print "done"
46 | 
47 | print "New Inference"
48 | acc = mx.metric.Accuracy()
49 | lenet_model.score(test_iter, acc)
50 | print "Correctness:", acc.get()[1], "Model:", model_prefix
51 | 


--------------------------------------------------------------------------------
/ece408_project/rai:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leo811121/UIUC-CS-483-Parallel-Programming/0aa3d6097073c4dd5de7f9a52b54e9d230a4df4a/ece408_project/rai


--------------------------------------------------------------------------------
/ece408_project/rai_build.yml:
--------------------------------------------------------------------------------
 1 | rai:
 2 |   version: 0.2
 3 |   image: illinoisimpact/ece408_mxnet_docker:amd64-gpu-latest-fa19
 4 | resources:
 5 |   cpu:
 6 |     architecture: amd64
 7 |   gpu:
 8 |     architecture: volta
 9 |     count: 1
10 |   network: false
11 | commands:
12 |   build:
13 |     - /bin/bash -c "cp -rv /ece408_src/* /mxnet/src/operator/custom" # copy golden files to mxnet source tree
14 |     - /bin/bash -c "cp -rv /src/* /build" # copy the project folder to /build so everything appears in the upload
15 |     - /bin/bash -c "for src in ece408_src/*; do cp -v $src /mxnet/src/operator/custom/.; done" # copy source files to mxnet tree
16 |     - nice -n20 make -C /mxnet # build mxnet
17 |     - pip2 install --user -e /mxnet/python   # install python bindings
18 |     #- /usr/bin/time python m1.1.py # execute code
19 |     #- /usr/bin/time python m1.2.py # execute code
20 |     #- /usr/bin/time python m2.1.py  # execute code
21 |     #- /usr/bin/time python m3.1.py # execute code
22 |     #- /usr/bin/time python m4.1.py # execute code
23 |     #- /usr/bin/time python final.py # execute code
24 |     #- nvprof python m1.2.py
25 |     #- python m3.1.py
26 |     #- python m3.1.py 100
27 |     #- nvprof -o timeline.nvprof python m3.1.py
28 |     #- nvprof --kernels "::forward:1" --analysis-metrics -o forward1_analysis.nvprof python m3.1.py
29 |     #- nvprof --kernels "::forward:2" --analysis-metrics -o forward2_analysis.nvprof python m3.1.py
30 |     - python m4.1.py 10000
31 |     #- nvprof python m4.1.py 10000
32 |     #- nvprof -o timeline.nvprof python m4.1.py 10000
33 |     #- nvprof --kernels "::unrolltoshare1:1" --analysis-metrics -o unrolltoshare1.nvprof python m4.1.py 10000
34 |     #- nvprof --kernels "::unrolltoshare2:1" --analysis-metrics -o unrolltoshare2.nvprof python m4.1.py 10000
35 | 


--------------------------------------------------------------------------------
/ece408_project/reader.py:
--------------------------------------------------------------------------------
 1 | def load_mnist(path, rows, cols, kind):
 2 |     import os
 3 |     import gzip
 4 |     import numpy as np
 5 | 
 6 |     filters = 1
 7 | 
 8 |     """Load MNIST data from `path`"""
 9 |     labels_path = os.path.join(path,
10 |                                '%s-labels-idx1-ubyte.gz'
11 |                                % kind)
12 |     images_path = os.path.join(path,
13 |                                '%s-images-idx3-ubyte.gz'
14 |                                % kind)
15 | 
16 |     with gzip.open(labels_path, 'rb') as lbpath:
17 |         labels = np.frombuffer(lbpath.read(), dtype=np.uint8,
18 |                                offset=8)
19 |         labels.reshape(len(labels))
20 | 
21 |     with gzip.open(images_path, 'rb') as imgpath:
22 |         images = np.frombuffer(imgpath.read(), dtype=np.uint8,
23 |                                offset=16).reshape(len(labels), filters, rows, cols)
24 | 
25 |     return images, labels
26 | 
27 | 
28 | def store_mnist(path, images, labels, kind):
29 |     import os
30 |     import gzip
31 |     import numpy as np
32 |     import struct
33 | 
34 |     """Store data to `path`"""
35 |     labels_path = os.path.join(path,
36 |                                '%s-labels-idx1-ubyte.gz'
37 |                                % kind)
38 |     images_path = os.path.join(path,
39 |                                '%s-images-idx3-ubyte.gz'
40 |                                % kind)
41 | 
42 |     with gzip.open(labels_path, 'wb') as lbpath:
43 |         lbpath.write(struct.pack("i", 0))  # magic
44 |         lbpath.write(struct.pack("i", labels.size))  # number of items (32b)
45 |         lbpath.write(labels.tobytes())
46 | 
47 |     with gzip.open(images_path, 'wb') as imgpath:
48 |         imgpath.write(struct.pack("i", 0))  # magic number
49 |         # number of images (32b)
50 |         imgpath.write(struct.pack("i", images.shape[0]))
51 |         # number of rows (32b)
52 |         imgpath.write(struct.pack("i", images.shape[1]))
53 |         # number of cols (32b)
54 |         imgpath.write(struct.pack("i", images.shape[2]))
55 |         imgpath.write(images.tobytes())
56 | 
57 |     return images, labels
58 | 


--------------------------------------------------------------------------------
/ece408_project/report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leo811121/UIUC-CS-483-Parallel-Programming/0aa3d6097073c4dd5de7f9a52b54e9d230a4df4a/ece408_project/report.pdf


--------------------------------------------------------------------------------