├── .gitignore ├── Part1 ├── CH2 │ ├── main.pdf │ └── main.tex ├── CH3 │ ├── code │ │ ├── matrix_multi_row.cu │ │ ├── matrixadd.cu │ │ └── q1.cu │ ├── main.pdf │ └── main.tex ├── CH4 │ ├── main.pdf │ └── main.tex ├── CH5 │ ├── main.pdf │ └── main.tex └── CH6 │ ├── code │ ├── MatMultiCorner.cu │ └── compare.cu │ ├── main.pdf │ └── main.tex ├── Part2 ├── CH10 │ ├── code │ │ └── reduction_basic.cu │ ├── main.pdf │ └── main.tex ├── CH11 │ ├── code │ │ └── prefix_sum.cu │ ├── main.pdf │ └── main.tex ├── CH12 │ ├── main.pdf │ └── main.tex ├── CH7 │ ├── code │ │ ├── convolution2D.cu │ │ └── convolution3D.cu │ ├── main.pdf │ └── main.tex ├── CH8 │ ├── main.pdf │ └── main.tex └── CH9 │ ├── mian.pdf │ └── mian.tex ├── Part3 ├── CH13 │ └── code │ │ ├── q4 │ │ └── merge.cu │ │ └── sort.cu └── CH14 │ ├── code │ └── sprase_matrix_computation.cu │ └── main.md ├── cuda └── main.cu ├── new └── test.tex ├── programming-massively-parallel-processors-a-hands-on-approach-4nbsped-9780323912310_dokumen.pub.pdf └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE and Editor folders 2 | .vscode/ 3 | .idea/ 4 | *.swp 5 | *.swo 6 | 7 | # CUDA编译生成的文件 8 | *.exe 9 | *.exp 10 | *.lib 11 | *.o 12 | *.obj 13 | *.out 14 | *.app 15 | *.i*86 16 | *.x86_64 17 | *.ptx 18 | 19 | # LaTeX生成的临时文件 20 | *.aux 21 | *.lof 22 | *.log 23 | *.lot 24 | *.fls 25 | *.out 26 | *.toc 27 | *.fmt 28 | *.fot 29 | *.cb 30 | *.cb2 31 | .*.lb 32 | *.dvi 33 | *.xdv 34 | *-converted-to.* 35 | *.ps 36 | *.eps 37 | *.tex.bak 38 | *.pdfsync 39 | *.synctex.gz 40 | *.fdb_latexmk 41 | *.snm 42 | *.nav 43 | *.vrb 44 | _*.data.minted 45 | 46 | # 构建目录 47 | build/ 48 | bin/ 49 | Debug/ 50 | Release/ 51 | x64/ 52 | x86/ 53 | 54 | # 操作系统生成的文件 55 | Thumbs.db 56 | .DS_Store 57 | *.tmp 58 | *.temp -------------------------------------------------------------------------------- /Part1/CH2/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part1/CH2/main.pdf -------------------------------------------------------------------------------- /Part1/CH2/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{listings}%for coding 3 | \begin{document} 4 | 5 | \section*{key points} 6 | 7 | Primary part of parallel concept. 8 | 9 | cudaMemcpy('desintation','source','size','type'): 10 | 11 | where type:cudaMemcpyHostToDevice,cudaMemcpyDeviceToDevice\dots 12 | 13 | cudaMalloc((void **) \&A, size): 14 | 15 | a variable's address and its size. 16 | 17 | cudaError\_t err='function' 18 | 19 | where 'function' is like cudaMemcpy,cudaMalloc\dots 20 | 21 | \section{question} 22 | 23 | Compare with your question: 24 | 25 | If we want to use each thread in a grid to calculate one output element of a vector addition, what would be the expression for mapping the thread/block indices to the data index (i)? (A) i=threadIdx.x + threadIdx.y; (B) i=blockIdx.x + threadIdx.x; (C) i=blockIdx.x blockDim.x + threadIdx.x; (D) i=blockIdx.x threadIdx.x; 26 | 27 | 1.C 28 | 29 | i = blockIdx.x $\cdot$ blockDim.x + threadIdx.x 30 | 31 | \textbf{blockIdx.x} gives the index of the current block. 32 | 33 | \textbf{blockDim.x} gives the number of threads per block. 34 | 35 | \textbf{threadIdx.x} gives the index of the current thread within its block. 36 | 37 | 38 | 2.C 39 | 40 | Each thread hanles two elements, so it needs to multiply the overall thread index by 2 to get the index of the first element the thread will process. 41 | 42 | 43 | 44 | 3.D 45 | 46 | 2 $\cdot$ blockDim.x is two consecutive elements, and each thread processes one element from each section sequentially. Consider the entire block offset and the per-thread offset within the block. 47 | 48 | 49 | 4.C 50 | 51 | Number of blocks $= \frac{8000}{1024} \approx 7.8$ 52 | 53 | ceil(N) = 8 54 | 55 | Total threads = N $\times$ 1024 = 8192. 56 | 57 | 5.D 58 | 59 | To allocate an array of v integer elements in CUDA device global memory, you need to specify the total amount of memory required in bytes. Since each integer typically requires sizeof(int) bytes, the correct expression for the second argument of the cudaMalloc call would be: 60 | \begin{equation} 61 | v \times sizeof(int) 62 | \end{equation} 63 | 64 | 6.D 65 | 66 | When using cudaMalloc() to allocate memory on the device and have a pointer variable A\_d point to it, you need to pass the address of the pointer to cudaMalloc() so that it can modify the pointer to point to the allocated memory. 67 | 68 | (void**) \&A\_d 69 | 70 | 7.C 71 | 72 | cudaMemcpy(A\_d, A\_h, 3000, cudaMemcpyHostToDevice) 73 | 74 | explaination: 75 | 76 | \textbf{A\_d} as the destination pointer on the device. 77 | 78 | \textbf{A\_h} as the source pointer on the host. 79 | 80 | \textbf{3000} as the number of bytes to copy. 81 | 82 | \textbf{cudaMemcpyHostToDevice} as the direction of the copy. 83 | 84 | 8.C 85 | 86 | From the formula on book, it is: cudaError\_t err. 87 | 88 | 9.1:128 89 | 90 | The number of threads per block is specified in the kernel launch configuration. In line 9. 91 | 92 | 9.2:200064 93 | 94 | \begin{equation} 95 | Number of blocks = \frac{200000}{128} \approx 1563(ceil) 96 | \end{equation} 97 | 98 | so the total number of threads is: 99 | \begin{equation} 100 | Total threads = 1563 \times 128 = 200064 101 | \end{equation} 102 | 103 | 9.3:1563 104 | 105 | As calculated above. 106 | 107 | 9.4:200064 108 | 109 | All threads in the grid execute line 02, as it is part of the kernel's main body. Therefore, the number of threads is the same as the total number of threads in the grid 110 | 111 | 9.5:200000 112 | 113 | Line 04 is executed only if the condition in line 03 is true, i.e., i < N. Since i ranges from 0 to 200063, only the first 200000 threads meet this condition 114 | 115 | 10. He can use function like \_\_global\_\_ \_\_host\_\_ function{}\dots 116 | 117 | \begin{lstlisting}[language=C++] 118 | __host__ __device__ void function() { 119 | // Function implementation 120 | } 121 | \end{lstlisting} 122 | 123 | \end{document} -------------------------------------------------------------------------------- /Part1/CH3/code/matrix_multi_row.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "CudaMatrix.h" 3 | // |1 2 3| * |1 2| = |6 12| 4 | // |4 5 6| |1 2| |15 30| 5 | // |1 2| 6 | __global__ void matrix_multi_row(const float* A, const float* B, float* C,int A_rows, int A_cols, int B_cols){ 7 | //count in row order 8 | int row = blockIdx.x * blockDim.x + threadIdx.x; 9 | if (row < A_rows){ 10 | for(int col = 0; col < B_cols; col++){ 11 | float sum = 0; 12 | for(int mul = 0; mul < A_cols; mul++){ 13 | sum += A[row * A_cols + mul] * B[mul * B_cols + col]; 14 | } 15 | C[row * B_cols + col] = sum; 16 | } 17 | } 18 | } 19 | 20 | __global__ void matrix_multi_col(const float* A, const float* B, float* C, int A_rows, int A_cols, int B_cols){ 21 | //count in col order 22 | int col = blockIdx.x * blockDim.x + threadIdx.x; 23 | if (col < B_cols){ 24 | for(int row = 0; row < A_rows; row++){ 25 | float sum = 0; 26 | for(int mul = 0; mul < A_cols; mul++){ 27 | sum += A[row * A_cols + mul] * B[mul * B_cols + col]; 28 | } 29 | C[row * B_cols + col] = sum; 30 | } 31 | } 32 | } 33 | 34 | //print matrix 35 | void coutmatrix(const float* mat, int rows, int cols, const char* name){ 36 | std::cout << "Matrix " << name << ":\n"; 37 | for(int i = 0;i < rows; i++){ 38 | for(int j = 0; j < cols; j++){ 39 | std::cout << mat[i * cols + j] << "\t"; 40 | } 41 | std::cout << "\n"; 42 | } 43 | } 44 | 45 | int main(){ 46 | const int A_ROWS = 2; 47 | const int B_COLS = 2; 48 | float h_A[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; 49 | float h_B[] = {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}; 50 | const int A_COLS = sizeof(h_A) / sizeof(float) / A_ROWS; 51 | const int B_ROWS = sizeof(h_B) / sizeof(float) / B_COLS; 52 | // std::cout << COLS << std::endl; //output 3, correct 53 | size_t A_size = A_ROWS * A_COLS * sizeof(float); 54 | size_t B_size = B_ROWS * B_COLS * sizeof(float); 55 | size_t C_size = A_ROWS * B_COLS * sizeof(float); 56 | float h_C_row[A_ROWS * B_COLS]; 57 | float h_C_col[A_ROWS * B_COLS]; 58 | 59 | coutmatrix(h_A, A_ROWS, A_COLS, "A"); 60 | coutmatrix(h_B, B_ROWS, B_COLS, "B"); // row_A = col_B 61 | 62 | 63 | float *d_A, *d_B, *d_C_col, *d_C_row; 64 | cudaMalloc((void**)&d_A, A_size); 65 | cudaMalloc((void**)&d_B, B_size); 66 | cudaMalloc((void**)&d_C_col, C_size); 67 | cudaMalloc((void**)&d_C_row, C_size); 68 | 69 | cudaMemcpy(d_A, h_A, A_size, cudaMemcpyHostToDevice); 70 | cudaMemcpy(d_B, h_B, B_size, cudaMemcpyHostToDevice); 71 | 72 | int threadsPerBlock = 256; 73 | 74 | // calculate in row: 75 | int blocksPerGrid = (A_ROWS + threadsPerBlock - 1) / threadsPerBlock; 76 | matrix_multi_row<<>>(d_A, d_B, d_C_row, A_ROWS, A_COLS, B_COLS); 77 | 78 | // calculate in col: 79 | blocksPerGrid = (A_COLS + threadsPerBlock - 1) / threadsPerBlock; 80 | matrix_multi_col<<>>(d_A, d_B, d_C_col, A_ROWS, A_COLS, B_COLS); 81 | 82 | cudaError_t err = cudaGetLastError(); 83 | if (err != cudaSuccess){//check if error happened; 84 | std::cerr << "Cuda launch error: " << cudaGetErrorString(err) << std::endl; 85 | cudaFree(d_A); 86 | cudaFree(d_B); 87 | cudaFree(d_C_row); 88 | cudaFree(d_C_col); 89 | return -1; 90 | } 91 | 92 | cudaDeviceSynchronize(); 93 | cudaMemcpy(h_C_row, d_C_row, C_size, cudaMemcpyDeviceToHost); 94 | cudaMemcpy(h_C_col, d_C_col, C_size, cudaMemcpyDeviceToHost); 95 | coutmatrix(h_C_row, A_ROWS, B_COLS, "C = A * B in row"); 96 | coutmatrix(h_C_col, A_ROWS, B_COLS, "C = A * B in col"); 97 | 98 | cudaFree(d_A); 99 | cudaFree(d_B); 100 | cudaFree(d_C_row); 101 | cudaFree(d_C_col); 102 | 103 | return 0; 104 | } -------------------------------------------------------------------------------- /Part1/CH3/code/matrixadd.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // #define ROWS 2 5 | // #define COLS 2 6 | //matrix add together 7 | __global__ void matrixadd(const float* A, const float* B, float* C,int rows, int cols){ 8 | //the pointers of the input matrics A and B, the pointer of the output matrix C, number of columns in matrices 9 | int row = blockIdx.x * blockDim.x + threadIdx.x;//global row index 10 | if (row < rows){ 11 | for (int col = 0; col < cols; col++){//iterate each row 12 | C[row * cols + col] = A[row * cols + col] + B[row * cols + col]; 13 | } 14 | } 15 | 16 | } 17 | 18 | //print matrix 19 | void coutmatrix(const float* mat, int rows, int cols, const char* name){ 20 | std::cout << "Matrix " << name << ":\n"; 21 | for(int i = 0;i < rows; i++){ 22 | for(int j = 0; j < cols; j++){ 23 | std::cout << mat[i * cols + j] << "\t"; 24 | } 25 | std::cout << "\n"; 26 | } 27 | } 28 | 29 | 30 | int main() { 31 | const int ROWS = 2; 32 | const int COLS = 2; 33 | size_t size = ROWS * COLS * sizeof(float); 34 | //initalize matrix A,B,C in host 35 | float h_A[ROWS * COLS] = {1.0f, 2.0f, 3.0f, 4.0f}; 36 | float h_B[ROWS * COLS] = {1.0f, 1.0f, 2.0f, 2.0f}; 37 | float h_C[ROWS * COLS]; 38 | 39 | // Print input matrices 40 | coutmatrix(h_A, ROWS, COLS, "A"); 41 | coutmatrix(h_B, ROWS, COLS, "B"); 42 | 43 | // set device memory 44 | float *d_A, *d_B, *d_C; 45 | cudaMalloc((void**)&d_A, size); 46 | cudaMalloc((void**)&d_B, size); 47 | cudaMalloc((void**)&d_C, size); 48 | 49 | //cpoy host data to device 50 | // cudaMemcpy(destination, source, size, direction); 51 | cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); 52 | cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); 53 | int threadsPerBlock = 256; 54 | int blocksPerGrid = (ROWS + threadsPerBlock - 1) / threadsPerBlock;//an extra block for not perfectly divisible by threadsPerBlock; 55 | matrixadd<<>>(d_A, d_B, d_C, ROWS, COLS);//activate kernel, <<<...>>>specifying the number of grid and block; 56 | 57 | cudaError_t err = cudaGetLastError(); 58 | if (err != cudaSuccess){//check if error happened; 59 | std::cerr << "Cuda launch error: " << cudaGetErrorString(err) << std::endl; 60 | cudaFree(d_A); 61 | cudaFree(d_B); 62 | cudaFree(d_C); 63 | return -1; 64 | } 65 | 66 | cudaDeviceSynchronize(); 67 | 68 | cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); 69 | 70 | coutmatrix(h_C, ROWS, COLS, "C=A+B"); 71 | 72 | cudaFree(d_A); 73 | cudaFree(d_B); 74 | cudaFree(d_C); 75 | 76 | return 0; 77 | } 78 | -------------------------------------------------------------------------------- /Part1/CH3/code/q1.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Matrix dimensions 5 | #define M 1024 // Number of rows in A and C 6 | #define N 1024 // Number of columns in A and rows in B 7 | #define P 1024 // Number of columns in B and C 8 | 9 | // CUDA Kernel: Each thread computes one row of the output matrix C 10 | __global__ void matrixMulRowKernel(const float* A, const float* B, float* C, int N, int P) { 11 | // Calculate the row index this thread is responsible for 12 | int row = blockIdx.x * blockDim.x + threadIdx.x; 13 | 14 | // Boundary check to ensure we don't access out-of-bounds memory 15 | if (row < M) { 16 | for (int col = 0; col < P; ++col) { 17 | float value = 0.0f; 18 | for (int k = 0; k < N; ++k) { 19 | value += A[row * N + k] * B[k * P + col]; 20 | } 21 | C[row * P + col] = value; 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /Part1/CH3/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part1/CH3/main.pdf -------------------------------------------------------------------------------- /Part1/CH3/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{listings} 3 | \usepackage{geometry} 4 | \usepackage{array} 5 | \usepackage{ulem} 6 | \usepackage{float} 7 | \geometry{a4paper, margin=1in} 8 | \begin{document} 9 | 10 | 11 | \section*{key points} 12 | Blur image: 13 | blur\_size leads to the core of the blurring pixel that accumulates other pixels around it. 14 | 15 | Matrix multiplication: 16 | 17 | inner product: 18 | \begin{equation} 19 | P_{row,col} = \sum M_{row,k}\cdot N_{k,col} for k = 0,1,\cdots Width-1 20 | \end{equation} 21 | 22 | Col-major and row-major: 23 | two different ways of showing a matrix's index. A row-major counts in the same row consecutively (\( r \cdot Cols + c \)), while a col-major counts in columns (\( c \cdot Rows + r \)) . 24 | 25 | 26 | As the table \ref{Matrix} shows, the \textbf{Bold} is for row-major, \textit{Italic} is for col-major. 27 | \begin{table}[H] 28 | \centering 29 | \caption{\bf Detailed matrix index.} 30 | \label{Matrix} 31 | \begin{tabular}{llll} 32 | (0,0)\textbf{0}\textit{0} & (0,1)\textbf{1}\textit{3} & (0,2)\textbf{2}\textit{6} & (0,3)\textbf{3}\textit{9} \\ 33 | (1,0)\textbf{4}\textit{1} & (1,1)\textbf{5}\textit{4} & (1,2)\textbf{6}\textit{7} & (1,3)\textbf{7}\textit{10} \\ 34 | (2,0)\textbf{8}\textit{2} & (2,1)\textbf{9}\textit{5} & (2,2)\textbf{10}\textit{8} & (2,3)\textbf{11}\textit{11} 35 | \end{tabular} 36 | \end{table} 37 | 38 | \section{Solutions} 39 | \subsection{} 40 | (1) 41 | Input matrices are A$(M \times K)$ and B$\$(K \times N)$. 42 | Output matrix is C$(M \times N)$. 43 | 44 | Every thread is responsible for calculating one complete row of the output matrix C. 45 | 46 | 47 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 48 | 49 | __global__ void matmul_row(float* A, float* B, float* C, int M, int N, int K) { 50 | int col = blockIdx.x * blockDim.x + threadIdx.x; 51 | if (col < N) { 52 | for (int row = 0; row < M; ++row) { 53 | float sum = 0; 54 | for (int k = 0; k < K; ++k){ 55 | sum += A[row*K + k] * B[k*N + col]; 56 | } 57 | C[row * N + col] = sum; 58 | } 59 | } 60 | } 61 | \end{lstlisting} 62 | 63 | (2) 64 | 65 | \begin{lstlisting}[breaklines=true] 66 | __global__ void matmul_row(float* A, float* B,float* C, int M,int N,int K){ 67 | int row = blockIdx.x * blockDim.x + threadIdx.x; 68 | if (row < M) { 69 | for (int col = 0; col < N; ++col) { 70 | float sum = 0; 71 | for (int k = 0;k < K; ++k) { 72 | sum += A[row * K + k] * B[k * N + col]; 73 | } 74 | C[row * N + col] = sum; 75 | } 76 | } 77 | } 78 | \end{lstlisting} 79 | 80 | (3) 81 | 82 | For row-wise Kernel A and C have a better memory access, for col-wise Kernel B has a better memory access. 83 | 84 | This shall depend on the matrix dimensions. 85 | 86 | Further consideration: 87 | Both kernels can be further optimized using tiling to improve data locality and cache utilization. Shared memory and warp shuffling 88 | 89 | Explanation: 90 | \subsubsection{Tiling} 91 | \begin{lstlisting}[breaklines=true] 92 | __global__ void matmul_row_tiled(float* A, float* B, float* C, int M, int N, int K) { 93 | // Shared memory for tiles from A and B 94 | __shared__ float As[TILE_WIDTH][TILE_WIDTH]; 95 | __shared__ float Bs[TILE_WIDTH][TILE_WIDTH]; 96 | 97 | int row = blockIdx.x * blockDim.x + threadIdx.x; 98 | int col = blockIdx.y * blockDim.y + threadIdx.y; 99 | 100 | float sum = 0; 101 | 102 | // Loop over tiles, where K is a multiple of TILE_WIDTH 103 | for (int tile = 0; tile < K / TILE_WIDTH; ++tile) { 104 | // Load tiles from global to shared memory 105 | int tile_k = tile * TILE_WIDTH; 106 | As[threadIdx.y][threadIdx.x] = A[row * K + tile_k + threadIdx.x]; 107 | Bs[threadIdx.y][threadIdx.x] = B[(tile_k + threadIdx.y) * N + col]; 108 | 109 | // Synchronize threads within the block to ensure tiles are loaded 110 | __syncthreads(); 111 | 112 | // Compute partial sum for the current tile 113 | for (int k = 0; k < TILE_WIDTH; ++k) { 114 | sum += As[threadIdx.y][k] * Bs[k][threadIdx.x]; 115 | } 116 | 117 | // Synchronize again before loading the next tile 118 | __syncthreads(); 119 | } 120 | 121 | // Store the final result in the output matrix 122 | if (row < M && col < N) { 123 | C[row * N + col] = sum; 124 | } 125 | } 126 | \end{lstlisting} 127 | 128 | Compare: 129 | 130 | Without tiling: 131 | Each thread calculates one element of the output C, which should load rows of A and cols of B from global memory leading to a high number of global memory access. 132 | 133 | With tiling: 134 | Each thread block cooperatively computes a small tile of the output matrix. All threads within the block can access this data repeatedly from shared memory. 135 | 136 | Tiling helps decreasing exchange between global memory and local memory. 137 | 138 | 139 | 140 | 141 | \subsection{}%2 142 | Computes each element of the output vector as the dot product, manages memory allocation,data transfer, kernel invocation, and retrives the result. 143 | 144 | \begin{lstlisting}[breaklines=true] 145 | #include 146 | #include 147 | #include 148 | 149 | // CUDA Kernel for Matrix-Vector Multiplication 150 | __global__ void matrixVectorMulKernel(const float* B, const float* C, float* A, int N) {// input matrix pointers and number of rows/columns in the square matrix. 151 | int row = blockIdx.x * blockDim.x + threadIdx.x; 152 | if (row < N) {// number of row that over N won't work. 153 | float sum = 0.0f; 154 | for (int j = 0; j < N; ++j) { 155 | sum += B[row * N + j] * C[j];//the dot product of the corresponding row in matrix B and vector C. 156 | } 157 | A[row] = sum; 158 | } 159 | //each thread computes one element of the output vector A. 160 | } 161 | 162 | // Host Function for Matrix-Vector Multiplication 163 | void matrixVectorMul(const float* h_B, const float* h_C, float* h_A, int N) {// host input matrix pointers and N. 164 | float *d_B = nullptr, *d_C = nullptr, *d_A = nullptr; 165 | size_t sizeMatrix = N * N * sizeof(float); 166 | size_t sizeVector = N * sizeof(float); 167 | // Allocate device memory 168 | //error checking. 169 | cudaError_t err = cudaMalloc((void**)&d_B, sizeMatrix); 170 | if (err != cudaSuccess) { 171 | std::cerr << "Failed to allocate device memory for matrix B (error code " 172 | << cudaGetErrorString(err) << ")!\n"; 173 | exit(EXIT_FAILURE); 174 | } 175 | 176 | err = cudaMalloc((void**)&d_C, sizeVector); 177 | if (err != cudaSuccess) { 178 | std::cerr << "Failed to allocate device memory for vector C (error code " 179 | << cudaGetErrorString(err) << ")!\n"; 180 | cudaFree(d_B); 181 | exit(EXIT_FAILURE); 182 | } 183 | 184 | err = cudaMalloc((void**)&d_A, sizeVector); 185 | if (err != cudaSuccess) { 186 | std::cerr << "Failed to allocate device memory for vector A (error code " 187 | << cudaGetErrorString(err) << ")!\n"; 188 | cudaFree(d_B); 189 | cudaFree(d_C); 190 | exit(EXIT_FAILURE); 191 | } 192 | 193 | // Copy data from host to device 194 | err = cudaMemcpy(d_B, h_B, sizeMatrix, cudaMemcpyHostToDevice); 195 | if (err != cudaSuccess) { 196 | std::cerr << "Failed to copy matrix B from host to device (error code " 197 | << cudaGetErrorString(err) << ")!\n"; 198 | cudaFree(d_B); 199 | cudaFree(d_C); 200 | cudaFree(d_A); 201 | exit(EXIT_FAILURE); 202 | } 203 | 204 | err = cudaMemcpy(d_C, h_C, sizeVector, cudaMemcpyHostToDevice); 205 | if (err != cudaSuccess) { 206 | std::cerr << "Failed to copy vector C from host to device (error code " 207 | << cudaGetErrorString(err) << ")!\n"; 208 | cudaFree(d_B); 209 | cudaFree(d_C); 210 | cudaFree(d_A); 211 | exit(EXIT_FAILURE); 212 | } 213 | 214 | // Launch the CUDA kernel 215 | int threadsPerBlock = 256; 216 | int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; 217 | matrixVectorMulKernel<<>>(d_B, d_C, d_A, N); 218 | 219 | // Check for any kernel launch errors 220 | err = cudaGetLastError(); 221 | if (err != cudaSuccess) { 222 | std::cerr << "Failed to launch matrixVectorMulKernel (error code " 223 | << cudaGetErrorString(err) << ")!\n"; 224 | cudaFree(d_B); 225 | cudaFree(d_C); 226 | cudaFree(d_A); 227 | exit(EXIT_FAILURE); 228 | } 229 | 230 | // Copy the result vector A back to host 231 | err = cudaMemcpy(h_A, d_A, sizeVector, cudaMemcpyDeviceToHost); 232 | 233 | if (err != cudaSuccess) { 234 | std::cerr << "Failed to copy vector A from device to host (error code " 235 | << cudaGetErrorString(err) << ")!\n"; 236 | cudaFree(d_B); 237 | cudaFree(d_C); 238 | cudaFree(d_A); 239 | exit(EXIT_FAILURE); 240 | } 241 | 242 | // Free device memory 243 | cudaFree(d_B); 244 | cudaFree(d_C); 245 | cudaFree(d_A); 246 | } 247 | 248 | int main() { 249 | // Define the size of the matrix and vectors 250 | int N = 1024; // Example size; can be modified as needed 251 | 252 | // Initialize host vectors and matrix 253 | std::vector h_B(N * N, 1.0f); // Initialize all elements to 1.0 254 | std::vector h_C(N, 1.0f); // Initialize all elements to 1.0 255 | std::vector h_A(N, 0.0f); // Output vector 256 | 257 | // Perform matrix-vector multiplication 258 | matrixVectorMul(h_B.data(), h_C.data(), h_A.data(), N); 259 | //A = sum_{j}(B[i][j]*C[i]) 260 | 261 | // Optional: Verify the result (since B and C are all ones(the initial value), A should be filled with N) 262 | bool correct = true; 263 | for (int i = 0; i < N; ++i) { 264 | if (h_A[i] != static_cast(N)) { 265 | correct = false; 266 | std::cerr << "Mismatch at index " << i << ": " << h_A[i] << " != " << N << "\n"; 267 | break; 268 | } 269 | } 270 | 271 | if (correct) { 272 | std::cout << "Matrix-vector multiplication successful. All elements are " << N << ".\n"; 273 | } else { 274 | std::cout << "Matrix-vector multiplication failed.\n"; 275 | } 276 | 277 | return 0; 278 | } 279 | \end{lstlisting} 280 | 281 | \subsection{} 282 | \textbf{I'm a foolish.} 283 | 284 | a.\sout{32} 512 285 | 286 | The number of threads per block is multiple of bd: 16$\cdot$32 = 512. 287 | Whole block contains M,N for mapping 2D matrix. 288 | 289 | b.\sout{16$\cdot$32} 48640 290 | 291 | gridDim.x = (N - 1)/16 + 1 = (300 - 1)/16 + 1 = 19. 292 | 293 | gridDim.y = (M - 1)/32 + 1 = 5. 294 | 295 | So the number of all blocks is 19$\cdot$5 = 95. All threads is 95*512 = 48640. 296 | 297 | c. \sout{$16\cdot32\cdot[(N-1)/16+1]\cdot[(M-1)/32+1]$}95 as b solved. 298 | 299 | d.150$\cdot$30 300 | 301 | Directly multiple M and N(only row= 5 - i\%3. 78 | 79 | 80 | \romannumeral2:\sout{129} 2 81 | 82 | As a result the number of divergent iteration is 2 for j = 3 and 4. 83 | 84 | \subsection{} 85 | 2048 86 | 87 | Each block can hold 512 threads, and each thread handles one element, it need enough blocks to cover all 2000 elements. 88 | 89 | Number of Blocks = \(\lceil2000/512\rceil = 4\) blocks. 90 | 91 | Total threads = \(4 blocks \time 512threads per block = 2048 threads\). 92 | 93 | \subsection{} 94 | 1 95 | 96 | Only warp 62(1984-2015) is divergence for 1984-1999 is valid and 2000-2015 is invalid. 97 | 98 | Other over 62 is invalid and lower 62 is valid. 99 | 100 | \subsection{} 101 | 17\% 102 | 103 | The maximum execution time = 3.0 microseconds. Waiting time is the minimum execution time = 1.9 microseconds. 104 | 105 | Total execution time = \(\sum{execution time per thread}/(8\cdot minimum execution time) \time 100\% \approx 17.08\%\) 106 | 107 | \subsection{} 108 | \sout{No, the number of warp is limited by the device, setting only 32 thread cause the use of warp increase. A limited number of warp can use fewer threads to compute resulting in a loss of efficiency.} 109 | 110 | \textbf{I copied GPT's answer:} 111 | 112 | No, omitting \_\_syncthreads() is not recommended even when each block has only 32 threads (one warp). While it's true that a single warp executes in lockstep, \_\_syncthreads() ensures memory operations are completed and visible to all threads within the block. Without it, you risk memory inconsistencies and race conditions. Additionally, using only 32 threads per block can limit the GPU's ability to hide latency and fully utilize its computational resources, potentially leading to reduced efficiency. 113 | 114 | \textbf{Here's detailed reasons:} 115 | It is not suggested to leave this function out for the following reasons: 116 | 117 | 1.\_\_syncthreads() keeps all memory operations work before threads within the block are completed for keeping memory safe. 118 | 119 | 2. It can prevent reordering synchronization to maintaining the intended execution order. 120 | 121 | 3. Even if current block size is 32 threads, further changes might involve increasing block size for more robust and easier to maintain or scale. 122 | 123 | 4. Warp-level synchronization isn't sufficient, it also need block-level synchronization. 124 | 125 | \textbf{Correcting my answer:} 126 | 127 | A restrict number of block do exists, while the actual number is very high. FOr grid dimension.x:$2^{31}-1$, for gird dimension.y:$65535$, for grid dimension.z:$65535$. And the maximum number per block is constrained: 1024 thread per block. Accessible via the cudaDevideProp structure. 128 | 129 | What infect most is resource constraints. Each block consumes shared memory and registers causing lower occupance and performance degradation. 130 | 131 | 132 | \subsection{} 133 | C 134 | 135 | For option A, number of block:4. Total threads is \(128\cdot4=512\) threads. 136 | 137 | For option B, number of block:4. Total threads is \(256\cdot4=1024\) threads. 138 | 139 | For option C, number of block:3. Total threads is \(512\cdot3=1536\) threads. 140 | 141 | For option D, number of block:1. Total threads is \(1024\cdot1=1024\) threads. 142 | 143 | \subsection{} 144 | a,50\%,b,50\%,c,50\%,d,100\%,e,100\% 145 | 146 | All numbers of block are $\leq$ 64 and all numbers of threads are $\leq$ 2048. 147 | 148 | \subsection{} 149 | 150 | Y,N,N 151 | 152 | a. Yes: 2048/128 = 16 blocks needed, \(2048\cdot30=61440\) registers needed,both below the limits of 32 blocks and 65536 registers. 153 | 154 | b. No: 2048/32 = 64 blocks needed, \(2048\cdot29=59392\) registers needed, blocks are over the limit. 155 | 156 | c. No: 2048/256 = 8 blocks needed, \(2048\cdot34=69632\) registers needed, registers are over the limit. 157 | 158 | \subsection{} 159 | It may be not enough to multiply two matrices. Since multiplying two matrices takes not only the number of its elements. With \(512\cdot8\) threads is far from necessary threads. It would be impossible. 160 | 161 | \textbf{Not exactly:} 162 | 163 | The student's configuration exceeds the maximum allowed threads per block by a factor of 2. 164 | 165 | Total threads per SM: \(512\cdot8=4096\) threads/SM. 166 | 167 | Result matrix elements: \(1024\cdot1024=1048576\) elements. Insufficient to handle all elements efficiently. 168 | 169 | Revised approach: 170 | 171 | Adjust thread block size. Make it lower than 512 as the limitation. For example: setting \(16\times16=256\) threads per block. Total block needed is \((1024/16)\times(1024/16)=4096\) blocks. Up to 8 blocks per SM means no more than 8 active blocks, allowing the GPU to manage workload distribution effectively, not the maximnum number of block can be set. 172 | 173 | \end{document} -------------------------------------------------------------------------------- /Part1/CH5/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part1/CH5/main.pdf -------------------------------------------------------------------------------- /Part1/CH5/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{listings} 3 | \usepackage{geometry} 4 | \usepackage{array} 5 | \usepackage{ulem} 6 | \usepackage{float} 7 | \geometry{a4paper, margin=1in} 8 | \begin{document} 9 | 10 | \section*{key point} 11 | Allocate variables in their suitable memory position. 12 | Tiling can divide large matrices into smaller submatrices With data reusing and improving cache efficiency. 13 | 14 | For naive approach of matrices multiplies, A(M*K), B(K*N) and C(M*N). 15 | Each element of A is accessed N time and B is M times (for each column of B and each row if A). 16 | 17 | The tiling method is like the block matrix multiplication in Linear Algebra. Reducing the need of exchanging data from device to host since a smaller matrix needs less data and faster speed in computing. For example: Multilevel segmentation. 18 | 19 | For a \textbf{rectangle matrix}, we can add 0.0f to it and make it becomes a square matrix. Thus the x and y dimension would be a multiple of submatrices. This won't have bad influence on previous matrix. 20 | 21 | \section{solutions} 22 | \subsection{} 23 | No, it can't. For matrix addition, every element would be used for only one times. There's no need for reuse of the repeated elements. 24 | 25 | \textbf{Single use per element} and \textbf{no data reuse}. 26 | 27 | \subsection{} 28 | For a 2$\times$2 tiling, it takes 4$\times$4 blocks to compute the matrix, while a 4$\times$4 tiling takes 2$\times$2 blocks to compute the matrix. The amount of data exchange for the previous one is 4 times larger than the last one. \sout{So it is proportional to the dimension size of tiles.} 29 | 30 | The relationship is inversely proportional to the square of the tile dimension. 31 | 32 | \subsection{} 33 | If one forget the first function it may leading a mistake on the multiplies in Pvalues, since Mds and Nds haven't fully prepared for the next new calculation. 34 | 35 | \sout{While for the second function, it won't leading many errors because it just order all thread waiting for all other threads for the next iteration.} 36 | 37 | The function $\_\_syncthreads()$ serves as a barrier synchronization point, ensuring that all threads in the block reach the barrier before any thread proceeds beyond it. This guarantees that all shared memory operations (reads and writes) preceding the barrier are completed and visible to all threads. 38 | 39 | The first one ensures that all data is loaded into Mds and Nds before any thread begins computation. Lacking it leads to threads using incomplete data, resulting in correct multiplication results. 40 | 41 | The second one ensures that all threads have finished computing with the current tiles before they starts loading the next tiles. Lacking ut allows threads to start loading new data into shared memory before all threads have finished computing with the current data, causing data races and incorrect results. 42 | 43 | In conclusion, $\_\_syncthreads()$ can ensures data integrity for preventing race conditions and ensuring correct results in case that threads operate on incomplete or corrupted data, resulting in incorrect computations and unpredictable behavior. 44 | 45 | \subsection{} 46 | When we have lots of registers and shared memory, the top selection is shared memory since it can pass data quicker than registers. Registers need exchange data from \sout{warps to warps}, while shared memory can avoid this step. 47 | 48 | \textbf{Shared memory accessibility, synchronization and avoiding redundant loads}. 49 | 50 | Registers are private to each thread and cannot be directly accessed by other threads or warps. This require each thread to have its own copy, leading to redundancy. While shared memory allows all threads within a block to access the same data without the need for data exchange between warps. It reduces the need for redundant data storage and minimizes the overhead associated with data synchronization between threads. 51 | 52 | However, for a single thread, register can reaches highest speed than any other ways of expressing data. 53 | 54 | \subsection{} 55 | \sout{For about $\lfloor M\%32 \rfloor \times \lfloor N\%32 \rfloor$} 56 | 57 | By a factor of 32. 58 | 59 | Though for matrices where M or N are not multiples of 32, zero-padding is not required in considering it efficiency since the presence of some partial tile slightly decreases the overall reduction efficiency but does not change much of the asymptotic bandwidth reduction, which remains O(1/32). 60 | 61 | \subsection{} 62 | A kernel launched with 1000 thread blocks of 512 threads shall makes every single thread's register a copy. So the answer is $512\cdot1000$. 63 | 64 | \subsection{} 65 | For a variable placed in shared memory, every thread in the same block would use the shared memory together. So the answer is 1000. 66 | 67 | \subsection{} 68 | \subsubsection{} 69 | For 2*N times. 70 | 71 | \subsubsection{} 72 | \sout{For $\lceil 2*N/T \rceil$ times.} 73 | 74 | For $2 * \lceil N/T \rceil$ times. 75 | 76 | \subsection{} 77 | Since there are 36 floating-point operations and 7 * 32-bit global memory accesses/s. 36 FLOPs per thread, 7 accesses per thread and 7*32 bits = 224bits = 28bytes. 78 | 79 | So, the operational intensity is \(\frac{FLOPs}{Bytes} = \frac{36}{28} \approx 1.29 FLOPs/Byte\). 80 | 81 | For device configurations: 82 | \subsubsection{} 83 | \sout{For 200 GFLOPS and 100 GB/second. it doesn't reach the limit of the GFLOP line, it is compute-bound.} 84 | 85 | Compute capacity = 200 GFLOPS. 86 | Memory capacity in FLOPs = Memory Bandwidth * OI = 100 * 1.29 = 129 GFLOPS, 87 | 88 | So 129 < 200, it is memory-bound. 89 | 90 | \subsubsection{} 91 | \sout{It reaches almost the line, so it is both compute-bound and memory-bound.} 92 | Memory capacity in FLOPs = Memory Bandwidth * OI = 250 * 1.29 = 322.5 GFLOPS. 93 | 94 | So 322.5 > 300, it is compute-bound. 95 | 96 | \textbf{Roofline model, defined by compute capacity and memory capacity}. 97 | 98 | \subsection{} 99 | \subsubsection{} 100 | \sout{For those number that is smaller than block's width and height.} 101 | 102 | In thread block dimensions, CUDA allows 1024 threads per block and 32 block at least. So the limit of 20 is safe. 103 | 104 | Shared memory takes 20*20*sizeof(float) = 1600 bytes, lower than the common 48KB. 105 | 106 | So, all BLOCK\_WIDTH values from 1 to 20 will allow the kernel to execute correctly. 107 | 108 | 109 | \subsubsection{} 110 | \sout{It didn't execute boundary check for the matrix. Since some BLOCK\_WIDTH may not be the multiple of the matrix. Padding zero to the matrix is necessary.} 111 | 112 | \romannumeral1: inconsistent variable naming: The BLOCK\_WIDTH and BLOCK\_SIZE are used interchangeable, it should be declared before compiling. 113 | 114 | \romannumeral2: Missing function \_\_syncthreads(), without a synchronization, it may cause unpredictable problems. 115 | 116 | 117 | \textbf{I totally misunderstand this question.} 118 | 119 | \subsection{} 120 | \subsubsection{} 121 | 8*128, since every thread in every block need a variable i. 122 | \subsubsection{} 123 | 8*128, same as a. 124 | \subsubsection{} 125 | 8, it is stored in shared memory. 126 | \subsubsection{} 127 | 8, same as c. 128 | \subsubsection{} 129 | 129*bytes(float).\textbf{=(1+128)$\times$4=512 bytes.} 130 | \subsubsection{} 131 | \textbf{I don't know how to solve it.} 132 | 133 | For the floating-point operations per byte (OP/B), we need to calculate: 134 | \romannumeral1: FLOPs: 135 | 136 | Multiplications: 137 | 138 | 2.5f * x[0]; 139 | 140 | 3.7f * x[1]; 141 | 142 | 6.3f * x[2]; 143 | 144 | 8.5f * x[3]; 145 | 146 | y\_s * b\_s[threadIdx.x]. 147 | 148 | Counts 5 149 | 150 | Additions: 151 | adding the results together, it takes 5.(For 6 numbers added together, it takes 5 steps.) 152 | 153 | For the step: y\_s = 7.4f, it is a data movement operation and does not constitute a floating-point operaton. 154 | 155 | \textbf{So, total FLOPs is 10.} 156 | 157 | \romannumeral2: Global Memory Accesses: 158 | 159 | Reads: 160 | 161 | x[j] = a[j*blockDim.x*gridDim.x + i]; → 4 reads from a[] 162 | 163 | b\_s[threadIdx.x] = b[i]; → 1 read from b[] 164 | 165 | Writes: 166 | 167 | b[i] = ...; → 1 write to b[] 168 | 169 | Total number per threads: 6. 170 | 171 | \textbf{Total Bytes accessed per thread: 6*size(float) = 6*4 = 24 bytes.} 172 | 173 | \romannumeral3: OP/B Calculation: 174 | 175 | The ratio = 10/24 \(\approx\) 0.4167 OP/B. 176 | 177 | \subsection{} 178 | \subsubsection{} 179 | The maximum number of threads is 32*64=2048, number of registers is 55296, and shared memory takes 128KB which is larger than 96KB. So it can't achieve full occupancy, the shared memory is the limiting factor. 180 | 181 | \subsubsection{} 182 | The maximum number of threads is 32*256=8192, much larger than 2048 threads/SM. So it can't achieve full occupancy.\textbf{Besides, both registers and shared memory also surpass their respective capacities.} 183 | 184 | \end{document} -------------------------------------------------------------------------------- /Part1/CH6/code/MatMultiCorner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define tile_size 16 4 | 5 | __global__ void MatMulCornerTurn(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int row_A, int col_A, int col_B){ 6 | int row = blockIdx.y * tile_size + threadIdx.y; 7 | int col = blockIdx.x * tile_size + threadIdx.x; 8 | 9 | __shared__ float As[tile_size][tile_size]; 10 | __shared__ float Bs[tile_size][tile_size]; 11 | 12 | float Cvalue = 0.0f; 13 | 14 | for (int t = 0; t < (col_A + tile_size - 1) / tile_size; t++) { 15 | // Load shared memory for A 16 | if (row < row_A && (t * tile_size + threadIdx.x) < col_A) { 17 | As[threadIdx.y][threadIdx.x] = A[row * col_A + t * tile_size + threadIdx.x]; 18 | } else { 19 | As[threadIdx.y][threadIdx.x] = 0.0f; 20 | } 21 | 22 | // Load shared memory for B, col_A = row_B 23 | if ((t * tile_size + threadIdx.y) < col_A && col < col_B) { 24 | Bs[threadIdx.y][threadIdx.x] = B[(t * tile_size + threadIdx.y) * col_B + col]; 25 | } else { 26 | Bs[threadIdx.y][threadIdx.x] = 0.0f; 27 | } 28 | 29 | __syncthreads(); 30 | 31 | // Perform multiplication for the tile 32 | for (int i = 0; i < tile_size; i++) { 33 | Cvalue += As[threadIdx.y][i] * Bs[i][threadIdx.x]; 34 | } 35 | 36 | __syncthreads(); 37 | } 38 | 39 | // Write the result to C 40 | if (row < row_A && col < col_B) { 41 | C[row * col_B + col] = Cvalue; 42 | } 43 | } 44 | 45 | 46 | void matrixMultiply(const float* A, const float* B, float* C, int row_A, int col_A, int col_B){ 47 | float *d_A, *d_B, *d_C; 48 | size_t size_A = row_A * col_A * sizeof(float); 49 | size_t size_B = col_A * col_B * sizeof(float); 50 | size_t size_C = row_A * col_B * sizeof(float); 51 | 52 | cudaMalloc((void**)&d_A, size_A); 53 | cudaMalloc((void**)&d_B, size_B); 54 | cudaMalloc((void**)&d_C, size_C); 55 | 56 | cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice); 57 | cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice); 58 | 59 | dim3 dimBlock(tile_size, tile_size); 60 | dim3 dimGrid((col_B + tile_size - 1) / tile_size, (row_A + tile_size - 1) / tile_size); 61 | 62 | MatMulCornerTurn<<>>(d_A, d_B, d_C, row_A, col_A, col_B); 63 | 64 | cudaDeviceSynchronize(); 65 | 66 | cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost); 67 | 68 | cudaFree(d_A); 69 | cudaFree(d_B); 70 | cudaFree(d_C); 71 | } 72 | 73 | //print matrix 74 | void coutmatrix(const float* mat, int rows, int cols, const char* name){ 75 | std::cout << "Matrix " << name << ":\n"; 76 | for(int i = 0;i < rows; i++){ 77 | for(int j = 0; j < cols; j++){ 78 | std::cout << mat[i * cols + j] << "\t"; 79 | } 80 | std::cout << "\n"; 81 | } 82 | } 83 | 84 | int main(){ 85 | int row_A = 18;//also row_C 86 | int col_A = 6;//also row_B 87 | int col_B = 4;//also col_C 88 | float *h_A = (float*)malloc(row_A * col_A * sizeof(float)); 89 | float *h_B = (float*)malloc(col_A * col_B * sizeof(float)); 90 | float *h_C = (float*)malloc(row_A * col_B * sizeof(float)); 91 | 92 | float host_A[] = {1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 93 | 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, 94 | 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 95 | 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, 96 | 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 97 | 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, 98 | 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 99 | 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, 100 | 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 101 | 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, 102 | 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 103 | 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, 104 | 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 105 | 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, 106 | 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 107 | 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f, 108 | 1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f, 109 | 4.0f, 5.0f, 6.0f, 4.0f, 5.0f, 6.0f,}; 110 | 111 | float host_B[] = {1.0f, 2.0f,1.0f, 2.0f, 112 | 1.0f, 2.0f, 1.0f, 2.0f, 113 | 1.0f, 2.0f, 1.0f, 2.0f, 114 | 1.0f, 2.0f, 1.0f, 2.0f, 115 | 1.0f, 2.0f, 1.0f, 2.0f, 116 | 1.0f, 2.0f, 1.0f, 2.0f}; 117 | memcpy(h_A, host_A, sizeof(host_A)); 118 | memcpy(h_B, host_B, sizeof(host_B)); 119 | 120 | coutmatrix(h_A, row_A, col_A, "A"); 121 | coutmatrix(h_B, col_A, col_B, "B"); 122 | // constexpr int tile_size = 16; 123 | matrixMultiply(h_A, h_B, h_C, row_A, col_A, col_B); 124 | coutmatrix(h_C, row_A, col_B, "C"); 125 | 126 | free(h_A); 127 | free(h_B); 128 | free(h_C); 129 | return 0; 130 | } -------------------------------------------------------------------------------- /Part1/CH6/code/compare.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // For memcpy 4 | 5 | #define TILE_SIZE 16 6 | 7 | // Naive Matrix Multiplication Kernel 8 | __global__ void MatMulNaive(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int row_A, int col_A, int col_B){ 9 | int row = blockIdx.y * blockDim.y + threadIdx.y; 10 | int col = blockIdx.x * blockDim.x + threadIdx.x; 11 | 12 | if(row < row_A && col < col_B){ 13 | float Cvalue = 0.0f; 14 | for(int k = 0; k < col_A; k++){ 15 | Cvalue += A[row * col_A + k] * B[k * col_B + col]; 16 | } 17 | C[row * col_B + col] = Cvalue; 18 | } 19 | } 20 | 21 | // Shared Memory Tiled Matrix Multiplication Kernel 22 | __global__ void MatMulCornerTurn(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int row_A, int col_A, int col_B){ 23 | int row = blockIdx.y * TILE_SIZE + threadIdx.y; 24 | int col = blockIdx.x * TILE_SIZE + threadIdx.x; 25 | 26 | __shared__ float As[TILE_SIZE][TILE_SIZE]; 27 | __shared__ float Bs[TILE_SIZE][TILE_SIZE]; 28 | 29 | float Cvalue = 0.0f; 30 | 31 | for (int t = 0; t < (col_A + TILE_SIZE - 1) / TILE_SIZE; t++) { 32 | // Load shared memory for A 33 | if (row < row_A && (t * TILE_SIZE + threadIdx.x) < col_A) { 34 | As[threadIdx.y][threadIdx.x] = A[row * col_A + t * TILE_SIZE + threadIdx.x]; 35 | } else { 36 | As[threadIdx.y][threadIdx.x] = 0.0f; 37 | } 38 | 39 | // Load shared memory for B, col_A = row_B 40 | if ((t * TILE_SIZE + threadIdx.y) < col_A && col < col_B) { 41 | Bs[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * col_B + col]; 42 | } else { 43 | Bs[threadIdx.y][threadIdx.x] = 0.0f; 44 | } 45 | 46 | __syncthreads(); 47 | 48 | // Perform multiplication for the tile 49 | for (int i = 0; i < TILE_SIZE; i++) { 50 | Cvalue += As[threadIdx.y][i] * Bs[i][threadIdx.x]; 51 | } 52 | 53 | __syncthreads(); 54 | } 55 | 56 | // Write the result to C 57 | if (row < row_A && col < col_B) { 58 | C[row * col_B + col] = Cvalue; 59 | } 60 | } 61 | 62 | // Host Function for Matrix Multiplication 63 | void matrixMultiply(const float* A, const float* B, float* C, int row_A, int col_A, int col_B, bool useSharedMemory){ 64 | float *d_A, *d_B, *d_C; 65 | size_t size_A = row_A * col_A * sizeof(float); 66 | size_t size_B = col_A * col_B * sizeof(float); 67 | size_t size_C = row_A * col_B * sizeof(float); 68 | 69 | // Allocate device memory 70 | cudaMalloc((void**)&d_A, size_A); 71 | cudaMalloc((void**)&d_B, size_B); 72 | cudaMalloc((void**)&d_C, size_C); 73 | 74 | // Copy data from host to device 75 | cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice); 76 | cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice); 77 | 78 | // Define block and grid dimensions 79 | dim3 dimBlock(TILE_SIZE, TILE_SIZE); 80 | dim3 dimGrid((col_B + TILE_SIZE - 1) / TILE_SIZE, (row_A + TILE_SIZE - 1) / TILE_SIZE); 81 | 82 | // Select kernel based on the flag 83 | if(useSharedMemory){ 84 | MatMulCornerTurn<<>>(d_A, d_B, d_C, row_A, col_A, col_B); 85 | } 86 | else{ 87 | MatMulNaive<<>>(d_A, d_B, d_C, row_A, col_A, col_B); 88 | } 89 | 90 | // Wait for GPU to finish before accessing on host 91 | cudaDeviceSynchronize(); 92 | 93 | // Copy the result from device to host 94 | cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost); 95 | 96 | // Free device memory 97 | cudaFree(d_A); 98 | cudaFree(d_B); 99 | cudaFree(d_C); 100 | } 101 | 102 | // Function to Print Matrices 103 | void coutmatrix(const float* mat, int rows, int cols, const char* name){ 104 | std::cout << "Matrix " << name << ":\n"; 105 | for(int i = 0;i < rows; i++){ 106 | for(int j = 0; j < cols; j++){ 107 | std::cout << mat[i * cols + j] << "\t"; 108 | } 109 | std::cout << "\n"; 110 | } 111 | } 112 | 113 | // Function to Initialize Matrices 114 | void initializeMatrices(float* h_A, float* h_B, int row_A, int col_A, int col_B){ 115 | // Initialize matrix A 116 | // Example: Initialize A with repeating sequences 117 | for(int i = 0; i < row_A; i++){ 118 | for(int j = 0; j < col_A; j++){ 119 | h_A[i * col_A + j] = static_cast((i * col_A + j) % 6 + 1) * 1.012; // Values between 1.0f and 6.0f 120 | } 121 | } 122 | 123 | // Initialize matrix B 124 | // Example: Initialize B with alternating 1s and 2s 125 | for(int i = 0; i < col_A; i++){ 126 | for(int j = 0; j < col_B; j++){ 127 | h_B[i * col_B + j] = (j % 2 == 0) ? 1.0f : 2.0f; 128 | h_B[i * col_B + j] *= 1.023; 129 | } 130 | } 131 | } 132 | 133 | int main(){ 134 | // Define matrix dimensions 135 | int row_A = 1024*2; // Increased size for meaningful performance comparison 136 | int col_A = 1024*2; 137 | int col_B = 1024*2; 138 | 139 | // Allocate host memory 140 | float *h_A = (float*)malloc(row_A * col_A * sizeof(float)); 141 | float *h_B = (float*)malloc(col_A * col_B * sizeof(float)); 142 | float *h_C_naive = (float*)malloc(row_A * col_B * sizeof(float)); 143 | float *h_C_shared = (float*)malloc(row_A * col_B * sizeof(float)); 144 | 145 | // Initialize matrices 146 | initializeMatrices(h_A, h_B, row_A, col_A, col_B); 147 | 148 | // Uncomment the following lines if you want to print the matrices (Not recommended for large matrices) 149 | /* 150 | coutmatrix(h_A, row_A, col_A, "A"); 151 | coutmatrix(h_B, col_A, col_B, "B"); 152 | */ 153 | 154 | // Create CUDA events for timing 155 | cudaEvent_t start_naive, stop_naive; 156 | cudaEvent_t start_shared, stop_shared; 157 | cudaEventCreate(&start_naive); 158 | cudaEventCreate(&stop_naive); 159 | cudaEventCreate(&start_shared); 160 | cudaEventCreate(&stop_shared); 161 | 162 | // -------------------------- 163 | // Measure Naive Kernel 164 | // -------------------------- 165 | cudaEventRecord(start_naive); 166 | matrixMultiply(h_A, h_B, h_C_naive, row_A, col_A, col_B, false); 167 | cudaEventRecord(stop_naive); 168 | cudaEventSynchronize(stop_naive); 169 | 170 | float milliseconds_naive = 0; 171 | cudaEventElapsedTime(&milliseconds_naive, start_naive, stop_naive); 172 | 173 | // -------------------------- 174 | // Measure Shared Memory Kernel 175 | // -------------------------- 176 | cudaEventRecord(start_shared); 177 | matrixMultiply(h_A, h_B, h_C_shared, row_A, col_A, col_B, true); 178 | cudaEventRecord(stop_shared); 179 | cudaEventSynchronize(stop_shared); 180 | 181 | float milliseconds_shared = 0; 182 | cudaEventElapsedTime(&milliseconds_shared, start_shared, stop_shared); 183 | 184 | // -------------------------- 185 | // Compare Results (Optional) 186 | // -------------------------- 187 | bool correct = true; 188 | for(int i = 0; i < row_A * col_B; i++){ 189 | if(abs(h_C_naive[i] - h_C_shared[i]) > 1e-3){ 190 | correct = false; 191 | std::cout << "Mismatch at index " << i << ": Naive = " << h_C_naive[i] << ", Shared = " << h_C_shared[i] << "\n"; 192 | break; 193 | } 194 | } 195 | 196 | if(correct){ 197 | std::cout << "Both kernels produced the same results.\n"; 198 | } 199 | else{ 200 | std::cout << "Mismatch detected between kernel results!\n"; 201 | } 202 | 203 | // -------------------------- 204 | // Print Execution Times 205 | // -------------------------- 206 | std::cout << "Naive Kernel Execution Time: " << milliseconds_naive << " ms\n"; 207 | std::cout << "Shared Memory Kernel Execution Time: " << milliseconds_shared << " ms\n"; 208 | 209 | // -------------------------- 210 | // Clean Up 211 | // -------------------------- 212 | free(h_A); 213 | free(h_B); 214 | free(h_C_naive); 215 | free(h_C_shared); 216 | 217 | cudaEventDestroy(start_naive); 218 | cudaEventDestroy(stop_naive); 219 | cudaEventDestroy(start_shared); 220 | cudaEventDestroy(stop_shared); 221 | 222 | return 0; 223 | } 224 | -------------------------------------------------------------------------------- /Part1/CH6/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part1/CH6/main.pdf -------------------------------------------------------------------------------- /Part1/CH6/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{amsmath} 3 | \usepackage{listings} 4 | \usepackage{geometry} 5 | \usepackage{array} 6 | \usepackage{ulem} 7 | \usepackage{float} 8 | \geometry{a4paper, margin=1in} 9 | \begin{document} 10 | 11 | \section*{key points} 12 | A summary for the beginner tutorial. 13 | 14 | The introduction of memory coalescing, helping memory program better. 15 | 16 | I have to admit that this chapter is pretty difficult, it should be pay great attention on the conclusion part. 17 | \section{} 18 | \subsection{} 19 | \begin{lstlisting} 20 | unsigned int row = blockIdx.y*blockDim.y + threadIdx.y; 21 | unsigned int col = blockIdx.x*blockDim.x + threadIdx.x; 22 | if (row < Width && col < Width){ 23 | float Pvalue = 0.0f; 24 | for(unsigned int k = 0;k < Width;++k){ 25 | Pvalue += N[row*Width + k]*M[col*Width + k]; 26 | } 27 | P[row*Width + col] = Pvalue; 28 | } 29 | \end{lstlisting} 30 | 31 | 32 | \textbf{Basically correct, while using shared memory, tilting and branching can make it perform better.} An optimized kernel with corner turning and shared memory: 33 | 34 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 35 | #define TILE_WIDTH 16 36 | __global__ void matrixMulCornerTurning(float* N, float* M, float* P, int Width) { 37 | // Calculate the row and column index of the element 38 | int row = blockIdx.y * TILE_WIDTH + threadIdx.y; 39 | int col = blockIdx.x * TILE_WIDTH + threadIdx.x; 40 | 41 | // Allocate shared memory for tiles of N and M 42 | __shared__ float tileN[TILE_WIDTH][TILE_WIDTH]; 43 | __shared__ float tileM[TILE_WIDTH][TILE_WIDTH]; 44 | 45 | float Pvalue = 0.0f; 46 | 47 | // Loop over tiles 48 | for (int t = 0; t < (Width + TILE_WIDTH - 1) / TILE_WIDTH; ++t) { 49 | // Load elements into shared memory with corner turning 50 | if (row < Width && (t * TILE_WIDTH + threadIdx.x) < Width) 51 | tileN[threadIdx.y][threadIdx.x] = N[row * Width + t * TILE_WIDTH + threadIdx.x]; 52 | else 53 | tileN[threadIdx.y][threadIdx.x] = 0.0f; 54 | 55 | if (col < Width && (t * TILE_WIDTH + threadIdx.y) < Width) 56 | // Transpose M while loading to ensure coalesced access 57 | tileM[threadIdx.x][threadIdx.y] = M[col * Width + t * TILE_WIDTH + threadIdx.y]; 58 | else 59 | tileM[threadIdx.x][threadIdx.y] = 0.0f; 60 | 61 | __syncthreads(); 62 | 63 | // Multiply the two tiles 64 | for (int k = 0; k < TILE_WIDTH; ++k) { 65 | Pvalue += tileN[threadIdx.y][k] * tileM[threadIdx.x][k]; 66 | } 67 | 68 | __syncthreads(); 69 | } 70 | 71 | // Write the result to global memory 72 | if (row < Width && col < Width) { 73 | P[row * Width + col] = Pvalue; 74 | } 75 | } 76 | \end{lstlisting} 77 | 78 | \subsection{} 79 | The fit value of BLOCK\_SIZE could be the \sout{multiple} \textbf{divisor} number of the \sout{width} \textbf{warp width}. This shall makes it fully consider all elements without extra operation. 80 | 81 | Common values like 8,16 and 32 align with warp size considerations and hardware constraints.For ensuring coalesced memory accesses effectively. 82 | 83 | Using multiples of wap size can lead to inefficient resource utilization and exceed the maximum threads per block. 84 | 85 | \subsection{} 86 | \subsubsection{} 87 | \sout{uncoalesced} coalesced, a[i] where i = BlockIdx.x * BlockDim.x + threadIdx.x. 88 | \subsubsection{} 89 | \sout{uncoalesced} NA, each thread accesses a unique index threadIdx.x, instead, bank conflicts are the primary concern. 90 | \subsubsection{} 91 | coalesced. 92 | \subsubsection{} 93 | \sout{NA} coalesced, for each j, threads access c[i*4+j]. Assuming i is consecutive across threads, the accessed address are also consecutive for each j. 94 | \subsubsection{} 95 | \sout{uncoalesced} NA, each thread accesses a unique index based on threadIdx.x and loop variable j. It should concern the bank conflicts rather than coalescing. 96 | \subsubsection{} 97 | \sout{coalesced} NA, each thread access a unique index based on threadIdx.x. 98 | \subsubsection{} 99 | \sout{NA} coalesced, for each i+8, if i is consecutive across threads, they are consecutive memory addresses. 100 | \subsubsection{} 101 | \sout{coalesced} NA, only one accesses for an index, based on threadIdx.x. 102 | \subsubsection{} 103 | NA 104 | 105 | coalesced access maximize menory throughput and minimize latency, leading to significant performance improvements.(For contiguous acces, aligned access and same memory space) 106 | 107 | uncoalesced access orrors when threads in a warp access memory loactions that are scattered or noncontiguous, leading to multiple memory transactions, increasing latency and reducing overall performance.(For strided access, random access and misaligned access) 108 | 109 | e.g.:B[idx] = A[4 * idx], index like 0,4,8,12$\cdots$ are not contiguous. 110 | 111 | coalescing not available pertains to certain types of memory accesses where coalescing is irrelevant or handled differently.(For shared memory access, registers and loacl memory, constant and texture memory) 112 | 113 | e.g.:\_\_shared\_\_ float sharedA[256];sharedA[idx] = A[idx];B[idx] = sharedA[idx] * 2.0f; where threads access sharedA[idx], which is shared memory. 114 | 115 | \subsection{} 116 | \subsubsection{} 117 | per element C[i][j], Multiplications: N, additions:N-1, total FLOPs $\approx$ 2N. Total FLOPs for entire matrix: \(N^2 \times 2N = 2N^3\). 118 | 119 | Memory access: matrix A: each element accessed once per C[i][j], total is N\^3(number of elements(N$^2$) * access per element(N) = N$^3$), matrix B is also N$^3$. Global memory writes for matrix C = N$^2$. 120 | 121 | So total memory accesses = N$^3$ + N$^3$ + N$^2$ = 2N$^3$ + N$^2$ $\approx$ 2N$^3$. 122 | 123 | Bytes = 2N$^3$ * 4 = 8N$^3$ bytes. 124 | 125 | So OP/B = \(\frac{2N^3}{8N^3} = \frac{1}{4} = 0.25 OP/B\). 126 | 127 | \subsubsection{} 128 | FLOPs is also 2N$^3$. Memory access is \(2 * \frac{N^3}{32} = \frac{N^3}{16}\). Bytes = N$^3$/4 bytes. 129 | 130 | So, OP/B = \(\frac{2N^3}{N^3/4}\). 131 | \subsubsection{} 132 | For a further step, coarsening is: \(\frac{\frac{N^3}{4}}{4(the thread coarsening factor)}\) = \(\frac{N^3}{16}\) bytes. 133 | So it should be \(\frac{2N^3}{\frac{N^3}{16}}\) = 32 OP/B., 134 | 135 | \textbf{Conclusion:} 136 | memory access ratio = $\frac{Total Floating Point Operations}{Bytes accessed}$ 137 | 138 | A tile can decrease N times for a N*N tile, and coalesce can decrease N time of the total Operations Per Byte (OP/B) (under ideal circumstance, depend on the memory access pattern and how ell the data accesses align with the GPU's memory architecture.). 139 | 140 | \end{document} -------------------------------------------------------------------------------- /Part2/CH10/code/reduction_basic.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | void coutarray(const float* arr, int len, const char* name){ 4 | std::cout << "Array " << name << ":\n"; 5 | for (int i = 0; i < len; i++){ 6 | std::cout << arr[i] << "\t"; 7 | } 8 | std::cout << "\n"; 9 | } 10 | 11 | 12 | __global__ void reduction_squencial(float *In, float *Out, const int len){ 13 | __shared__ float sdata[256]; 14 | unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 15 | 16 | float val = (idx < len) ? In[idx] : 0; 17 | sdata[threadIdx.x] = val; 18 | __syncthreads(); 19 | for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1){ 20 | if (threadIdx.x < stride){ 21 | sdata[threadIdx.x] += sdata[threadIdx.x + stride]; 22 | } 23 | __syncthreads(); 24 | } 25 | if (threadIdx.x == 0){ 26 | Out[blockIdx.x] = sdata[0]; 27 | } 28 | } 29 | 30 | __global__ void reduction_reverse(float *In, float *Out, const int len){ 31 | __shared__ float sdata[256]; 32 | unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; 33 | float val = (idx < len) ? In[idx] : 0; 34 | sdata[threadIdx.x] = val; 35 | __syncthreads(); 36 | for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1){ 37 | if (threadIdx.x >= stride){ 38 | sdata[threadIdx.x] += sdata[threadIdx.x - stride]; 39 | } 40 | __syncthreads(); 41 | } 42 | if (threadIdx.x == (blockDim.x - 1)){ 43 | Out[blockIdx.x] = sdata[threadIdx.x]; 44 | } 45 | } 46 | 47 | #define COARSE_FACTOR 3 48 | __global__ void reduction_coarsened_sum(float *In, float *Out, int len){ 49 | const int BLOCKDIM = 256; 50 | __shared__ float input_s[BLOCKDIM]; 51 | unsigned int segment = COARSE_FACTOR * 2 * blockDim.x * blockIdx.x; 52 | unsigned int i = segment + threadIdx.x; 53 | unsigned int t = threadIdx.x; 54 | float sum = 0.0f; 55 | if (i < len) sum += In[i]; 56 | for (unsigned int tile = 1; tile < COARSE_FACTOR * 2; tile++){ 57 | unsigned int idx = i + tile * BLOCKDIM; 58 | if (idx < len) sum += In[i + tile * BLOCKDIM]; 59 | 60 | } 61 | input_s[t] = sum; 62 | for (unsigned int stride = blockDim.x / 2; stride >= 1; stride >>= 1){ 63 | __syncthreads(); 64 | if (t < stride){ 65 | input_s[t] += input_s[t + stride]; 66 | } 67 | } 68 | __syncthreads(); 69 | if (t == 0){ 70 | atomicAdd(Out, input_s[0]); 71 | } 72 | } 73 | 74 | __device__ float atomicMaxFloat(float *address, float val) { 75 | //used to compare with float type data and return the max data 76 | int *address_as_int = (int*) address; 77 | int old = *address_as_int, assumed; 78 | 79 | do { 80 | assumed = old; 81 | float old_val = __int_as_float(assumed); 82 | float max_val = fmaxf(old_val, val); 83 | int new_val_int = __float_as_int(max_val); 84 | old = atomicCAS(address_as_int, assumed, new_val_int); 85 | } while (assumed != old); 86 | 87 | return __int_as_float(old); 88 | } 89 | 90 | __global__ void reduction_coarsened_max(float *In, float *Out, int len){ 91 | const int BLOCKDIM = 256; 92 | __shared__ float input_s[BLOCKDIM]; 93 | unsigned int segment = COARSE_FACTOR * 2 * blockDim.x * blockIdx.x; 94 | unsigned int i = segment + threadIdx.x; 95 | unsigned int t = threadIdx.x; 96 | float max_num = -FLT_MAX; 97 | if (i < len){ 98 | max_num = In[i]; 99 | } 100 | for (unsigned int tile = 1; tile < COARSE_FACTOR * 2; tile++){ 101 | unsigned int idx = i + tile * BLOCKDIM; 102 | if (idx < len) if(max_num < In[i + tile * BLOCKDIM]) max_num = In[i + tile * BLOCKDIM]; 103 | 104 | } 105 | input_s[t] = max_num; 106 | for (unsigned int stride = blockDim.x / 2; stride >= 1; stride >>= 1){ 107 | __syncthreads(); 108 | if (t < stride){ 109 | if (input_s[t] < input_s[t + stride]) input_s[t] = input_s[t + stride]; 110 | } 111 | } 112 | __syncthreads(); 113 | if (t == 0){ 114 | atomicMaxFloat(Out, input_s[0]); 115 | } 116 | } 117 | 118 | void reduction(const float *In, float *Out, const int len){ 119 | float *d_In, *d_Out; 120 | cudaMalloc((void**)&d_In, len * sizeof(float)); 121 | 122 | cudaMemcpy(d_In, In, len * sizeof(float), cudaMemcpyHostToDevice); 123 | 124 | int threadsPerBlock = 256; 125 | int blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock; 126 | cudaMalloc((void**)&d_Out, blocksPerGrid * sizeof(float)); 127 | 128 | reduction_squencial<<>>(d_In, d_Out, len); 129 | int size = blocksPerGrid; 130 | while (size > 1){ 131 | int newBlocks = (size + threadsPerBlock - 1) / threadsPerBlock; 132 | reduction_squencial<<>>(d_Out, d_Out, size); 133 | size = newBlocks; 134 | } 135 | // std::cout << len << std::endl; 136 | cudaMemcpy(Out, d_Out, sizeof(float), cudaMemcpyDeviceToHost); 137 | coutarray(Out, 1, "Output_squencial"); 138 | 139 | cudaFree(d_Out); 140 | reduction_reverse<<>>(d_In, d_Out, len); 141 | cudaMemcpy(Out, d_Out, sizeof(float), cudaMemcpyDeviceToHost); 142 | coutarray(Out, 1, "Output_reverse"); 143 | size = blocksPerGrid; 144 | while (size > 1){ 145 | int newBlocks = (size + threadsPerBlock - 1) / threadsPerBlock; 146 | reduction_reverse<<>>(d_Out, d_Out, size); 147 | size = newBlocks; 148 | } 149 | 150 | cudaFree(d_Out); 151 | reduction_coarsened_sum<<>>(d_In, d_Out, len); 152 | cudaMemcpy(Out, d_Out, sizeof(float), cudaMemcpyDeviceToHost); 153 | coutarray(Out, 1, "Output_tiled_sum"); 154 | 155 | cudaFree(d_Out); 156 | float initVal = -FLT_MAX; 157 | cudaMalloc((void**)&d_Out, sizeof(float));//reallocate the data 158 | cudaMemcpy(d_Out, &initVal, sizeof(float), cudaMemcpyHostToDevice); //initialize the regional number 159 | reduction_coarsened_max<<>>(d_In, d_Out, len); 160 | cudaMemcpy(Out, d_Out, sizeof(float), cudaMemcpyDeviceToHost); 161 | coutarray(Out, 1, "Output_tiled_max"); 162 | 163 | std::cout<< "For a len that is not the multiple of coarse factor, just use set the exceeded number as 0." << std::endl; 164 | 165 | cudaFree(d_In); 166 | cudaFree(d_Out); 167 | 168 | } 169 | 170 | int main(){ 171 | const float h_In[] = {1,2,3,4,5,6,7,8,9,0,1,2,3,4,5}; 172 | const int len = sizeof(h_In) / sizeof(float); 173 | // const int size_In = len; 174 | // std::cout << size_In << "\n"; 175 | float h_Out; 176 | 177 | coutarray(h_In, len, "Input"); 178 | reduction(h_In, &h_Out, len); 179 | 180 | return 0; 181 | } -------------------------------------------------------------------------------- /Part2/CH10/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part2/CH10/main.pdf -------------------------------------------------------------------------------- /Part2/CH10/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{amsmath} 3 | \usepackage{listings} 4 | \usepackage{geometry} 5 | \usepackage{array} 6 | \usepackage{ulem} 7 | \usepackage{float} 8 | \geometry{a4paper, margin=1in} 9 | \begin{document} 10 | 11 | \section*{key points} 12 | Reduction: make a series of reduction to parallel calculation. 13 | 14 | reduction tree: takes $log_{2}N$ times rather than N-1 times. the maximum number of calcualtor in the same step is N/2, the average number is \(\frac{N-1}{log_{2}N}\) 15 | 16 | shared memory for can decrease the times of reading and writing data. 17 | 18 | for low number of blocks, utilizing coarsening factor can help this problem. This will take less parallel workers, which should be paid attention to. 19 | 20 | \section{} 21 | \subsection{} 22 | \sout{that is \(\sum_{i=1}^{5}(1024/(2*32*2^{i-1}))\)} 23 | \(\frac{1024}{2^5} = 32 active warps\) 24 | 25 | \subsection{} 26 | that is 1 warp 27 | 28 | \subsection{} 29 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 30 | __global__ void ConvergentSumReductionKernel(float* input, float* output){ 31 | unsigned int i = threadIdx.x; 32 | for (unsigned int stride = blockDim.x; stride >= 1; stride /= 2) { 33 | if (threadIdx.x > stride && threadIdx.x < stride*2){ 34 | input[i] += input[i-stride]; 35 | } 36 | __syncthreads(); 37 | } 38 | if(threadIdx.x == 0) { 39 | *output = input[blockDim.x];//this order help decrease competition between different blocks. 40 | } 41 | } 42 | \end{lstlisting} 43 | 44 | \subsection{} 45 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 46 | __global__ CoarsenedMaxReductionKernel(const float* input, float* output){ 47 | __shared__ float input_s[BLOCK_DIM]; 48 | unsigned int segment = COARSE_FACTOR*2*blockDim.x*blockIdx.x; 49 | unsigned int i = segment + threadIdx.x; 50 | unsigned int t = threadIdx.x; 51 | float maximum = input[i]; 52 | for(unsigned int tile = 1; tile < COARSE_FACTOR*2; ++tile) { 53 | maximum = max(maximum, input[i+tile*BLOCK_DIM]); 54 | } 55 | input_s[t] = maximum; 56 | for (unsigned int stride = blockDim.x/2; stride >= 1; stride /= 2){ 57 | __syncthreads(); 58 | if ( t < stride) { 59 | input_s[t] = max(input_s[t], input_s[t+stride]); 60 | } 61 | } 62 | if (t == 0){ 63 | output[blockIdx.x] = input_s[0]; 64 | } 65 | } 66 | \end{lstlisting} 67 | 68 | \subsection{} 69 | \subsubsection{} 70 | for each iteration, it should be: 71 | 8 11 13 4 72 | 19 17 73 | 36 74 | 75 | \subsubsection{} 76 | it should be: 77 | 11 10 10 5 78 | 21 15 79 | 36 80 | \end{document} -------------------------------------------------------------------------------- /Part2/CH11/code/prefix_sum.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void coutarray(const float* arr, int len, const char* name){ 4 | std::cout << "Array " << name << ":\n"; 5 | for (int i = 0; i < len; i++){ 6 | std::cout << arr[i] << "\t"; 7 | } 8 | std::cout << "\n"; 9 | } 10 | 11 | #define SECTION_SIZE 256 12 | 13 | __global__ void Kogge_Stone_scan_kernel(const float *X, float *Y, const int len){ 14 | __shared__ float XY[SECTION_SIZE]; 15 | unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 16 | if (i < len){ 17 | XY[threadIdx.x] = X[i]; 18 | } else{ 19 | XY[threadIdx.x] = 0.0f; 20 | } 21 | for (unsigned int stride = 1; stride < blockDim.x; stride *= 2){ 22 | __syncthreads(); 23 | float tmp; 24 | if(threadIdx.x >= stride){ 25 | tmp = XY[threadIdx.x] + XY[threadIdx.x - stride]; 26 | XY[threadIdx.x] = tmp; 27 | } 28 | __syncthreads(); 29 | } 30 | if (i < len){ 31 | Y[i] = XY[threadIdx.x]; 32 | } 33 | } 34 | 35 | __global__ void Kogge_Stone_scan_kernel_double_buffer(const float *X, float *Y, const int len){ 36 | __shared__ float XY[SECTION_SIZE]; 37 | __shared__ float XY2[SECTION_SIZE]; 38 | 39 | unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 40 | 41 | if ( i < len){ 42 | XY[threadIdx.x] = X[i]; 43 | } else{ 44 | XY[threadIdx.x] = 0.0f; 45 | } 46 | for (unsigned int stride = 1; stride < blockDim.x; stride *= 2){ 47 | __syncthreads(); 48 | if(threadIdx.x >= stride){ 49 | float tmp = XY[threadIdx.x] + XY[threadIdx.x - stride]; 50 | XY2[threadIdx.x] = tmp; 51 | } else{ 52 | XY2[threadIdx.x] = XY[threadIdx.x]; 53 | } 54 | __syncthreads(); 55 | for (int j = 0; j < blockDim.x; j++){ 56 | XY[j] = XY2[j]; 57 | } 58 | } 59 | __syncthreads(); 60 | if (i < len+1){ 61 | Y[i] = XY[threadIdx.x]; 62 | } 63 | } 64 | 65 | __global__ void Kogge_Stone_scan_kernel_exclusive(const float *X, float *Y, const int len){ 66 | __shared__ float XY[SECTION_SIZE+1]; 67 | unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 68 | if (i == 0){ 69 | XY[threadIdx.x] = 0.0f; 70 | } else if (i < len + 1){ 71 | XY[threadIdx.x] = X[i - 1]; 72 | } else{ 73 | XY[threadIdx.x] = 0.0f; 74 | } 75 | for (unsigned int stride = 1; stride < blockDim.x; stride *= 2){ 76 | __syncthreads(); 77 | float tmp; 78 | if(threadIdx.x >= stride){ 79 | tmp = XY[threadIdx.x] + XY[threadIdx.x - stride]; 80 | XY[threadIdx.x] = tmp; 81 | } 82 | __syncthreads(); 83 | } 84 | if (i < len + 1){ 85 | Y[i] = XY[threadIdx.x]; 86 | } 87 | } 88 | 89 | 90 | // __global__ void Kogge_Stone_hierarchical_kernel(const float *X, float *Y, const int len){ 91 | // int i = blockDim.x * blockIdx.x + threadIdx.x; 92 | // if (i % HIERARCHICAL_SIZE == 0 && i < len){ 93 | // Y[i / HIERARCHICAL_SIZE] = X[i]; 94 | // for (unsigned int stride = 1; stride < HIERARCHICAL_SIZE; stride++){ 95 | // __syncthreads(); 96 | // if (i + stride < len){ 97 | // Y[i / HIERARCHICAL_SIZE] += X[i + stride]; 98 | // } 99 | // } 100 | // } 101 | // } 102 | 103 | // __global__ void hierarchical_sum(const float *X, float *Y, const int len){ 104 | // int i = blockDim.x * blockIdx.x + threadIdx.x; 105 | // if (i > HIERARCHICAL_SIZE){ 106 | // Y[i] += X[i / HIERARCHICAL_SIZE]; 107 | // } 108 | // } 109 | 110 | #define HIERARCHICAL_SIZE 2 111 | __global__ void segemnt_sum(const float *X, float *Y, int len){ 112 | unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 113 | __shared__ float sData[HIERARCHICAL_SIZE]; 114 | if (threadIdx.x < HIERARCHICAL_SIZE && i < len) { 115 | sData[threadIdx.x] = X[i]; 116 | } else if (threadIdx.x < HIERARCHICAL_SIZE) { 117 | sData[threadIdx.x] = 0.0f; 118 | } 119 | __syncthreads(); 120 | if (threadIdx.x % HIERARCHICAL_SIZE != 0){ 121 | sData[threadIdx.x] += sData[threadIdx.x - 1]; 122 | } 123 | __syncthreads(); 124 | if (i < len){ 125 | Y[i] = sData[threadIdx.x]; 126 | } 127 | } 128 | 129 | __global__ void featured_sum(const float *X, float *Y, int len) { 130 | unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 131 | extern __shared__ float sData[]; 132 | 133 | 134 | } 135 | 136 | void prefix_sum(const float *In, float *Out, float *Out_exclusive, float *Out_Inner, int len){ 137 | float *d_In, *d_Out; 138 | int size_In = sizeof(float) * len; 139 | cudaMalloc((void**)&d_In, size_In); 140 | cudaMalloc((void**)&d_Out, size_In); 141 | cudaMemcpy(d_In, In, size_In, cudaMemcpyHostToDevice); 142 | 143 | int threadsPerBlock = 256; 144 | int blocksPerGrid = (len + threadsPerBlock - 1) / threadsPerBlock; 145 | // Kogge_Stone_scan_kernel<<>>(d_In, d_Out, len); 146 | // cudaMemcpy(Out, d_Out, size_In, cudaMemcpyDeviceToHost); 147 | // coutarray(Out, len, "Prefix sum result"); 148 | 149 | // cudaFree(d_Out); 150 | // cudaMalloc((void**)&d_Out, size_In); 151 | // Kogge_Stone_scan_kernel_double_buffer<<>>(d_In, d_Out, len); 152 | // cudaMemcpy(Out, d_Out, size_In, cudaMemcpyDeviceToHost); 153 | // coutarray(Out, len, "Prefix sum double-buffer"); 154 | 155 | // cudaFree(d_Out); 156 | // cudaMalloc((void**)&d_Out, size_In + sizeof(float)); 157 | // int exclusive_blocksPerGrid = (len + 1 + threadsPerBlock - 1) / threadsPerBlock; 158 | // Kogge_Stone_scan_kernel_exclusive<<>>(d_In, d_Out, len); 159 | // // std::cout << sizeof(d_Out) << std::endl; 160 | // cudaMemcpy(Out_exclusive, d_Out, size_In + sizeof(float), cudaMemcpyDeviceToHost); 161 | // coutarray(Out_exclusive, len+1, "Prefix sum in exclusive method"); 162 | 163 | // cudaFree(d_Out); 164 | 165 | cudaFree(d_Out); 166 | int threadsHier = HIERARCHICAL_SIZE; 167 | int blocksHier = (len + threadsHier - 1) / threadsHier; 168 | cudaMalloc((void**)&d_Out, len * sizeof(float)); 169 | segemnt_sum<<>>(d_In, d_Out, len); 170 | cudaMemcpy(Out, d_Out, len * sizeof(float), cudaMemcpyDeviceToHost); 171 | coutarray(Out, len, "Step 1: sum the segement array"); 172 | 173 | float *d_Inner; 174 | int size_inner = (len + HIERARCHICAL_SIZE - 1) / HIERARCHICAL_SIZE; 175 | cudaMalloc((void**)&d_Inner, size_inner * sizeof(float)); 176 | int threadsFeature = 256; 177 | int blocksFeature = (size_inner + threadsFeature - 1) / threadsFeature; 178 | size_t sharedMemSize = size_inner * sizeof(float); 179 | std::cout << size_inner << std::endl; 180 | featured_sum<<>>(d_Inner, d_Out, size_inner); 181 | cudaMemcpy(Out_Inner, d_Inner, size_inner * sizeof(float), cudaMemcpyDeviceToHost); 182 | coutarray(Out_Inner, size_inner, "Step 2: featrued data"); 183 | 184 | 185 | // int threadsHier = 256; 186 | // int hierarchical_len = (len + HIERARCHICAL_SIZE - 1) / (HIERARCHICAL_SIZE); 187 | // int blocksHier = (hierarchical_len + threadsHier - 1) / threadsHier; 188 | // cudaMalloc((void**)&d_Out, hierarchical_len * sizeof(float)); 189 | // Kogge_Stone_hierarchical_kernel<<>>(d_In, d_Out, len); 190 | // // std::cout << (len + HIERARCHICAL_SIZE - 1) / (HIERARCHICAL_SIZE) << std::endl; 191 | // cudaMemcpy(Out, d_Out, hierarchical_len * sizeof(float), cudaMemcpyDeviceToHost); 192 | // std::cout << "In segment of: " << HIERARCHICAL_SIZE << std::endl; 193 | // coutarray(Out, hierarchical_len, "First step: hierarchical scan sum"); 194 | 195 | // // std::cout << hierarchical_len << std::endl; 196 | // float *new_Out; 197 | // cudaMalloc((void**)&new_Out, hierarchical_len * sizeof(float)); 198 | // Kogge_Stone_scan_kernel<<>>(d_Out, new_Out, hierarchical_len); 199 | // // std::cout << sizeof(d_Out) / sizeof(float) << std::endl; 200 | // cudaMemcpy(Out_Inner, new_Out, hierarchical_len * sizeof(float), cudaMemcpyDeviceToHost); 201 | // coutarray(Out_Inner, hierarchical_len, "Second step: prefix sum of hierarical scan"); 202 | 203 | // hierarchical_sum<<>>(new_Out, d_Out, len); 204 | // cudaMemcpy(Out, d_Out, size_In, cudaMemcpyDeviceToHost); 205 | // coutarray(Out, len, "Third step: final sum result"); 206 | 207 | 208 | 209 | 210 | cudaFree(d_In); 211 | cudaFree(d_Out); 212 | } 213 | 214 | int main(){ 215 | const float h_In[] = {1,2,3,4,5,6,7,8,9,0}; 216 | // const float h_In[] = {1,2,3,1,-1,1,-1,1,-1,1}; 217 | const int len = sizeof(h_In) / sizeof(float); 218 | float h_Out[len]; 219 | float h_Out_exclusize[len+1]; 220 | float h_Out_Inner[(len + HIERARCHICAL_SIZE - 1) / HIERARCHICAL_SIZE]; 221 | 222 | coutarray(h_In, len, "Input"); 223 | prefix_sum(h_In, h_Out, h_Out_exclusize, h_Out_Inner, len); 224 | 225 | } -------------------------------------------------------------------------------- /Part2/CH11/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part2/CH11/main.pdf -------------------------------------------------------------------------------- /Part2/CH11/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{amsmath} 3 | \usepackage{listings} 4 | \usepackage{geometry} 5 | \usepackage{array} 6 | \usepackage{ulem} 7 | \usepackage{float} 8 | \geometry{a4paper, margin=1in} 9 | \begin{document} 10 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 11 | \end{lstlisting} 12 | 13 | 14 | \section*{key points} 15 | help decrease computing $O(N^{2})$ 16 | 17 | double-buffering: use two buffers of the shared memory, thus separate reading and writing section so that it can decrease one time of \_\_syncthread(). 18 | 19 | \section{} 20 | \subsection{} 21 | for the array: [4 6 7 1 2 8 5 2] 22 | the step in progress is [10 13 8 3 10 13 7],[4 10 14 18 16 18 16 17] 23 | 24 | so the final result should be:[4 10 14 18 20 28 29 30] 25 | 26 | \subsection{} 27 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 28 | __global__ void Kogge_Stone_scan_kernel(float *X, float *Y, unsigned int N){ 29 | __shared__ float XY[SECTION_SIZE]; 30 | __shared__ float XY_1[SECTION_SIZE]; 31 | if(i < N) { 32 | XY[threadIdx.x] = X[i]; 33 | XY_1[threadIdx.x] = X[i]; 34 | } else { 35 | XY[threadIdx.x] = 0.0f; 36 | XY_1[threadIdx.x] = 0.0f; 37 | } 38 | bool choose = true; 39 | for(unsigned int stride = 1; stride < blockDim.x; stride *= 2){ 40 | __syncthreads(); 41 | if(choose){ 42 | if(threadIdx.x >= stride){ 43 | XY_1[threadIdx.x] = XY[threadIdx.x] + XY[threadIdx.x - stride]; 44 | } 45 | choose = false; 46 | } else { 47 | if(threadIdx.x >= stride){ 48 | XY[threadIdx.x] = XY_1[threadIdx.x] + XY_1[threadIdx.x-stride]; 49 | } 50 | choose = true; 51 | } 52 | if(i < N){ 53 | if(choose){ 54 | Y[i] = XY[threadIdx.x]; 55 | } else { 56 | Y[i] = XY_1[threadIdx.x]; 57 | } 58 | } 59 | } 60 | } 61 | \end{lstlisting} 62 | 63 | \subsection{} 64 | for different stride values n, it will calculate from (2k-1)n to (2k)n. 65 | 66 | \subsection{} 67 | it is about $2048*log_{2}2048$ 68 | 69 | \subsection{} 70 | \sout{for each step, the array is: [10 8 10 7], [18 17],[25],and [20], [17 10 15], so the final result should be [4,10,18,18,20,28,33,35]} 71 | [4 10 7 8 2 10 5 7],[4 10 18 8 2 18 5 15],[4 10 18 8 20 18 28 15],[4 10 18 20 28 28 33],[4 10 18 18 20 28 33 35],[4 10 18 18 20 28 33 35] 72 | 73 | for stride from 1,2,4(Upsweep phase) to 4,2,1(DownSweep phase). 74 | 75 | \subsection{} 76 | \(\sum_{i=1}^{log_{2}n}\frac{n}{2^{i}}\) 77 | 78 | \subsection{} 79 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 80 | __global__ void Kogge_Stone_scan_kernel(float *X, float *Y, unsigned int N){ 81 | unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; 82 | __shared__ float XY[blockDim.x+1]; 83 | if(i > 0 && i < N){ 84 | XY[threadIdx.x] = X[i-1];// Shift by 1 for exclusive behavior 85 | } else { 86 | XY[threadIdx.x] = 0.0f; 87 | } 88 | for(unsigned int stride = 1; stride < blockDim.x; stride *= 2){ 89 | __syncthreads(); 90 | float temp; 91 | if(threadIdx.x >= stride){ 92 | temp = XY[threadIdx.x] + XY[threadIdx.x - stride]; 93 | } 94 | __syncthreads(); 95 | if(threadIdx.x >= stride){ 96 | XY[threadIdx.x] = temp; 97 | } 98 | } 99 | if(i < N){ 100 | Y[i] = XY[threadIdx.x]; 101 | } 102 | } 103 | \end{lstlisting} 104 | 105 | \subsection{} 106 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 107 | #include 108 | #include 109 | 110 | __global__ void reduceKernel(float* input, float* output, int* flags, int n) { 111 | extern __shared__ float sdata[]; 112 | int tid = threadIdx.x; 113 | int i = blockIdx.x * blockDim.x + tid; 114 | 115 | if (i < n) { 116 | sdata[tid] = input[i]; 117 | } else { 118 | sdata[tid] = 0; 119 | } 120 | __syncthreads(); 121 | 122 | // Up-sweep phase 123 | for (int stride = 1; stride < blockDim.x; stride *= 2) { 124 | int index = (tid + 1) * stride * 2 - 1; 125 | if (index < blockDim.x && i < n && flags[index] == flags[index - stride]) { 126 | sdata[index] += sdata[index - stride]; 127 | } 128 | __syncthreads(); 129 | } 130 | 131 | // Write the block-level result to output 132 | if (tid == blockDim.x - 1 && i < n) { 133 | output[blockIdx.x] = sdata[tid]; 134 | } 135 | } 136 | 137 | __global__ void propagateKernel(float* output, int* flags, int n) { 138 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 139 | 140 | if (tid >= n) return; 141 | 142 | // Segmented scan at block level 143 | for (int stride = 1; stride < n; stride *= 2) { 144 | __syncthreads(); 145 | if (tid >= stride && flags[tid] == flags[tid - stride]) { 146 | output[tid] += output[tid - stride]; 147 | } 148 | } 149 | } 150 | 151 | __global__ void downsweepKernel(float* input, float* output, int* flags, int n) { 152 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 153 | 154 | if (tid >= n) return; 155 | 156 | float val = output[tid]; 157 | for (int i = 0; i < n; i++) { 158 | if (flags[i] == flags[tid]) { 159 | output[tid] += input[i]; 160 | } 161 | } 162 | } 163 | 164 | void segmentedParallelScan(float* h_input, float* h_output, int* h_flags, int n) { 165 | // Allocate device memory 166 | float *d_input, *d_output; 167 | int *d_flags; 168 | cudaMalloc(&d_input, n * sizeof(float)); 169 | cudaMalloc(&d_output, n * sizeof(float)); 170 | cudaMalloc(&d_flags, n * sizeof(int)); 171 | 172 | // Copy input data to device 173 | cudaMemcpy(d_input, h_input, n * sizeof(float), cudaMemcpyHostToDevice); 174 | cudaMemcpy(d_flags, h_flags, n * sizeof(int), cudaMemcpyHostToDevice); 175 | 176 | // Launch reduce kernel 177 | int threadsPerBlock = 256; 178 | int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; 179 | reduceKernel<<>>(d_input, d_output, d_flags, n); 180 | 181 | // Launch propagate kernel 182 | propagateKernel<<>>(d_output, d_flags, n); 183 | 184 | // Launch downsweep kernel 185 | downsweepKernel<<>>(d_input, d_output, d_flags, n); 186 | 187 | // Copy output data back to host 188 | cudaMemcpy(h_output, d_output, n * sizeof(float), cudaMemcpyDeviceToHost); 189 | 190 | // Free device memory 191 | cudaFree(d_input); 192 | cudaFree(d_output); 193 | cudaFree(d_flags); 194 | } 195 | 196 | \end{lstlisting} 197 | 198 | \end{document} -------------------------------------------------------------------------------- /Part2/CH12/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part2/CH12/main.pdf -------------------------------------------------------------------------------- /Part2/CH12/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{amsmath} 3 | \usepackage{listings} 4 | \usepackage{geometry} 5 | \usepackage{array} 6 | \usepackage{ulem} 7 | \usepackage{float} 8 | \geometry{a4paper, margin=1in} 9 | \begin{document} 10 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 11 | \end{lstlisting} 12 | 13 | 14 | \section*{key points} 15 | use different ways to define the split of data. Like equal\_sized partitioning, range-based partitioning and dynamic partitioning. 16 | 17 | 18 | \section{} 19 | \subsection{} 20 | That is 12, which is B[3]. So the co-rank is (5,4). 21 | 22 | \subsection{} 23 | the co-rank is (4,2) 24 | 25 | the calculation for thread 2: 26 | it start at position k = 6. so the for co-rank, it should be: i+j = 6. 27 | The possible can be (2, 4),(3, 3),(4, 2),(5, 1). 28 | 29 | After comparing B[1], A[4] and A[5], we get the result of (4,2). 30 | 31 | \subsection{} 32 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 33 | int counter = 0; 34 | int C_length = C_next - C_curr; 35 | int A_length = A_next - A_curr; 36 | int B_length = B_next - B_curr; 37 | int total_iteration = ceilf((float)C_length / tile_size); 38 | int C_completed = 0; 39 | int A_consumed = 0; 40 | int B_consumed = 0; 41 | while(counter < total_iteration){ 42 | for(int i = 0; i < tile_size; i += blockDim.x){ 43 | int idx = A_curr + A_consumed + i + threadIdx.x; 44 | if(i + threadIdx.x < A_length - A_consumed) { 45 | int coranked_idx = corank(idx); // Apply corank to adjust the index 46 | A_S[i + threadIdx.x] = A[coranked_idx]; 47 | } 48 | } 49 | for(int i = 0; i < tile_size; i += blockDim.x) { 50 | int idx = B_curr + B_consumed + i + threadIdx.x; 51 | if(i + threadIdx.x < B_length - B_consumed) { 52 | int coranked_idx = corank(idx); // Apply corank to adjust the index 53 | B_S[i + threadIdx.x] = B[coranked_idx]; 54 | } 55 | } 56 | __syncthreads(); 57 | A_consumed += tile_size; 58 | B_consumed += tile_size; 59 | C_completed += tile_size; 60 | counter++; 61 | } 62 | \end{lstlisting} 63 | 64 | \subsection{} 65 | \subsubsection{} 66 | total number elements is: 1030400 + 608000 = 1638400 elements. 67 | 68 | \(\text{Total Threads} = \frac{\text{Total Elements to Merge}}{\text{Elements per Thread}} = \frac{1,638,400}{8} = 204,800 \text{threads}\) 69 | 70 | \subsubsection{} 71 | it is $\frac{1638400}{8} = 204800$ threads devided by thread block size: $\frac{204800}{1024} = 200$ thread blocks. 72 | 73 | For each block 2 binary seaches is in needed, so the resilt is 200 * 2 = 400 times. 74 | 75 | \subsubsection{} 76 | in shared memory each one executes once, so the answer is 200 times. 77 | 78 | \end{document} -------------------------------------------------------------------------------- /Part2/CH7/code/convolution2D.cu: -------------------------------------------------------------------------------- 1 | #include 2 | //print matrix 3 | 4 | #define MAX_KERNEL_RADIUS 1 5 | __constant__ float const_conv_kernel[(2 * MAX_KERNEL_RADIUS + 1) * (2 * MAX_KERNEL_RADIUS + 1)]; 6 | 7 | #define BLOCK_SIZE 16 8 | 9 | 10 | void coutmatrix(const float* mat, int rows, int cols, const char* name){ 11 | std::cout << "Matrix " << name << ":\n"; 12 | for(int i = 0;i < rows; i++){ 13 | for(int j = 0; j < cols; j++){ 14 | std::cout << mat[i * cols + j] << "\t"; 15 | } 16 | std::cout << "\n"; 17 | } 18 | } 19 | __global__ void conv2D_basic_boundary_check(const float *A, const float *B, float *C, int row, int col, int r){ 20 | int outCol = blockIdx.x * blockDim.x + threadIdx.x; 21 | int outRow = blockIdx.y * blockDim.y + threadIdx.y; 22 | if (outCol >= col || outRow >= row) return; 23 | float Pvalue = 0.0f; 24 | for (int fRow = 0; fRow < 2 * r + 1; fRow++){ 25 | for (int fCol = 0; fCol < 2 * r + 1; fCol++){ 26 | int inRow = outRow - r + fRow; 27 | int inCol = outCol - r + fCol; 28 | if (inRow >= 0 && inRow < row && inCol >= 0 && inCol < col){ 29 | Pvalue += B[fRow* (2 * r + 1) + fCol] * A[inRow * col + inCol]; 30 | } 31 | } 32 | } 33 | C[outRow * col + outCol] = Pvalue; 34 | } 35 | __global__ void conv2D_basic(const float *A, const float *B, float *C, int row, int col, int r){ 36 | //without boundary check, thus it will be smaller, row and col reduced by 2 * r; 37 | int outCol = blockDim.x * blockIdx.x + threadIdx.x; 38 | int outRow = blockDim.y * blockIdx.y + threadIdx.y; 39 | if (outCol < r || outCol >= col - r || outRow < r || outRow >= row - r) return; 40 | float Pvalue = 0.0f; 41 | for (int fRow = 0; fRow < 2 * r + 1; fRow++){ 42 | for (int fCol = 0; fCol < 2 * r + 1; fCol++){ 43 | int inRow = outRow - r + fRow; 44 | int inCol = outCol - r + fCol; 45 | Pvalue += B[fRow * (2 * r + 1) + fCol] * A[inRow * col + inCol]; 46 | } 47 | } 48 | C[(outRow - r) * (col - 2 * r) + (outCol - r)] = Pvalue; 49 | } 50 | __global__ void conv2D_constant_mem(const float *A, float *C, int row, int col){ 51 | int outCol = blockIdx.x * blockDim.x + threadIdx.x; 52 | int outRow = blockIdx.y * blockDim.y + threadIdx.y; 53 | if (outCol >= col || outRow >= row) return; 54 | float Pvalue = 0.0f; 55 | for (int fRow = 0; fRow < 2 * MAX_KERNEL_RADIUS + 1; fRow++){ 56 | for (int fCol = 0; fCol < 2 * MAX_KERNEL_RADIUS + 1; fCol++){\ 57 | int inRow = outRow - MAX_KERNEL_RADIUS + fRow; 58 | int inCol = outCol - MAX_KERNEL_RADIUS + fCol; 59 | if (inRow >= 0 && inRow < row && inCol >= 0 && inCol < col){ 60 | Pvalue += const_conv_kernel[fRow * (2 * MAX_KERNEL_RADIUS + 1) + fCol] * A[inRow * col + inCol]; 61 | } 62 | } 63 | } 64 | C[outRow * col + outCol] = Pvalue; 65 | } 66 | 67 | __global__ void conv2D_constant_mem_tiled(const float *A, float *C, int row, int col){ 68 | __shared__ float sharedMem[BLOCK_SIZE + 2 * MAX_KERNEL_RADIUS][BLOCK_SIZE + 2 * MAX_KERNEL_RADIUS]; 69 | int outCol = blockIdx.x * blockDim.x + threadIdx.x; 70 | int outRow = blockIdx.y * blockDim.y + threadIdx.y; 71 | 72 | for (int y = threadIdx.y; y < BLOCK_SIZE + 2 * MAX_KERNEL_RADIUS; y += BLOCK_SIZE){ 73 | for (int x = threadIdx.x; x < BLOCK_SIZE + 2 * MAX_KERNEL_RADIUS; x += BLOCK_SIZE){ 74 | int sharedMemRow = y; 75 | int sharedMemCol = x; 76 | int globalRow = blockIdx.y * blockDim.y + y - MAX_KERNEL_RADIUS; 77 | int globalCol = blockIdx.x * blockDim.x + x - MAX_KERNEL_RADIUS; 78 | 79 | if (globalRow >= 0 && globalRow < row && globalCol >= 0 && globalCol < col){ 80 | sharedMem[sharedMemRow][sharedMemCol] = A[globalRow * col + globalCol]; 81 | } else{ 82 | sharedMem[sharedMemRow][sharedMemCol] = 0.0f; 83 | } 84 | } 85 | } 86 | 87 | __syncthreads(); 88 | 89 | if (outRow < row && outCol < col){ 90 | float Pvalue = 0.0f; 91 | for (int i = 0; i < 2 * MAX_KERNEL_RADIUS + 1; i ++){ 92 | for (int j = 0; j < 2 * MAX_KERNEL_RADIUS + 1; j++){ 93 | Pvalue += const_conv_kernel[i * (2 * MAX_KERNEL_RADIUS + 1) + j] * sharedMem[threadIdx.y + i][threadIdx.x + j]; 94 | } 95 | } 96 | C[outRow * col + outCol] = Pvalue; 97 | } 98 | } 99 | 100 | void conv2D(const float *A, const float *B, float *C, int row, int col, int r){ 101 | float *d_A, *d_B, *d_C; 102 | size_t size_A = row * col * sizeof(float); 103 | size_t size_B = (2 * r + 1) * (2 * r + 1) * sizeof(float); 104 | 105 | cudaMalloc((void**)&d_A, size_A); 106 | cudaMalloc((void**)&d_B, size_B); 107 | cudaMalloc((void**)&d_C, size_A);//size C is the same as the size A 108 | 109 | cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice); 110 | cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice); 111 | 112 | dim3 blockDim(16,16); 113 | dim3 gridDim((col + blockDim.x - 1) / blockDim.x, (row + blockDim.y - 1) / blockDim.y); 114 | 115 | conv2D_basic_boundary_check<<>>(d_A, d_B, d_C, row, col, r); 116 | cudaDeviceSynchronize(); 117 | cudaMemcpy(C, d_C, size_A, cudaMemcpyDeviceToHost); 118 | coutmatrix(C, row, col, "Result_boundary"); 119 | 120 | size_t size_C = (row - r * 2) * (col - r * 2) * sizeof(float); 121 | cudaFree(d_C); 122 | cudaMalloc((void**)&d_C, size_C);//smaller than the size A with r 123 | dim3 grid((col - 2 * r + blockDim.x - 1) / blockDim.x, 124 | (row - 2 * r + blockDim.y - 1) / blockDim.y); 125 | conv2D_basic<<>>(d_A, d_B, d_C, row, col, r); 126 | cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost); 127 | coutmatrix(C, row - r * 2, col - r * 2, "Result_no_boundary"); 128 | 129 | cudaMemcpyToSymbol(const_conv_kernel, B, size_B); 130 | cudaFree(d_C); 131 | cudaMalloc((void**)&d_C, size_A); 132 | conv2D_constant_mem<<>>(d_A, d_C, row, col); 133 | cudaMemcpy(C, d_C, size_A, cudaMemcpyDeviceToHost); 134 | coutmatrix(C, row, col, "Constant_memory_res"); 135 | 136 | cudaFree(d_C); 137 | cudaMalloc((void**)&d_C, size_A); 138 | conv2D_constant_mem_tiled<<>>(d_A, d_C, row, col); 139 | cudaMemcpy(C, d_C, size_A, cudaMemcpyDeviceToHost); 140 | coutmatrix(C, row, col, "Constant_tiled_memory_res"); 141 | 142 | 143 | 144 | cudaFree(d_A); 145 | cudaFree(d_B); 146 | cudaFree(d_C); 147 | } 148 | 149 | int main(){ 150 | const int row_A = 5; 151 | const int col_A = 5; 152 | const int r = 1; 153 | float h_A[] = { 154 | 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 155 | 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 156 | 1.0f, 2.0f, 3.0f, 2.0f, 1.0f, 157 | 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 158 | 1.0f, 1.0f, 1.0f, 1.0f, 1.0f 159 | }; 160 | float h_B[] = { 161 | 0, 0, 0, 162 | 0, 5, 0, 163 | 0, 0, 0 164 | }; 165 | // std::cout<< r/2 * 2 << std::endl; 166 | coutmatrix(h_A, row_A, col_A, "A"); 167 | coutmatrix(h_B, 2*r + 1, 2 * r + 1, "conv_kernel"); 168 | 169 | float* h_C = new float[row_A * col_A];//same size as A 170 | conv2D(h_A, h_B, h_C, row_A, col_A, r); 171 | 172 | 173 | delete[] h_C; 174 | return 0; 175 | } -------------------------------------------------------------------------------- /Part2/CH7/code/convolution3D.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | // |depth 4 | // | 5 | // | 6 | // |________width 7 | // / 8 | // / 9 | // / 10 | // /height 11 | 12 | #define KERNEL_RADIUS 1 13 | __constant__ float const_conv_kernel_3D[(2 * KERNEL_RADIUS + 1) * (2 * KERNEL_RADIUS + 1) * (2 * KERNEL_RADIUS + 1)]; 14 | 15 | #define BLOCK_SIZE 16 16 | 17 | 18 | void coutmatrix3D(const float* mat, int width, int height, int depth, const char* name){ 19 | std::cout << "Matrix " << name << ":\n"; 20 | for (int k = 0; k < depth; k++){ 21 | std::cout << "Layer" << k << ":\n"; 22 | for(int i = 0;i < height; i++){ 23 | for(int j = 0; j < width; j++){ 24 | std::cout << mat[k * height * width + i * width + j] << "\t"; 25 | } 26 | std::cout << "\n"; 27 | } 28 | } 29 | 30 | } 31 | 32 | __global__ void conv3D_basic_boundary_check(const float *A, const float *B, float *C, int width, int height, int depth, int r){ 33 | int outDep = blockIdx.z * blockDim.z + threadIdx.z; 34 | int outHei = blockIdx.y * blockDim.y + threadIdx.y; 35 | int outWid = blockIdx.x * blockDim.x + threadIdx.x; 36 | if (outDep >= depth || outHei >= height || outWid >= width) return; 37 | float Pvalue = 0.0f; 38 | for (int fDep = 0; fDep < 2 * r + 1; fDep++){ 39 | for (int fHei = 0; fHei < 2 * r + 1; fHei++){ 40 | for (int fWid = 0; fWid < 2 * r + 1;fWid++){ 41 | int inDep = outDep - r + fDep; 42 | int inHei = outHei - r + fHei; 43 | int inWid = outWid - r + fWid; 44 | if (inDep >= 0 && inHei >= 0 && inWid >= 0 && inDep < depth && inHei < height && inWid < width){ 45 | Pvalue += A[inDep * height * width + inHei * width + inWid] * B[fDep * (2 * r + 1) * (2 * r + 1) + fHei * (2 * r + 1) + fWid]; 46 | } 47 | } 48 | } 49 | } 50 | C[outDep * height * width + outHei * width + outWid] = Pvalue; 51 | } 52 | 53 | __global__ void conv3D_constant_mem(float *A, float *C, int width, int height, int depth){ 54 | int outDep = blockDim.z * blockIdx.z + threadIdx.z; 55 | int outHei = blockDim.y * blockIdx.y + threadIdx.y; 56 | int outWid = blockDim.x * blockIdx.x + threadIdx.x; 57 | if (outDep >= depth || outHei >= height || outWid >= width) return; 58 | float Pvalue = 0.0f; 59 | for (int fDep = 0; fDep < 2 * KERNEL_RADIUS + 1; fDep++){ 60 | for (int fHei = 0; fHei < 2 * KERNEL_RADIUS + 1; fHei++){ 61 | for (int fWid = 0; fWid < 2 * KERNEL_RADIUS + 1;fWid++){ 62 | int inDep = outDep - KERNEL_RADIUS + fDep; 63 | int inHei = outHei - KERNEL_RADIUS + fHei; 64 | int inWid = outWid - KERNEL_RADIUS + fWid; 65 | if (inDep >= 0 && inHei >= 0 && inWid >= 0 && inDep < depth && inHei < height && inWid < width){ 66 | Pvalue += A[inDep * height * width + inHei * width + inWid] * const_conv_kernel_3D[fDep * (2 * KERNEL_RADIUS + 1) * (2 * KERNEL_RADIUS + 1) + fHei * (2 * KERNEL_RADIUS + 1) + fWid]; 67 | } 68 | } 69 | } 70 | } 71 | C[outDep * height * width + outHei * width + outWid] = Pvalue; 72 | } 73 | 74 | __global__ void conv3D_constant_mem_tiled(const float *A, float *C, int width, int height, int depth){ 75 | __shared__ float sharedMem[BLOCK_SIZE + 2 * KERNEL_RADIUS][BLOCK_SIZE + 2 * KERNEL_RADIUS][BLOCK_SIZE + 2 * KERNEL_RADIUS]; 76 | int outDep = blockDim.z * blockIdx.z + threadIdx.z; 77 | int outHei = blockDim.y * blockIdx.y + threadIdx.y; 78 | int outWid = blockDim.x * blockIdx.x + threadIdx.x; 79 | 80 | for (int z = threadIdx.z; z < BLOCK_SIZE + 2 * KERNEL_RADIUS; z += BLOCK_SIZE){ 81 | int globalDep = blockIdx.z * blockDim.z + z - KERNEL_RADIUS; 82 | for (int y = threadIdx.y; y < BLOCK_SIZE + 2 * KERNEL_RADIUS; y += BLOCK_SIZE){ 83 | int globalHei = blockIdx.y * blockDim.y + y - KERNEL_RADIUS; 84 | for (int x = threadIdx.x; x < BLOCK_SIZE + 2 * KERNEL_RADIUS; x += BLOCK_SIZE){ 85 | int globalWid = blockIdx.x * blockDim.x + x - KERNEL_RADIUS; 86 | if (globalDep >=0 && globalHei >= 0 && globalWid >= 0 && globalDep < depth && globalHei < height && globalWid < width){ 87 | sharedMem[z][y][x] = A[globalDep * height * width + globalHei * width + globalWid]; 88 | } else{ 89 | sharedMem[z][y][x] = 0.0f; 90 | } 91 | } 92 | } 93 | } 94 | __syncthreads(); 95 | 96 | if (outDep >= depth || outHei >= height || outWid >= width) return; 97 | float Pvalue = 0.0f; 98 | for (int i = 0; i < 2 * KERNEL_RADIUS + 1; i++){ 99 | for (int j = 0; j < 2 * KERNEL_RADIUS + 1; j++){ 100 | for (int k = 0; k < 2 * KERNEL_RADIUS + 1; k++){ 101 | Pvalue += sharedMem[i + threadIdx.z][j + threadIdx.y][k + threadIdx.x] * const_conv_kernel_3D[i * (2 * KERNEL_RADIUS + 1) * (2 * KERNEL_RADIUS + 1) + j * (2 * KERNEL_RADIUS + 1) + k]; 102 | } 103 | } 104 | } 105 | C[outDep * height * width + outHei * width + outWid] = Pvalue; 106 | } 107 | 108 | void conv3D(const float *A, const float *B, float *C, int width, int height, int depth, int r){ 109 | float *d_A, *d_B, *d_C; 110 | size_t size_A = width * height * depth * sizeof(float); 111 | size_t size_B = (2 * r + 1) * (2 * r + 1) * (2 * r + 1) * sizeof(float); 112 | 113 | cudaMalloc((void**)&d_A, size_A); 114 | cudaMalloc((void**)&d_B, size_B); 115 | cudaMalloc((void**)&d_C, size_A);//size C is the same as the size A 116 | 117 | cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice); 118 | cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice); 119 | 120 | dim3 blockDim(8,8,8); 121 | dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y, (depth + blockDim.z - 1) / blockDim.z); 122 | conv3D_basic_boundary_check<<>>(d_A, d_B, d_C, width, height, depth, r); 123 | cudaMemcpy(C, d_C, size_A, cudaMemcpyDeviceToHost); 124 | coutmatrix3D(C, width, height, depth, "C_3D"); 125 | 126 | cudaFree(d_C); 127 | cudaMemcpyToSymbol(const_conv_kernel_3D, B, size_B); 128 | conv3D_constant_mem<<>>(d_A, d_C, width, height, depth); 129 | cudaMemcpy(C, d_C, size_A, cudaMemcpyDeviceToHost); 130 | coutmatrix3D(C, width, height, depth, "C_3D_constant_mem"); 131 | 132 | cudaFree(d_C); 133 | conv3D_constant_mem_tiled<<>>(d_A, d_C, width, height, depth); 134 | cudaMemcpy(C, d_C, size_A, cudaMemcpyDeviceToHost); 135 | coutmatrix3D(C, width, height, depth, "C_3D_constant_mem_tile"); 136 | 137 | cudaFree(d_A); 138 | cudaFree(d_B); 139 | cudaFree(d_C); 140 | } 141 | 142 | int main(){ 143 | const int width_A = 5; 144 | const int heigh_A = 5; 145 | const int depth_A = 5; 146 | const int r = 1; 147 | float h_A_layer[] = { 148 | 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 149 | 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 150 | 1.0f, 2.0f, 3.0f, 2.0f, 1.0f, 151 | 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 152 | 1.0f, 1.0f, 1.0f, 1.0f, 1.0f 153 | }; 154 | const int size_A = width_A * heigh_A * depth_A; 155 | float h_A[size_A]; 156 | for (int d = 0; d < depth_A; d++){ 157 | for (int i = 0;i < width_A * heigh_A; i++){ 158 | h_A[d * heigh_A * width_A + i] = h_A_layer[i]; 159 | } 160 | } 161 | 162 | // float h_B_layer[] = { 163 | // 0, 0, 0, 164 | // 0, 5, 0, 165 | // 0, 0, 0 166 | // }; 167 | // const int size_B = (2 * r + 1) * (2 * r + 1) * (2 * r + 1); 168 | // float h_B[size_B]; 169 | // for (int d = 0; d < 2 * r + 1; d++){ 170 | // for (int i = 0;i < (2 * r + 1) * (2 * r + 1);i++){ 171 | // h_B[d * (2 * r + 1) * (2 * r + 1) + i] = h_B_layer[i]; 172 | // } 173 | // } 174 | 175 | const int size_B = (2 * r + 1) * (2 * r + 1) * (2 * r + 1); 176 | float h_B[size_B]; 177 | for (int i = 0; i < size_B; i++) { 178 | h_B[i] = 0; 179 | } 180 | h_B[(size_B - 1) / 2] = 5; 181 | 182 | 183 | coutmatrix3D(h_A, width_A, heigh_A, depth_A, "A_3D"); 184 | coutmatrix3D(h_B, 2*r + 1, 2 * r + 1, 2 * r + 1, "conv_kernel_3D"); 185 | 186 | // float* h_C = new float[row_A * col_A];//same size as A 187 | float h_C[size_A]; 188 | conv3D(h_A, h_B, h_C, width_A, heigh_A, depth_A, r); 189 | 190 | 191 | // delete[] h_C; 192 | return 0; 193 | } -------------------------------------------------------------------------------- /Part2/CH7/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part2/CH7/main.pdf -------------------------------------------------------------------------------- /Part2/CH7/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{amsmath} 3 | \usepackage{listings} 4 | \usepackage{geometry} 5 | \usepackage{array} 6 | \usepackage{ulem} 7 | \usepackage{float} 8 | \geometry{a4paper, margin=1in} 9 | \begin{document} 10 | 11 | \section*{key points} 12 | For convolution, pay attention on memory access is quiet important. Here, using the constant memory for storing the filter kernel can greatly speed up its efficiency with the function \_\_constant\_\_. As a constant variable, its accessing speed is faster than global variable. 13 | 14 | Then, data can be transferred form host to device with: cudaMemcpyToSymbol(dest, src, size). 15 | 16 | Convolution with tiling: 17 | 18 | Halo + tile = reginal data, tile = output data, where halo = filter radius, so that those over tile's data can be accessed form local memory instead of global memory. 19 | 20 | The arithmetic-to-global memory access ratio for the tiled kernel is 21 | \begin{equation*} 22 | \frac{OUT\_TILE\_DIM^2*(2*FILTER\_RADIUS+1)^2*2}{(OUT\_TILE\_DIM+2*FILTER\_RADIUS)^2*4}. 23 | \end{equation*} 24 | 25 | Pay attention on telling difference between Valid convolution(smaller), same convolution(same) and full convolution(larger)! 26 | 27 | \section{} 28 | \subsection{} 29 | 51, calculated form 40+6+5=51. 30 | 31 | \subsection 32 | 8,21,13,20, same as 1. 33 | 34 | \subsection{} 35 | \subsubsection{} 36 | original number. 37 | 38 | \subsubsection{} 39 | All move to left for a number. 40 | 41 | \subsubsection{} 42 | All move to right for a number. 43 | 44 | \subsubsection{} 45 | \sout{calculate the difference between the current number} computes the central difference, which approximates the first derivative of the signal. 46 | 47 | \subsubsection{} 48 | The average of the nearby number. 49 | 50 | 51 | \subsection{} 52 | \subsubsection{} 53 | \sout{\(\frac{M-1}{2}\)} M-1 54 | 55 | \subsubsection{} 56 | \(N\cdot M\) 57 | 58 | \subsubsection{} 59 | \sout{\((N-(M-1))\cdot M+2\cdot \sum_{i=1}^{\frac{M-1}{2}}(i+1)\)} 60 | 61 | 62 | get it from \((N-(M-1))\times M - \frac{(M-1)(M+1)}{4}\) 63 | 64 | which actually is \(N\times M - \frac{(M-1)(M+1)}{4}\) 65 | 66 | \subsection{} 67 | \subsubsection{} 68 | \((N+M-1)^2-N^2\) 69 | 70 | \subsubsection{} 71 | \sout{\(M^2 - N^2\)} \[[(N + M - 1)^2 - N^2] \times M^2\] 72 | 73 | \subsubsection{} 74 | \sout{\(N^2\cdot N^2 - \sum_{i = 1}^{\frac{M-1}{2}}[(N+i)^2-N^2]\)} \[N^2 \times M^2\] 75 | 76 | \subsection{} 77 | \subsubsection{} 78 | \((N_1+M_1-1)(N_2+M_2-1) - N_1\cdot N_2\)\ 79 | 80 | \subsubsection{} 81 | \sout{\(M_1M_2\cdot N_1N_2\)} \((N_1 + M_1 - 1) \times (N_2 + M_2 - 1) \times (M_1 \times M_2)\) 82 | 83 | \subsubsection{} 84 | \sout{\(M_1M_2\cdot N_1N_2 - \sum_{i = 1}^{\frac{M_1-1}{2}}\sum_{i = 1}^{\frac{M_2-1}{2}}[[(N_i+i)(M_2+j)-N_1N_2]]\)} \(N_1 \times N_2 \times (M_1 \times M_2)\) 85 | 86 | \subsection{} 87 | \subsubsection{} 88 | \(\lceil N/T\rceil ^2\) 89 | 90 | \subsubsection{} 91 | \((T+M-1)^2\) 92 | 93 | \subsubsection{} 94 | \(4*(T+M-1)^2\) 95 | 96 | \subsubsection{} 97 | \(\lceil N/T\rceil ^2\),\((T+M-1+halo)^2\),\(4*(T+M-1+halo)^2\) 98 | 99 | \subsection{} 100 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 101 | __global__ void convolution_3D_basic_kernel(const float *N, const float *F, float *P, int r, int width, int height, int depth) { 102 | int outCol = blockIdx.x*blockDim.x + threadIdx.x; 103 | int outRow = blockIdx.y*blockDim.y + threadIdx.y; 104 | int outDep = blockIdx.z*blockDim.z + threadIdx.z; 105 | float Pvalue = 0.0f; 106 | // Ensure the thread is within the output bounds 107 | if (outCol < width && outRow < height && outDep < depth) { 108 | for (int fRow = 0; fRow < 2*r+1; fRow++){ 109 | for (int fCol = 0; fCol < 2*r+1; fCol++){ 110 | for (int fDep = 0; fDep < 2*r+1; fDep++){ 111 | int inRow = outRow - r + fRow; 112 | int inCol = outCol - r + fCol; 113 | int inDep = outDep - r + fDep; 114 | if (inRow >= 0 && inRow < height && 115 | inCol >= 0 && inCol < width && 116 | inDep >= 0 && inDep < depth) 117 | { 118 | int filterIdx = fRow * (2*r+1)*(2*r+1)+fCol*(2*r +1)+ fDep; 119 | Pvalue += F[filterIdx] * N[inRow*width*depth +inCol*depth+inDep]; 120 | } 121 | } 122 | } 123 | } 124 | int outputIdx = outRow * width * depth + outCol * depth + outDep; 125 | P[outputIdx] = Pvalue; 126 | } 127 | } 128 | \end{lstlisting} 129 | \textbf{Attention: F and P is actually a pointer(a 1D array), so use single indices for accessing it.} 130 | 131 | \subsection{} 132 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines = true] 133 | #define MAX_FILTER_SIZE //based on the maximum expected 'r'. 134 | __constant__ float F[MAX_FILTER_SIZE]; 135 | __global__ void convolution_3D_const_mem_kernel(const float *N, float *P, int r, int width, int height, int depth){ 136 | int outCol = blockIdx.x*blockDim.x + threadIdx.x; 137 | int outRow = blockIdx.y*blockDim.y + threadIdx.y; 138 | int outDep = blockIdx.z*blockDim.z + threadIdx.z; 139 | float Pvalue = 0.0f; 140 | if (outCol < width && outRow < height && outDep < depth){ 141 | for (int fRow = 0; fRow < 2*r+1; fRow++){ 142 | for (int fCol = 0; fCol < 2*r+1; fCol++){ 143 | for (int fDep = 0; fDep < 2*r+1; fDep++){ 144 | int inRow = outRow - r + fRow; 145 | int inCol = outCol - r + fCol; 146 | int inDep = outDep - r + fDep; 147 | if (inRow >= 0 && inRow < height && 148 | inCol >= 0 && inCol < width && 149 | inDep >= 0 && inDep < depth) 150 | { 151 | int filterIdx = fRow * (2*r+1)*(2*r+1)+fCol*(2*r +1)+ fDep; 152 | Pvalue += F[filterIdx] * N[inRow*width*depth +inCol*depth+inDep]; 153 | } 154 | } 155 | } 156 | } 157 | P[outRow*width*depth+outCol*depth+outDep] = Pvalue; 158 | } 159 | 160 | } 161 | \end{lstlisting} 162 | 163 | \textbf{Compared to last problem, the variable F changed into a const variable} 164 | 165 | const VS \_\_constant\_\_: 166 | 167 | when data is large and doesn't fit into constant memory, access patterns are non-uniform, or different threads access different data shall use the function const. While \_\_constant\_\_ has faster access when all threads use the same filter coefficients, reduced global memory bandwidth. But it is limited by the size of constant memory and mot suitable for very large filters. 168 | 169 | \subsection{} 170 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines = true] 171 | #define IN_TILE_DIM 32 172 | #define OUT_TILE_DIM ((IN_TILE_DIM) - 2*(FILTER_RADIUS)) 173 | __constant__ float F_c[2*FILTER_RADIUS+1][2*FILTER_RADIUS+1][2*FILTER_RADIUS+1]; 174 | __global__ void convolution_tiled_3D_const_mem_kernel(const float *N, float *P, int width, int height, int depth) { 175 | int col = blockIdx.x*OUT_TILE_DIM + threadIdx.x - FILTER_RADIUS; 176 | int row = blockIdx.y*OUT_TILE_DIM + threadIdx.y - FILTER_RADIUS; 177 | int dep = blockIdx.z*OUT_TILE_DIM + threadIdx.z - FILTER_RADIUS; 178 | __shared__ float N_s[IN_TILE_DIM][IN_TILE_DIM][IN_TILE_DIM]; 179 | if (col < width && row < height && dep < depth){ 180 | N_s[threadIdx.z][threadIdx.y][threadIdx.x] = N[row*width*depth+col*depth+dep]; 181 | }else{ 182 | N_s[threadIdx.z][threadIdx.y][threadIdx.x] = 0; 183 | } 184 | __syncthreads(); 185 | int tileCol = threadIdx.x - FILTER_RADIUS; 186 | int tileRow = threadIdx.y - FILTER_RADIUS; 187 | int tileDep = threadIdx.z - FILTER_RADIUS; 188 | if (col < width && row < height && dep < depth){ 189 | if (tileCol >=0 && tileCol < OUT_TILE_DIM && tileRow >=0 && tileRow=0 && tileDep < OUT_TILE_DIM){ 190 | float Pvalue = 0.0f; 191 | for (int fRow = 0; fRow < 2*FILTER_RADIUS+1; fRow++){ 192 | for (int fCol = 0; fCol < 2*FILTER_RADIUS+1; fCol++){ 193 | for (int fDep = 0; fDep < 2*FILTER_RADIUS+1; fDep++){ 194 | Pvalue += F_c[fRow][fCol][fDep]*N_s[tileRow+fRow][tileCol+fCol][tileDep+fDep]; 195 | } 196 | } 197 | } 198 | P[row*width*depth+col*depth+dep] = Pvalue; 199 | } 200 | } 201 | } 202 | \end{lstlisting} 203 | 204 | The use effect of N\_s: 205 | 206 | Act as original data for a faster reading speed. 207 | 208 | \end{document} -------------------------------------------------------------------------------- /Part2/CH8/main.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part2/CH8/main.pdf -------------------------------------------------------------------------------- /Part2/CH8/main.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{amsmath} 3 | \usepackage{listings} 4 | \usepackage{geometry} 5 | \usepackage{array} 6 | \usepackage{ulem} 7 | \usepackage{float} 8 | \geometry{a4paper, margin=1in} 9 | \begin{document} 10 | 11 | \section*{key points} 12 | sencil is harder than convolution, since it multiple different pattern. 13 | 14 | register tiling optimizes memory access by minizing index calculations and leveraging faster register storage. 15 | coarening enchangces computational efficiency by enabling threads to handle multiple targets concrrently, thereby increasing parallelism(if per thread is evenly distributed). 16 | 17 | \section{} 18 | \subsection{} 19 | \subsubsection{} 20 | the number is 118*118*118 21 | 22 | \subsubsection{} 23 | \sout{that is 10*10*10} 24 | Effective grid size: 118*118*118, block size:8*8*8. Number of blocks along each dimension:\(\lceil \frac{118}{8}\rceil = 15\). So the total number of thread blocks is 15*15*15. 25 | 26 | \subsubsection{} 27 | \sout{the number is 8*8*8} 28 | Shared memory optimizes memory access but not change the number of thread blocks required for covering the grid. 29 | So it is the same as b.:15*15*15 30 | 31 | \subsubsection{} 32 | \sout{the number is 3*32*32} 33 | \(\lceil \frac{118}{32}\rceil = 4\), for covering three layers, the answer is 4*4*3. 34 | 35 | \subsection{} 36 | \subsubsection{} 37 | \sout{32*32*16} 32*32*18, z-dimension should take into consider with 2 additional halo layers. 38 | 39 | \subsubsection{} 40 | \sout{32*32*3} 32*32*16, in z-dimension 16 consective output planes calculated together, so the processes should be 16 layers. 41 | 42 | \subsubsection{} 43 | For memory access, that is 32*32*16*4 bytes. For FLOPs, it is 7(for every point)*16(elements/thread) = 112FLOP. 44 | 45 | read operation: 7reads*4bytes; write operation:1write*4bytes. 46 | 47 | So totally for (28+4=32)*16=512 bytes. 48 | 49 | As a result the answer should be 112/512 $\approx$ 0.21875FLOPs/Byte. 50 | 51 | \textbf{For allowing significant data reuse,} 52 | 53 | \subsubsection{} 54 | \sout{32*32*16*4 bytes}. for each thread instead of the whold block of its life time, so the number should be 3 layers of pre,curr and next. 55 | 56 | \subsubsection{} 57 | 32*32*3*4 bytes 58 | 59 | The use of register tiling won't effect the need of shared memory. 60 | 61 | 62 | 63 | 64 | 65 | 66 | \end{document} -------------------------------------------------------------------------------- /Part2/CH9/mian.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BoliboliWJY/Programming-Massively-Parallel-Processors-A-Hands-on-Approach-4th/ce6a2a4b4071f94ffcbe6aad4d43868782f71d8a/Part2/CH9/mian.pdf -------------------------------------------------------------------------------- /Part2/CH9/mian.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{amsmath} 3 | \usepackage{listings} 4 | \usepackage{geometry} 5 | \usepackage{array} 6 | \usepackage{ulem} 7 | \usepackage{float} 8 | \geometry{a4paper, margin=1in} 9 | \begin{document} 10 | 11 | \section*{key points} 12 | histogtam for calculate the accumulated number of data. 13 | 14 | atomic aperation: every thread holds on a single data. Making a series of operation be an undevided opeation by locking other operations with the function: int atomicAdd(int* adress, int val); replace the function like: *adress++; 15 | 16 | For speeding up the accessing memory speed, placing data in the last-level cache(shared in SMs) can help decrease latency form hunhreds of cycles to tens of cycles.(differ from global memory) 17 | 18 | Privatization: avoid traffic on a single target, cghange the aim target to shared memory, thus increase accessing speed, moreover, don't forget the function \_\_syncthreads(); 19 | 20 | Coarsening: too many copies for each block submit form private to public may takes lots of time, so coarsening can help decreasing number of pribate copies. Interleaved partitioning and non-interleaved partitioning: for the previous one, the improved cache utilization and reduced memory latency due to better coalescing. 21 | 22 | Aggregation: Consecutive similar data enhances memory access patterns and reduces contention, leading to more effective aggregation. Conversely, randomly sorted data can hinder performance due to poor memory coalescing and increased contention on shared resources. 23 | 24 | 25 | \section{} 26 | \subsection{} 27 | \sout{1} for expressing maximum throughput, represent it in operations per second, which is 10 million operations per second. 28 | 29 | \subsection{} 30 | \sout{for about 0.9*1/4ns+0.1*1/100ns * 1e9 per second} 31 | in real process, L2 and global aren't executed pipelined, they actually serialized, it should be thought together: \(\frac{1}{(0.9*4ns)+(0.1*100ns)}=\frac{1}{3.6+10ns}\approx 7.35\times10^7\) operations/second. 32 | 33 | \subsection{} 34 | it is 5*10 million bytes(50MFLOPs/s), the amount of throughput is the number of floating-point operations per second. 35 | 36 | \subsection{} 37 | it should be about \(\frac{1}{1ns}/1.1*5\) 38 | 39 | \subsection{} 40 | d 41 | 42 | the use of this function is: atomicAdd(address, value); 43 | 44 | \subsection{} 45 | \subsubsection{} 46 | \sout{524288/1024 times.} Total atmiuc ioerations is just 524288, instead of the number block. 47 | 48 | \subsubsection{} 49 | 52488/(52488/1024) times. 50 | 51 | \subsubsection{}\ 52 | 52488/[(52488/1024)*4] times. 53 | 54 | 55 | 56 | \end{document} -------------------------------------------------------------------------------- /Part3/CH13/code/q4/merge.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // Define constants for block sizes 8 | #define BLOCK_SIZE 256 9 | #define MAX_CHUNK_SIZE 1024 // Max chunk size for local sorting 10 | 11 | // Forward declarations of device functions 12 | __device__ int countLE(int* arr, int start, int end, int val); 13 | __device__ int findPosition(int* A, int startA, int endA, int* B, int startB, int endB, int target); 14 | 15 | // Local sort kernel - sorts small chunks of data 16 | __global__ void localSort(int* input, int* output, int n, int chunkSize) { 17 | extern __shared__ int sharedMem[]; 18 | int tid = threadIdx.x; 19 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 20 | int chunkStart = blockIdx.x * chunkSize; 21 | 22 | // Load data into shared memory 23 | for (int i = tid; i < chunkSize && (chunkStart + i) < n; i += blockDim.x) { 24 | sharedMem[i] = input[chunkStart + i]; 25 | } 26 | __syncthreads(); 27 | 28 | // Sort data in shared memory using insertion sort (good for small chunks) 29 | for (int i = 1; i < chunkSize && (chunkStart + i) < n; i++) { 30 | int key = sharedMem[i]; 31 | int j = i - 1; 32 | while (j >= 0 && sharedMem[j] > key) { 33 | sharedMem[j + 1] = sharedMem[j]; 34 | j--; 35 | } 36 | sharedMem[j + 1] = key; 37 | __syncthreads(); // Ensure sorting step is complete before continuing 38 | } 39 | 40 | // Write sorted data back to global memory 41 | for (int i = tid; i < chunkSize && (chunkStart + i) < n; i += blockDim.x) { 42 | output[chunkStart + i] = sharedMem[i]; 43 | } 44 | } 45 | 46 | // Parallel merge implementation from Chapter 12 47 | __global__ void parallelMerge(int* A, int* B, int* C, int sizeA, int sizeB) { 48 | int gid = blockIdx.x * blockDim.x + threadIdx.x; 49 | int totalThreads = gridDim.x * blockDim.x; 50 | int totalSize = sizeA + sizeB; 51 | int elemsPerThread = (totalSize + totalThreads - 1) / totalThreads; 52 | int start = min(gid * elemsPerThread, totalSize); 53 | int end = min(start + elemsPerThread, totalSize); 54 | 55 | if (start >= totalSize) return; 56 | 57 | // get start position 58 | int i = max(0, findPosition(A, 0, sizeA, B, 0, sizeB, start)); 59 | int j = max(0, start - i); // ensure j is not negative 60 | 61 | // merge process 62 | int k = start; 63 | while (k < end) { 64 | if (j >= sizeB || (i < sizeA && A[i] <= B[j])) { 65 | C[k] = A[i]; 66 | i++; 67 | } else { 68 | C[k] = B[j]; 69 | j++; 70 | } 71 | k++; 72 | } 73 | } 74 | 75 | // Count elements less than or equal to given value 76 | __device__ int countLE(int* arr, int start, int end, int val) { 77 | int left = start; 78 | int right = end; 79 | 80 | while (left < right) { 81 | int mid = (left + right) / 2; 82 | 83 | if (arr[mid] <= val) { 84 | left = mid + 1; 85 | } else { 86 | right = mid; 87 | } 88 | } 89 | 90 | return left - start; 91 | } 92 | 93 | // Binary search to find position - core of parallel merge 94 | __device__ int findPosition(int* A, int startA, int endA, int* B, int startB, int endB, int target) { 95 | // boundary check 96 | if (target <= 0) return startA; 97 | if (target >= (endA - startA) + (endB - startB)) return endA; 98 | 99 | int left = startA; 100 | int right = min(endA, startA + target + 1); // limit search range 101 | 102 | while (left < right) { 103 | int mid = left + (right - left) / 2; 104 | 105 | // A[mid] 106 | int aVal = A[mid]; 107 | 108 | // count elements in B <= aVal 109 | int countB = 0; 110 | int bLeft = startB; 111 | int bRight = endB; 112 | while (bLeft < bRight) { 113 | int bMid = bLeft + (bRight - bLeft) / 2; 114 | if (B[bMid] <= aVal) 115 | bLeft = bMid + 1; 116 | else 117 | bRight = bMid; 118 | } 119 | countB = bLeft - startB; 120 | 121 | // count elements in A <= aVal 122 | int countA = mid - startA + 1; 123 | 124 | // total countA + countB elements <= aVal 125 | if (countA + countB <= target) 126 | left = mid + 1; 127 | else 128 | right = mid; 129 | } 130 | 131 | return left; 132 | } 133 | 134 | // Main sorting function 135 | void mergeSortParallel(int* input, int* output, int n) { 136 | int* d_input; 137 | int* d_output; 138 | int* d_temp; 139 | 140 | // Allocate device memory 141 | cudaMalloc((void**)&d_input, n * sizeof(int)); 142 | cudaMalloc((void**)&d_output, n * sizeof(int)); 143 | cudaMalloc((void**)&d_temp, n * sizeof(int)); 144 | 145 | // Copy input data to device 146 | cudaMemcpy(d_input, input, n * sizeof(int), cudaMemcpyHostToDevice); 147 | 148 | // Phase 1: Local sorting 149 | int chunkSize = MAX_CHUNK_SIZE; 150 | int numChunks = (n + chunkSize - 1) / chunkSize; 151 | int numThreadsPerChunk = min(chunkSize, BLOCK_SIZE); 152 | 153 | localSort<<>>( 154 | d_input, d_output, n, chunkSize); 155 | 156 | // Phase 2: Recursively merge sorted chunks 157 | int* d_in = d_output; 158 | int* d_out = d_temp; 159 | 160 | for (int currSize = chunkSize; currSize < n; currSize *= 2) { 161 | int blocksNeeded = (n + 2 * currSize - 1) / (2 * currSize); 162 | 163 | for (int i = 0; i < blocksNeeded; i++) { 164 | int start = i * 2 * currSize; 165 | int mid = min(start + currSize, n); 166 | int end = min(start + 2 * currSize, n); 167 | 168 | // Use parallel merge from Chapter 12 169 | dim3 gridDim((end - start + BLOCK_SIZE - 1) / BLOCK_SIZE); 170 | dim3 blockDim(BLOCK_SIZE); 171 | 172 | parallelMerge<<>>( 173 | d_in + start, d_in + mid, 174 | d_out + start, mid - start, end - mid); 175 | } 176 | 177 | // Swap input and output arrays 178 | int* temp = d_in; 179 | d_in = d_out; 180 | d_out = temp; 181 | } 182 | 183 | // Copy results back to host 184 | cudaMemcpy(output, d_in, n * sizeof(int), cudaMemcpyDeviceToHost); 185 | 186 | // Free device memory 187 | cudaFree(d_input); 188 | cudaFree(d_output); 189 | cudaFree(d_temp); 190 | } 191 | 192 | // Print a sample of the array contents 193 | void printArraySample(int* arr, int size, const char* label) { 194 | printf("%s: \n", label); 195 | 196 | // Print first 10 elements 197 | printf("First 10 elements: "); 198 | int frontCount = min(10, size); 199 | for (int i = 0; i < frontCount; i++) { 200 | printf("%d ", arr[i]); 201 | } 202 | printf("\n"); 203 | 204 | // Print middle 10 elements (if size is sufficient) 205 | if (size > 20) { 206 | printf("Middle 10 elements: "); 207 | int midStart = size / 2 - 5; 208 | for (int i = 0; i < 10; i++) { 209 | printf("%d ", arr[midStart + i]); 210 | } 211 | printf("\n"); 212 | } 213 | 214 | // Print last 10 elements (if size is sufficient) 215 | if (size > 10) { 216 | printf("Last 10 elements: "); 217 | int backCount = min(10, size); 218 | for (int i = size - backCount; i < size; i++) { 219 | printf("%d ", arr[i]); 220 | } 221 | printf("\n"); 222 | } 223 | 224 | printf("\n"); 225 | } 226 | 227 | // Main function 228 | int main() { 229 | const int N = 1024 * 1024; // One million elements 230 | int* input = new int[N]; 231 | int* output = new int[N]; 232 | 233 | // Initialize random number generator 234 | srand(time(NULL)); 235 | 236 | // Initialize array with random data 237 | for (int i = 0; i < N; i++) { 238 | input[i] = rand() % 10000; 239 | } 240 | 241 | // Display input array sample 242 | printf("======= BEFORE SORTING =======\n"); 243 | printArraySample(input, N, "Input array"); 244 | 245 | // Record sort start time 246 | clock_t start_time = clock(); 247 | 248 | // Sort 249 | mergeSortParallel(input, output, N); 250 | 251 | // Record sort end time 252 | clock_t end_time = clock(); 253 | double time_spent = (double)(end_time - start_time) / CLOCKS_PER_SEC; 254 | 255 | // Display sorted result sample 256 | printf("======= AFTER SORTING =======\n"); 257 | printArraySample(output, N, "Sorted array"); 258 | 259 | // Validate sorting result 260 | bool sorted = true; 261 | for (int i = 1; i < N; i++) { 262 | if (output[i] < output[i-1]) { 263 | sorted = false; 264 | printf("Sort error: output[%d]=%d > output[%d]=%d\n", 265 | i-1, output[i-1], i, output[i]); 266 | break; 267 | } 268 | } 269 | 270 | if (sorted) { 271 | printf("Sort successful!\n"); 272 | } 273 | 274 | // Display sorting time 275 | printf("Sorting time: %.4f seconds\n", time_spent); 276 | 277 | // Clean up 278 | delete[] input; 279 | delete[] output; 280 | 281 | return 0; 282 | } -------------------------------------------------------------------------------- /Part3/CH13/code/sort.cu: -------------------------------------------------------------------------------- 1 | // q1 2 | __global__ void radix_sort_iter_memory_coalescing(unsigned int* input, unsigned int* output, unsigned int* bits, unsigned int N, unsigned int iter) { 3 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 4 | unsigned int tid = threadIdx.x; 5 | 6 | // shared memory 7 | extern __shared__ unsigned int s_mem[]; 8 | unsigned int* s_data = s_mem; 9 | unsigned int* s_bits = &s_data[blockDim.x]; 10 | 11 | unsigned int key = 0, bit = 0; 12 | if(i < N) { 13 | key = input[i]; 14 | s_data[tid] = key; 15 | bit = (key >> iter) & 1; 16 | s_bits[tid] = bit; 17 | bits[i] = bit; 18 | } 19 | __syncthreads(); 20 | 21 | exclusiveScan(bits, N); 22 | 23 | if(i < N) { 24 | unsigned int numOnesBefore = bits[i], numOnesTotal = bits[N]; 25 | unsigned int dst; 26 | if (bit == 0) { 27 | dst = i - numOnesBefore; 28 | } else { 29 | dst = N - numOnesTotal - (i - numOnesBefore); 30 | } 31 | output[dst] = s_data[tid]; // load data from shared memory 32 | } 33 | } 34 | 35 | // q2 36 | __global__ void radix_sort_iter_multibit(unsigned int* input, unsigned int* output, unsigned int*bits, unsigned int N, unsigned int iter, unsigned int numBits) { 37 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 38 | unsigned int tid = threadIdx.x; 39 | 40 | // shared memory 41 | extern __shared__ unsigned int s_mem[]; 42 | unsigned int* s_data = s_mem; 43 | unsigned int* s_digits = &s_data[blockDim.x]; 44 | 45 | // mask 46 | unsigned int mask = (1 << numBits) - 1; // when numBits = 2, mask = 0b11 47 | unsigned int digit = 0; 48 | unsigned int key = 0; 49 | 50 | if(i < N) { 51 | key = input[i]; 52 | s_data[tid] = key; 53 | digit = (key >> iter) & mask; // get more digits 54 | s_digits[tid] = digit; 55 | } 56 | __syncthreads(); 57 | 58 | if(i < N) { 59 | unsigned int dst = atomicAdd(&counters[digit], 1); 60 | output[dst] = s_data[tid]; 61 | } 62 | } 63 | 64 | // q3 65 | __global__ void radix_sort_iter_thread_coarsening(unsigned int* input, unsigned int* output, unsigned int* bits, unsigned int N, unsigned int iter, unsigned int numBits, unsigned int elements_per_thread) { 66 | unsigned int i_base = blockIdx.x * blockDim.x + threadIdx.x; 67 | unsigned int tid = threadIdx.x; 68 | 69 | // shared memory 70 | extern __shared__ unsigned int s_mem[]; 71 | unsigned int* s_data = s_mem; 72 | unsigned int* s_digits = &s_data[blockDim.x * elements_per_thread]; // menory size should be elements_per_thread larger 73 | 74 | // mask 75 | unsigned int mask = (1 << numBits) - 1; 76 | 77 | #pragma unroll 78 | for (int e = 0; e < elements_per_thread; e++) { 79 | unsigned int i = i_base + e * blockDim.x; 80 | 81 | if (i < N) { 82 | unsigned int key = input[i]; 83 | s_data[tid + e * blockDim.x] = key; 84 | 85 | unsigned int digit = (key >> iter) & mask; 86 | s_digits[tid + e * blockDim.x] = digit; 87 | 88 | bits[i] = digit; 89 | } 90 | } 91 | __syncthreads(); 92 | 93 | exclusiveScan(bits, N); 94 | 95 | #pragma unroll 96 | for (int e = 0; e < elements_per_thread; e++) { 97 | unsigned int i = i_base + e * blockDim.x; 98 | 99 | if (i < N) { 100 | unsigned int digit = s_digits[tid + e * blockDim.x]; 101 | unsigned int numOnesBefore = bits[i], numOnesTotal = bits[N]; 102 | unsigned int dst; 103 | if (digit == 0) { 104 | dst = i - numOnesBefore; 105 | } else { 106 | dst = N - numOnesTotal - (i - numOnesBefore); 107 | } 108 | output[dst] = s_data[tid + e * blockDim.x]; 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /Part3/CH14/code/sprase_matrix_computation.cu: -------------------------------------------------------------------------------- 1 | // q3 2 | __global__ void computeHistogram(int* rowIdx, int* rowNnz, int nnz) { 3 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 4 | 5 | if (tid < nnz) { 6 | int row = rowIdx[tid]; 7 | atomicAdd(&rowNnz[row], 1); 8 | } 9 | } 10 | 11 | __global__ void exclusiveScan(int* rowNnz, int* rowPtrs, int numRows) { 12 | extern __shared__ int temp[]; 13 | 14 | int tid = threadIdx.x; 15 | int idx = blockIdx.x * blockDim.x + tid; 16 | 17 | if (idx < numRows) { 18 | temp[tid] = rowNnz[idx]; 19 | } else { 20 | temp[tid] = 0; 21 | } 22 | __syncthreads(); 23 | 24 | // prefix sum 25 | for (int stride = 1; stride < blockDim.x; stride *= 2) { 26 | int index = (tid + 1) * stride * 2 - 1; 27 | if (index < blockDim.x) { 28 | temp[index] += temp[index - stride]; 29 | } 30 | __syncthreads(); 31 | } 32 | 33 | if (idx < numRows) { 34 | if (tid > 0) { 35 | rowPtrs[idx] = temp[tid - 1]; 36 | } else { 37 | rowPtrs[idx] = 0; // first row pointer is 0 38 | } 39 | } 40 | } 41 | 42 | __global__ void reorderElements(int* cooRowIdx, int* cooColIdx, float* cooValues, int* csrRowPtrs, int* csrColIdx, float* csrValues, int* rowOffsets, int nnz) { 43 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 44 | 45 | if (tid < nnz) { 46 | int row = cooRowIdx[tid]; 47 | int offset = atomicAdd(&rowOffsets[row], 1); 48 | int pos = csrRowPtrs[row] + offset; 49 | 50 | csrColIdx[pos] = cooColIdx[tid]; 51 | csrValues[pos] = cooValues[tid]; 52 | } 53 | } 54 | 55 | // q4 56 | __global__ void spmvELL(int numRows, int maxColsPerRow, int* colIdx, float* values, float* x, float* y) { 57 | int row = blockIdx.x * blockDim.x + threadIdx.x; 58 | 59 | if (row < numRows) { 60 | float sum = 0.0f; 61 | 62 | for (int i = 0; i < maxColsPerRow; i++) { 63 | int idx = i * numRows + row; 64 | int col = colIdx[idx]; 65 | 66 | // ignore elements filled with -1 67 | if (col >= 0) { 68 | sum += values[idx] * x[col]; 69 | } 70 | } 71 | 72 | y[row] = sum; 73 | } 74 | } 75 | 76 | // q5 77 | __global__ void spmvJDS(int numRows, int *rowPerm, int *jdsRowPtrs, int numJdsDiagonals, int *colIdx, float *values, float *x, float *y) { 78 | tid = blockIdx.x * blockDim.x + threadIdx.x; 79 | 80 | if (tid < numRows) { 81 | float sum = 0.0f; 82 | 83 | for (int j = 0; j < numJdsDiagonals; j++) { 84 | if (j < numJdsDiagonals && tid < jdsRowPtrs[j+1] - jdsRowPtrs[j]) { 85 | int idx = jdsRowPtrs[j] + tid; 86 | int col = colIdx[idx]; 87 | 88 | if (col >= 0) { 89 | sum += values[idx] * x[col]; 90 | } 91 | } 92 | } 93 | 94 | int originalRow = rowPerm[row]; 95 | y[originalRow] = sum; 96 | } 97 | } -------------------------------------------------------------------------------- /Part3/CH14/main.md: -------------------------------------------------------------------------------- 1 | # Sparse matrix computation 2 | 3 | ## q1 4 | 5 | ### COO 6 | 7 | | idx | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 8 | |:----:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| 9 | | rowIdx | 0 | 0 | 1 | 2 | 2 | 3 | 3 | 10 | | colIdx | 0 | 2 | 2 | 1 | 2 | 0 | 3 | 11 | | value | 1 | 7 | 8 | 4 | 3 | 2 | 1 | 12 | 13 | ### CSR 14 | 15 | | idx | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 16 | |:----:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| 17 | | rowPtrs | 0 | 2 | 3 | 5 | 7 | 18 | | colIdx | 0 | 2 | 2 | 1 | 2 | 0 | 3 | 19 | | value | 1 | 7 | 8 | 4 | 3 | 2 | 1 | 20 | 21 | ### ELL 22 | 23 | | idx | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 24 | |:---:|:-:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| 25 | | colIdx | 0 | 2 | 1 | 0 | 2 | * | 2 | 3 | 26 | | value | 1 | 8 | 4 | 2 | 7 | * | 3 | 1 | 27 | 28 | ### JDS 29 | 30 | | idx | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 31 | |:--------:|:-:|:-:|:-:|:-:|:-:|:-:|:-:| 32 | | iterPtr | 0 | 4 | 7 | | | | | 33 | | colIdx | 0 | 1 | 0 | 2 | 2 | 2 | 3 | 34 | | value | 1 | 4 | 2 | 8 | 7 | 3 | 1 | 35 | 36 | ## q2 37 | 38 | ### COO 39 | 40 | Z * 3 integers 41 | 42 | ### CSR 43 | 44 | (m + 1) + 2 * Z 45 | 46 | ### ELL 47 | 48 | missing the max number of a nonzeros row 49 | 50 | set it as max of B elements for a row, it should be B * m * 2 51 | 52 | ### JDS 53 | 54 | missing the max number of a nonzeros row 55 | 56 | set it as max of B elements for a row, it should be B + 2 * Z + m 57 | 58 | ## q3 - q5 59 | 60 | check Part3\CH14\code\sprase_matrix_computation.cu -------------------------------------------------------------------------------- /cuda/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void helloFromGPU() { 4 | printf("Hello World from GPU!\n"); 5 | } 6 | 7 | int main() { 8 | helloFromGPU<<<1, 1>>>(); 9 | cudaDeviceSynchronize(); 10 | return 0; 11 | } 12 | -------------------------------------------------------------------------------- /new/test.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{listings} 3 | \usepackage{geometry} 4 | \usepackage{array} 5 | \usepackage{ulem} 6 | \usepackage{float} 7 | \geometry{a4paper, margin=1in} 8 | \begin{document} 9 | 10 | 11 | \section*{key points} 12 | Blur image: 13 | blur\_size leads to the core of the blurring pixel that accumulates other pixels around it. 14 | 15 | Matrix multiplication: 16 | 17 | inner product: 18 | \begin{equation} 19 | P_{row,col} = \sum M_{row,k}\cdot N_{k,col} for k = 0,1,\cdots Width-1 20 | \end{equation} 21 | 22 | Col-major and row-major: 23 | two different ways of showing a matrix's index. A row-major counts in the same row consecutively (\( r \cdot Cols + c \)), while a col-major counts in columns (\( c \cdot Rows + r \)) . 24 | 25 | 26 | As the table \ref{Matrix} shows, the \textbf{Bold} is for row-major, \textit{Italic} is for col-major. 27 | \begin{table}[H] 28 | \centering 29 | \caption{\bf Detailed matrix index.} 30 | \label{Matrix} 31 | \begin{tabular}{llll} 32 | (0,0)\textbf{0}\textit{0} & (0,1)\textbf{1}\textit{3} & (0,2)\textbf{2}\textit{6} & (0,3)\textbf{3}\textit{9} \\ 33 | (1,0)\textbf{4}\textit{1} & (1,1)\textbf{5}\textit{4} & (1,2)\textbf{6}\textit{7} & (1,3)\textbf{7}\textit{10} \\ 34 | (2,0)\textbf{8}\textit{2} & (2,1)\textbf{9}\textit{5} & (2,2)\textbf{10}\textit{8} & (2,3)\textbf{11}\textit{11} 35 | \end{tabular} 36 | \end{table} 37 | 38 | \section{Solutions} 39 | \subsection{} 40 | (1) 41 | Input matrices are A$(M \times K)$ and B$\$(K \times N)$. 42 | Output matrix is C$(M \times N)$. 43 | 44 | Every thread is responsible for calculating one complete row of the output matrix C. 45 | 46 | 47 | \begin{lstlisting}[basicstyle=\small\ttfamily, breaklines=true] 48 | 49 | __global__ void matmul_row(float* A, float* B, float* C, int M, int N, int K) { 50 | int col = blockIdx.x * blockDim.x + threadIdx.x; 51 | if (col < N) { 52 | for (int row = 0; row < M; ++row) { 53 | float sum = 0; 54 | for (int k = 0; k < K; ++k){ 55 | sum += A[row*K + k] * B[k*N + col]; 56 | } 57 | C[row * N + col] = sum; 58 | } 59 | } 60 | } 61 | \end{lstlisting} 62 | 63 | (2) 64 | 65 | \begin{lstlisting}[breaklines=true] 66 | __global__ void matmul_row(float* A, float* B,float* C, int M,int N,int K){ 67 | int row = blockIdx.x * blockDim.x + threadIdx.x; 68 | if (row < M) { 69 | for (int col = 0; col < N; ++col) { 70 | float sum = 0; 71 | for (int k = 0;k < K; ++k) { 72 | sum += A[row * K + k] * B[k * N + col]; 73 | } 74 | C[row * N + col] = sum; 75 | } 76 | } 77 | } 78 | \end{lstlisting} 79 | 80 | (3) 81 | 82 | For row-wise Kernel A and C have a better memory access, for col-wise Kernel B has a better memory access. 83 | 84 | This shall depend on the matrix dimensions. 85 | 86 | Further consideration: 87 | Both kernels can be further optimized using tiling to improve data locality and cache utilization. Shared memory and warp shuffling 88 | 89 | Explanation: 90 | \subsubsection{Tiling} 91 | \begin{lstlisting}[breaklines=true] 92 | __global__ void matmul_row_tiled(float* A, float* B, float* C, int M, int N, int K) { 93 | // Shared memory for tiles from A and B 94 | __shared__ float As[TILE_WIDTH][TILE_WIDTH]; 95 | __shared__ float Bs[TILE_WIDTH][TILE_WIDTH]; 96 | 97 | int row = blockIdx.x * blockDim.x + threadIdx.x; 98 | int col = blockIdx.y * blockDim.y + threadIdx.y; 99 | 100 | float sum = 0; 101 | 102 | // Loop over tiles, where K is a multiple of TILE_WIDTH 103 | for (int tile = 0; tile < K / TILE_WIDTH; ++tile) { 104 | // Load tiles from global to shared memory 105 | int tile_k = tile * TILE_WIDTH; 106 | As[threadIdx.y][threadIdx.x] = A[row * K + tile_k + threadIdx.x]; 107 | Bs[threadIdx.y][threadIdx.x] = B[(tile_k + threadIdx.y) * N + col]; 108 | 109 | // Synchronize threads within the block to ensure tiles are loaded 110 | __syncthreads(); 111 | 112 | // Compute partial sum for the current tile 113 | for (int k = 0; k < TILE_WIDTH; ++k) { 114 | sum += As[threadIdx.y][k] * Bs[k][threadIdx.x]; 115 | } 116 | 117 | // Synchronize again before loading the next tile 118 | __syncthreads(); 119 | } 120 | 121 | // Store the final result in the output matrix 122 | if (row < M && col < N) { 123 | C[row * N + col] = sum; 124 | } 125 | } 126 | \end{lstlisting} 127 | 128 | Compare: 129 | 130 | Without tiling: 131 | Each thread calculates one element of the output C, which should load rows of A and cols of B from global memory leading to a high number of global memory access. 132 | 133 | With tiling: 134 | Each thread block cooperatively computes a small tile of the output matrix. All threads within the block can access this data repeatedly from shared memory. 135 | 136 | Tiling helps decreasing exchange between global memory and local memory. 137 | 138 | 139 | 140 | 141 | \subsection{}%2 142 | Computes each element of the output vector as the dot product, manages memory allocation,data transfer, kernel invocation, and retrives the result. 143 | 144 | \begin{lstlisting}[breaklines=true] 145 | #include 146 | #include 147 | #include 148 | 149 | // CUDA Kernel for Matrix-Vector Multiplication 150 | __global__ void matrixVectorMulKernel(const float* B, const float* C, float* A, int N) {// input matrix pointers and number of rows/columns in the square matrix. 151 | int row = blockIdx.x * blockDim.x + threadIdx.x; 152 | if (row < N) {// number of row that over N won't work. 153 | float sum = 0.0f; 154 | for (int j = 0; j < N; ++j) { 155 | sum += B[row * N + j] * C[j];//the dot product of the corresponding row in matrix B and vector C. 156 | } 157 | A[row] = sum; 158 | } 159 | //each thread computes one element of the output vector A. 160 | } 161 | 162 | // Host Function for Matrix-Vector Multiplication 163 | void matrixVectorMul(const float* h_B, const float* h_C, float* h_A, int N) {// host input matrix pointers and N. 164 | float *d_B = nullptr, *d_C = nullptr, *d_A = nullptr; 165 | size_t sizeMatrix = N * N * sizeof(float); 166 | size_t sizeVector = N * sizeof(float); 167 | // Allocate device memory 168 | //error checking. 169 | cudaError_t err = cudaMalloc((void**)&d_B, sizeMatrix); 170 | if (err != cudaSuccess) { 171 | std::cerr << "Failed to allocate device memory for matrix B (error code " 172 | << cudaGetErrorString(err) << ")!\n"; 173 | exit(EXIT_FAILURE); 174 | } 175 | 176 | err = cudaMalloc((void**)&d_C, sizeVector); 177 | if (err != cudaSuccess) { 178 | std::cerr << "Failed to allocate device memory for vector C (error code " 179 | << cudaGetErrorString(err) << ")!\n"; 180 | cudaFree(d_B); 181 | exit(EXIT_FAILURE); 182 | } 183 | 184 | err = cudaMalloc((void**)&d_A, sizeVector); 185 | if (err != cudaSuccess) { 186 | std::cerr << "Failed to allocate device memory for vector A (error code " 187 | << cudaGetErrorString(err) << ")!\n"; 188 | cudaFree(d_B); 189 | cudaFree(d_C); 190 | exit(EXIT_FAILURE); 191 | } 192 | 193 | // Copy data from host to device 194 | err = cudaMemcpy(d_B, h_B, sizeMatrix, cudaMemcpyHostToDevice); 195 | if (err != cudaSuccess) { 196 | std::cerr << "Failed to copy matrix B from host to device (error code " 197 | << cudaGetErrorString(err) << ")!\n"; 198 | cudaFree(d_B); 199 | cudaFree(d_C); 200 | cudaFree(d_A); 201 | exit(EXIT_FAILURE); 202 | } 203 | 204 | err = cudaMemcpy(d_C, h_C, sizeVector, cudaMemcpyHostToDevice); 205 | if (err != cudaSuccess) { 206 | std::cerr << "Failed to copy vector C from host to device (error code " 207 | << cudaGetErrorString(err) << ")!\n"; 208 | cudaFree(d_B); 209 | cudaFree(d_C); 210 | cudaFree(d_A); 211 | exit(EXIT_FAILURE); 212 | } 213 | 214 | // Launch the CUDA kernel 215 | int threadsPerBlock = 256; 216 | int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; 217 | matrixVectorMulKernel<<>>(d_B, d_C, d_A, N); 218 | 219 | // Check for any kernel launch errors 220 | err = cudaGetLastError(); 221 | if (err != cudaSuccess) { 222 | std::cerr << "Failed to launch matrixVectorMulKernel (error code " 223 | << cudaGetErrorString(err) << ")!\n"; 224 | cudaFree(d_B); 225 | cudaFree(d_C); 226 | cudaFree(d_A); 227 | exit(EXIT_FAILURE); 228 | } 229 | 230 | // Copy the result vector A back to host 231 | err = cudaMemcpy(h_A, d_A, sizeVector, cudaMemcpyDeviceToHost); 232 | 233 | if (err != cudaSuccess) { 234 | std::cerr << "Failed to copy vector A from device to host (error code " 235 | << cudaGetErrorString(err) << ")!\n"; 236 | cudaFree(d_B); 237 | cudaFree(d_C); 238 | cudaFree(d_A); 239 | exit(EXIT_FAILURE); 240 | } 241 | 242 | // Free device memory 243 | cudaFree(d_B); 244 | cudaFree(d_C); 245 | cudaFree(d_A); 246 | } 247 | 248 | int main() { 249 | // Define the size of the matrix and vectors 250 | int N = 1024; // Example size; can be modified as needed 251 | 252 | // Initialize host vectors and matrix 253 | std::vector h_B(N * N, 1.0f); // Initialize all elements to 1.0 254 | std::vector h_C(N, 1.0f); // Initialize all elements to 1.0 255 | std::vector h_A(N, 0.0f); // Output vector 256 | 257 | // Perform matrix-vector multiplication 258 | matrixVectorMul(h_B.data(), h_C.data(), h_A.data(), N); 259 | //A = sum_{j}(B[i][j]*C[i]) 260 | 261 | // Optional: Verify the result (since B and C are all ones(the initial value), A should be filled with N) 262 | bool correct = true; 263 | for (int i = 0; i < N; ++i) { 264 | if (h_A[i] != static_cast(N)) { 265 | correct = false; 266 | std::cerr << "Mismatch at index " << i << ": " << h_A[i] << " != " << N << "\n"; 267 | break; 268 | } 269 | } 270 | 271 | if (correct) { 272 | std::cout << "Matrix-vector multiplication successful. All elements are " << N << ".\n"; 273 | } else { 274 | std::cout << "Matrix-vector multiplication failed.\n"; 275 | } 276 | 277 | return 0; 278 | } 279 | \end{lstlisting} 280 | 281 | \subsection{} 282 | \textbf{I'm a foolish.} 283 | 284 | a.\sout{32} 512 285 | 286 | The number of threads per block is multiple of bd: 16$\cdot$32 = 512. 287 | Whole block contains M,N for mapping 2D matrix. 288 | 289 | b.\sout{16$\cdot$32} 48640 290 | 291 | gridDim.x = (N - 1)/16 + 1 = (300 - 1)/16 + 1 = 19. 292 | 293 | gridDim.y = (M - 1)/32 + 1 = 5. 294 | 295 | So the number of all blocks is 19$\cdot$5 = 95. All threads is 95*512 = 48640. 296 | 297 | c. \sout{$16\cdot32\cdot[(N-1)/16+1]\cdot[(M-1)/32+1]$}95 as b solved. 298 | 299 | d.150$\cdot$30 300 | 301 | Directly multiple M and N(only row