├── .gitignore
├── PMPP
    ├── data
    │   └── 1.jpg
    ├── CMakeLists.txt
    ├── Histogram.cu
    ├── MergeSort.cu
    ├── Convolutional.cu
    ├── SparseMatrix.cu
    ├── Prefixsum.cu
    └── GraphSearch.cu
├── CMakeLists.txt
└── Readme.md


/.gitignore:
--------------------------------------------------------------------------------
1 | # Project exclude paths
2 | /cmake-build-debug/


--------------------------------------------------------------------------------
/PMPP/data/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Syencil/Programming_Massively_Parallel_Processors/HEAD/PMPP/data/1.jpg


--------------------------------------------------------------------------------
/PMPP/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | 
 3 | cuda_add_executable(Convolutional Convolutional.cu)
 4 | set_property(TARGET Convolutional PROPERTY FOLDER PMPP)
 5 | target_link_libraries(Convolutional cudart.so  libopencv_core.so libopencv_imgproc.so libopencv_imgcodecs.so)
 6 | 
 7 | cuda_add_executable(Prefixsum Prefixsum.cu)
 8 | set_property(TARGET Prefixsum PROPERTY FOLDER PMPP)
 9 | target_link_libraries(Prefixsum cudart.so)
10 | 
11 | cuda_add_executable(Histogram Histogram.cu)
12 | set_property(TARGET Histogram PROPERTY FOLDER PMPP)
13 | target_link_libraries(Histogram cudart.so)
14 | 
15 | cuda_add_executable(SparseMatrix SparseMatrix.cu)
16 | set_property(TARGET SparseMatrix PROPERTY FOLDER PMPP)
17 | target_link_libraries(SparseMatrix cudart.so)
18 | 
19 | cuda_add_executable(MergeSort MergeSort.cu)
20 | set_property(TARGET MergeSort PROPERTY FOLDER PMPP)
21 | target_link_libraries(MergeSort cudart.so)
22 | 
23 | cuda_add_executable(GraphSearch GraphSearch.cu)
24 | set_property(TARGET GraphSearch PROPERTY FOLDER PMPP)
25 | target_link_libraries(GraphSearch cudart.so)


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.5)
 2 | project(ParallOptimization)
 3 | 
 4 | set_property(GLOBAL PROPERTY USE_FOLDERS on)
 5 | 
 6 | # output
 7 | set(EXECUTABLE_OUTPUT_PATH "${PROJECT_BINARY_DIR}/bin")
 8 | message(STATUS "Project_binary_dir : ${PROJECT_BINARY_DIR}")
 9 | # c++ 11
10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
11 | 
12 | # find opencv
13 | find_package(OpenCV REQUIRED)
14 | include_directories(${OpenCV_INCLUDE_DIRS})
15 | if(NOT OpenCV_LIBRARY_DIRS)
16 |     set(OpenCV_LIBRARY_DIRS /usr/local/lib)
17 |     message(WARING " Can not find opencv lib. It will use the default path => ${OpenCV_LIBRARY_DIRS}")
18 | endif()
19 | link_directories(${OpenCV_LIBRARY_DIRS})
20 | message(STATUS "OpenCV_INCLUDE_DIRS => ${OpenCV_INCLUDE_DIRS}")
21 | message(STATUS "OpenCV_LIBRARY_DIRS => ${OpenCV_LIBRARY_DIRS}")
22 | 
23 | if(NOT OpenCV_FOUND)
24 |     message(ERROR "OpenCV not found!")
25 | endif(NOT OpenCV_FOUND)
26 | 
27 | # find cuda
28 | find_package(CUDA)
29 | find_package(CUDA REQUIRED)
30 | 
31 | include_directories(/usr/local/cuda/include)
32 | if(NOT CUDA_LIBRARY_DIRS)
33 |     set(CUDA_LIBRARY_DIRS /usr/local/cuda/lib64)
34 |     message(WARING " Can not find CUDA lib. It will use the default path => ${CUDA_LIBRARY_DIRS}")
35 | endif()
36 | link_directories(${CUDA_LIBRARY_DIRS})
37 | message(STATUS "CUDA_INCLUDE_DIRS : ${CUDA_INCLUDE_DIRS}")
38 | message(STATUS "CUDA_LIBRARY_DIRS : ${CUDA_LIBRARY_DIRS}")
39 | 
40 | if (CUDA_FOUND)
41 |     add_subdirectory(PMPP)
42 | else()
43 |     message("CUDA not found!")
44 | endif()


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # Programming Massively Parallel Processors
 2 | ## 介绍
 3 | 此项目为学习《Programming Massively Parallel Processors》时的代码记录，主要是第7~12章的并行计算模式代码。
 4 | 主要包含Convolution, Prefix Sum, Histogram Computation, Sparse Matrix Computation, Merge Sort, Graph Search 6大计算模式。
 5 | 由于书中没有完整代码，所以此项目中代码并不一定是最优解，只是按照书中讲解自我推理得来。
 6 | <br>
 7 | ## 简单笔记
 8 | ### Convolution
 9 | * 卷积核体积小，初始化后不会变，适合使用__constant__保存
10 | * 图像体积大，需要使用tile策略，同一个像素点可能被block中多个thread使用，适合读入__shared__中
11 | * 图像数据读入__shared__中时，尽量保证coalescing的方式读入
12 | * 对于halo cells可以不用特殊处理，因为其他block在将其载入__shared__中时L2 Cache会将其缓存起来，
13 | ### Prefix Sum
14 | * 浮点数计算存在大数吃小数的情况，可以使用Kahan算法进行一定程度的补偿
15 | * 按照Brent Kung和Kogge Stone累加算法，读取数据效率不高，故采用corner tuning的方式，将数据先以coalescing的方式载入__shared__
16 | * 通常情况下Kogge Stone算法直观性更好，而Brent Kung则是在速度和前者差不多的情况下功耗更低
17 | * 对于任意长度的累加序列，可以采用三段法。但是由于要多次读写全局内存，效率十分底下，可以采用Single-Pass的方式。
18 | 起难点在于需要实现block之间的通信和同步，采用全局内存+原子操作解决。
19 | ### Histogram
20 | * 直方图统计需要用到atomicAdd原子操作，而原子操作是否受限于内存读取速率，因为同一个内存地址同时只能进行一个原子操作，效率十分低下。而Cache也能一定程度的缓解这个问题
21 | * 对于超大序列的统计需要用到多个block并行，而太多的请求即使在Cache中也难以解决，故考虑私有化，每个block中先用__shared__进行统计，最后再汇总
22 | * 对于某些统计，例如图像直方图统计中，存在多个连续相同地址的atomicAdd操作请求，故考虑聚合，将多次请求合并为一次，提升效率
23 | * 书中关于Aggregation的代码Figure.9.11有错误，每次累加的位置应当是prev_index坐标的地址。
24 | ### Sparse Matrix
25 | * 稀疏矩阵如果按照密集矩阵存储计算，则会浪费许多存储空间以及计算资源。CSR，ELL，COO为三种常用的存储方式。
26 | * 通常来说CSR存储空间最少，但是对于老式CUDA显卡，存在divergence和non-coalescing。
27 | * ELL采用Padding的方式，以保证每次数据读取的一致性，但是当某一行的非零元素特别多时，整体存储空间就会变得特别大，效率低。
28 | * COO类似于CSR，采用data，col_idx，row_idx来存储，具有非常好的灵活性，但是只有当稀疏性<1/3才能有压缩的性能。
29 | * Hybrid（ELL + COO）是一种不错的选择，将某行特别长的元素用COO存储，剩下的用ELL存储。
30 | * JDS是一种按row划分，并按照非零元素长度进行排序的算法。排序好后适合CUDA按照block划分处理，每一个block中的threads处理的数据长度比较类似。
31 | * 老式CUDA设备上JDS-ELL效果更好，新式CUDA设备上对于对齐的要求比较低，JDS-CSR效果更好
32 | * 通常来说，完全随机=>ELL，完全随机但行方差大=>ELL+COO，近似对角=>JDS，极度稀疏=>COO
33 | ### Merge Sort
34 | * 这一部分主要是讲解合并两个有序序列，也就是归并排序的核心部分。我的代码只实现了non-coalescing方式，使用shared memory的部分实在太复杂了，以后有空再更
35 | * 假设输入A，B，合并之后为C，长度分别为m，n，m + n，那么对于C中任意位置k，必定是从A[i]]或者B[j]得来，并且k = i + j。并且可知A[i-1] <= B[j] && A[i] > B[j-1]
36 | * 由上可知，对于C的任意位置k都可以确定对应的i和j，于是可以将合并算法并行化。
37 | * 由k确定i和j位置的算法称为co_rank，采用二分查找。书中采用控制i和j的low-bound，但是实际上k = i + j我们只需要控制i就可以了，所以我的代码里面采用的是标准的二分查找
38 | * 此时数据读取方式是stride的方式而不是coalescing，故采用shared memory的方式。每一个block采用tiled进行循环解决任意长度的序列。每次先将AB tiled长度数据读入共享内存中，再合并，再读取
39 | * 上述的方法读取数据方式coalescing，但是每次只使用了tiled个数据，还有tiled个数据未被使用又重新读入，整体上来说浪费了一半的带宽。
40 | * 解决此问题的方式是引入一个变量记录每次消耗共享内存之后的Index，下一次迭代时候只读取被消耗长度的数据并填入到已经消耗的部分，此时共享内存类似一个循环队列。
41 | * 上述方法可以完美节省带宽，提升效率，但是代码复杂度会大大提高，并且会大幅增加寄存器使用量
42 | ### Graph Search
43 | * 图搜索中的广度优先遍历BFS可以改写为并行编程模式。其核心部分在于遍历当前队列中所有的节点以便于寻找下一级别节点的部分，可以用多线程完成。
44 | * 由于整体需要用到大量的原子操作，会拖慢程序速度，故可以考虑私有化的方式，而同一warp中仍然存在竞争关系，可以考虑采用3级缓存的方式，即使用sub-queue
45 | * 对压缩的稀疏矩阵CSR遍历时，row_ptr访问不规则，可以用纹理内存解决no-coalescing的问题
46 | * 当cur_frontier队列中的元素少时，启动核函数开销过大，可以采用设置阈值的方式自动选择序列化求解或者并行求解（我代码里面没有加，因为和普通的重复了，而且很简单）
47 | * data load imbalance的问题可以用动态编程的方式解决


--------------------------------------------------------------------------------
/PMPP/Histogram.cu:
--------------------------------------------------------------------------------
  1 | // Created by luozhiwang (luozw1994@outlook.com)
  2 | // Date: 2019/12/27
  3 | 
  4 | #include <cuda.h>
  5 | #include <random>
  6 | 
  7 | static void HandleError(cudaError_t err, const char *file, int line ) {
  8 |     if (err != cudaSuccess) {
  9 |         printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
 10 |                 file, line );
 11 |         exit( EXIT_FAILURE );
 12 |     }
 13 | }
 14 | #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
 15 | 
 16 | void histogram_cpu(const unsigned char *array, unsigned int *hist, unsigned int max_len){
 17 |     int idx;
 18 |     for (int i = 0; i < max_len; ++i){
 19 |         idx = array[i] - 'a';
 20 |         if (0 <= idx && idx < 26){
 21 |             hist[idx] += 1;
 22 |         }
 23 |     }
 24 | }
 25 | 
 26 | bool is_equal(const unsigned int *hist1, const unsigned int *hist2, const int &max_len){
 27 |     for (unsigned int i = 0; i < max_len; ++i){
 28 |         if (hist1[i] != hist2[i]){
 29 |             return false;
 30 |         }
 31 |     }
 32 |     return true;
 33 | }
 34 | 
 35 | // 一些参数
 36 | const int length = 80;
 37 | const int thread_num = 32;
 38 | const int per_thread = 2;
 39 | const int hist_num = 26;
 40 | const int block_num = (length + thread_num * per_thread - 1) / (thread_num * per_thread);
 41 | 
 42 | __device__ unsigned int hist_global[hist_num] = {0};
 43 | 
 44 | __global__ void histogram(const unsigned char *array, unsigned int max_len){
 45 |     __shared__ float hist_ds[hist_num];
 46 |     int pos;
 47 |     int pre = -1;
 48 |     int acc = 0;
 49 |     int tx = threadIdx.x;
 50 |     int bidx = blockIdx.x;
 51 |     int bdx = blockDim.x;
 52 |     int idx = tx + bidx * bdx * per_thread;
 53 |     for (unsigned int i = tx; i < hist_num; i += bdx){
 54 |         hist_ds[i] = 0u;
 55 |     }
 56 |     __syncthreads();
 57 |     for (unsigned int i = idx; i < (bidx+1) * bdx * per_thread && i < max_len; i += bdx){
 58 |         pos = array[i] - 'a';
 59 |         if (pre != pos){
 60 |             if (0 <= pre && pre < hist_num){
 61 |                 atomicAdd(&hist_ds[pre], acc);
 62 |             }
 63 |             acc = 1;
 64 |             pre = pos;
 65 |         }else{
 66 |             acc +=1;
 67 |         }
 68 |     }
 69 |     if (0 <= pre && pre < hist_num){
 70 |         atomicAdd(&hist_ds[pre], acc);
 71 |     }
 72 |     __syncthreads();
 73 |     for (unsigned int i = tx; i < hist_num; i += bdx){
 74 |         atomicAdd(&hist_global[i], hist_ds[i]);
 75 |     }
 76 | }
 77 | 
 78 | int main(int args, char **argv){
 79 |     printf("Block num is %d\nThread num is %d\n",block_num, thread_num);
 80 |     // Definition
 81 |     float elapsed_time;
 82 |     char tmp[26];
 83 |     unsigned char *array_host = new unsigned char[length];
 84 |     unsigned int *hist_host = new unsigned int [hist_num];
 85 |     unsigned int *hist_cpu = new unsigned int [hist_num];
 86 |     unsigned char *array_dev;
 87 |     cudaEvent_t start, stop;
 88 |     HANDLE_ERROR(cudaMalloc((void**)&array_dev, sizeof(char) * length));
 89 |     HANDLE_ERROR(cudaEventCreate(&start));
 90 |     HANDLE_ERROR(cudaEventCreate(&stop));
 91 |     // Init Host ====> Dev
 92 |     for (int i = 0; i < 26; ++i){
 93 |         tmp[i] = 'a' + i;
 94 |     }
 95 |     std::default_random_engine e;
 96 |     std::uniform_int_distribution<int> distribution(0, 26);
 97 |     for (int i = 0; i < length; ++i){
 98 |         array_host[i] = tmp[distribution(e)];
 99 |     }
100 |     HANDLE_ERROR(cudaMemcpy(array_dev, array_host, sizeof(char) * length, cudaMemcpyHostToDevice));
101 |     // launch kernel
102 |     HANDLE_ERROR(cudaEventRecord(start, 0));
103 |     histogram<<<block_num, thread_num>>>(array_dev,  length);
104 |     printf("Histogram \n");
105 |     HANDLE_ERROR(cudaEventRecord(stop, 0));
106 |     // elapsed time
107 |     HANDLE_ERROR(cudaEventSynchronize(stop));
108 |     HANDLE_ERROR(cudaEventElapsedTime(&elapsed_time ,start, stop));
109 |     printf("Elapsed Time is %f \n",elapsed_time);
110 |     // Dev ====> Host
111 |     HANDLE_ERROR(cudaMemcpyFromSymbol(hist_host, hist_global, sizeof(int) * hist_num));
112 |     // verify the output
113 |     histogram_cpu(array_host, hist_cpu, length);
114 |     if (is_equal(hist_host, hist_cpu, hist_num)){
115 |         printf("Answer is Correct\n");
116 |     }else{
117 |         printf("Answer is Wrong\n");
118 |         for (int i = 0; i < hist_num; ++i){
119 |             printf("%d  %d  %d  \n", i, hist_host[i], hist_cpu[i]);
120 |         }
121 |     }
122 |     // Destroy
123 |     HANDLE_ERROR(cudaEventDestroy(start));
124 |     HANDLE_ERROR(cudaEventDestroy(stop));
125 |     HANDLE_ERROR(cudaFree(array_dev));
126 |     free(array_host);
127 |     free(hist_host);
128 |     free(hist_cpu);
129 | }


--------------------------------------------------------------------------------
/PMPP/MergeSort.cu:
--------------------------------------------------------------------------------
  1 | // Created by luozhiwang (luozw1994@outlook.com)
  2 | // Date: 2020/1/3
  3 | 
  4 | #include <cuda.h>
  5 | #include <random>
  6 | 
  7 | const int m = 1000;
  8 | const int n = 1048;
  9 | 
 10 | static void HandleError(cudaError_t err, const char *file, int line ) {
 11 |     if (err != cudaSuccess) {
 12 |         printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
 13 |                 file, line );
 14 |         exit( EXIT_FAILURE );
 15 |     }
 16 | }
 17 | #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
 18 | 
 19 | bool verify_output(int *array1, int *array2, int len){
 20 |     for (int i = 0; i < len; ++i){
 21 |         if (array1[i] != array2[i]){
 22 |             return false;
 23 |         }
 24 |     }
 25 |     return true;
 26 | }
 27 | 
 28 | __device__ __host__
 29 | void merge(int *array1, int len1, int *array2, int len2, int *output){
 30 |     int i{0};
 31 |     int j{0};
 32 |     int k{0};
 33 |     while(i < len1 && j < len2){
 34 |         if (array1[i] <= array2[j]){
 35 |             output[k++] = array1[i++];
 36 |         }else{
 37 |             output[k++] = array2[j++];
 38 |         }
 39 |     }
 40 |     if (i == len1){
 41 |         while (j < len2){
 42 |             output[k++] = array2[j++];
 43 |         }
 44 |     }else{
 45 |         while (i < len1){
 46 |             output[k++] = array1[i++];
 47 |         }
 48 |     }
 49 | }
 50 | 
 51 | // 书中所使用的算法，每次都移动下限
 52 | __device__ __host__
 53 | int co_rank_aux(int k, int *A, int m, int *B, int n){
 54 |     int i = k < m ? k : m ;
 55 |     int j = k - i;
 56 |     int i_low = (k - n) < 0 ? 0 : k - n;
 57 |     int j_low = (k - m) < 0 ? 0 : k - m;
 58 |     int delta;
 59 |     while (true){
 60 |         if(i > 0 && j < n && A[i-1]>B[j]){
 61 |             delta = (i - i_low + 1) >> 1;
 62 |             j_low = j;
 63 |             i -= delta;
 64 |             j += delta;
 65 |         }else if(j > 0 && i < m && A[i] <= B[j-1]){
 66 |             delta = (j - j_low + 1) >> 1;
 67 |             i_low = i;
 68 |             i += delta;
 69 |             j -= delta;
 70 |         }else{
 71 |             break;
 72 |         }
 73 |     }
 74 |     return i;
 75 | }
 76 | 
 77 | // 书中每次用了一个delta来控制，每次移动都是选择移动i或者j的下限，感觉没必要，因为k=i+j，我就直接使用标准的二分查找了
 78 | __device__ __host__
 79 | int co_rank(int k, int *A, int m, int *B, int n){
 80 |     int i_max = k < m - 1? k : m - 1;
 81 |     int i_min = k < n ? 0 : k - n ;
 82 |     while (i_min < i_max){
 83 |         int i = (i_max + i_min + 1) / 2;
 84 |         int j = k - i;
 85 |         if (i > 0 && j < n && A[i - 1] > B[j]){
 86 |             i_max = i - 1;
 87 |         }else if (j > 0 && i < m && A[i] <= B[j - 1]){
 88 |             i_min = i + 1;
 89 |         }else{
 90 |             break;
 91 |         }
 92 |     }
 93 |     return (i_max + i_min + 1) / 2;
 94 | }
 95 | 
 96 | __global__ void merge_co_rank(int *array1, int m, int *array2, int n, int *output){
 97 |     int tid = threadIdx.x + blockDim.x * blockIdx.x;
 98 |     int section_size = (m + n - 1) / (blockDim.x * gridDim.x) + 1;
 99 |     int start_k = tid * section_size;
100 |     int end_k = min((tid + 1) * section_size, m + n);
101 |     int start_i = co_rank(start_k, array1, m, array2, n);
102 |     int end_i = co_rank(end_k, array1, m, array2, n);
103 |     int start_j = start_k - start_i;
104 |     int end_j = end_k - end_i;
105 |     merge(&array1[start_i], end_i - start_i, &array2[start_j], end_j - start_j, &output[start_k]);
106 | }
107 | 
108 | void show(int *array, int num, std::string str=""){
109 |     printf("%s\n", str.c_str());
110 |     for(int i = 0; i < num; ++i){
111 |         printf("%d ", array[i]);
112 |     }
113 |     printf("\n");
114 | }
115 | 
116 | void init_order(int *array, int num, int seed = 1){
117 |     std::default_random_engine e;
118 |     e.seed(seed);
119 |     std::uniform_real_distribution<float> prob(0, 1);
120 |     int i = 0;
121 |     int count = 0;
122 |     while (i < num){
123 |         if (prob(e) < 0.5){
124 |             array[i++] = count;
125 |         }
126 |         ++count;
127 |     }
128 | }
129 | 
130 | int main(int args, char **argv){
131 |     int *array1 = new int [m];
132 |     int *array2 = new int [n];
133 |     int *merge_cpu = new int [m + n];
134 |     int *output_cpu = new int [m + n];
135 | 
136 |     init_order(array1, m, 1);
137 |     init_order(array2, n,2);
138 | 
139 |     int *array1_dev, *array2_dev, *output_dev;
140 | 
141 |     cudaEvent_t start, end;
142 |     HANDLE_ERROR(cudaEventCreate(&start));
143 |     HANDLE_ERROR(cudaEventCreate(&end));
144 | 
145 |     HANDLE_ERROR(cudaMalloc((void**)&array1_dev, sizeof(int) * m));
146 |     HANDLE_ERROR(cudaMalloc((void**)&array2_dev, sizeof(int) * n));
147 |     HANDLE_ERROR(cudaMalloc((void**)&output_dev, sizeof(int) * (m + n)));
148 |     HANDLE_ERROR(cudaMemcpy(array1_dev, array1,  sizeof(int) * m, cudaMemcpyHostToDevice));
149 |     HANDLE_ERROR(cudaMemcpy(array2_dev, array2,  sizeof(int) * n, cudaMemcpyHostToDevice));
150 | 
151 |     dim3 grid(2);
152 |     dim3 block(16);
153 |     merge(array1, m, array2, n, merge_cpu);
154 | 
155 |     HANDLE_ERROR(cudaEventRecord(start, 0));
156 |     merge_co_rank<<<grid, block>>>(array1_dev, m, array2_dev, n, output_dev);
157 | //    merge_co_rank<<<grid, block>>>(array2_dev, n, array1_dev, m, output_dev);
158 |     HANDLE_ERROR(cudaEventRecord(end, 0));
159 |     HANDLE_ERROR(cudaEventSynchronize(end));
160 |     float elapsed_time;
161 |     HANDLE_ERROR(cudaEventElapsedTime(&elapsed_time, start, end));
162 |     printf("Elapsed Time is %f \n",elapsed_time);
163 | 
164 |     show(array1, m,"array1 ===>" );
165 |     show(array2, n,"array2 ===>");
166 | 
167 |     HANDLE_ERROR(cudaMemcpy(output_cpu, output_dev, sizeof(int) * (m+n), cudaMemcpyDeviceToHost));
168 |     if (verify_output(output_cpu, merge_cpu, m + n)){
169 |         printf("Answer is Correct\n");
170 |     } else{
171 |         printf("Answer is Wrong\n");
172 |         show(merge_cpu, m+n, "output_cpu ===>");
173 |         show(output_cpu, m+n, "output_device ===>");
174 | 
175 |     }
176 | 
177 |     delete []array1;
178 |     delete []array2;
179 |     delete []output_cpu;
180 |     delete []merge_cpu;
181 |     HANDLE_ERROR(cudaFree(array1_dev));
182 |     HANDLE_ERROR(cudaFree(array2_dev));
183 |     HANDLE_ERROR(cudaFree(output_dev));
184 |     HANDLE_ERROR(cudaEventDestroy(start));
185 |     HANDLE_ERROR(cudaEventDestroy(end));
186 | 
187 | 
188 | }


--------------------------------------------------------------------------------
/PMPP/Convolutional.cu:
--------------------------------------------------------------------------------
  1 | // Created by luozhiwang (luozw1994@outlook.com)
  2 | // Date: 2019/12/23
  3 | 
  4 | #include <cuda.h>
  5 | #include <cuda_runtime.h>
  6 | #include <vector>
  7 | #include <memory>
  8 | #include <opencv2/core/core.hpp>
  9 | #include <opencv2/imgproc/imgproc.hpp>
 10 | #include <opencv2/imgcodecs.hpp>
 11 | #include <iostream>
 12 | 
 13 | // 此项目实现多通道卷积，主要涉及const memory，shared memory，cache，corner tuning策略，coalescing读取
 14 | // 但是由于水平有限，CHW的一维模式coalescing读取到共享内存暂时不知道怎么处理
 15 | 
 16 | // 需要的一些函数
 17 | static void HandleError(cudaError_t err, const char *file, int line ) {
 18 |     if (err != cudaSuccess) {
 19 |         printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
 20 |                 file, line );
 21 |         exit( EXIT_FAILURE );
 22 |     }
 23 | }
 24 | #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
 25 | 
 26 | cv::Mat ImageRead(const std::string &image_path, const int &height, const int &width){
 27 |     cv::Mat original_image = cv::imread(image_path, cv::IMREAD_COLOR);
 28 |     cv::Mat resized_image(height, width, CV_8UC3);
 29 |     cv::resize(original_image, resized_image, cv::Size(width, height));
 30 |     resized_image.convertTo(resized_image, CV_32FC3);
 31 |     return resized_image;
 32 | }
 33 | 
 34 | // 定义常量内存 以及一些常量参数
 35 | const int mask_width = 3;
 36 | const int image_channel = 3;
 37 | const int image_height = 448;
 38 | const int image_width = 448;
 39 | const int thread_num = 32;
 40 | const int tiled_width = thread_num;
 41 | 
 42 | __constant__ float mask_device[image_channel * mask_width *mask_width];
 43 | 
 44 | // 定义kernel
 45 | __global__ void convolutional_2D(float *image, float *image_conv, const int image_h, const int image_w){
 46 | 
 47 |     int tx = threadIdx.x;
 48 |     int ty = threadIdx.y;
 49 |     int tidx = threadIdx.x + blockDim.x * blockIdx.x;
 50 |     int tidy = threadIdx.y + blockDim.y * blockIdx.y;
 51 |     int timg = tidx + tidy * image_w;
 52 |     // 3通道图像 以blockDim为tile大小 CHW 读入shared memory
 53 |     __shared__ float image_ds[image_channel * tiled_width * tiled_width];
 54 |     // not coalescing =====> total 0.046s
 55 |     for (int c = 0; c < image_channel; ++c){
 56 |         if (0 <= tidx && tidx < image_w && 0 <= tidy && tidy <=image_h){
 57 |             image_ds[(tx + ty * tiled_width) * 3 + c] = image[timg * 3 + c ];
 58 |         }else{
 59 |             image_ds[(c * tiled_width * tiled_width) + tx + ty * tiled_width ] = 0.0f;
 60 |         }
 61 |     }
 62 |     __syncthreads();
 63 |     // 计算
 64 |     float output = 0.0f;
 65 |     int tile_x_start = blockIdx.x * blockDim.x;
 66 |     int tile_x_end = (blockIdx.x + 1) * blockDim.x;
 67 |     int tile_y_start = blockIdx.y * blockDim.y;
 68 |     int tile_y_end = (blockIdx.y + 1) * blockDim.y;
 69 |     int halo = mask_width / 2;
 70 | 
 71 |     for (int i = 0; i < mask_width; ++i){
 72 |         int x_idx = tidx - halo + i;
 73 |         for (int j = 0; j < mask_width; ++j){
 74 |             int y_idx = tidy - halo + j;
 75 |             for (int c=0; c<image_channel; ++c){
 76 |                 if (0 <= x_idx && x_idx < image_w && 0 <= y_idx && y_idx <=image_h){
 77 |                     if (tile_x_start <= x_idx && x_idx < tile_x_end && tile_y_start <= y_idx && y_idx < tile_y_end){
 78 |                         output += mask_device[c * mask_width * mask_width + j * mask_width + i] * image_ds[(((ty - halo + j) * tiled_width + tx - halo + i)) * 3 + c] ;
 79 |                     }else{
 80 |                         output += mask_device[c * mask_width * mask_width + j * mask_width + i] * image[(x_idx + y_idx * image_w) * image_channel + c] ;
 81 |                     }
 82 |                 }
 83 |             }
 84 |         }
 85 |     }
 86 |     if (0 <= tidx && tidx < image_w && 0 <= tidy && tidy <=image_h){
 87 |         image_conv[timg] = output;
 88 |     }
 89 | }
 90 | 
 91 | 
 92 | int main(int args, char **argv){
 93 |     // 声明变量
 94 |     float *mask_host = new float[mask_width*mask_width*image_channel];
 95 |     float *image_device, *image_output_device;
 96 |     cudaEvent_t start;
 97 |     cudaEvent_t end;
 98 | 
 99 |     // 初始化事件
100 |     HANDLE_ERROR(cudaEventCreate(&start));
101 |     HANDLE_ERROR(cudaEventCreate(&end));
102 | 
103 |     // 初始化const memory
104 |     for (int i = 0; i < mask_width*mask_width*image_channel; ++i){
105 |         mask_host[i] = 1.0/(mask_width*mask_width*image_channel);
106 |     }
107 | 
108 |     HANDLE_ERROR(cudaMemcpyToSymbol(mask_device, mask_host, image_channel * mask_width * mask_width * sizeof(float)));
109 | 
110 |     // 读取图像数据，载入Device中
111 |     std::string image_path = "./PMPP/data/1.jpg";
112 |     cv::Mat image_host = ImageRead(image_path, image_height, image_width);
113 | 
114 | 
115 |     HANDLE_ERROR(cudaMalloc((void**)&image_device, sizeof(float)*image_host.channels() * image_host.rows * image_host.cols));
116 |     HANDLE_ERROR(cudaMalloc((void**)&image_output_device, sizeof(float) * image_host.rows * image_host.cols));
117 |     HANDLE_ERROR(cudaMemcpy(image_device, image_host.data, sizeof(float)*image_host.channels() * image_host.rows * image_host.cols, cudaMemcpyHostToDevice));
118 | 
119 |     // 开始记录时间
120 |     HANDLE_ERROR(cudaEventRecord(start, 0));
121 | 
122 |     // 启动核函数，计算block所需要数量
123 |     int blockDim_x = (image_width+thread_num-1) / thread_num;
124 |     int blockDim_y = (image_height+thread_num-1) / thread_num;
125 |     dim3 block(thread_num, thread_num);
126 |     dim3 grid(blockDim_x, blockDim_y);
127 |     convolutional_2D<<<grid,block>>>(image_device, image_output_device, image_height, image_width);
128 |     std::cout<<"convolution"<<std::endl;
129 | 
130 |     // 读取计算时间
131 |     HANDLE_ERROR(cudaEventRecord(end, 0));
132 |     float elapsed_time;
133 |     HANDLE_ERROR(cudaEventSynchronize(end));
134 | 
135 |     HANDLE_ERROR(cudaEventElapsedTime(&elapsed_time, start, end));
136 |     std::cout<< "Execution time is "<<elapsed_time << std::endl;
137 | 
138 |     // 将数据读取出来
139 |     // 验证计算结果
140 |     cv::Mat render_image = cv::Mat::ones(image_height, image_width, CV_32FC1);
141 |     HANDLE_ERROR(cudaMemcpy(render_image.data, image_output_device, sizeof(float)*image_host.rows * image_host.cols, cudaMemcpyDeviceToHost));
142 |     cv::imwrite("./PMPP/data/render.jpg", render_image);
143 | 
144 |     // destory
145 |     HANDLE_ERROR(cudaEventDestroy(start));
146 |     HANDLE_ERROR(cudaEventDestroy(end));
147 |     HANDLE_ERROR(cudaFree(image_device));
148 |     HANDLE_ERROR(cudaFree(image_output_device));
149 |     return 0;
150 | }


--------------------------------------------------------------------------------
/PMPP/SparseMatrix.cu:
--------------------------------------------------------------------------------
  1 | // Created by luozhiwang (luozw1994@outlook.com)
  2 | // Date: 2020/1/2
  3 | 
  4 | #include <cuda.h>
  5 | #include <vector>
  6 | #include <random>
  7 | 
  8 | // 此章节主要是关于稀疏矩阵计算，对应不同类型的稀疏矩阵有不同的存储格式。
  9 | // 主要是介绍为主，没什么代码。此处就是Dense-Matrix转CSR，ELL，COO格式
 10 | 
 11 | class Matrix{
 12 | public:
 13 |     int row;
 14 |     int column;
 15 |     int num;
 16 |     std::vector<std::vector<float>> data;
 17 | 
 18 |     Matrix(const std::vector<std::vector<float>> &data){
 19 |         this->row = data.size();
 20 |         this->column = data[0].size();
 21 |         for (int r = 0; r < data.size(); ++r){
 22 |             std::vector<float> tmp;
 23 |             for (int c = 0; c < data[0].size(); ++c){
 24 |                 tmp.push_back(data[r][c]);
 25 |             }
 26 |             this->data.push_back(tmp);
 27 |         }
 28 |     }
 29 | 
 30 |     void show(){
 31 |         printf(" ===================   Origin Matrix   ===================>\n");
 32 |         for (int r = 0; r < this->row; ++r){
 33 |             for(int c = 0; c < this->column; ++c){
 34 |                 printf("%.3f ", data[r][c]);
 35 |             }
 36 |             printf("\n");
 37 |         }
 38 |         printf("\n");
 39 |     }
 40 | };
 41 | 
 42 | class CSR{
 43 | public:
 44 |     int column;
 45 |     int row;
 46 |     std::vector<int> col_idx;
 47 |     std::vector<int> row_ptr;
 48 |     std::vector<float> data;
 49 | 
 50 |     CSR(const Matrix &matrix){
 51 |         this->column =  matrix.data[0].size();
 52 |         this->row = matrix.data.size();
 53 | 
 54 |         int count = 0;
 55 |         row_ptr.push_back(0);
 56 |         for (int r = 0; r < this->row; ++r){
 57 |             for (int c = 0; c < this->column; ++c){
 58 |                 float tmp = matrix.data[r][c];
 59 |                 if (tmp != 0){
 60 |                     ++count;
 61 |                     data.push_back(tmp);
 62 |                     col_idx.push_back(c);
 63 |                 }
 64 |             }
 65 |             row_ptr.push_back(count);
 66 |         }
 67 |     }
 68 | 
 69 |     void show(){
 70 |         printf(" ===================   CSR   ===================>\n");
 71 |         printf("CSR data ===> ");
 72 |         for (int i = 0; i < data.size(); ++i){
 73 |             printf("%.3f ", data[i]);
 74 |         }
 75 |         printf("\nCSR col_idx ===> ");
 76 |         for (int i = 0; i < col_idx.size(); ++i){
 77 |             printf("%d ", col_idx[i]);
 78 |         }
 79 |         printf("\nCSR row_ptr ===> ");
 80 |         for (int i = 0; i < row_ptr.size(); ++i){
 81 |             printf("%d ", row_ptr[i]);
 82 |         }
 83 |         printf("\n\n");
 84 |     }
 85 | };
 86 | 
 87 | class COO{
 88 | public:
 89 |     int column;
 90 |     int row;
 91 |     std::vector<int> col_idx;
 92 |     std::vector<int> row_idx;
 93 |     std::vector<float> data;
 94 | 
 95 |     COO(const Matrix &matrix){
 96 |         this->column = matrix.column;
 97 |         this->row = matrix.row;
 98 | 
 99 |         for (int r = 0; r < this->row; ++r){
100 |             for (int c = 0; c < this->column; ++c){
101 |                 float tmp = matrix.data[r][c];
102 |                 if (tmp != 0){
103 |                     data.push_back(tmp);
104 |                     col_idx.push_back(c);
105 |                     row_idx.push_back(r);
106 |                 }
107 |             }
108 |         }
109 |     }
110 | 
111 |     void show(){
112 |         printf(" ===================   COO   ===================>\n");
113 |         printf("COO data ===> ");
114 |         for (int i = 0; i < data.size(); ++i){
115 |             printf("%.3f ", data[i]);
116 |         }
117 |         printf("\nCOO col_idx ===> ");
118 |         for (int i = 0; i < col_idx.size(); ++i){
119 |             printf("%d ", col_idx[i]);
120 |         }
121 |         printf("\nCOO row_ptr ===> ");
122 |         for (int i = 0; i < row_idx.size(); ++i){
123 |             printf("%d ", row_idx[i]);
124 |         }
125 |         printf("\n\n");
126 |     }
127 | };
128 | 
129 | class ELL{
130 | public:
131 |     std::vector<std::vector<float>> data;
132 |     std::vector<std::vector<int>> col_idx;
133 | 
134 |     ELL(const Matrix &matrix){
135 |         int max_len = 0;
136 |         for (int r = 0; r < matrix.row; ++r){
137 |             std::vector<int> tmp_col;
138 |             std::vector<float> tmp_data;
139 |             for (int c = 0; c < matrix.column; ++c){
140 |                 float tmp = matrix.data[r][c];
141 |                 if (tmp != 0){
142 |                     printf("%d ", c);
143 |                     tmp_col.push_back(c);
144 |                     tmp_data.push_back(tmp);
145 |                 }
146 |             }
147 |             if(max_len < tmp_data.size()){
148 |                 max_len = tmp_data.size();
149 |             }
150 |             data.push_back(tmp_data);
151 |             col_idx.push_back(tmp_col);
152 |         }
153 |         for (int r = 0; r <  data.size(); ++r){
154 |             for (int c = data[r].size(); c < max_len; ++c){
155 |                 data[r].push_back(0);
156 |                 col_idx[r].push_back(0);
157 |             }
158 |         }
159 | 
160 |     }
161 | 
162 |     void show(){
163 |         printf(" ===================   ELL   ===================>\n");
164 |         for (int r = 0; r < data.size(); ++r){
165 |             for (int c = 0; c < data[0].size(); ++c){
166 |                 printf("%.3f ", data[r][c]);
167 |             }
168 |             printf("       ");
169 |             for (int c = 0; c < col_idx[0].size(); ++c){
170 |                 printf("%d ", col_idx[r][c]);
171 | //                printf("%d ", c);
172 |             }
173 |             printf("\n");
174 |         }
175 |         printf("\n");
176 |     }
177 | };
178 | 
179 | const int ROW = 10;
180 | const int COL = 10;
181 | 
182 | int main(int args, char **argv){
183 |     // 构建稀疏矩阵
184 |     std::default_random_engine e;
185 |     std::uniform_real_distribution<float > probability(0, 1);
186 |     std::uniform_real_distribution<float > number(0, 10);
187 |     std::vector<std::vector<float>> data;
188 |     for (int i=0; i<ROW; ++i){
189 |         std::vector<float> tmp;
190 |         for (int j = 0; j < COL; ++j){
191 |             if(probability(e) < 0.1){
192 |                 tmp.push_back(number(e));
193 |             }else{
194 |                 tmp.push_back(0);
195 |             }
196 |         }
197 |         data.push_back(tmp);
198 |     }
199 |     Matrix matrix{data};
200 |     matrix.show();
201 |     CSR csr{matrix};
202 |     csr.show();
203 |     COO coo{matrix};
204 |     coo.show();
205 |     ELL ell(matrix);
206 |     ell.show();
207 | 
208 | }


--------------------------------------------------------------------------------
/PMPP/Prefixsum.cu:
--------------------------------------------------------------------------------
  1 | // Created by luozhiwang (luozw1994@outlook.com)
  2 | // Date: 2019/12/25
  3 | 
  4 | #include <cuda.h>
  5 | #include <random>
  6 | #include <iostream>
  7 | 
  8 | // 此项目分为两个部分，kogge_stone进行少量数据的prefix sum，brent_kung进行任意规模数据量的prefix sum
  9 | // kogge_stone和cpu计算部分使用kahan浮点数累加算法提升精度
 10 | // brent_kung被用在任意规模数量的single-pass prefix sum算法中。并利用global memory和atomic operation以及__threadfence()实现block之间的通信。
 11 | 
 12 | // CPU进行计算的函数以及验证结果函数
 13 | static void HandleError(cudaError_t err, const char *file, int line ) {
 14 |     if (err != cudaSuccess) {
 15 |         printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
 16 |                 file, line );
 17 |         exit( EXIT_FAILURE );
 18 |     }
 19 | }
 20 | #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
 21 | 
 22 | template <typename T>
 23 | void prefixsum_cpu( T *array,  T *output, int length){
 24 |     T sum = 0;
 25 |     for (int i =0; i < length; ++i){
 26 |         sum += array[i];
 27 |         output[i] = sum;
 28 |     }
 29 | }
 30 | 
 31 | template <typename T>
 32 | void prefixsum_cpu_kahan( T *array,  T *output, int length){
 33 |     T sum = 0;
 34 |     float c = 0;
 35 |     float tmp ;
 36 |     for (int i =0; i < length; ++i){
 37 |         tmp = array[i] - c;
 38 |         c = sum + tmp - sum - tmp;
 39 |         sum += tmp;
 40 |         output[i] = sum;
 41 |     }
 42 | }
 43 | 
 44 | template <class T>
 45 | bool is_equale(const T *array1, const T *array2, int length){
 46 |     for (int i = 0; i< length; ++i){
 47 |         if(abs(array1[i] - array2[i])> 0.0001){
 48 |             return false;
 49 |         }
 50 |     }
 51 |     return true;
 52 | }
 53 | 
 54 | // 一些参数设置
 55 | const unsigned int length = 2048;
 56 | const unsigned int section = 32;
 57 | const int thread_num = section;
 58 | const int block_num = (length + thread_num - 1) / thread_num;
 59 | 
 60 | 
 61 | __device__ float aux[block_num] = {0};
 62 | __device__ int flags[block_num + 1] = {1};
 63 | 
 64 | //length不大 可以用一个block处理
 65 | __global__ void  kogge_stone(float *array, float *output, const unsigned int max_len){
 66 |     extern __shared__ float array_ds[];
 67 |     // 读入shared memory
 68 |     int tx = threadIdx.x;
 69 |     if (tx < max_len){
 70 |         array_ds[tx] = array[tx];
 71 |     }
 72 | 
 73 |     // Kahan浮点数加法
 74 |     float c = 0.0;
 75 |     for (int stride = 1; stride < max_len; stride *= 2){
 76 |         __syncthreads();
 77 |         if (tx >= stride && tx < max_len){
 78 |             float tmp = array_ds[tx] - c;
 79 |             c = array_ds[tx- stride] + tmp - array_ds[tx- stride] - tmp;
 80 |             array_ds[tx] = tmp + array_ds[tx - stride];
 81 |         }
 82 |     }
 83 |     if (tx < max_len){
 84 |         output[tx] = array_ds[tx];
 85 |     }
 86 | }
 87 | 
 88 | // 假设length 非常大 shared memory都存不下， 此时只处理block里面的数据，使用aux数组
 89 | __global__ void brent_kung(float *array, float *output, const unsigned int max_len){
 90 |     extern __shared__ float array_ds[];
 91 |     // 读入shared memory
 92 |     int bidx = blockIdx.x;
 93 |     int bdx = blockDim.x;
 94 |     int tx = threadIdx.x;
 95 |     int idx = 2 * bidx * bdx + tx;
 96 |     int pidx;
 97 | 
 98 |     if (idx < max_len){
 99 |         array_ds[tx] = array[idx];
100 |     }
101 |     if (idx + bdx < max_len){
102 |         array_ds[tx + bdx] = array[idx + bdx];
103 |     }
104 | 
105 |     // stage 1
106 |     for (unsigned int stride = 1; stride < max_len ; stride*=2){
107 |         __syncthreads();
108 |         pidx = (tx + 1) * stride * 2 - 1;
109 |         if ( pidx < section ){
110 |             array_ds[pidx] += array_ds[pidx - stride];
111 |         }
112 |     }
113 | 
114 |     // stage 2 reversed tree
115 |     for (unsigned int stride = section / 2; stride > 0; stride/=2){
116 |         __syncthreads();
117 |         pidx = (tx + 1) * stride * 2 - 1;
118 |         if ( pidx + stride < section){
119 |             array_ds[pidx + stride] += array_ds[pidx];
120 |         }
121 |     }
122 |     __syncthreads();
123 | 
124 |     // 进行block间通信
125 |     __shared__ float val;
126 |     if (tx == bdx - 1){
127 |         while (atomicAdd(&flags[bidx], 0) == 0){
128 | 
129 |         }
130 |         val = aux[bidx];
131 |         aux[bidx + 1] = val + array_ds[section - 1];
132 |         // 保证在执行atomicAdd之前aux数组更新完成了
133 |         __threadfence();
134 |         atomicAdd(&flags[bidx + 1], 1);
135 |     }
136 |     __syncthreads();
137 |     idx = 2 * bidx * bdx + tx;
138 |     if (idx < max_len){
139 |         output[idx] = array_ds[tx] + val ;
140 |     }
141 |     if (idx + bdx < max_len){
142 |         output[idx+ bdx] = array_ds[tx + bdx] + val;
143 |     }
144 | 
145 | }
146 | 
147 | 
148 | int main(int args, char **argv){
149 | 
150 |     printf("Block num is %d\nThread num is %d\n",block_num, thread_num);
151 | 
152 |     // 声明
153 |     float *array_host = new float[length];
154 |     float *output_host = new float[length];
155 |     float *output_cpu = new float[length];
156 | 
157 |     float *array_device, *output_device ;
158 | 
159 |     cudaEvent_t start, stop;
160 |     HANDLE_ERROR(cudaEventCreate(&start));
161 |     HANDLE_ERROR(cudaEventCreate(&stop));
162 | 
163 |     // 初始化array以及分配空间
164 |     std::default_random_engine e;
165 |     std::uniform_real_distribution<float> distribution(-10, 10);
166 |     for (int i = 0; i < length; ++i){
167 |        array_host[i] = distribution(e);
168 |     }
169 | 
170 |     HANDLE_ERROR(cudaMalloc((void**)&array_device, sizeof(float) * length));
171 |     HANDLE_ERROR(cudaMalloc((void**)&output_device, sizeof(float) * length));
172 |     HANDLE_ERROR(cudaMemcpy(array_device, array_host, sizeof(float) * length, cudaMemcpyHostToDevice));
173 | 
174 |     // 记录时间并启动kernel，同时记录结束时间
175 |     HANDLE_ERROR(cudaEventRecord(start, 0));
176 | 
177 |     //    kogge_stone<<<1, thread_num, section * sizeof(float)>>>(array_device, output_device, length);
178 |     //    std:: cout << "kogge_stone"<< std::endl;
179 | 
180 |     brent_kung<<<block_num, (thread_num+1) / 2, section * sizeof(float)>>>(array_device, output_device, length);
181 |     std:: cout << "brent_kung"<< std::endl;
182 | 
183 | 
184 |     HANDLE_ERROR(cudaEventRecord(stop, 0));
185 |     HANDLE_ERROR(cudaEventSynchronize(stop));
186 |     float elapsed_time;
187 |     HANDLE_ERROR(cudaEventElapsedTime(&elapsed_time, start, stop));
188 |     std::cout << "Elapsed Time is "<< elapsed_time << std::endl;
189 | 
190 |     // 将数据拷贝出来
191 |     HANDLE_ERROR(cudaMemcpy(output_host, output_device, sizeof(float) * length, cudaMemcpyDeviceToHost));
192 | 
193 |     // 验证结果
194 |     prefixsum_cpu_kahan(array_host, output_cpu, length);
195 | 
196 |     if (is_equale(output_cpu, output_host, length)){
197 |         std::cout << "Answer is Correct"<< std::endl;
198 |     }else{
199 |         std::cout << "Answer is Wrong"<< std::endl;
200 |         for (int i = 0; i < length; ++i){
201 |             printf("%d  %f  %f  %f \n",i, array_host[i], output_cpu[i], output_host[i]);
202 |         }
203 |     }
204 | 
205 |     // Destroy
206 |     HANDLE_ERROR(cudaEventDestroy(start));
207 |     HANDLE_ERROR(cudaEventDestroy(stop));
208 |     HANDLE_ERROR(cudaFree(array_device));
209 |     HANDLE_ERROR(cudaFree(output_device));
210 |     delete[] array_host;
211 |     delete[] output_cpu;
212 |     delete[] output_host;
213 | }


--------------------------------------------------------------------------------
/PMPP/GraphSearch.cu:
--------------------------------------------------------------------------------
  1 | // Created by luozhiwang (luozw1994@outlook.com)
  2 | // Date: 2019/12/27
  3 | 
  4 | #include <cuda.h>
  5 | #include <random>
  6 | 
  7 | // 稀疏矩阵那一章里面是用的vector来实现的，这一次试一下堆上的动态数组
  8 | // 配置一些参数 做BFS求节点路径
  9 | const int SOURCE_VERTEX = 0;
 10 | const int MAX_VERTEX = 15;
 11 | const int BLOCK_QUEUE_NUM = 4;
 12 | const int BLOCK_QUEUE_SIZE = 16;
 13 | const int BLOCK_SIZE = BLOCK_QUEUE_SIZE * BLOCK_QUEUE_NUM;
 14 | texture<int, 1> row_ptr_dev;
 15 | 
 16 | 
 17 | static void HandleError(cudaError_t err, const char *file, int line ) {
 18 |     if (err != cudaSuccess) {
 19 |         printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
 20 |                 file, line );
 21 |         exit( EXIT_FAILURE );
 22 |     }
 23 | }
 24 | #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
 25 | 
 26 | class Graph{
 27 | private:
 28 |     const  int vertex_num;
 29 |      int edge_num = 0;
 30 |      int **joint_matrix;
 31 |      int *dest;
 32 |      int *row_ptr;
 33 | private:
 34 |     void show_dense_matrix() const{
 35 |         printf(" ===================   Origin Matrix   ===================>\n");
 36 |         for (int r = 0; r < this->vertex_num; ++r){
 37 |             for(int c = 0; c < this->vertex_num; ++c){
 38 |                 printf("%.1d ", this -> joint_matrix[r][c]);
 39 |             }
 40 |             printf("\n");
 41 |         }
 42 |         printf("\n");
 43 |     }
 44 |     void show_csr_matrix() const{
 45 |         printf(" ===================   CSR   ===================>\n");
 46 |         printf("\nCSR Dest ===> ");
 47 |         for (int i = 0; i < this->edge_num; ++i){
 48 |             printf("%d ", this-> dest[i]);
 49 |         }
 50 |         printf("\nCSR Row_ptr ===> ");
 51 |         for (int i = 0; i < this -> vertex_num + 1; ++i){
 52 |             printf("%d ", this -> row_ptr[i]);
 53 |         }
 54 |         printf("\n\n");
 55 |     }
 56 | 
 57 | public:
 58 |     Graph(const  int &vertex_num, const float &sparse_ratio): vertex_num(vertex_num){
 59 |         std::default_random_engine e;
 60 |         std::uniform_real_distribution<float> prob(0, 1);
 61 | 
 62 |         this -> joint_matrix = new  int*[this->vertex_num];
 63 |         for(int i = 0; i < this->vertex_num; ++i){
 64 |             this -> joint_matrix[i] = new  int[this -> vertex_num];
 65 |         }
 66 | 
 67 |         // Dense joint matrix
 68 |         for (int i = 0; i < vertex_num; ++i){
 69 |             for (int j = 0; j < vertex_num; ++j){
 70 |                 // 自己和自己没有路径
 71 |                 if (prob(e) <= sparse_ratio && i != j){
 72 |                     this -> joint_matrix[i][j] = 1;
 73 |                     ++edge_num;
 74 |                 } else{
 75 |                     this -> joint_matrix[i][j] = 0;
 76 |                 }
 77 |             }
 78 |         }
 79 | 
 80 |         // CSR
 81 |         dest = new  int[this -> edge_num];
 82 |         row_ptr = new  int[this -> vertex_num + 1];
 83 |         int count = 0;
 84 |         row_ptr[0] = 0;
 85 |         for (int i = 0; i < vertex_num; ++i){
 86 |             for (int j = 0; j < vertex_num; ++j){
 87 |                 if (this -> joint_matrix[i][j] != 0){
 88 |                     dest[count] = j;
 89 |                     ++count;
 90 |                 }
 91 |             }
 92 |             row_ptr[i + 1] = count;
 93 |         }
 94 |     }
 95 | 
 96 |     ~Graph(){
 97 |         delete []row_ptr;
 98 |         delete []dest;
 99 |         for (int i = 0; i < this-> vertex_num; ++i){
100 |             delete []joint_matrix[i];
101 |         }
102 |     }
103 | 
104 |     void show(int type = 0) const{
105 |         switch (type){
106 |             case 0: this -> show_dense_matrix();
107 |                 break;
108 |             case 1: this -> show_csr_matrix();
109 |                 break;
110 |             default:
111 |                 break;
112 |         }
113 |     }
114 | 
115 |      int get_edge_num() const{
116 |         return edge_num;
117 |     }
118 |      int** get_joint_matrix() const{
119 |         return joint_matrix;
120 |     }
121 |      int* get_dest() const{
122 |         return dest;
123 |     }
124 |      int* get_row_ptr() const{
125 |         return row_ptr;
126 |     }
127 | };
128 | 
129 | bool verify_output(int *array1, int *array2, int len){
130 |     bool is_right = true;
131 |     for (int i = 0; i < len; ++i){
132 |         if (array1[i] != array2[i]){
133 |             is_right = false;
134 |             printf("wrong %d, %d\n", array1[i], array2[2]);
135 |             break;
136 |         }
137 |     }
138 |     if (is_right){
139 |         printf("Answer is Correct\n");
140 |     }else{
141 |         printf("Answer is Wrong\n");
142 |         for (int i = 0; i< len; ++i){
143 |             printf("%d ", array1[i]);
144 |         }
145 |         printf("\n");
146 |         for (int i = 0; i< len; ++i){
147 |             printf("%d ", array2[i]);
148 |         }
149 |         printf("\n");
150 |     }
151 | }
152 | 
153 | void insert_into_dist(int source, int *frontier, int *frontier_size){
154 |     frontier[(*frontier_size)++] = source;
155 | }
156 | 
157 | void BFS_sequential(const int &source, const int *row_ptr, const int *dest, int *dist){
158 |     int frontier[2][MAX_VERTEX];
159 |     int *pre_froniter = &frontier[0][0];
160 |     int *cur_frontier = &frontier[1][0];
161 |     int pre_size = 0;
162 |     int cur_size = 0;
163 |     // 初始化配置
164 |     insert_into_dist(source, pre_froniter, &pre_size);
165 |     dist[source] = 0;
166 |     while (pre_size > 0){
167 |         // 遍历所有存储的节点
168 |         for (int i = 0; i < pre_size; ++i){
169 |             int cur_vertex = pre_froniter[i];
170 |             // 遍历当前节点中的所有分支
171 |            for (int j = row_ptr[cur_vertex]; j < row_ptr[cur_vertex+1]; ++j){
172 |                 if (dist[dest[j]] == -1){
173 |                     insert_into_dist(dest[j], cur_frontier, &cur_size);
174 |                     dist[dest[j]] = dist[cur_vertex] + 1;
175 |                 }
176 |             }
177 |         }
178 |         // cur赋值给pre，重置cur
179 |         std::swap(pre_froniter, cur_frontier);
180 |         pre_size = cur_size;
181 |         cur_size = 0;
182 |     }
183 | }
184 | 
185 | __global__ void BFS_Bqueue_kernel( int *pre_frontier,  int *pre_size,  int *cur_frontier,
186 |                                    int *cur_size,  int *dest,  int *dist, int *visited){
187 |     // 3级队列缓存优化
188 |     // shared memory 分别存level 3的cur_frontier，对应的大小，level 2的cur_frontier，合并时对应的idx
189 |     __shared__ int sub_queue_sd[BLOCK_QUEUE_NUM][BLOCK_QUEUE_SIZE];
190 |     __shared__  int sub_queue_size[BLOCK_QUEUE_NUM];
191 |     __shared__  int block_queue[BLOCK_QUEUE_NUM * BLOCK_QUEUE_SIZE];
192 |     __shared__  int block_queue_insert_idx;
193 |     __shared__  int sub_queue_total_size;
194 |     const  int tx = threadIdx.x;
195 |     const  int tid = tx + blockDim.x * blockIdx.x;
196 |     const  int queue_idx = tx % BLOCK_QUEUE_NUM;
197 |     if (tx < BLOCK_QUEUE_NUM){
198 |         sub_queue_size[tx] = 0;
199 |         if (tx == 0){
200 |             sub_queue_total_size = 0;
201 |         }
202 |     }
203 |     __syncthreads();
204 |     // 开始遍历
205 |     if (tid < *pre_size){
206 |         const  int cur_vertex = pre_frontier[tid];
207 |         for( int i = tex1D(row_ptr_dev, cur_vertex); i < tex1D(row_ptr_dev, cur_vertex + 1); ++i){
208 |             const  int was_visited = atomicExch(&visited[dest[i]], 1);
209 |             if (!was_visited){
210 |                 dist[dest[i]] = dist[cur_vertex] + 1;
211 |                 const  int cur_sub_size = atomicAdd(&sub_queue_size[queue_idx], 1);
212 |                 if (cur_sub_size < BLOCK_QUEUE_SIZE){
213 |                     sub_queue_sd[queue_idx][cur_sub_size] = dest[i];
214 |                 }else{
215 |                     // overflow 直接放入global memory中
216 |                     sub_queue_size[queue_idx] = BLOCK_QUEUE_SIZE;
217 |                     const  int global_idx = atomicAdd(cur_size, 1);
218 |                     cur_frontier[global_idx] = dest[i];
219 |                 }
220 |             }
221 |         }
222 |     }
223 |     __syncthreads();
224 |     // 开始执行合并操作
225 |     // level 3 ===> level 2 cur_frontier
226 |     for ( int i = 0; i < BLOCK_QUEUE_NUM; ++i){
227 |         for ( int idx = tx; idx < sub_queue_size[i]; idx += blockDim.x){
228 |             block_queue[idx + i * sub_queue_size[i]] = sub_queue_sd[i][idx];
229 |         }
230 |     }
231 |     // level 3 ===> level 2 cur_size
232 |     for ( int i = tx; i < BLOCK_QUEUE_NUM; i += blockDim.x){
233 |         atomicAdd(&sub_queue_total_size, sub_queue_size[i]);
234 |     }
235 |     __syncthreads();
236 | 
237 |     // level 2 ===> level 1 cur_frontier，cur_size
238 |     if (tx ==0){
239 |         block_queue_insert_idx = atomicAdd(cur_size, sub_queue_total_size);
240 |     }
241 |     __syncthreads();
242 |     for ( int i = tx; i < sub_queue_total_size; i += blockDim.x){
243 |         cur_frontier[block_queue_insert_idx + i] = block_queue[i];
244 |     }
245 | }
246 | 
247 | void BFS_Bqueue(const int &source,  int *dest,  int *row_ptr,  int *dist,  int edge_num){
248 |     // 初始化host
249 |     int frontier[2][MAX_VERTEX];
250 |     int *pre_froniter = &frontier[0][0];
251 |     int *cur_frontier = &frontier[1][0];
252 |     int visited[MAX_VERTEX] = {0};
253 |     int pre_size = 1;
254 |     int cur_size = 0;
255 | 
256 |     int *dist_output = new  int[MAX_VERTEX];
257 |     for (int i = 0; i < MAX_VERTEX; ++i){
258 |         dist[i] = -1;
259 |         dist_output[i] = -1;
260 |     }
261 | 
262 |     pre_froniter[0] = source;
263 |     visited[source] = 1;
264 |     dist[source] = 0;
265 | 
266 |     // 初始化 dev
267 |     int *cur_size_dev, *pre_size_dev;
268 |     int *dest_dev, *dist_dev, *visited_dev;
269 |     int *cur_frontier_dev, *pre_frontier_dev;
270 | 
271 |     cudaEvent_t start, end;
272 | 
273 |     HANDLE_ERROR(cudaEventCreate(&start));
274 |     HANDLE_ERROR(cudaEventCreate(&end));
275 | 
276 |     HANDLE_ERROR(cudaMalloc((void**)&cur_size_dev, sizeof(int)));
277 |     HANDLE_ERROR(cudaMalloc((void**)&pre_size_dev, sizeof(int)));
278 |     HANDLE_ERROR(cudaMalloc((void**)&dest_dev, sizeof(int) * edge_num));
279 |     HANDLE_ERROR(cudaMalloc((void**)&dist_dev, sizeof(int) * MAX_VERTEX));
280 |     HANDLE_ERROR(cudaMalloc((void**)&cur_frontier_dev, sizeof(int) * (MAX_VERTEX)));
281 |     HANDLE_ERROR(cudaMalloc((void**)&pre_frontier_dev, sizeof(int) * (MAX_VERTEX)));
282 |     HANDLE_ERROR(cudaMalloc((void**)&visited_dev, sizeof(int) * MAX_VERTEX));
283 |     HANDLE_ERROR(cudaMemcpy(cur_size_dev, &cur_size, sizeof(int), cudaMemcpyHostToDevice));
284 |     HANDLE_ERROR(cudaMemcpy(pre_size_dev, &pre_size, sizeof(int), cudaMemcpyHostToDevice));
285 |     HANDLE_ERROR(cudaMemcpy(dest_dev, dest, sizeof(int) * edge_num, cudaMemcpyHostToDevice));
286 |     HANDLE_ERROR(cudaMemcpy(dist_dev, dist, sizeof(int) * MAX_VERTEX, cudaMemcpyHostToDevice));
287 |     HANDLE_ERROR(cudaMemcpy(cur_frontier_dev, cur_frontier, sizeof(int) * (MAX_VERTEX), cudaMemcpyHostToDevice));
288 |     HANDLE_ERROR(cudaMemcpy(pre_frontier_dev, pre_froniter, sizeof(int) * (MAX_VERTEX), cudaMemcpyHostToDevice));
289 |     HANDLE_ERROR(cudaMemcpy(visited_dev, visited, sizeof(int) * (MAX_VERTEX), cudaMemcpyHostToDevice));
290 | 
291 |     cudaArray *t_array = 0;
292 |     cudaChannelFormatDesc desc = cudaCreateChannelDesc<int>();
293 |     HANDLE_ERROR(cudaMallocArray(&t_array, &desc, (MAX_VERTEX + 1)));
294 |     HANDLE_ERROR(cudaMemcpyToArray(t_array, 0, 0, row_ptr, sizeof(int) * (MAX_VERTEX + 1), cudaMemcpyHostToDevice));
295 |     HANDLE_ERROR(cudaBindTextureToArray(row_ptr_dev, t_array));
296 | 
297 |     HANDLE_ERROR(cudaEventRecord(start, 0));
298 |     while (pre_size > 0){
299 |         // 求解出cur_frontier
300 |         int BLOCK_NUM = (pre_size - 1) / BLOCK_SIZE + 1;
301 |         BFS_Bqueue_kernel<<<BLOCK_NUM, BLOCK_SIZE>>>(pre_frontier_dev, pre_size_dev, cur_frontier_dev, cur_size_dev, dest_dev, dist_dev, visited_dev);
302 |         // dev ===> host
303 |         std::swap(pre_frontier_dev, cur_frontier_dev);
304 |         HANDLE_ERROR(cudaMemcpy(pre_size_dev, cur_size_dev, sizeof(int), cudaMemcpyDeviceToDevice));
305 |         HANDLE_ERROR(cudaMemset(cur_size_dev, 0, sizeof(int)));
306 |         HANDLE_ERROR(cudaMemcpy(&pre_size, pre_size_dev, sizeof(int), cudaMemcpyDeviceToHost));
307 |     }
308 |     // Kernel launch
309 |     HANDLE_ERROR(cudaEventRecord(end, 0));
310 |     HANDLE_ERROR(cudaEventSynchronize(end));
311 |     float elapsed_time;
312 |     HANDLE_ERROR(cudaEventElapsedTime(&elapsed_time, start, end));
313 |     printf("Elapsed Time is %f \n",elapsed_time);
314 | 
315 |     // 验证结果
316 |     HANDLE_ERROR(cudaMemcpy(dist, dist_dev, sizeof(int) * MAX_VERTEX, cudaMemcpyDeviceToHost));
317 |     BFS_sequential(SOURCE_VERTEX, row_ptr, dest, dist_output);
318 |     verify_output(dist_output, dist, MAX_VERTEX);
319 | 
320 |     // destroy
321 |     delete []dist_output;
322 |     HANDLE_ERROR(cudaUnbindTexture(row_ptr_dev));
323 |     HANDLE_ERROR(cudaEventDestroy(start));
324 |     HANDLE_ERROR(cudaEventDestroy(end));
325 |     HANDLE_ERROR(cudaFree(dest_dev));
326 |     HANDLE_ERROR(cudaFree(dist_dev));
327 | }
328 | 
329 | int main(int args, char** argv){
330 | 
331 |     // 先打印一下graph 验证一下是否正确
332 |     Graph graph = Graph(MAX_VERTEX, 0.2);
333 |     graph.show(0);
334 |     graph.show(1);
335 | 
336 |     // 初始化 host
337 |      int *dest = graph.get_dest();
338 |      int *row_ptr = graph.get_row_ptr();
339 |      int *dist = new  int[MAX_VERTEX];
340 | 
341 |     BFS_Bqueue(SOURCE_VERTEX, dest, row_ptr, dist, graph.get_edge_num());
342 | 
343 |     delete []dist;
344 |     return 0;
345 | 
346 | }


--------------------------------------------------------------------------------