├── .gitignore ├── CMakeLists.txt ├── README.md ├── include ├── scan.cuh └── utils.h └── src ├── main.cpp ├── scan.cu └── utils.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | build -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.9) 2 | project(scan_test) 3 | enable_language(CUDA) 4 | set(CMAKE_CXX_STANDARD 11) 5 | set(CMAKE_CUDA_STANDARD 11) 6 | find_package(CUDA REQUIRED) 7 | 8 | set(INCLUDE_DIRS 9 | ${PROJECT_SOURCE_DIR}/include) 10 | 11 | set(CPP_SRC 12 | ${PROJECT_SOURCE_DIR}/src/main.cpp 13 | ${PROJECT_SOURCE_DIR}/src/utils.cpp) 14 | 15 | set(CUDA_SRC 16 | ${PROJECT_SOURCE_DIR}/src/scan.cu) 17 | 18 | cuda_add_library(scan STATIC ${INCLUDE_DIRS} ${CUDA_SRC}) 19 | 20 | include_directories(${INCLUDE_DIRS}) 21 | add_executable(scan_test ${CPP_SRC}) 22 | target_link_libraries(scan_test scan) 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Parallel Prefix Sum (Scan) with CUDA 2 | [CUDA练手小项目——Parallel Prefix Sum (Scan)](https://zhuanlan.zhihu.com/p/661460705) 3 | My implementation of parallel exclusive scan in CUDA, following [this NVIDIA paper](https://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/scan/doc/scan.pdf). 4 | ## Achievements 5 | - Block scanning 6 | - Full scan for large vectors (support for multi-layers scan) 7 | - Bank conflict avoidance optimization (BCAO) 8 | ## Result 9 | i5-11400 @ 2.60GHz + GeForce RTX 2060 Super 10 | ``` 11 | -------------------------- N = 1000 -------------------------- 12 | scan_cpu - total: 0.00147 ms 13 | sequential_scan_gpu - total: 0.16394 ms kernel: 0.05120 ms 14 | parallel_block_scan_gpu - total: 0.12372 ms kernel: 0.01190 ms 15 | parallel_block_scan_gpu with bcao - total: 0.11636 ms kernel: 0.01027 ms 16 | parallel_large_scan_gpu - total: 0.13093 ms kernel: 0.02202 ms 17 | parallel_large_scan_gpu with bcao - total: 0.12363 ms kernel: 0.01696 ms 18 | 19 | -------------------------- N = 2048 -------------------------- 20 | scan_cpu - total: 0.00403 ms 21 | sequential_scan_gpu - total: 0.20442 ms kernel: 0.09626 ms 22 | parallel_block_scan_gpu - total: 0.12439 ms kernel: 0.01395 ms 23 | parallel_block_scan_gpu with bcao - total: 0.12183 ms kernel: 0.01360 ms 24 | parallel_large_scan_gpu - total: 0.12436 ms kernel: 0.02048 ms 25 | parallel_large_scan_gpu with bcao - total: 0.12137 ms kernel: 0.01638 ms 26 | 27 | -------------------------- N = 100000 -------------------------- 28 | scan_cpu - total: 0.25345 ms 29 | sequential_scan_gpu - total: 4.93468 ms kernel: 4.60474 ms 30 | parallel_large_scan_gpu - total: 0.30275 ms kernel: 0.05891 ms 31 | parallel_large_scan_gpu with bcao - total: 0.26996 ms kernel: 0.04157 ms 32 | 33 | -------------------------- N = 10000000 -------------------------- 34 | scan_cpu - total: 27.09050 ms 35 | sequential_scan_gpu - total: 484.60391 ms kernel: 469.34097 ms 36 | parallel_large_scan_gpu - total: 10.31124 ms kernel: 1.15578 ms 37 | parallel_large_scan_gpu with bcao - total: 9.54029 ms kernel: 0.89962 ms 38 | ``` 39 | 40 | ## Acknowledgements 41 | https://github.com/mattdean1/cuda 42 | https://github.com/TVycas/CUDA-Parallel-Prefix-Sum -------------------------------------------------------------------------------- /include/scan.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define MAX_THREADS_PER_BLOCK 1024 4 | #define MAX_ELEMENTS_PER_BLOCK (MAX_THREADS_PER_BLOCK * 2) 5 | 6 | #include 7 | 8 | void warm_up(); 9 | float scan_cpu(int *data, int *prefix_sum, int N); 10 | std::tuple sequential_scan_gpu(int *data, int *prefix_sum, int N); 11 | std::tuple parallel_block_scan_gpu(int *data, int *prefix_sum, int N, bool bcao); 12 | std::tuple parallel_large_scan_gpu(int *data, int *prefix_sum, int N, bool bcao); -------------------------------------------------------------------------------- /include/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | void data_init(int *data, int N); 4 | void results_check(int *a, int *b, int N); 5 | void print_int_arr(int *a, int N); 6 | int next_power_of_two(int x); -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "scan.cuh" 2 | #include "utils.h" 3 | #include 4 | #include 5 | #include 6 | 7 | int main(int argc, char **argv) 8 | { 9 | warm_up(); 10 | int nums[] = {1000, 2048, 100000, 10000000}; 11 | int len = sizeof(nums) / sizeof(int); 12 | for (int i = 0; i < len; i++) 13 | { 14 | int N = nums[i]; 15 | size_t arr_size = N * sizeof(int); 16 | int *data = (int *)malloc(arr_size); 17 | int *prefix_sum_cpu = (int *)malloc(arr_size); 18 | int *prefix_sum_gpu = (int *)malloc(arr_size); 19 | float total_cost, kernel_cost; 20 | data_init(data, N); 21 | printf("-------------------------- N = %d --------------------------\n", N); 22 | 23 | total_cost = scan_cpu(data, prefix_sum_cpu, N); 24 | printf("%35s - total: %10.5f ms\n", "scan_cpu", total_cost); 25 | 26 | std::tie(total_cost, kernel_cost) = sequential_scan_gpu(data, prefix_sum_gpu, N); 27 | results_check(prefix_sum_cpu, prefix_sum_gpu, N); 28 | printf("%35s - total: %10.5f ms kernel: %10.5f ms\n", "sequential_scan_gpu", total_cost, kernel_cost); 29 | 30 | if (N <= MAX_ELEMENTS_PER_BLOCK) 31 | { 32 | std::tie(total_cost, kernel_cost) = parallel_block_scan_gpu(data, prefix_sum_gpu, N, false); 33 | results_check(prefix_sum_cpu, prefix_sum_gpu, N); 34 | printf("%35s - total: %10.5f ms kernel: %10.5f ms\n", "parallel_block_scan_gpu", total_cost, 35 | kernel_cost); 36 | 37 | std::tie(total_cost, kernel_cost) = parallel_block_scan_gpu(data, prefix_sum_gpu, N, true); 38 | results_check(prefix_sum_cpu, prefix_sum_gpu, N); 39 | printf("%35s - total: %10.5f ms kernel: %10.5f ms\n", "parallel_block_scan_gpu with bcao", total_cost, 40 | kernel_cost); 41 | } 42 | 43 | std::tie(total_cost, kernel_cost) = parallel_large_scan_gpu(data, prefix_sum_gpu, N, false); 44 | results_check(prefix_sum_cpu, prefix_sum_gpu, N); 45 | printf("%35s - total: %10.5f ms kernel: %10.5f ms\n", "parallel_large_scan_gpu", total_cost, kernel_cost); 46 | 47 | std::tie(total_cost, kernel_cost) = parallel_large_scan_gpu(data, prefix_sum_gpu, N, true); 48 | results_check(prefix_sum_cpu, prefix_sum_gpu, N); 49 | printf("%35s - total: %10.5f ms kernel: %10.5f ms\n", "parallel_large_scan_gpu with bcao", total_cost, 50 | kernel_cost); 51 | 52 | free(data); 53 | free(prefix_sum_cpu); 54 | free(prefix_sum_gpu); 55 | printf("\n"); 56 | } 57 | } -------------------------------------------------------------------------------- /src/scan.cu: -------------------------------------------------------------------------------- 1 | #include "scan.cuh" 2 | #include "utils.h" 3 | #include 4 | #include 5 | #include 6 | 7 | #define NUM_BANKS 32 8 | #define LOG_NUM_BANKS 5 9 | #define ZERO_BANK_CONFLICTS 10 | #ifdef ZERO_BANK_CONFLICTS 11 | #define CONFLICT_FREE_OFFSET(n) (((n) >> LOG_NUM_BANKS) + ((n) >> (2 * LOG_NUM_BANKS))) 12 | #else 13 | #define CONFLICT_FREE_OFFSET(n) ((n) >> LOG_NUM_BANKS) 14 | #endif 15 | #define MAX_SHARE_SIZE (MAX_ELEMENTS_PER_BLOCK + CONFLICT_FREE_OFFSET(MAX_ELEMENTS_PER_BLOCK - 1)) 16 | #define CUDA_CHECK(call) \ 17 | do \ 18 | { \ 19 | cudaError_t err = call; \ 20 | if (err != cudaSuccess) \ 21 | { \ 22 | printf("CUDA Error: \n"); \ 23 | printf(" File: %s\n", __FILE__); \ 24 | printf(" Line: %d\n", __LINE__); \ 25 | printf(" Error Code: %d\n", err); \ 26 | printf(" Error Text: %s\n", cudaGetErrorString(err)); \ 27 | exit(1); \ 28 | } \ 29 | } while (0) 30 | 31 | __global__ void warm_up_kernel(int *data) 32 | { 33 | int tid = threadIdx.x; 34 | data[tid] += tid; 35 | } 36 | 37 | void warm_up() 38 | { 39 | int N = 512; 40 | size_t arr_size = N * sizeof(int); 41 | int *data = (int *)malloc(arr_size); 42 | data_init(data, N); 43 | 44 | for (int i = 0; i < 10; i++) 45 | { 46 | int *d_data; 47 | CUDA_CHECK(cudaMalloc(&d_data, arr_size)); 48 | CUDA_CHECK(cudaMemcpy(d_data, data, arr_size, cudaMemcpyHostToDevice)); 49 | 50 | warm_up_kernel<<<1, N>>>(d_data); 51 | CUDA_CHECK(cudaGetLastError()); 52 | CUDA_CHECK(cudaDeviceSynchronize()); 53 | 54 | CUDA_CHECK(cudaMemcpy(data, d_data, arr_size, cudaMemcpyDeviceToHost)); 55 | CUDA_CHECK(cudaFree(d_data)); 56 | } 57 | 58 | free(data); 59 | } 60 | 61 | class TotalTimer 62 | { 63 | private: 64 | std::chrono::high_resolution_clock::time_point m_start_point, m_end_point; 65 | 66 | public: 67 | void start() 68 | { 69 | m_start_point = std::chrono::high_resolution_clock::now(); 70 | }; 71 | void end() 72 | { 73 | m_end_point = std::chrono::high_resolution_clock::now(); 74 | }; 75 | float cost() 76 | { 77 | std::chrono::duration dur = m_end_point - m_start_point; 78 | return dur.count(); 79 | }; 80 | }; 81 | 82 | class KernelTimer 83 | { 84 | private: 85 | cudaEvent_t m_start_event, m_end_event; 86 | 87 | public: 88 | KernelTimer() 89 | { 90 | CUDA_CHECK(cudaEventCreate(&m_start_event)); 91 | CUDA_CHECK(cudaEventCreate(&m_end_event)); 92 | }; 93 | ~KernelTimer() 94 | { 95 | CUDA_CHECK(cudaEventDestroy(m_start_event)); 96 | CUDA_CHECK(cudaEventDestroy(m_end_event)); 97 | }; 98 | void start() 99 | { 100 | CUDA_CHECK(cudaEventRecord(m_start_event)); 101 | }; 102 | void end() 103 | { 104 | CUDA_CHECK(cudaEventRecord(m_end_event)); 105 | CUDA_CHECK(cudaEventSynchronize(m_end_event)); 106 | }; 107 | float cost() 108 | { 109 | float kernel_cost; 110 | CUDA_CHECK(cudaEventElapsedTime(&kernel_cost, m_start_event, m_end_event)); 111 | return kernel_cost; 112 | }; 113 | }; 114 | 115 | float scan_cpu(int *data, int *prefix_sum, int N) 116 | { 117 | TotalTimer total_timer; 118 | total_timer.start(); 119 | 120 | prefix_sum[0] = 0; 121 | for (int i = 1; i < N; i++) 122 | { 123 | prefix_sum[i] = prefix_sum[i - 1] + data[i - 1]; 124 | } 125 | 126 | total_timer.end(); 127 | return total_timer.cost(); 128 | } 129 | 130 | __global__ void sequential_scan_kernel(int *data, int *prefix_sum, int N) 131 | { 132 | prefix_sum[0] = 0; 133 | for (int i = 1; i < N; i++) 134 | { 135 | prefix_sum[i] = prefix_sum[i - 1] + data[i - 1]; 136 | } 137 | } 138 | 139 | std::tuple sequential_scan_gpu(int *data, int *prefix_sum, int N) 140 | { 141 | TotalTimer total_timer; 142 | total_timer.start(); 143 | 144 | int *d_data, *d_prefix_sum; 145 | size_t arr_size = N * sizeof(int); 146 | CUDA_CHECK(cudaMalloc(&d_data, arr_size)); 147 | CUDA_CHECK(cudaMalloc(&d_prefix_sum, arr_size)); 148 | CUDA_CHECK(cudaMemcpy(d_data, data, arr_size, cudaMemcpyHostToDevice)); 149 | 150 | KernelTimer kernel_timer; 151 | kernel_timer.start(); 152 | 153 | sequential_scan_kernel<<<1, 1>>>(d_data, d_prefix_sum, N); 154 | CUDA_CHECK(cudaGetLastError()); 155 | CUDA_CHECK(cudaDeviceSynchronize()); 156 | 157 | kernel_timer.end(); 158 | float kernel_cost = kernel_timer.cost(); 159 | 160 | CUDA_CHECK(cudaMemcpy(prefix_sum, d_prefix_sum, arr_size, cudaMemcpyDeviceToHost)); 161 | CUDA_CHECK(cudaFree(d_data)); 162 | CUDA_CHECK(cudaFree(d_prefix_sum)); 163 | 164 | total_timer.end(); 165 | float total_cost = total_timer.cost(); 166 | 167 | return {total_cost, kernel_cost}; 168 | } 169 | 170 | __global__ void parallel_block_scan_kernel(int *data, int *prefix_sum, int N) 171 | { 172 | extern __shared__ int tmp[]; 173 | int tid = threadIdx.x; 174 | int leaf_num = blockDim.x * 2; // equals to length of tmp 175 | 176 | tmp[tid * 2] = tid * 2 < N ? data[tid * 2] : 0; 177 | tmp[tid * 2 + 1] = tid * 2 + 1 < N ? data[tid * 2 + 1] : 0; 178 | __syncthreads(); 179 | 180 | int offset = 1; 181 | for (int d = leaf_num >> 1; d > 0; d >>= 1) 182 | { 183 | if (tid < d) 184 | { 185 | int ai = offset * (2 * tid + 1) - 1; 186 | int bi = offset * (2 * tid + 2) - 1; 187 | tmp[bi] += tmp[ai]; 188 | } 189 | offset *= 2; 190 | __syncthreads(); 191 | } 192 | 193 | if (tid == 0) 194 | { 195 | tmp[leaf_num - 1] = 0; 196 | } 197 | __syncthreads(); 198 | 199 | for (int d = 1; d < leaf_num; d *= 2) 200 | { 201 | offset >>= 1; 202 | if (tid < d) 203 | { 204 | int ai = offset * (2 * tid + 1) - 1; 205 | int bi = offset * (2 * tid + 2) - 1; 206 | 207 | int v = tmp[ai]; 208 | tmp[ai] = tmp[bi]; 209 | tmp[bi] += v; 210 | } 211 | __syncthreads(); 212 | } 213 | 214 | if (tid * 2 < N) 215 | { 216 | prefix_sum[tid * 2] = tmp[tid * 2]; 217 | } 218 | if (tid * 2 + 1 < N) 219 | { 220 | prefix_sum[tid * 2 + 1] = tmp[tid * 2 + 1]; 221 | } 222 | } 223 | 224 | __global__ void parallel_block_scan_bcao_kernel(int *data, int *prefix_sum, int N) 225 | { 226 | extern __shared__ int tmp[]; 227 | int tid = threadIdx.x; 228 | int leaf_num = blockDim.x * 2; // not equals to length of tmp 229 | 230 | int ai = tid; 231 | int bi = tid + (leaf_num >> 1); 232 | int offset_ai = CONFLICT_FREE_OFFSET(ai); 233 | int offset_bi = CONFLICT_FREE_OFFSET(bi); 234 | 235 | tmp[ai + offset_ai] = ai < N ? data[ai] : 0; 236 | tmp[bi + offset_bi] = bi < N ? data[bi] : 0; 237 | __syncthreads(); 238 | 239 | int offset = 1; 240 | for (int d = leaf_num >> 1; d > 0; d >>= 1) 241 | { 242 | if (tid < d) 243 | { 244 | int ai = offset * (2 * tid + 1) - 1; 245 | int bi = offset * (2 * tid + 2) - 1; 246 | ai += CONFLICT_FREE_OFFSET(ai); 247 | bi += CONFLICT_FREE_OFFSET(bi); 248 | tmp[bi] += tmp[ai]; 249 | } 250 | offset *= 2; 251 | __syncthreads(); 252 | } 253 | 254 | if (tid == 0) 255 | { 256 | tmp[leaf_num - 1 + CONFLICT_FREE_OFFSET(leaf_num - 1)] = 0; 257 | } 258 | __syncthreads(); 259 | 260 | for (int d = 1; d < leaf_num; d *= 2) 261 | { 262 | offset >>= 1; 263 | if (tid < d) 264 | { 265 | int ai = offset * (2 * tid + 1) - 1; 266 | int bi = offset * (2 * tid + 2) - 1; 267 | ai += CONFLICT_FREE_OFFSET(ai); 268 | bi += CONFLICT_FREE_OFFSET(bi); 269 | 270 | int v = tmp[ai]; 271 | tmp[ai] = tmp[bi]; 272 | tmp[bi] += v; 273 | } 274 | __syncthreads(); 275 | } 276 | 277 | if (ai < N) 278 | { 279 | prefix_sum[ai] = tmp[ai + offset_ai]; 280 | } 281 | if (bi < N) 282 | { 283 | prefix_sum[bi] = tmp[bi + offset_bi]; 284 | } 285 | } 286 | 287 | std::tuple parallel_block_scan_gpu(int *data, int *prefix_sum, int N, bool bcao) 288 | { 289 | TotalTimer total_timer; 290 | total_timer.start(); 291 | 292 | int *d_data, *d_prefix_sum; 293 | size_t arr_size = N * sizeof(int); 294 | CUDA_CHECK(cudaMalloc(&d_data, arr_size)); 295 | CUDA_CHECK(cudaMalloc(&d_prefix_sum, arr_size)); 296 | CUDA_CHECK(cudaMemcpy(d_data, data, arr_size, cudaMemcpyHostToDevice)); 297 | 298 | KernelTimer kernel_timer; 299 | kernel_timer.start(); 300 | 301 | int padding_N = next_power_of_two(N); 302 | if (bcao) 303 | { 304 | int share_mem_size = (padding_N + CONFLICT_FREE_OFFSET(padding_N - 1)) * sizeof(int); 305 | parallel_block_scan_bcao_kernel<<<1, padding_N / 2, share_mem_size>>>(d_data, d_prefix_sum, N); 306 | } 307 | else 308 | { 309 | int share_mem_size = padding_N * sizeof(int); 310 | parallel_block_scan_kernel<<<1, padding_N / 2, share_mem_size>>>(d_data, d_prefix_sum, N); 311 | } 312 | CUDA_CHECK(cudaGetLastError()); 313 | CUDA_CHECK(cudaDeviceSynchronize()); 314 | 315 | kernel_timer.end(); 316 | float kernel_cost = kernel_timer.cost(); 317 | 318 | CUDA_CHECK(cudaMemcpy(prefix_sum, d_prefix_sum, arr_size, cudaMemcpyDeviceToHost)); 319 | CUDA_CHECK(cudaFree(d_data)); 320 | CUDA_CHECK(cudaFree(d_prefix_sum)); 321 | 322 | total_timer.end(); 323 | float total_cost = total_timer.cost(); 324 | 325 | return {total_cost, kernel_cost}; 326 | } 327 | 328 | __global__ void parallel_large_scan_kernel(int *data, int *prefix_sum, int N, int *sums) 329 | { 330 | __shared__ int tmp[MAX_ELEMENTS_PER_BLOCK]; 331 | int tid = threadIdx.x; 332 | int bid = blockIdx.x; 333 | int block_offset = bid * MAX_ELEMENTS_PER_BLOCK; 334 | int leaf_num = MAX_ELEMENTS_PER_BLOCK; 335 | 336 | tmp[tid * 2] = tid * 2 + block_offset < N ? data[tid * 2 + block_offset] : 0; 337 | tmp[tid * 2 + 1] = tid * 2 + 1 + block_offset < N ? data[tid * 2 + 1 + block_offset] : 0; 338 | __syncthreads(); 339 | 340 | int offset = 1; 341 | for (int d = leaf_num >> 1; d > 0; d >>= 1) 342 | { 343 | if (tid < d) 344 | { 345 | int ai = offset * (2 * tid + 1) - 1; 346 | int bi = offset * (2 * tid + 2) - 1; 347 | tmp[bi] += tmp[ai]; 348 | } 349 | offset *= 2; 350 | __syncthreads(); 351 | } 352 | 353 | if (tid == 0) 354 | { 355 | sums[bid] = tmp[leaf_num - 1]; 356 | tmp[leaf_num - 1] = 0; 357 | } 358 | __syncthreads(); 359 | 360 | for (int d = 1; d < leaf_num; d *= 2) 361 | { 362 | offset >>= 1; 363 | if (tid < d) 364 | { 365 | int ai = offset * (2 * tid + 1) - 1; 366 | int bi = offset * (2 * tid + 2) - 1; 367 | 368 | int v = tmp[ai]; 369 | tmp[ai] = tmp[bi]; 370 | tmp[bi] += v; 371 | } 372 | __syncthreads(); 373 | } 374 | 375 | if (tid * 2 + block_offset < N) 376 | { 377 | prefix_sum[tid * 2 + block_offset] = tmp[tid * 2]; 378 | } 379 | if (tid * 2 + 1 + block_offset < N) 380 | { 381 | prefix_sum[tid * 2 + 1 + block_offset] = tmp[tid * 2 + 1]; 382 | } 383 | } 384 | 385 | __global__ void parallel_large_scan_bcao_kernel(int *data, int *prefix_sum, int N, int *sums) 386 | { 387 | __shared__ int tmp[MAX_SHARE_SIZE]; 388 | int tid = threadIdx.x; 389 | int bid = blockIdx.x; 390 | int block_offset = bid * MAX_ELEMENTS_PER_BLOCK; 391 | int leaf_num = MAX_ELEMENTS_PER_BLOCK; 392 | 393 | int ai = tid; 394 | int bi = tid + (leaf_num >> 1); 395 | int offset_ai = CONFLICT_FREE_OFFSET(ai); 396 | int offset_bi = CONFLICT_FREE_OFFSET(bi); 397 | 398 | tmp[ai + offset_ai] = ai + block_offset < N ? data[ai + block_offset] : 0; 399 | tmp[bi + offset_bi] = bi + block_offset < N ? data[bi + block_offset] : 0; 400 | __syncthreads(); 401 | 402 | int offset = 1; 403 | for (int d = leaf_num >> 1; d > 0; d >>= 1) 404 | { 405 | if (tid < d) 406 | { 407 | int ai = offset * (2 * tid + 1) - 1; 408 | int bi = offset * (2 * tid + 2) - 1; 409 | ai += CONFLICT_FREE_OFFSET(ai); 410 | bi += CONFLICT_FREE_OFFSET(bi); 411 | tmp[bi] += tmp[ai]; 412 | } 413 | offset *= 2; 414 | __syncthreads(); 415 | } 416 | 417 | if (tid == 0) 418 | { 419 | int last_idx = leaf_num - 1 + CONFLICT_FREE_OFFSET(leaf_num - 1); 420 | sums[bid] = tmp[last_idx]; 421 | tmp[last_idx] = 0; 422 | } 423 | __syncthreads(); 424 | 425 | for (int d = 1; d < leaf_num; d *= 2) 426 | { 427 | offset >>= 1; 428 | if (tid < d) 429 | { 430 | int ai = offset * (2 * tid + 1) - 1; 431 | int bi = offset * (2 * tid + 2) - 1; 432 | ai += CONFLICT_FREE_OFFSET(ai); 433 | bi += CONFLICT_FREE_OFFSET(bi); 434 | 435 | int v = tmp[ai]; 436 | tmp[ai] = tmp[bi]; 437 | tmp[bi] += v; 438 | } 439 | __syncthreads(); 440 | } 441 | 442 | if (ai + block_offset < N) 443 | { 444 | prefix_sum[ai + block_offset] = tmp[ai + offset_ai]; 445 | } 446 | if (bi + block_offset < N) 447 | { 448 | prefix_sum[bi + block_offset] = tmp[bi + offset_bi]; 449 | } 450 | } 451 | 452 | __global__ void add_kernel(int *prefix_sum, int *valus, int N) 453 | { 454 | int tid = threadIdx.x; 455 | int bid = blockIdx.x; 456 | int block_offset = bid * MAX_ELEMENTS_PER_BLOCK; 457 | int ai = tid + block_offset; 458 | int bi = tid + (MAX_ELEMENTS_PER_BLOCK >> 1) + block_offset; 459 | 460 | if (ai < N) 461 | { 462 | prefix_sum[ai] += valus[bid]; 463 | } 464 | if (bi < N) 465 | { 466 | prefix_sum[bi] += valus[bid]; 467 | } 468 | } 469 | 470 | void recursive_scan(int *d_data, int *d_prefix_sum, int N, bool bcao) 471 | { 472 | int block_num = N / MAX_ELEMENTS_PER_BLOCK; 473 | if (N % MAX_ELEMENTS_PER_BLOCK != 0) 474 | { 475 | block_num += 1; 476 | } 477 | int *d_sums, *d_sums_prefix_sum; 478 | CUDA_CHECK(cudaMalloc(&d_sums, block_num * sizeof(int))); 479 | CUDA_CHECK(cudaMalloc(&d_sums_prefix_sum, block_num * sizeof(int))); 480 | 481 | if (bcao) 482 | { 483 | parallel_large_scan_bcao_kernel<<>>(d_data, d_prefix_sum, N, d_sums); 484 | } 485 | else 486 | { 487 | parallel_large_scan_kernel<<>>(d_data, d_prefix_sum, N, d_sums); 488 | } 489 | CUDA_CHECK(cudaGetLastError()); 490 | CUDA_CHECK(cudaDeviceSynchronize()); 491 | 492 | if (block_num != 1) 493 | { 494 | recursive_scan(d_sums, d_sums_prefix_sum, block_num, bcao); 495 | add_kernel<<>>(d_prefix_sum, d_sums_prefix_sum, N); 496 | CUDA_CHECK(cudaGetLastError()); 497 | CUDA_CHECK(cudaDeviceSynchronize()); 498 | } 499 | 500 | CUDA_CHECK(cudaFree(d_sums)); 501 | CUDA_CHECK(cudaFree(d_sums_prefix_sum)); 502 | } 503 | 504 | std::tuple parallel_large_scan_gpu(int *data, int *prefix_sum, int N, bool bcao) 505 | { 506 | TotalTimer total_timer; 507 | total_timer.start(); 508 | 509 | int *d_data, *d_prefix_sum; 510 | size_t arr_size = N * sizeof(int); 511 | CUDA_CHECK(cudaMalloc(&d_data, arr_size)); 512 | CUDA_CHECK(cudaMalloc(&d_prefix_sum, arr_size)); 513 | CUDA_CHECK(cudaMemcpy(d_data, data, arr_size, cudaMemcpyHostToDevice)); 514 | 515 | KernelTimer kernel_timer; 516 | kernel_timer.start(); 517 | 518 | recursive_scan(d_data, d_prefix_sum, N, bcao); 519 | 520 | kernel_timer.end(); 521 | float kernel_cost = kernel_timer.cost(); 522 | 523 | CUDA_CHECK(cudaMemcpy(prefix_sum, d_prefix_sum, arr_size, cudaMemcpyDeviceToHost)); 524 | CUDA_CHECK(cudaFree(d_data)); 525 | CUDA_CHECK(cudaFree(d_prefix_sum)); 526 | 527 | total_timer.end(); 528 | float total_cost = total_timer.cost(); 529 | 530 | return {total_cost, kernel_cost}; 531 | } -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | #include 3 | #include 4 | 5 | void data_init(int *data, int N) 6 | { 7 | std::uniform_int_distribution<> int_generator(-10, 100); 8 | std::default_random_engine rand_engine(time(nullptr)); 9 | for (int i = 0; i < N; i++) 10 | { 11 | data[i] = int_generator(rand_engine); 12 | } 13 | } 14 | 15 | void results_check(int *a, int *b, int N) 16 | { 17 | for (int i = 0; i < N; i++) 18 | { 19 | if (a[i] != b[i]) 20 | { 21 | printf("results_check fail\n"); 22 | exit(1); 23 | } 24 | } 25 | } 26 | 27 | void print_int_arr(int *a, int N) 28 | { 29 | for (int i = 0; i < N; i++) 30 | { 31 | printf("%d ", a[i]); 32 | } 33 | printf("\n"); 34 | } 35 | 36 | int next_power_of_two(int x) 37 | { 38 | int power = 1; 39 | while (power < x) 40 | { 41 | power *= 2; 42 | } 43 | return power; 44 | } --------------------------------------------------------------------------------