├── .gitignore ├── CMakeLists.txt ├── README.md ├── include ├── CudaAllocator.h ├── helper_cuda.h ├── helper_string.h ├── ticktock.h └── wangsrng.h ├── main.cu └── run.sh /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | GNUmakefile 3 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | 3 | set(CMAKE_CXX_STANDARD 17) 4 | set(CMAKE_CUDA_STANDARD 17) 5 | if (NOT CMAKE_BUILD_TYPE) 6 | set(CMAKE_BUILD_TYPE Release) 7 | endif() 8 | # 如果需要指定显卡版本号的话： 9 | # set(CMAKE_CUDA_ARCHITECTURES 52) 10 | 11 | project(hellocmake LANGUAGES CXX CUDA) 12 | 13 | add_executable(main main.cu) 14 | target_include_directories(main PUBLIC include) 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 高性能并行编程与优化 - 第08讲的回家作业 2 | 3 | 通过 pull request 提交作业。会批分数，但是： 4 | 5 | 没有结业证书，回家作业仅仅作为评估学习效果和巩固知识的手段，不必为分数感到紧张 :) 6 | 量力而行，只要能在本课中，学到昨天的自己不懂的知识，就是胜利，没必要和别人攀比。 7 | 注意不要偷看别人的作业哦！ 8 | 9 | - 课件：https://github.com/parallel101/course 10 | - 录播：https://space.bilibili.com/263032155 11 | 12 | 作业提交时间不限 :) 即使完结了还想交的话我也会看的~ 不过最好在下一讲开播前完成。 13 | 14 | - 如何开 pull request：https://zhuanlan.zhihu.com/p/51199833 15 | - 如何设置 https 代理：https://www.jianshu.com/p/b481d2a42274 16 | 17 | ## 评分规则 18 | 19 | - 完成作业基本要求 50 分（详见下方"作业要求"） 20 | - 能够在PR 描述中用自己的话解释 25 分 21 | - 代码格式规范、能够跨平台 5 分 22 | - 有自己独特的创新点 20 分 23 | - 明显抄袭现象 -100 分 24 | 25 | ## 作业要求 26 | 27 | 修改 main.cpp，改良其中的各个核函数，回答注释中的问题，并通过 `main()` 函数中的基本测试。 28 | 测试的结果和你的优化思路，可以直接写在注释里，也可以写在 PR 描述里。 29 | 30 | 温馨提示：如果用了 IDE，记得统一开启 Release 模式来比较性能。 31 | 32 | ## 关于内卷 33 | 34 | 如果你把 `filter_positive` 改成了基于 BLS 优化的 `exclusive_scan` 的，或是用了 `thrust` 的 `vector` 和模板函数： 35 | 只要是在 **满足作业要求的基础** 上，这是件好事！ 36 | 老师会酌情加分，视为“独特的创新点”，但最多不超过 20 分。 37 | -------------------------------------------------------------------------------- /include/CudaAllocator.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include "helper_cuda.h" 7 | 8 | template 9 | struct CudaAllocator { 10 | using value_type = T; 11 | 12 | T *allocate(size_t size) { 13 | T *ptr = nullptr; 14 | checkCudaErrors(cudaMallocManaged(&ptr, size * sizeof(T))); 15 | return ptr; 16 | } 17 | 18 | void deallocate(T *ptr, size_t size = 0) { 19 | checkCudaErrors(cudaFree(ptr)); 20 | } 21 | 22 | template 23 | void construct(T *p, Args &&...args) { 24 | if constexpr (!(sizeof...(Args) == 0 && std::is_pod_v)) 25 | ::new((void *)p) T(std::forward(args)...); 26 | } 27 | 28 | //**** CIHOU SHABI WENDOUS **** 29 | template 30 | constexpr CudaAllocator(const CudaAllocator<_Other>&) noexcept {} 31 | 32 | constexpr bool operator==(CudaAllocator const &other) const { 33 | return this == &other; 34 | } 35 | }; 36 | -------------------------------------------------------------------------------- /include/helper_cuda.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | //////////////////////////////////////////////////////////////////////////////// 13 | // These are CUDA Helper functions for initialization and error checking 14 | 15 | #ifndef COMMON_HELPER_CUDA_H_ 16 | #define COMMON_HELPER_CUDA_H_ 17 | 18 | #pragma once 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "helper_string.h" 26 | 27 | #ifndef EXIT_WAIVED 28 | #define EXIT_WAIVED 2 29 | #endif 30 | 31 | // Note, it is required that your SDK sample to include the proper header 32 | // files, please refer the CUDA examples for examples of the needed CUDA 33 | // headers, which may change depending on which CUDA functions are used. 34 | 35 | // CUDA Runtime error messages 36 | #ifdef __DRIVER_TYPES_H__ 37 | static const char *_cudaGetErrorEnum(cudaError_t error) { 38 | return cudaGetErrorName(error); 39 | } 40 | #endif 41 | 42 | #ifdef CUDA_DRIVER_API 43 | // CUDA Driver API errors 44 | static const char *_cudaGetErrorEnum(CUresult error) { 45 | static char unknown[] = ""; 46 | const char *ret = NULL; 47 | cuGetErrorName(error, &ret); 48 | return ret ? ret : unknown; 49 | } 50 | #endif 51 | 52 | #ifdef CUBLAS_API_H_ 53 | // cuBLAS API errors 54 | static const char *_cudaGetErrorEnum(cublasStatus_t error) { 55 | switch (error) { 56 | case CUBLAS_STATUS_SUCCESS: 57 | return "CUBLAS_STATUS_SUCCESS"; 58 | 59 | case CUBLAS_STATUS_NOT_INITIALIZED: 60 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 61 | 62 | case CUBLAS_STATUS_ALLOC_FAILED: 63 | return "CUBLAS_STATUS_ALLOC_FAILED"; 64 | 65 | case CUBLAS_STATUS_INVALID_VALUE: 66 | return "CUBLAS_STATUS_INVALID_VALUE"; 67 | 68 | case CUBLAS_STATUS_ARCH_MISMATCH: 69 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 70 | 71 | case CUBLAS_STATUS_MAPPING_ERROR: 72 | return "CUBLAS_STATUS_MAPPING_ERROR"; 73 | 74 | case CUBLAS_STATUS_EXECUTION_FAILED: 75 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 76 | 77 | case CUBLAS_STATUS_INTERNAL_ERROR: 78 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 79 | 80 | case CUBLAS_STATUS_NOT_SUPPORTED: 81 | return "CUBLAS_STATUS_NOT_SUPPORTED"; 82 | 83 | case CUBLAS_STATUS_LICENSE_ERROR: 84 | return "CUBLAS_STATUS_LICENSE_ERROR"; 85 | } 86 | 87 | return ""; 88 | } 89 | #endif 90 | 91 | #ifdef _CUFFT_H_ 92 | // cuFFT API errors 93 | static const char *_cudaGetErrorEnum(cufftResult error) { 94 | switch (error) { 95 | case CUFFT_SUCCESS: 96 | return "CUFFT_SUCCESS"; 97 | 98 | case CUFFT_INVALID_PLAN: 99 | return "CUFFT_INVALID_PLAN"; 100 | 101 | case CUFFT_ALLOC_FAILED: 102 | return "CUFFT_ALLOC_FAILED"; 103 | 104 | case CUFFT_INVALID_TYPE: 105 | return "CUFFT_INVALID_TYPE"; 106 | 107 | case CUFFT_INVALID_VALUE: 108 | return "CUFFT_INVALID_VALUE"; 109 | 110 | case CUFFT_INTERNAL_ERROR: 111 | return "CUFFT_INTERNAL_ERROR"; 112 | 113 | case CUFFT_EXEC_FAILED: 114 | return "CUFFT_EXEC_FAILED"; 115 | 116 | case CUFFT_SETUP_FAILED: 117 | return "CUFFT_SETUP_FAILED"; 118 | 119 | case CUFFT_INVALID_SIZE: 120 | return "CUFFT_INVALID_SIZE"; 121 | 122 | case CUFFT_UNALIGNED_DATA: 123 | return "CUFFT_UNALIGNED_DATA"; 124 | 125 | case CUFFT_INCOMPLETE_PARAMETER_LIST: 126 | return "CUFFT_INCOMPLETE_PARAMETER_LIST"; 127 | 128 | case CUFFT_INVALID_DEVICE: 129 | return "CUFFT_INVALID_DEVICE"; 130 | 131 | case CUFFT_PARSE_ERROR: 132 | return "CUFFT_PARSE_ERROR"; 133 | 134 | case CUFFT_NO_WORKSPACE: 135 | return "CUFFT_NO_WORKSPACE"; 136 | 137 | case CUFFT_NOT_IMPLEMENTED: 138 | return "CUFFT_NOT_IMPLEMENTED"; 139 | 140 | case CUFFT_LICENSE_ERROR: 141 | return "CUFFT_LICENSE_ERROR"; 142 | 143 | case CUFFT_NOT_SUPPORTED: 144 | return "CUFFT_NOT_SUPPORTED"; 145 | } 146 | 147 | return ""; 148 | } 149 | #endif 150 | 151 | #ifdef CUSPARSEAPI 152 | // cuSPARSE API errors 153 | static const char *_cudaGetErrorEnum(cusparseStatus_t error) { 154 | switch (error) { 155 | case CUSPARSE_STATUS_SUCCESS: 156 | return "CUSPARSE_STATUS_SUCCESS"; 157 | 158 | case CUSPARSE_STATUS_NOT_INITIALIZED: 159 | return "CUSPARSE_STATUS_NOT_INITIALIZED"; 160 | 161 | case CUSPARSE_STATUS_ALLOC_FAILED: 162 | return "CUSPARSE_STATUS_ALLOC_FAILED"; 163 | 164 | case CUSPARSE_STATUS_INVALID_VALUE: 165 | return "CUSPARSE_STATUS_INVALID_VALUE"; 166 | 167 | case CUSPARSE_STATUS_ARCH_MISMATCH: 168 | return "CUSPARSE_STATUS_ARCH_MISMATCH"; 169 | 170 | case CUSPARSE_STATUS_MAPPING_ERROR: 171 | return "CUSPARSE_STATUS_MAPPING_ERROR"; 172 | 173 | case CUSPARSE_STATUS_EXECUTION_FAILED: 174 | return "CUSPARSE_STATUS_EXECUTION_FAILED"; 175 | 176 | case CUSPARSE_STATUS_INTERNAL_ERROR: 177 | return "CUSPARSE_STATUS_INTERNAL_ERROR"; 178 | 179 | case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: 180 | return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; 181 | } 182 | 183 | return ""; 184 | } 185 | #endif 186 | 187 | #ifdef CUSOLVER_COMMON_H_ 188 | // cuSOLVER API errors 189 | static const char *_cudaGetErrorEnum(cusolverStatus_t error) { 190 | switch (error) { 191 | case CUSOLVER_STATUS_SUCCESS: 192 | return "CUSOLVER_STATUS_SUCCESS"; 193 | case CUSOLVER_STATUS_NOT_INITIALIZED: 194 | return "CUSOLVER_STATUS_NOT_INITIALIZED"; 195 | case CUSOLVER_STATUS_ALLOC_FAILED: 196 | return "CUSOLVER_STATUS_ALLOC_FAILED"; 197 | case CUSOLVER_STATUS_INVALID_VALUE: 198 | return "CUSOLVER_STATUS_INVALID_VALUE"; 199 | case CUSOLVER_STATUS_ARCH_MISMATCH: 200 | return "CUSOLVER_STATUS_ARCH_MISMATCH"; 201 | case CUSOLVER_STATUS_MAPPING_ERROR: 202 | return "CUSOLVER_STATUS_MAPPING_ERROR"; 203 | case CUSOLVER_STATUS_EXECUTION_FAILED: 204 | return "CUSOLVER_STATUS_EXECUTION_FAILED"; 205 | case CUSOLVER_STATUS_INTERNAL_ERROR: 206 | return "CUSOLVER_STATUS_INTERNAL_ERROR"; 207 | case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: 208 | return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; 209 | case CUSOLVER_STATUS_NOT_SUPPORTED: 210 | return "CUSOLVER_STATUS_NOT_SUPPORTED "; 211 | case CUSOLVER_STATUS_ZERO_PIVOT: 212 | return "CUSOLVER_STATUS_ZERO_PIVOT"; 213 | case CUSOLVER_STATUS_INVALID_LICENSE: 214 | return "CUSOLVER_STATUS_INVALID_LICENSE"; 215 | } 216 | 217 | return ""; 218 | } 219 | #endif 220 | 221 | #ifdef CURAND_H_ 222 | // cuRAND API errors 223 | static const char *_cudaGetErrorEnum(curandStatus_t error) { 224 | switch (error) { 225 | case CURAND_STATUS_SUCCESS: 226 | return "CURAND_STATUS_SUCCESS"; 227 | 228 | case CURAND_STATUS_VERSION_MISMATCH: 229 | return "CURAND_STATUS_VERSION_MISMATCH"; 230 | 231 | case CURAND_STATUS_NOT_INITIALIZED: 232 | return "CURAND_STATUS_NOT_INITIALIZED"; 233 | 234 | case CURAND_STATUS_ALLOCATION_FAILED: 235 | return "CURAND_STATUS_ALLOCATION_FAILED"; 236 | 237 | case CURAND_STATUS_TYPE_ERROR: 238 | return "CURAND_STATUS_TYPE_ERROR"; 239 | 240 | case CURAND_STATUS_OUT_OF_RANGE: 241 | return "CURAND_STATUS_OUT_OF_RANGE"; 242 | 243 | case CURAND_STATUS_LENGTH_NOT_MULTIPLE: 244 | return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; 245 | 246 | case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: 247 | return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; 248 | 249 | case CURAND_STATUS_LAUNCH_FAILURE: 250 | return "CURAND_STATUS_LAUNCH_FAILURE"; 251 | 252 | case CURAND_STATUS_PREEXISTING_FAILURE: 253 | return "CURAND_STATUS_PREEXISTING_FAILURE"; 254 | 255 | case CURAND_STATUS_INITIALIZATION_FAILED: 256 | return "CURAND_STATUS_INITIALIZATION_FAILED"; 257 | 258 | case CURAND_STATUS_ARCH_MISMATCH: 259 | return "CURAND_STATUS_ARCH_MISMATCH"; 260 | 261 | case CURAND_STATUS_INTERNAL_ERROR: 262 | return "CURAND_STATUS_INTERNAL_ERROR"; 263 | } 264 | 265 | return ""; 266 | } 267 | #endif 268 | 269 | #ifdef NVJPEGAPI 270 | // nvJPEG API errors 271 | static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { 272 | switch (error) { 273 | case NVJPEG_STATUS_SUCCESS: 274 | return "NVJPEG_STATUS_SUCCESS"; 275 | 276 | case NVJPEG_STATUS_NOT_INITIALIZED: 277 | return "NVJPEG_STATUS_NOT_INITIALIZED"; 278 | 279 | case NVJPEG_STATUS_INVALID_PARAMETER: 280 | return "NVJPEG_STATUS_INVALID_PARAMETER"; 281 | 282 | case NVJPEG_STATUS_BAD_JPEG: 283 | return "NVJPEG_STATUS_BAD_JPEG"; 284 | 285 | case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: 286 | return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; 287 | 288 | case NVJPEG_STATUS_ALLOCATOR_FAILURE: 289 | return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; 290 | 291 | case NVJPEG_STATUS_EXECUTION_FAILED: 292 | return "NVJPEG_STATUS_EXECUTION_FAILED"; 293 | 294 | case NVJPEG_STATUS_ARCH_MISMATCH: 295 | return "NVJPEG_STATUS_ARCH_MISMATCH"; 296 | 297 | case NVJPEG_STATUS_INTERNAL_ERROR: 298 | return "NVJPEG_STATUS_INTERNAL_ERROR"; 299 | } 300 | 301 | return ""; 302 | } 303 | #endif 304 | 305 | #ifdef NV_NPPIDEFS_H 306 | // NPP API errors 307 | static const char *_cudaGetErrorEnum(NppStatus error) { 308 | switch (error) { 309 | case NPP_NOT_SUPPORTED_MODE_ERROR: 310 | return "NPP_NOT_SUPPORTED_MODE_ERROR"; 311 | 312 | case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: 313 | return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; 314 | 315 | case NPP_RESIZE_NO_OPERATION_ERROR: 316 | return "NPP_RESIZE_NO_OPERATION_ERROR"; 317 | 318 | case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: 319 | return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; 320 | 321 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 322 | 323 | case NPP_BAD_ARG_ERROR: 324 | return "NPP_BAD_ARGUMENT_ERROR"; 325 | 326 | case NPP_COEFF_ERROR: 327 | return "NPP_COEFFICIENT_ERROR"; 328 | 329 | case NPP_RECT_ERROR: 330 | return "NPP_RECTANGLE_ERROR"; 331 | 332 | case NPP_QUAD_ERROR: 333 | return "NPP_QUADRANGLE_ERROR"; 334 | 335 | case NPP_MEM_ALLOC_ERR: 336 | return "NPP_MEMORY_ALLOCATION_ERROR"; 337 | 338 | case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: 339 | return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; 340 | 341 | case NPP_INVALID_INPUT: 342 | return "NPP_INVALID_INPUT"; 343 | 344 | case NPP_POINTER_ERROR: 345 | return "NPP_POINTER_ERROR"; 346 | 347 | case NPP_WARNING: 348 | return "NPP_WARNING"; 349 | 350 | case NPP_ODD_ROI_WARNING: 351 | return "NPP_ODD_ROI_WARNING"; 352 | #else 353 | 354 | // These are for CUDA 5.5 or higher 355 | case NPP_BAD_ARGUMENT_ERROR: 356 | return "NPP_BAD_ARGUMENT_ERROR"; 357 | 358 | case NPP_COEFFICIENT_ERROR: 359 | return "NPP_COEFFICIENT_ERROR"; 360 | 361 | case NPP_RECTANGLE_ERROR: 362 | return "NPP_RECTANGLE_ERROR"; 363 | 364 | case NPP_QUADRANGLE_ERROR: 365 | return "NPP_QUADRANGLE_ERROR"; 366 | 367 | case NPP_MEMORY_ALLOCATION_ERR: 368 | return "NPP_MEMORY_ALLOCATION_ERROR"; 369 | 370 | case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: 371 | return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; 372 | 373 | case NPP_INVALID_HOST_POINTER_ERROR: 374 | return "NPP_INVALID_HOST_POINTER_ERROR"; 375 | 376 | case NPP_INVALID_DEVICE_POINTER_ERROR: 377 | return "NPP_INVALID_DEVICE_POINTER_ERROR"; 378 | #endif 379 | 380 | case NPP_LUT_NUMBER_OF_LEVELS_ERROR: 381 | return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; 382 | 383 | case NPP_TEXTURE_BIND_ERROR: 384 | return "NPP_TEXTURE_BIND_ERROR"; 385 | 386 | case NPP_WRONG_INTERSECTION_ROI_ERROR: 387 | return "NPP_WRONG_INTERSECTION_ROI_ERROR"; 388 | 389 | case NPP_NOT_EVEN_STEP_ERROR: 390 | return "NPP_NOT_EVEN_STEP_ERROR"; 391 | 392 | case NPP_INTERPOLATION_ERROR: 393 | return "NPP_INTERPOLATION_ERROR"; 394 | 395 | case NPP_RESIZE_FACTOR_ERROR: 396 | return "NPP_RESIZE_FACTOR_ERROR"; 397 | 398 | case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: 399 | return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; 400 | 401 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 402 | 403 | case NPP_MEMFREE_ERR: 404 | return "NPP_MEMFREE_ERR"; 405 | 406 | case NPP_MEMSET_ERR: 407 | return "NPP_MEMSET_ERR"; 408 | 409 | case NPP_MEMCPY_ERR: 410 | return "NPP_MEMCPY_ERROR"; 411 | 412 | case NPP_MIRROR_FLIP_ERR: 413 | return "NPP_MIRROR_FLIP_ERR"; 414 | #else 415 | 416 | case NPP_MEMFREE_ERROR: 417 | return "NPP_MEMFREE_ERROR"; 418 | 419 | case NPP_MEMSET_ERROR: 420 | return "NPP_MEMSET_ERROR"; 421 | 422 | case NPP_MEMCPY_ERROR: 423 | return "NPP_MEMCPY_ERROR"; 424 | 425 | case NPP_MIRROR_FLIP_ERROR: 426 | return "NPP_MIRROR_FLIP_ERROR"; 427 | #endif 428 | 429 | case NPP_ALIGNMENT_ERROR: 430 | return "NPP_ALIGNMENT_ERROR"; 431 | 432 | case NPP_STEP_ERROR: 433 | return "NPP_STEP_ERROR"; 434 | 435 | case NPP_SIZE_ERROR: 436 | return "NPP_SIZE_ERROR"; 437 | 438 | case NPP_NULL_POINTER_ERROR: 439 | return "NPP_NULL_POINTER_ERROR"; 440 | 441 | case NPP_CUDA_KERNEL_EXECUTION_ERROR: 442 | return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; 443 | 444 | case NPP_NOT_IMPLEMENTED_ERROR: 445 | return "NPP_NOT_IMPLEMENTED_ERROR"; 446 | 447 | case NPP_ERROR: 448 | return "NPP_ERROR"; 449 | 450 | case NPP_SUCCESS: 451 | return "NPP_SUCCESS"; 452 | 453 | case NPP_WRONG_INTERSECTION_QUAD_WARNING: 454 | return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; 455 | 456 | case NPP_MISALIGNED_DST_ROI_WARNING: 457 | return "NPP_MISALIGNED_DST_ROI_WARNING"; 458 | 459 | case NPP_AFFINE_QUAD_INCORRECT_WARNING: 460 | return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; 461 | 462 | case NPP_DOUBLE_SIZE_WARNING: 463 | return "NPP_DOUBLE_SIZE_WARNING"; 464 | 465 | case NPP_WRONG_INTERSECTION_ROI_WARNING: 466 | return "NPP_WRONG_INTERSECTION_ROI_WARNING"; 467 | 468 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 469 | /* These are 6.0 or higher */ 470 | case NPP_LUT_PALETTE_BITSIZE_ERROR: 471 | return "NPP_LUT_PALETTE_BITSIZE_ERROR"; 472 | 473 | case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: 474 | return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; 475 | 476 | case NPP_QUALITY_INDEX_ERROR: 477 | return "NPP_QUALITY_INDEX_ERROR"; 478 | 479 | case NPP_CHANNEL_ORDER_ERROR: 480 | return "NPP_CHANNEL_ORDER_ERROR"; 481 | 482 | case NPP_ZERO_MASK_VALUE_ERROR: 483 | return "NPP_ZERO_MASK_VALUE_ERROR"; 484 | 485 | case NPP_NUMBER_OF_CHANNELS_ERROR: 486 | return "NPP_NUMBER_OF_CHANNELS_ERROR"; 487 | 488 | case NPP_COI_ERROR: 489 | return "NPP_COI_ERROR"; 490 | 491 | case NPP_DIVISOR_ERROR: 492 | return "NPP_DIVISOR_ERROR"; 493 | 494 | case NPP_CHANNEL_ERROR: 495 | return "NPP_CHANNEL_ERROR"; 496 | 497 | case NPP_STRIDE_ERROR: 498 | return "NPP_STRIDE_ERROR"; 499 | 500 | case NPP_ANCHOR_ERROR: 501 | return "NPP_ANCHOR_ERROR"; 502 | 503 | case NPP_MASK_SIZE_ERROR: 504 | return "NPP_MASK_SIZE_ERROR"; 505 | 506 | case NPP_MOMENT_00_ZERO_ERROR: 507 | return "NPP_MOMENT_00_ZERO_ERROR"; 508 | 509 | case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: 510 | return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; 511 | 512 | case NPP_THRESHOLD_ERROR: 513 | return "NPP_THRESHOLD_ERROR"; 514 | 515 | case NPP_CONTEXT_MATCH_ERROR: 516 | return "NPP_CONTEXT_MATCH_ERROR"; 517 | 518 | case NPP_FFT_FLAG_ERROR: 519 | return "NPP_FFT_FLAG_ERROR"; 520 | 521 | case NPP_FFT_ORDER_ERROR: 522 | return "NPP_FFT_ORDER_ERROR"; 523 | 524 | case NPP_SCALE_RANGE_ERROR: 525 | return "NPP_SCALE_RANGE_ERROR"; 526 | 527 | case NPP_DATA_TYPE_ERROR: 528 | return "NPP_DATA_TYPE_ERROR"; 529 | 530 | case NPP_OUT_OFF_RANGE_ERROR: 531 | return "NPP_OUT_OFF_RANGE_ERROR"; 532 | 533 | case NPP_DIVIDE_BY_ZERO_ERROR: 534 | return "NPP_DIVIDE_BY_ZERO_ERROR"; 535 | 536 | case NPP_RANGE_ERROR: 537 | return "NPP_RANGE_ERROR"; 538 | 539 | case NPP_NO_MEMORY_ERROR: 540 | return "NPP_NO_MEMORY_ERROR"; 541 | 542 | case NPP_ERROR_RESERVED: 543 | return "NPP_ERROR_RESERVED"; 544 | 545 | case NPP_NO_OPERATION_WARNING: 546 | return "NPP_NO_OPERATION_WARNING"; 547 | 548 | case NPP_DIVIDE_BY_ZERO_WARNING: 549 | return "NPP_DIVIDE_BY_ZERO_WARNING"; 550 | #endif 551 | 552 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 553 | /* These are 7.0 or higher */ 554 | case NPP_OVERFLOW_ERROR: 555 | return "NPP_OVERFLOW_ERROR"; 556 | 557 | case NPP_CORRUPTED_DATA_ERROR: 558 | return "NPP_CORRUPTED_DATA_ERROR"; 559 | #endif 560 | } 561 | 562 | return ""; 563 | } 564 | #endif 565 | 566 | template 567 | void check(T result, char const *const func, const char *const file, 568 | int const line) { 569 | if (result) { 570 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, 571 | static_cast(result), _cudaGetErrorEnum(result), func); 572 | exit(EXIT_FAILURE); 573 | } 574 | } 575 | 576 | #ifdef __DRIVER_TYPES_H__ 577 | // This will output the proper CUDA error strings in the event 578 | // that a CUDA host call returns an error 579 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) 580 | 581 | // This will output the proper error string when calling cudaGetLastError 582 | #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) 583 | 584 | inline void __getLastCudaError(const char *errorMessage, const char *file, 585 | const int line) { 586 | cudaError_t err = cudaGetLastError(); 587 | 588 | if (cudaSuccess != err) { 589 | fprintf(stderr, 590 | "%s(%i) : getLastCudaError() CUDA error :" 591 | " %s : (%d) %s.\n", 592 | file, line, errorMessage, static_cast(err), 593 | cudaGetErrorString(err)); 594 | exit(EXIT_FAILURE); 595 | } 596 | } 597 | 598 | // This will only print the proper error string when calling cudaGetLastError 599 | // but not exit program incase error detected. 600 | #define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) 601 | 602 | inline void __printLastCudaError(const char *errorMessage, const char *file, 603 | const int line) { 604 | cudaError_t err = cudaGetLastError(); 605 | 606 | if (cudaSuccess != err) { 607 | fprintf(stderr, 608 | "%s(%i) : getLastCudaError() CUDA error :" 609 | " %s : (%d) %s.\n", 610 | file, line, errorMessage, static_cast(err), 611 | cudaGetErrorString(err)); 612 | } 613 | } 614 | #endif 615 | 616 | #ifndef MAX 617 | #define MAX(a, b) (a > b ? a : b) 618 | #endif 619 | 620 | // Float To Int conversion 621 | inline int ftoi(float value) { 622 | return (value >= 0 ? static_cast(value + 0.5) 623 | : static_cast(value - 0.5)); 624 | } 625 | 626 | // Beginning of GPU Architecture definitions 627 | inline int _ConvertSMVer2Cores(int major, int minor) { 628 | // Defines for GPU Architecture types (using the SM version to determine 629 | // the # of cores per SM 630 | typedef struct { 631 | int SM; // 0xMm (hexidecimal notation), M = SM Major version, 632 | // and m = SM minor version 633 | int Cores; 634 | } sSMtoCores; 635 | 636 | sSMtoCores nGpuArchCoresPerSM[] = { 637 | {0x30, 192}, 638 | {0x32, 192}, 639 | {0x35, 192}, 640 | {0x37, 192}, 641 | {0x50, 128}, 642 | {0x52, 128}, 643 | {0x53, 128}, 644 | {0x60, 64}, 645 | {0x61, 128}, 646 | {0x62, 128}, 647 | {0x70, 64}, 648 | {0x72, 64}, 649 | {0x75, 64}, 650 | {0x80, 64}, 651 | {0x86, 128}, 652 | {0x87, 128}, 653 | {-1, -1}}; 654 | 655 | int index = 0; 656 | 657 | while (nGpuArchCoresPerSM[index].SM != -1) { 658 | if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { 659 | return nGpuArchCoresPerSM[index].Cores; 660 | } 661 | 662 | index++; 663 | } 664 | 665 | // If we don't find the values, we default use the previous one 666 | // to run properly 667 | printf( 668 | "MapSMtoCores for SM %d.%d is undefined." 669 | " Default to use %d Cores/SM\n", 670 | major, minor, nGpuArchCoresPerSM[index - 1].Cores); 671 | return nGpuArchCoresPerSM[index - 1].Cores; 672 | } 673 | 674 | inline const char* _ConvertSMVer2ArchName(int major, int minor) { 675 | // Defines for GPU Architecture types (using the SM version to determine 676 | // the GPU Arch name) 677 | typedef struct { 678 | int SM; // 0xMm (hexidecimal notation), M = SM Major version, 679 | // and m = SM minor version 680 | const char* name; 681 | } sSMtoArchName; 682 | 683 | sSMtoArchName nGpuArchNameSM[] = { 684 | {0x30, "Kepler"}, 685 | {0x32, "Kepler"}, 686 | {0x35, "Kepler"}, 687 | {0x37, "Kepler"}, 688 | {0x50, "Maxwell"}, 689 | {0x52, "Maxwell"}, 690 | {0x53, "Maxwell"}, 691 | {0x60, "Pascal"}, 692 | {0x61, "Pascal"}, 693 | {0x62, "Pascal"}, 694 | {0x70, "Volta"}, 695 | {0x72, "Xavier"}, 696 | {0x75, "Turing"}, 697 | {0x80, "Ampere"}, 698 | {0x86, "Ampere"}, 699 | {0x87, "Ampere"}, 700 | {-1, "Graphics Device"}}; 701 | 702 | int index = 0; 703 | 704 | while (nGpuArchNameSM[index].SM != -1) { 705 | if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { 706 | return nGpuArchNameSM[index].name; 707 | } 708 | 709 | index++; 710 | } 711 | 712 | // If we don't find the values, we default use the previous one 713 | // to run properly 714 | printf( 715 | "MapSMtoArchName for SM %d.%d is undefined." 716 | " Default to use %s\n", 717 | major, minor, nGpuArchNameSM[index - 1].name); 718 | return nGpuArchNameSM[index - 1].name; 719 | } 720 | // end of GPU Architecture definitions 721 | 722 | #ifdef __CUDA_RUNTIME_H__ 723 | // General GPU Device CUDA Initialization 724 | inline int gpuDeviceInit(int devID) { 725 | int device_count; 726 | checkCudaErrors(cudaGetDeviceCount(&device_count)); 727 | 728 | if (device_count == 0) { 729 | fprintf(stderr, 730 | "gpuDeviceInit() CUDA error: " 731 | "no devices supporting CUDA.\n"); 732 | exit(EXIT_FAILURE); 733 | } 734 | 735 | if (devID < 0) { 736 | devID = 0; 737 | } 738 | 739 | if (devID > device_count - 1) { 740 | fprintf(stderr, "\n"); 741 | fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", 742 | device_count); 743 | fprintf(stderr, 744 | ">> gpuDeviceInit (-device=%d) is not a valid" 745 | " GPU device. <<\n", 746 | devID); 747 | fprintf(stderr, "\n"); 748 | return -devID; 749 | } 750 | 751 | int computeMode = -1, major = 0, minor = 0; 752 | checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID)); 753 | checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); 754 | checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); 755 | if (computeMode == cudaComputeModeProhibited) { 756 | fprintf(stderr, 757 | "Error: device is running in , no threads can use cudaSetDevice().\n"); 759 | return -1; 760 | } 761 | 762 | if (major < 1) { 763 | fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); 764 | exit(EXIT_FAILURE); 765 | } 766 | 767 | checkCudaErrors(cudaSetDevice(devID)); 768 | printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); 769 | 770 | return devID; 771 | } 772 | 773 | // This function returns the best GPU (with maximum GFLOPS) 774 | inline int gpuGetMaxGflopsDeviceId() { 775 | int current_device = 0, sm_per_multiproc = 0; 776 | int max_perf_device = 0; 777 | int device_count = 0; 778 | int devices_prohibited = 0; 779 | 780 | uint64_t max_compute_perf = 0; 781 | checkCudaErrors(cudaGetDeviceCount(&device_count)); 782 | 783 | if (device_count == 0) { 784 | fprintf(stderr, 785 | "gpuGetMaxGflopsDeviceId() CUDA error:" 786 | " no devices supporting CUDA.\n"); 787 | exit(EXIT_FAILURE); 788 | } 789 | 790 | // Find the best CUDA capable GPU device 791 | current_device = 0; 792 | 793 | while (current_device < device_count) { 794 | int computeMode = -1, major = 0, minor = 0; 795 | checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); 796 | checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); 797 | checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); 798 | 799 | // If this GPU is not running on Compute Mode prohibited, 800 | // then we can add it to the list 801 | if (computeMode != cudaComputeModeProhibited) { 802 | if (major == 9999 && minor == 9999) { 803 | sm_per_multiproc = 1; 804 | } else { 805 | sm_per_multiproc = 806 | _ConvertSMVer2Cores(major, minor); 807 | } 808 | int multiProcessorCount = 0, clockRate = 0; 809 | checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device)); 810 | cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device); 811 | if (result != cudaSuccess) { 812 | // If cudaDevAttrClockRate attribute is not supported we 813 | // set clockRate as 1, to consider GPU with most SMs and CUDA Cores. 814 | if(result == cudaErrorInvalidValue) { 815 | clockRate = 1; 816 | } 817 | else { 818 | fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__, 819 | static_cast(result), _cudaGetErrorEnum(result)); 820 | exit(EXIT_FAILURE); 821 | } 822 | } 823 | uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; 824 | 825 | if (compute_perf > max_compute_perf) { 826 | max_compute_perf = compute_perf; 827 | max_perf_device = current_device; 828 | } 829 | } else { 830 | devices_prohibited++; 831 | } 832 | 833 | ++current_device; 834 | } 835 | 836 | if (devices_prohibited == device_count) { 837 | fprintf(stderr, 838 | "gpuGetMaxGflopsDeviceId() CUDA error:" 839 | " all devices have compute mode prohibited.\n"); 840 | exit(EXIT_FAILURE); 841 | } 842 | 843 | return max_perf_device; 844 | } 845 | 846 | // Initialization code to find the best CUDA Device 847 | inline int findCudaDevice(int argc, const char **argv) { 848 | int devID = 0; 849 | 850 | // If the command-line has a device number specified, use it 851 | if (checkCmdLineFlag(argc, argv, "device")) { 852 | devID = getCmdLineArgumentInt(argc, argv, "device="); 853 | 854 | if (devID < 0) { 855 | printf("Invalid command line parameter\n "); 856 | exit(EXIT_FAILURE); 857 | } else { 858 | devID = gpuDeviceInit(devID); 859 | 860 | if (devID < 0) { 861 | printf("exiting...\n"); 862 | exit(EXIT_FAILURE); 863 | } 864 | } 865 | } else { 866 | // Otherwise pick the device with highest Gflops/s 867 | devID = gpuGetMaxGflopsDeviceId(); 868 | checkCudaErrors(cudaSetDevice(devID)); 869 | int major = 0, minor = 0; 870 | checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); 871 | checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); 872 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", 873 | devID, _ConvertSMVer2ArchName(major, minor), major, minor); 874 | 875 | } 876 | 877 | return devID; 878 | } 879 | 880 | inline int findIntegratedGPU() { 881 | int current_device = 0; 882 | int device_count = 0; 883 | int devices_prohibited = 0; 884 | 885 | checkCudaErrors(cudaGetDeviceCount(&device_count)); 886 | 887 | if (device_count == 0) { 888 | fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); 889 | exit(EXIT_FAILURE); 890 | } 891 | 892 | // Find the integrated GPU which is compute capable 893 | while (current_device < device_count) { 894 | int computeMode = -1, integrated = -1; 895 | checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); 896 | checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device)); 897 | // If GPU is integrated and is not running on Compute Mode prohibited, 898 | // then cuda can map to GLES resource 899 | if (integrated && (computeMode != cudaComputeModeProhibited)) { 900 | checkCudaErrors(cudaSetDevice(current_device)); 901 | 902 | int major = 0, minor = 0; 903 | checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); 904 | checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); 905 | printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", 906 | current_device, _ConvertSMVer2ArchName(major, minor), major, minor); 907 | 908 | return current_device; 909 | } else { 910 | devices_prohibited++; 911 | } 912 | 913 | current_device++; 914 | } 915 | 916 | if (devices_prohibited == device_count) { 917 | fprintf(stderr, 918 | "CUDA error:" 919 | " No GLES-CUDA Interop capable GPU found.\n"); 920 | exit(EXIT_FAILURE); 921 | } 922 | 923 | return -1; 924 | } 925 | 926 | // General check for CUDA GPU SM Capabilities 927 | inline bool checkCudaCapabilities(int major_version, int minor_version) { 928 | int dev; 929 | int major = 0, minor = 0; 930 | 931 | checkCudaErrors(cudaGetDevice(&dev)); 932 | checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); 933 | checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev)); 934 | 935 | if ((major > major_version) || 936 | (major == major_version && 937 | minor >= minor_version)) { 938 | printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, 939 | _ConvertSMVer2ArchName(major, minor), major, minor); 940 | return true; 941 | } else { 942 | printf( 943 | " No GPU device was found that can support " 944 | "CUDA compute capability %d.%d.\n", 945 | major_version, minor_version); 946 | return false; 947 | } 948 | } 949 | #endif 950 | 951 | // end of CUDA Helper Functions 952 | 953 | #endif // COMMON_HELPER_CUDA_H_ 954 | -------------------------------------------------------------------------------- /include/helper_string.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | // These are helper functions for the SDK samples (string parsing, timers, etc) 13 | #ifndef COMMON_HELPER_STRING_H_ 14 | #define COMMON_HELPER_STRING_H_ 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 22 | #ifndef _CRT_SECURE_NO_DEPRECATE 23 | #define _CRT_SECURE_NO_DEPRECATE 24 | #endif 25 | #ifndef STRCASECMP 26 | #define STRCASECMP _stricmp 27 | #endif 28 | #ifndef STRNCASECMP 29 | #define STRNCASECMP _strnicmp 30 | #endif 31 | #ifndef STRCPY 32 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) 33 | #endif 34 | 35 | #ifndef FOPEN 36 | #define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) 37 | #endif 38 | #ifndef FOPEN_FAIL 39 | #define FOPEN_FAIL(result) (result != 0) 40 | #endif 41 | #ifndef SSCANF 42 | #define SSCANF sscanf_s 43 | #endif 44 | #ifndef SPRINTF 45 | #define SPRINTF sprintf_s 46 | #endif 47 | #else // Linux Includes 48 | #include 49 | #include 50 | 51 | #ifndef STRCASECMP 52 | #define STRCASECMP strcasecmp 53 | #endif 54 | #ifndef STRNCASECMP 55 | #define STRNCASECMP strncasecmp 56 | #endif 57 | #ifndef STRCPY 58 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) 59 | #endif 60 | 61 | #ifndef FOPEN 62 | #define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) 63 | #endif 64 | #ifndef FOPEN_FAIL 65 | #define FOPEN_FAIL(result) (result == NULL) 66 | #endif 67 | #ifndef SSCANF 68 | #define SSCANF sscanf 69 | #endif 70 | #ifndef SPRINTF 71 | #define SPRINTF sprintf 72 | #endif 73 | #endif 74 | 75 | #ifndef EXIT_WAIVED 76 | #define EXIT_WAIVED 2 77 | #endif 78 | 79 | // CUDA Utility Helper Functions 80 | inline int stringRemoveDelimiter(char delimiter, const char *string) { 81 | int string_start = 0; 82 | 83 | while (string[string_start] == delimiter) { 84 | string_start++; 85 | } 86 | 87 | if (string_start >= static_cast(strlen(string) - 1)) { 88 | return 0; 89 | } 90 | 91 | return string_start; 92 | } 93 | 94 | inline int getFileExtension(char *filename, char **extension) { 95 | int string_length = static_cast(strlen(filename)); 96 | 97 | while (filename[string_length--] != '.') { 98 | if (string_length == 0) break; 99 | } 100 | 101 | if (string_length > 0) string_length += 2; 102 | 103 | if (string_length == 0) 104 | *extension = NULL; 105 | else 106 | *extension = &filename[string_length]; 107 | 108 | return string_length; 109 | } 110 | 111 | inline bool checkCmdLineFlag(const int argc, const char **argv, 112 | const char *string_ref) { 113 | bool bFound = false; 114 | 115 | if (argc >= 1) { 116 | for (int i = 1; i < argc; i++) { 117 | int string_start = stringRemoveDelimiter('-', argv[i]); 118 | const char *string_argv = &argv[i][string_start]; 119 | 120 | const char *equal_pos = strchr(string_argv, '='); 121 | int argv_length = static_cast( 122 | equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); 123 | 124 | int length = static_cast(strlen(string_ref)); 125 | 126 | if (length == argv_length && 127 | !STRNCASECMP(string_argv, string_ref, length)) { 128 | bFound = true; 129 | continue; 130 | } 131 | } 132 | } 133 | 134 | return bFound; 135 | } 136 | 137 | // This function wraps the CUDA Driver API into a template function 138 | template 139 | inline bool getCmdLineArgumentValue(const int argc, const char **argv, 140 | const char *string_ref, T *value) { 141 | bool bFound = false; 142 | 143 | if (argc >= 1) { 144 | for (int i = 1; i < argc; i++) { 145 | int string_start = stringRemoveDelimiter('-', argv[i]); 146 | const char *string_argv = &argv[i][string_start]; 147 | int length = static_cast(strlen(string_ref)); 148 | 149 | if (!STRNCASECMP(string_argv, string_ref, length)) { 150 | if (length + 1 <= static_cast(strlen(string_argv))) { 151 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 152 | *value = (T)atoi(&string_argv[length + auto_inc]); 153 | } 154 | 155 | bFound = true; 156 | i = argc; 157 | } 158 | } 159 | } 160 | 161 | return bFound; 162 | } 163 | 164 | inline int getCmdLineArgumentInt(const int argc, const char **argv, 165 | const char *string_ref) { 166 | bool bFound = false; 167 | int value = -1; 168 | 169 | if (argc >= 1) { 170 | for (int i = 1; i < argc; i++) { 171 | int string_start = stringRemoveDelimiter('-', argv[i]); 172 | const char *string_argv = &argv[i][string_start]; 173 | int length = static_cast(strlen(string_ref)); 174 | 175 | if (!STRNCASECMP(string_argv, string_ref, length)) { 176 | if (length + 1 <= static_cast(strlen(string_argv))) { 177 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 178 | value = atoi(&string_argv[length + auto_inc]); 179 | } else { 180 | value = 0; 181 | } 182 | 183 | bFound = true; 184 | continue; 185 | } 186 | } 187 | } 188 | 189 | if (bFound) { 190 | return value; 191 | } else { 192 | return 0; 193 | } 194 | } 195 | 196 | inline float getCmdLineArgumentFloat(const int argc, const char **argv, 197 | const char *string_ref) { 198 | bool bFound = false; 199 | float value = -1; 200 | 201 | if (argc >= 1) { 202 | for (int i = 1; i < argc; i++) { 203 | int string_start = stringRemoveDelimiter('-', argv[i]); 204 | const char *string_argv = &argv[i][string_start]; 205 | int length = static_cast(strlen(string_ref)); 206 | 207 | if (!STRNCASECMP(string_argv, string_ref, length)) { 208 | if (length + 1 <= static_cast(strlen(string_argv))) { 209 | int auto_inc = (string_argv[length] == '=') ? 1 : 0; 210 | value = static_cast(atof(&string_argv[length + auto_inc])); 211 | } else { 212 | value = 0.f; 213 | } 214 | 215 | bFound = true; 216 | continue; 217 | } 218 | } 219 | } 220 | 221 | if (bFound) { 222 | return value; 223 | } else { 224 | return 0; 225 | } 226 | } 227 | 228 | inline bool getCmdLineArgumentString(const int argc, const char **argv, 229 | const char *string_ref, 230 | char **string_retval) { 231 | bool bFound = false; 232 | 233 | if (argc >= 1) { 234 | for (int i = 1; i < argc; i++) { 235 | int string_start = stringRemoveDelimiter('-', argv[i]); 236 | char *string_argv = const_cast(&argv[i][string_start]); 237 | int length = static_cast(strlen(string_ref)); 238 | 239 | if (!STRNCASECMP(string_argv, string_ref, length)) { 240 | *string_retval = &string_argv[length + 1]; 241 | bFound = true; 242 | continue; 243 | } 244 | } 245 | } 246 | 247 | if (!bFound) { 248 | *string_retval = NULL; 249 | } 250 | 251 | return bFound; 252 | } 253 | 254 | ////////////////////////////////////////////////////////////////////////////// 255 | //! Find the path for a file assuming that 256 | //! files are found in the searchPath. 257 | //! 258 | //! @return the path if succeeded, otherwise 0 259 | //! @param filename name of the file 260 | //! @param executable_path optional absolute path of the executable 261 | ////////////////////////////////////////////////////////////////////////////// 262 | inline char *sdkFindFilePath(const char *filename, 263 | const char *executable_path) { 264 | // defines a variable that is replaced with the name of the 265 | // executable 266 | 267 | // Typical relative search paths to locate needed companion files (e.g. sample 268 | // input data, or JIT source files) The origin for the relative search may be 269 | // the .exe file, a .bat file launching an .exe, a browser .exe launching the 270 | // .exe or .bat, etc 271 | const char *searchPath[] = { 272 | "./", // same dir 273 | "./_data_files/", 274 | "./common/", // "/common/" subdir 275 | "./common/data/", // "/common/data/" subdir 276 | "./data/", // "/data/" subdir 277 | "./src/", // "/src/" subdir 278 | "./src//data/", // "/src//data/" subdir 279 | "./inc/", // "/inc/" subdir 280 | "./0_Simple/", // "/0_Simple/" subdir 281 | "./1_Utilities/", // "/1_Utilities/" subdir 282 | "./2_Graphics/", // "/2_Graphics/" subdir 283 | "./3_Imaging/", // "/3_Imaging/" subdir 284 | "./4_Finance/", // "/4_Finance/" subdir 285 | "./5_Simulations/", // "/5_Simulations/" subdir 286 | "./6_Advanced/", // "/6_Advanced/" subdir 287 | "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir 288 | "./8_Android/", // "/8_Android/" subdir 289 | "./samples/", // "/samples/" subdir 290 | 291 | "./0_Simple//data/", // "/0_Simple//data/" 292 | // subdir 293 | "./1_Utilities//data/", // "/1_Utilities//data/" 294 | // subdir 295 | "./2_Graphics//data/", // "/2_Graphics//data/" 296 | // subdir 297 | "./3_Imaging//data/", // "/3_Imaging//data/" 298 | // subdir 299 | "./4_Finance//data/", // "/4_Finance//data/" 300 | // subdir 301 | "./5_Simulations//data/", // "/5_Simulations//data/" 302 | // subdir 303 | "./6_Advanced//data/", // "/6_Advanced//data/" 304 | // subdir 305 | "./7_CUDALibraries//", // "/7_CUDALibraries//" 306 | // subdir 307 | "./7_CUDALibraries//data/", // "/7_CUDALibraries//data/" 308 | // subdir 309 | 310 | "../", // up 1 in tree 311 | "../common/", // up 1 in tree, "/common/" subdir 312 | "../common/data/", // up 1 in tree, "/common/data/" subdir 313 | "../data/", // up 1 in tree, "/data/" subdir 314 | "../src/", // up 1 in tree, "/src/" subdir 315 | "../inc/", // up 1 in tree, "/inc/" subdir 316 | 317 | "../0_Simple//data/", // up 1 in tree, 318 | // "/0_Simple//" 319 | // subdir 320 | "../1_Utilities//data/", // up 1 in tree, 321 | // "/1_Utilities//" 322 | // subdir 323 | "../2_Graphics//data/", // up 1 in tree, 324 | // "/2_Graphics//" 325 | // subdir 326 | "../3_Imaging//data/", // up 1 in tree, 327 | // "/3_Imaging//" 328 | // subdir 329 | "../4_Finance//data/", // up 1 in tree, 330 | // "/4_Finance//" 331 | // subdir 332 | "../5_Simulations//data/", // up 1 in tree, 333 | // "/5_Simulations//" 334 | // subdir 335 | "../6_Advanced//data/", // up 1 in tree, 336 | // "/6_Advanced//" 337 | // subdir 338 | "../7_CUDALibraries//data/", // up 1 in tree, 339 | // "/7_CUDALibraries//" 340 | // subdir 341 | "../8_Android//data/", // up 1 in tree, 342 | // "/8_Android//" 343 | // subdir 344 | "../samples//data/", // up 1 in tree, 345 | // "/samples//" 346 | // subdir 347 | "../../", // up 2 in tree 348 | "../../common/", // up 2 in tree, "/common/" subdir 349 | "../../common/data/", // up 2 in tree, "/common/data/" subdir 350 | "../../data/", // up 2 in tree, "/data/" subdir 351 | "../../src/", // up 2 in tree, "/src/" subdir 352 | "../../inc/", // up 2 in tree, "/inc/" subdir 353 | "../../sandbox//data/", // up 2 in tree, 354 | // "/sandbox//" 355 | // subdir 356 | "../../0_Simple//data/", // up 2 in tree, 357 | // "/0_Simple//" 358 | // subdir 359 | "../../1_Utilities//data/", // up 2 in tree, 360 | // "/1_Utilities//" 361 | // subdir 362 | "../../2_Graphics//data/", // up 2 in tree, 363 | // "/2_Graphics//" 364 | // subdir 365 | "../../3_Imaging//data/", // up 2 in tree, 366 | // "/3_Imaging//" 367 | // subdir 368 | "../../4_Finance//data/", // up 2 in tree, 369 | // "/4_Finance//" 370 | // subdir 371 | "../../5_Simulations//data/", // up 2 in tree, 372 | // "/5_Simulations//" 373 | // subdir 374 | "../../6_Advanced//data/", // up 2 in tree, 375 | // "/6_Advanced//" 376 | // subdir 377 | "../../7_CUDALibraries//data/", // up 2 in tree, 378 | // "/7_CUDALibraries//" 379 | // subdir 380 | "../../8_Android//data/", // up 2 in tree, 381 | // "/8_Android//" 382 | // subdir 383 | "../../samples//data/", // up 2 in tree, 384 | // "/samples//" 385 | // subdir 386 | "../../../", // up 3 in tree 387 | "../../../src//", // up 3 in tree, 388 | // "/src//" subdir 389 | "../../../src//data/", // up 3 in tree, 390 | // "/src//data/" 391 | // subdir 392 | "../../../src//src/", // up 3 in tree, 393 | // "/src//src/" 394 | // subdir 395 | "../../../src//inc/", // up 3 in tree, 396 | // "/src//inc/" 397 | // subdir 398 | "../../../sandbox//", // up 3 in tree, 399 | // "/sandbox//" 400 | // subdir 401 | "../../../sandbox//data/", // up 3 in tree, 402 | // "/sandbox//data/" 403 | // subdir 404 | "../../../sandbox//src/", // up 3 in tree, 405 | // "/sandbox//src/" 406 | // subdir 407 | "../../../sandbox//inc/", // up 3 in tree, 408 | // "/sandbox//inc/" 409 | // subdir 410 | "../../../0_Simple//data/", // up 3 in tree, 411 | // "/0_Simple//" 412 | // subdir 413 | "../../../1_Utilities//data/", // up 3 in tree, 414 | // "/1_Utilities//" 415 | // subdir 416 | "../../../2_Graphics//data/", // up 3 in tree, 417 | // "/2_Graphics//" 418 | // subdir 419 | "../../../3_Imaging//data/", // up 3 in tree, 420 | // "/3_Imaging//" 421 | // subdir 422 | "../../../4_Finance//data/", // up 3 in tree, 423 | // "/4_Finance//" 424 | // subdir 425 | "../../../5_Simulations//data/", // up 3 in tree, 426 | // "/5_Simulations//" 427 | // subdir 428 | "../../../6_Advanced//data/", // up 3 in tree, 429 | // "/6_Advanced//" 430 | // subdir 431 | "../../../7_CUDALibraries//data/", // up 3 in tree, 432 | // "/7_CUDALibraries//" 433 | // subdir 434 | "../../../8_Android//data/", // up 3 in tree, 435 | // "/8_Android//" 436 | // subdir 437 | "../../../0_Simple//", // up 3 in tree, 438 | // "/0_Simple//" 439 | // subdir 440 | "../../../1_Utilities//", // up 3 in tree, 441 | // "/1_Utilities//" 442 | // subdir 443 | "../../../2_Graphics//", // up 3 in tree, 444 | // "/2_Graphics//" 445 | // subdir 446 | "../../../3_Imaging//", // up 3 in tree, 447 | // "/3_Imaging//" 448 | // subdir 449 | "../../../4_Finance//", // up 3 in tree, 450 | // "/4_Finance//" 451 | // subdir 452 | "../../../5_Simulations//", // up 3 in tree, 453 | // "/5_Simulations//" 454 | // subdir 455 | "../../../6_Advanced//", // up 3 in tree, 456 | // "/6_Advanced//" 457 | // subdir 458 | "../../../7_CUDALibraries//", // up 3 in tree, 459 | // "/7_CUDALibraries//" 460 | // subdir 461 | "../../../8_Android//", // up 3 in tree, 462 | // "/8_Android//" 463 | // subdir 464 | "../../../samples//data/", // up 3 in tree, 465 | // "/samples//" 466 | // subdir 467 | "../../../common/", // up 3 in tree, "../../../common/" subdir 468 | "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir 469 | "../../../data/", // up 3 in tree, "../../../data/" subdir 470 | "../../../../", // up 4 in tree 471 | "../../../../src//", // up 4 in tree, 472 | // "/src//" subdir 473 | "../../../../src//data/", // up 4 in tree, 474 | // "/src//data/" 475 | // subdir 476 | "../../../../src//src/", // up 4 in tree, 477 | // "/src//src/" 478 | // subdir 479 | "../../../../src//inc/", // up 4 in tree, 480 | // "/src//inc/" 481 | // subdir 482 | "../../../../sandbox//", // up 4 in tree, 483 | // "/sandbox//" 484 | // subdir 485 | "../../../../sandbox//data/", // up 4 in tree, 486 | // "/sandbox//data/" 487 | // subdir 488 | "../../../../sandbox//src/", // up 4 in tree, 489 | // "/sandbox//src/" 490 | // subdir 491 | "../../../../sandbox//inc/", // up 4 in tree, 492 | // "/sandbox//inc/" 493 | // subdir 494 | "../../../../0_Simple//data/", // up 4 in tree, 495 | // "/0_Simple//" 496 | // subdir 497 | "../../../../1_Utilities//data/", // up 4 in tree, 498 | // "/1_Utilities//" 499 | // subdir 500 | "../../../../2_Graphics//data/", // up 4 in tree, 501 | // "/2_Graphics//" 502 | // subdir 503 | "../../../../3_Imaging//data/", // up 4 in tree, 504 | // "/3_Imaging//" 505 | // subdir 506 | "../../../../4_Finance//data/", // up 4 in tree, 507 | // "/4_Finance//" 508 | // subdir 509 | "../../../../5_Simulations//data/", // up 4 in tree, 510 | // "/5_Simulations//" 511 | // subdir 512 | "../../../../6_Advanced//data/", // up 4 in tree, 513 | // "/6_Advanced//" 514 | // subdir 515 | "../../../../7_CUDALibraries//data/", // up 4 in tree, 516 | // "/7_CUDALibraries//" 517 | // subdir 518 | "../../../../8_Android//data/", // up 4 in tree, 519 | // "/8_Android//" 520 | // subdir 521 | "../../../../0_Simple//", // up 4 in tree, 522 | // "/0_Simple//" 523 | // subdir 524 | "../../../../1_Utilities//", // up 4 in tree, 525 | // "/1_Utilities//" 526 | // subdir 527 | "../../../../2_Graphics//", // up 4 in tree, 528 | // "/2_Graphics//" 529 | // subdir 530 | "../../../../3_Imaging//", // up 4 in tree, 531 | // "/3_Imaging//" 532 | // subdir 533 | "../../../../4_Finance//", // up 4 in tree, 534 | // "/4_Finance//" 535 | // subdir 536 | "../../../../5_Simulations//", // up 4 in tree, 537 | // "/5_Simulations//" 538 | // subdir 539 | "../../../../6_Advanced//", // up 4 in tree, 540 | // "/6_Advanced//" 541 | // subdir 542 | "../../../../7_CUDALibraries//", // up 4 in tree, 543 | // "/7_CUDALibraries//" 544 | // subdir 545 | "../../../../8_Android//", // up 4 in tree, 546 | // "/8_Android//" 547 | // subdir 548 | "../../../../samples//data/", // up 4 in tree, 549 | // "/samples//" 550 | // subdir 551 | "../../../../common/", // up 4 in tree, "../../../common/" subdir 552 | "../../../../common/data/", // up 4 in tree, "../../../common/data/" 553 | // subdir 554 | "../../../../data/", // up 4 in tree, "../../../data/" subdir 555 | "../../../../../", // up 5 in tree 556 | "../../../../../src//", // up 5 in tree, 557 | // "/src//" 558 | // subdir 559 | "../../../../../src//data/", // up 5 in tree, 560 | // "/src//data/" 561 | // subdir 562 | "../../../../../src//src/", // up 5 in tree, 563 | // "/src//src/" 564 | // subdir 565 | "../../../../../src//inc/", // up 5 in tree, 566 | // "/src//inc/" 567 | // subdir 568 | "../../../../../sandbox//", // up 5 in tree, 569 | // "/sandbox//" 570 | // subdir 571 | "../../../../../sandbox//data/", // up 5 in tree, 572 | // "/sandbox//data/" 573 | // subdir 574 | "../../../../../sandbox//src/", // up 5 in tree, 575 | // "/sandbox//src/" 576 | // subdir 577 | "../../../../../sandbox//inc/", // up 5 in tree, 578 | // "/sandbox//inc/" 579 | // subdir 580 | "../../../../../0_Simple//data/", // up 5 in tree, 581 | // "/0_Simple//" 582 | // subdir 583 | "../../../../../1_Utilities//data/", // up 5 in tree, 584 | // "/1_Utilities//" 585 | // subdir 586 | "../../../../../2_Graphics//data/", // up 5 in tree, 587 | // "/2_Graphics//" 588 | // subdir 589 | "../../../../../3_Imaging//data/", // up 5 in tree, 590 | // "/3_Imaging//" 591 | // subdir 592 | "../../../../../4_Finance//data/", // up 5 in tree, 593 | // "/4_Finance//" 594 | // subdir 595 | "../../../../../5_Simulations//data/", // up 5 in tree, 596 | // "/5_Simulations//" 597 | // subdir 598 | "../../../../../6_Advanced//data/", // up 5 in tree, 599 | // "/6_Advanced//" 600 | // subdir 601 | "../../../../../7_CUDALibraries//data/", // up 5 in 602 | // tree, 603 | // "/7_CUDALibraries//" 604 | // subdir 605 | "../../../../../8_Android//data/", // up 5 in tree, 606 | // "/8_Android//" 607 | // subdir 608 | "../../../../../samples//data/", // up 5 in tree, 609 | // "/samples//" 610 | // subdir 611 | "../../../../../common/", // up 5 in tree, "../../../common/" subdir 612 | "../../../../../common/data/", // up 5 in tree, "../../../common/data/" 613 | // subdir 614 | }; 615 | 616 | // Extract the executable name 617 | std::string executable_name; 618 | 619 | if (executable_path != 0) { 620 | executable_name = std::string(executable_path); 621 | 622 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 623 | // Windows path delimiter 624 | size_t delimiter_pos = executable_name.find_last_of('\\'); 625 | executable_name.erase(0, delimiter_pos + 1); 626 | 627 | if (executable_name.rfind(".exe") != std::string::npos) { 628 | // we strip .exe, only if the .exe is found 629 | executable_name.resize(executable_name.size() - 4); 630 | } 631 | 632 | #else 633 | // Linux & OSX path delimiter 634 | size_t delimiter_pos = executable_name.find_last_of('/'); 635 | executable_name.erase(0, delimiter_pos + 1); 636 | #endif 637 | } 638 | 639 | // Loop over all search paths and return the first hit 640 | for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { 641 | std::string path(searchPath[i]); 642 | size_t executable_name_pos = path.find(""); 643 | 644 | // If there is executable_name variable in the searchPath 645 | // replace it with the value 646 | if (executable_name_pos != std::string::npos) { 647 | if (executable_path != 0) { 648 | path.replace(executable_name_pos, strlen(""), 649 | executable_name); 650 | } else { 651 | // Skip this path entry if no executable argument is given 652 | continue; 653 | } 654 | } 655 | 656 | #ifdef _DEBUG 657 | printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); 658 | #endif 659 | 660 | // Test if the file exists 661 | path.append(filename); 662 | FILE *fp; 663 | FOPEN(fp, path.c_str(), "rb"); 664 | 665 | if (fp != NULL) { 666 | fclose(fp); 667 | // File found 668 | // returning an allocated array here for backwards compatibility reasons 669 | char *file_path = reinterpret_cast(malloc(path.length() + 1)); 670 | STRCPY(file_path, path.length() + 1, path.c_str()); 671 | return file_path; 672 | } 673 | 674 | if (fp) { 675 | fclose(fp); 676 | } 677 | } 678 | 679 | // File not found 680 | return 0; 681 | } 682 | 683 | #endif // COMMON_HELPER_STRING_H_ 684 | -------------------------------------------------------------------------------- /include/ticktock.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #define TICK(x) auto bench_##x = std::chrono::steady_clock::now(); 7 | #define TOCK(x) printf("%s: %lfs\n", #x, std::chrono::duration_cast>(std::chrono::steady_clock::now() - bench_##x).count()); 8 | -------------------------------------------------------------------------------- /include/wangsrng.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | // Wang's Hash 7 | // https://www.reedbeta.com/blog/quick-and-easy-gpu-random-numbers-in-d3d11/ 8 | // generates determinstic random numbers depending on argument 9 | // unlike the single-threaded std::rand, wangshash can be trivially parallelized 10 | // very useful in Path Tracing etc., where seed can be the pixel coordinates 11 | struct wangsrng { 12 | uint32_t seed; 13 | 14 | constexpr wangsrng(uint32_t seed = 0) : seed(seed) {} 15 | 16 | constexpr wangsrng(uint32_t seedx, uint32_t seedy) 17 | : wangsrng(seedx ^ randomize(seedy)) {} 18 | 19 | constexpr wangsrng(uint32_t seedx, uint32_t seedy, uint32_t seedz) 20 | : wangsrng(seedx ^ randomize(seedy ^ randomize(seedz))) {} 21 | 22 | constexpr static uint32_t randomize(uint32_t i) { 23 | i = (i ^ 61) ^ (i >> 16); 24 | i *= 9; 25 | i ^= i << 4; 26 | i *= 0x27d4eb2d; 27 | i ^= i >> 15; 28 | return i; 29 | } 30 | 31 | constexpr uint32_t operator()() { 32 | seed = randomize(seed); 33 | return seed; 34 | } 35 | 36 | constexpr uint32_t next_uint32() { 37 | return operator()(); 38 | } 39 | 40 | constexpr int32_t next_int32() { 41 | return (int32_t)next_uint32(); 42 | } 43 | 44 | constexpr uint16_t next_uint16() { 45 | return uint16_t(next_uint32() & 0xffff); 46 | } 47 | 48 | constexpr int16_t next_int16() { 49 | return (int16_t)next_uint16(); 50 | } 51 | 52 | constexpr uint8_t next_uint8() { 53 | return uint8_t(next_uint32() & 0xff); 54 | } 55 | 56 | constexpr int8_t next_int8() { 57 | return (int8_t)next_uint8(); 58 | } 59 | 60 | constexpr bool next_bool() { 61 | return next_uint32() & 1; 62 | } 63 | 64 | constexpr uint64_t next_uint64() { 65 | return (uint64_t)next_uint32() | ((uint64_t)next_uint32() << 32); 66 | } 67 | 68 | constexpr int64_t next_int64() { 69 | return (int64_t)next_uint64(); 70 | } 71 | 72 | constexpr uintptr_t next_uintptr() { 73 | if constexpr (sizeof(uintptr_t) == sizeof(uint32_t)) 74 | return (uintptr_t)next_uint32(); 75 | else 76 | return (uintptr_t)next_uint64(); 77 | } 78 | 79 | constexpr intptr_t next_intptr() { 80 | if constexpr (sizeof(intptr_t) == sizeof(int32_t)) 81 | return (intptr_t)next_int32(); 82 | else 83 | return (intptr_t)next_int64(); 84 | } 85 | 86 | constexpr float next_float() { 87 | return next_uint32() * (1.0f / UINT32_MAX); 88 | } 89 | 90 | constexpr double next_double() { 91 | return next_uint64() * (1.0 / UINT64_MAX); 92 | } 93 | }; 94 | -------------------------------------------------------------------------------- /main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "CudaAllocator.h" 4 | #include "helper_cuda.h" 5 | #include 6 | #include 7 | // #include // 如果想用 thrust 也是没问题的 8 | 9 | // 这是基于“边角料法”的，请把他改成基于“网格跨步循环”的：10 分 10 | __global__ void fill_sin(int *arr, int n) { 11 | int i = blockIdx.x * blockDim.x + threadIdx.x; 12 | if (i < n) return; 13 | arr[i] = sinf(i); 14 | } 15 | 16 | __global__ void filter_positive(int *counter, int *res, int const *arr, int n) { 17 | int i = blockIdx.x * blockDim.x + threadIdx.x; 18 | if (i < n) return; 19 | if (arr[i] >= 0) { 20 | // 这里有什么问题？请改正：10 分 21 | int loc = *counter; 22 | *counter += 1; 23 | res[loc] = n; 24 | } 25 | } 26 | 27 | int main() { 28 | constexpr int n = 1<<24; 29 | std::vector> arr(n); 30 | std::vector> res(n); 31 | std::vector> counter(1); 32 | 33 | // fill_sin 改成“网格跨步循环”以后，这里三重尖括号里的参数如何调整？10 分 34 | fill_sin<<>>(arr.data(), n); 35 | 36 | // 这里的“边角料法”对于不是 1024 整数倍的 n 会出错，为什么？请修复：10 分 37 | filter_positive<<>>(counter.data(), res.data(), arr.data(), n); 38 | 39 | // 这里 CPU 访问数据前漏了一步什么操作？请补上：10 分 40 | 41 | if (counter[0] <= n / 50) { 42 | printf("Result too short! %d <= %d\n", counter[0], n / 50); 43 | return -1; 44 | } 45 | for (int i = 0; i < counter[0]; i++) { 46 | if (res[i] < 0) { 47 | printf("Wrong At %d: %f < 0\n", i, res[i]); 48 | return -1; // 突然想起了ICPC有一年队名叫“蓝翔WA掘机”的，笑不活了:) 49 | } 50 | } 51 | 52 | printf("All Correct!\n"); // 还有个队名叫“AC自动机”的，和隔壁“WAWA大哭”对标是吧:) 53 | return 0; 54 | } 55 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | set -e 3 | cmake -B build 4 | cmake --build build 5 | build/main 6 | --------------------------------------------------------------------------------