├── .clang-format ├── .clang-tidy ├── .clangd ├── .git-blame-ignore-revs ├── .github └── workflows │ ├── mirror-main-branch-to-master-branch.yml │ └── push-to-legacy-repositories.yml ├── .gitignore ├── CHANGELOG.md ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.TXT ├── README.md ├── cmake ├── AppendOptionIfAvailable.cmake ├── CPM.cmake ├── CubAddSubdir.cmake ├── CubBuildCompilerTargets.cmake ├── CubBuildTargetList.cmake ├── CubCompilerHacks.cmake ├── CubCudaConfig.cmake ├── CubHeaderTesting.cmake ├── CubInstallRules.cmake ├── CubUtilities.cmake └── header_test.in ├── cub ├── agent │ ├── agent_adjacent_difference.cuh │ ├── agent_batch_memcpy.cuh │ ├── agent_histogram.cuh │ ├── agent_merge_sort.cuh │ ├── agent_radix_sort_downsweep.cuh │ ├── agent_radix_sort_histogram.cuh │ ├── agent_radix_sort_onesweep.cuh │ ├── agent_radix_sort_upsweep.cuh │ ├── agent_reduce.cuh │ ├── agent_reduce_by_key.cuh │ ├── agent_rle.cuh │ ├── agent_scan.cuh │ ├── agent_scan_by_key.cuh │ ├── agent_segment_fixup.cuh │ ├── agent_segmented_radix_sort.cuh │ ├── agent_select_if.cuh │ ├── agent_spmv_orig.cuh │ ├── agent_sub_warp_merge_sort.cuh │ ├── agent_three_way_partition.cuh │ ├── agent_unique_by_key.cuh │ └── single_pass_scan_operators.cuh ├── block │ ├── block_adjacent_difference.cuh │ ├── block_discontinuity.cuh │ ├── block_exchange.cuh │ ├── block_histogram.cuh │ ├── block_load.cuh │ ├── block_merge_sort.cuh │ ├── block_radix_rank.cuh │ ├── block_radix_sort.cuh │ ├── block_raking_layout.cuh │ ├── block_reduce.cuh │ ├── block_run_length_decode.cuh │ ├── block_scan.cuh │ ├── block_shuffle.cuh │ ├── block_store.cuh │ ├── radix_rank_sort_operations.cuh │ └── specializations │ │ ├── block_histogram_atomic.cuh │ │ ├── block_histogram_sort.cuh │ │ ├── block_reduce_raking.cuh │ │ ├── block_reduce_raking_commutative_only.cuh │ │ ├── block_reduce_warp_reductions.cuh │ │ ├── block_scan_raking.cuh │ │ └── block_scan_warp_scans.cuh ├── cmake │ ├── cub-config-version.cmake │ ├── cub-config.cmake │ ├── cub-header-search.cmake │ └── cub-header-search.cmake.in ├── config.cuh ├── cub.cuh ├── detail │ ├── choose_offset.cuh │ ├── cpp_compatibility.cuh │ ├── detect_cuda_runtime.cuh │ ├── device_double_buffer.cuh │ ├── device_synchronize.cuh │ ├── exec_check_disable.cuh │ ├── strong_load.cuh │ ├── strong_store.cuh │ ├── temporary_storage.cuh │ ├── type_traits.cuh │ └── uninitialized_copy.cuh ├── device │ ├── device_adjacent_difference.cuh │ ├── device_copy.cuh │ ├── device_histogram.cuh │ ├── device_memcpy.cuh │ ├── device_merge_sort.cuh │ ├── device_partition.cuh │ ├── device_radix_sort.cuh │ ├── device_reduce.cuh │ ├── device_run_length_encode.cuh │ ├── device_scan.cuh │ ├── device_segmented_radix_sort.cuh │ ├── device_segmented_reduce.cuh │ ├── device_segmented_sort.cuh │ ├── device_select.cuh │ ├── device_spmv.cuh │ └── dispatch │ │ ├── dispatch_adjacent_difference.cuh │ │ ├── dispatch_batch_memcpy.cuh │ │ ├── dispatch_histogram.cuh │ │ ├── dispatch_merge_sort.cuh │ │ ├── dispatch_radix_sort.cuh │ │ ├── dispatch_reduce.cuh │ │ ├── dispatch_reduce_by_key.cuh │ │ ├── dispatch_rle.cuh │ │ ├── dispatch_scan.cuh │ │ ├── dispatch_scan_by_key.cuh │ │ ├── dispatch_segmented_sort.cuh │ │ ├── dispatch_select_if.cuh │ │ ├── dispatch_spmv_orig.cuh │ │ ├── dispatch_three_way_partition.cuh │ │ └── dispatch_unique_by_key.cuh ├── grid │ ├── grid_barrier.cuh │ ├── grid_even_share.cuh │ ├── grid_mapping.cuh │ └── grid_queue.cuh ├── host │ └── mutex.cuh ├── iterator │ ├── arg_index_input_iterator.cuh │ ├── cache_modified_input_iterator.cuh │ ├── cache_modified_output_iterator.cuh │ ├── constant_input_iterator.cuh │ ├── counting_input_iterator.cuh │ ├── discard_output_iterator.cuh │ ├── tex_obj_input_iterator.cuh │ ├── tex_ref_input_iterator.cuh │ └── transform_input_iterator.cuh ├── thread │ ├── thread_load.cuh │ ├── thread_operators.cuh │ ├── thread_reduce.cuh │ ├── thread_scan.cuh │ ├── thread_search.cuh │ ├── thread_sort.cuh │ └── thread_store.cuh ├── util_allocator.cuh ├── util_arch.cuh ├── util_compiler.cuh ├── util_cpp_dialect.cuh ├── util_debug.cuh ├── util_deprecated.cuh ├── util_device.cuh ├── util_macro.cuh ├── util_math.cuh ├── util_namespace.cuh ├── util_ptx.cuh ├── util_type.cuh ├── version.cuh └── warp │ ├── specializations │ ├── warp_reduce_shfl.cuh │ ├── warp_reduce_smem.cuh │ ├── warp_scan_shfl.cuh │ └── warp_scan_smem.cuh │ ├── warp_exchange.cuh │ ├── warp_load.cuh │ ├── warp_merge_sort.cuh │ ├── warp_reduce.cuh │ ├── warp_scan.cuh │ └── warp_store.cuh ├── docs ├── developer_overview.md └── test_overview.md ├── examples ├── CMakeLists.txt ├── block │ ├── .gitignore │ ├── CMakeLists.txt │ ├── example_block_radix_sort.cu │ ├── example_block_reduce.cu │ ├── example_block_reduce_dyn_smem.cu │ └── example_block_scan.cu ├── cmake │ ├── CMakeLists.txt │ └── add_subdir │ │ ├── CMakeLists.txt │ │ └── dummy.cu └── device │ ├── .gitignore │ ├── CMakeLists.txt │ ├── example_device_partition_flagged.cu │ ├── example_device_partition_if.cu │ ├── example_device_radix_sort.cu │ ├── example_device_reduce.cu │ ├── example_device_scan.cu │ ├── example_device_select_flagged.cu │ ├── example_device_select_if.cu │ ├── example_device_select_unique.cu │ └── example_device_sort_find_non_trivial_runs.cu └── test ├── .gitignore ├── CMakeLists.txt ├── README.md ├── bfloat16.h ├── c2h ├── custom_type.cuh ├── generators.cu └── generators.cuh ├── catch2_runner.cu ├── catch2_test_block_adjacent_difference.cu ├── catch2_test_block_histogram.cu ├── catch2_test_block_load.cu ├── catch2_test_block_merge_sort.cu ├── catch2_test_block_radix_sort.cu ├── catch2_test_block_reduce.cu ├── catch2_test_block_run_length_decode.cu ├── catch2_test_block_scan.cu ├── catch2_test_block_shuffle.cu ├── catch2_test_block_store.cu ├── catch2_test_cdp_helper.h ├── catch2_test_cdp_wrapper.cu ├── catch2_test_helper.h ├── catch2_test_printing.cu ├── catch2_test_util_type.cu ├── catch2_test_warp_exchange.cu ├── catch2_test_warp_load.cu ├── catch2_test_warp_mask.cu ├── catch2_test_warp_merge_sort.cu ├── catch2_test_warp_reduce.cu ├── catch2_test_warp_scan.cu ├── catch2_test_warp_store.cu ├── cmake ├── CMakeLists.txt ├── check_source_files.cmake └── test_install │ └── CMakeLists.txt ├── fill_striped.cuh ├── half.h ├── link_a.cu ├── link_b.cu ├── link_main.cpp ├── mersenne.h ├── test_allocator.cu ├── test_block_radix_rank.cu ├── test_cdp_variant_state.cu ├── test_device_adjacent_difference.cu ├── test_device_batch_copy.cu ├── test_device_batch_memcpy.cu ├── test_device_histogram.cu ├── test_device_merge_sort.cu ├── test_device_radix_sort.cu ├── test_device_reduce.cu ├── test_device_reduce_by_key.cu ├── test_device_run_length_encode.cu ├── test_device_scan.cu ├── test_device_scan_by_key.cu ├── test_device_segmented_sort.cu ├── test_device_select_if.cu ├── test_device_select_unique.cu ├── test_device_select_unique_by_key.cu ├── test_device_spmv.cu ├── test_device_three_way_partition.cu ├── test_grid_barrier.cu ├── test_iterator.cu ├── test_iterator_deprecated.cu ├── test_namespace_wrapped.cu ├── test_temporary_storage_layout.cu ├── test_thread_operators.cu ├── test_thread_sort.cu ├── test_util.h └── test_util_vec.h /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | AccessModifierOffset: -2 3 | AlignAfterOpenBracket: Align 4 | AlignConsecutiveAssignments: true 5 | AlignEscapedNewlines: Right 6 | AlignOperands: true 7 | AllowAllArgumentsOnNextLine: false 8 | AllowAllConstructorInitializersOnNextLine: false 9 | AllowAllParametersOfDeclarationOnNextLine: false 10 | AllowShortBlocksOnASingleLine: false 11 | AllowShortCaseLabelsOnASingleLine: false 12 | AllowShortFunctionsOnASingleLine: All 13 | AllowShortIfStatementsOnASingleLine: Never 14 | AllowShortLambdasOnASingleLine: All 15 | AllowShortLoopsOnASingleLine: false 16 | AlwaysBreakAfterReturnType: None 17 | AlwaysBreakTemplateDeclarations: Yes 18 | BinPackArguments: false 19 | BinPackParameters: false 20 | BreakBeforeBraces: Custom 21 | BraceWrapping: 22 | AfterCaseLabel: false 23 | AfterClass: true 24 | AfterControlStatement: true 25 | AfterEnum: true 26 | AfterFunction: true 27 | AfterNamespace: true 28 | AfterStruct: true 29 | AfterUnion: true 30 | BeforeCatch: true 31 | BeforeElse: true 32 | IndentBraces: false 33 | SplitEmptyFunction: false 34 | SplitEmptyRecord: false 35 | BreakBeforeBinaryOperators: None 36 | BreakBeforeTernaryOperators: true 37 | BreakConstructorInitializers: BeforeComma 38 | BreakInheritanceList: BeforeComma 39 | ColumnLimit: 100 40 | CompactNamespaces: false 41 | ContinuationIndentWidth: 2 42 | IncludeBlocks: Regroup 43 | IncludeCategories: 44 | - Regex: '^$' 51 | Priority: 4 52 | IndentCaseLabels: true 53 | IndentPPDirectives: None 54 | IndentWidth: 2 55 | KeepEmptyLinesAtTheStartOfBlocks: true 56 | MaxEmptyLinesToKeep: 1 57 | NamespaceIndentation: None 58 | PenaltyBreakAssignment: 30 59 | PenaltyBreakBeforeFirstCallParameter: 50 60 | PenaltyBreakComment: 0 61 | PenaltyBreakFirstLessLess: 0 62 | PenaltyBreakString: 70 63 | PenaltyBreakTemplateDeclaration: 0 64 | PenaltyExcessCharacter: 100 65 | PenaltyReturnTypeOnItsOwnLine: 90 66 | PointerAlignment: Right 67 | ReflowComments: true 68 | SortIncludes: CaseInsensitive 69 | SpaceAfterCStyleCast: false 70 | SpaceAfterLogicalNot: false 71 | SpaceAfterTemplateKeyword: true 72 | SpaceBeforeAssignmentOperators: true 73 | SpaceBeforeCpp11BracedList: false 74 | SpaceBeforeCtorInitializerColon: true 75 | SpaceBeforeInheritanceColon: true 76 | SpaceBeforeParens: ControlStatements 77 | SpaceBeforeRangeBasedForLoopColon: true 78 | SpaceInEmptyParentheses: false 79 | SpacesBeforeTrailingComments: 1 80 | SpacesInAngles: false 81 | SpacesInCStyleCastParentheses: false 82 | SpacesInParentheses: false 83 | SpacesInSquareBrackets: false 84 | Standard: c++11 85 | TabWidth: 2 86 | UseTab: Never 87 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | --- 2 | Checks: 3 | 'modernize-*, 4 | -modernize-use-equals-default, 5 | -modernize-concat-nested-namespaces, 6 | -modernize-use-trailing-return-type' 7 | 8 | # -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly) 9 | # -modernize-concat-nested-namespaces # auto-fix is broken (can delete code) 10 | # -modernize-use-trailing-return-type # just a preference 11 | 12 | WarningsAsErrors: '' 13 | HeaderFilterRegex: '' 14 | AnalyzeTemporaryDtors: false 15 | FormatStyle: none 16 | CheckOptions: 17 | - key: modernize-loop-convert.MaxCopySize 18 | value: '16' 19 | - key: modernize-loop-convert.MinConfidence 20 | value: reasonable 21 | - key: modernize-pass-by-value.IncludeStyle 22 | value: llvm 23 | - key: modernize-replace-auto-ptr.IncludeStyle 24 | value: llvm 25 | - key: modernize-use-nullptr.NullMacros 26 | value: 'NULL' 27 | ... 28 | -------------------------------------------------------------------------------- /.clangd: -------------------------------------------------------------------------------- 1 | # https://clangd.llvm.org/config 2 | 3 | # Apply a config conditionally to all C files 4 | If: 5 | PathMatch: .*\.(c|h)$ 6 | 7 | --- 8 | 9 | # Apply a config conditionally to all C++ files 10 | If: 11 | PathMatch: .*\.(c|h)pp 12 | 13 | --- 14 | 15 | # Apply a config conditionally to all CUDA files 16 | If: 17 | PathMatch: .*\.cuh? 18 | CompileFlags: 19 | Add: 20 | # Allow variadic CUDA functions 21 | - "-Xclang=-fcuda-allow-variadic-functions" 22 | 23 | --- 24 | 25 | # Tweak the clangd parse settings for all files 26 | CompileFlags: 27 | Compiler: clang++ 28 | CompilationDatabase: . 29 | Add: 30 | - -x 31 | - cuda 32 | # report all errors 33 | - "-ferror-limit=0" 34 | - "-ftemplate-backtrace-limit=0" 35 | - "-stdlib=libc++" 36 | Remove: 37 | - -stdpar 38 | # strip CUDA fatbin args 39 | - "-Xfatbin*" 40 | - "-gpu=*" 41 | - "--diag_suppress*" 42 | # strip CUDA arch flags 43 | - "-gencode*" 44 | - "--generate-code*" 45 | # strip gcc's -fcoroutines 46 | - -fcoroutines 47 | # strip CUDA flags unknown to clang 48 | - "-ccbin*" 49 | - "--compiler-options*" 50 | - "--expt-extended-lambda" 51 | - "--expt-relaxed-constexpr" 52 | - "-forward-unknown-to-host-compiler" 53 | - "-Werror=cross-execution-space-call" 54 | Diagnostics: 55 | Suppress: 56 | - "variadic_device_fn" 57 | - "attributes_not_allowed" 58 | # The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error. 59 | # Temporarily suppressing it, but should probably fix 60 | - "template_param_shadow" 61 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Exclude these commits from git-blame and similar tools. 2 | # 3 | # To use this file, run the following command from the repo root: 4 | # 5 | # ``` 6 | # $ git config blame.ignoreRevsFile .git-blame-ignore-revs 7 | # ``` 8 | # 9 | # Include a brief comment with each commit added, for example: 10 | # 11 | # ``` 12 | # d92d9f8baac5ec48a8f8718dd69f415a45efe372 # Initial clang-format 13 | # ``` 14 | # 15 | # Only add commits that are pure formatting changes (e.g. 16 | # clang-format version changes, etc). 17 | -------------------------------------------------------------------------------- /.github/workflows/mirror-main-branch-to-master-branch.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - "main" 5 | 6 | jobs: 7 | mirror-main-branch-to-master-branch: 8 | name: Mirror main branch to master branch 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Mirror main branch to master branch 12 | id: mirror 13 | uses: google/mirror-branch-action@v1.0 14 | with: 15 | source: "main" 16 | dest: "master" 17 | github-token: ${{ secrets.GITHUB_TOKEN }} 18 | -------------------------------------------------------------------------------- /.github/workflows/push-to-legacy-repositories.yml: -------------------------------------------------------------------------------- 1 | on: push 2 | 3 | jobs: 4 | push-to-legacy-repositories: 5 | name: Push to legacy repositories 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: Push `main` to github.com/nvlabs/cub 9 | uses: wei/git-sync@v2 10 | if: github.repository == 'nvidia/cub' 11 | with: 12 | source_repo: "nvidia/cub" 13 | source_branch: "main" 14 | destination_repo: "nvlabs/cub" 15 | destination_branch: "main" 16 | ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} 17 | - name: Push all tags to github.com/nvlabs/cub 18 | uses: wei/git-sync@v2 19 | if: github.repository == 'nvidia/cub' 20 | with: 21 | source_repo: "nvidia/cub" 22 | source_branch: "refs/tags/*" 23 | destination_repo: "nvlabs/cub" 24 | destination_branch: "refs/tags/*" 25 | ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} 26 | - name: Push `main` to github.com/thrust/cub 27 | uses: wei/git-sync@v2 28 | if: github.repository == 'nvidia/cub' 29 | with: 30 | source_repo: "nvidia/cub" 31 | source_branch: "main" 32 | destination_repo: "thrust/cub" 33 | destination_branch: "main" 34 | ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} 35 | - name: Push all tags to github.com/thrust/cub 36 | uses: wei/git-sync@v2 37 | if: github.repository == 'nvidia/cub' 38 | with: 39 | source_repo: "nvidia/cub" 40 | source_branch: "refs/tags/*" 41 | destination_repo: "thrust/cub" 42 | destination_branch: "refs/tags/*" 43 | ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .p4config 2 | *~ 3 | \#* 4 | /build 5 | .cache 6 | .vscode 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 3.15 is the minimum. 2 | # 3.17 for NVC++. 3 | # 3.18.3 for C++17 + CUDA. 4 | cmake_minimum_required(VERSION 3.15) 5 | 6 | # Remove this when we use the new CUDA_ARCHITECTURES properties. 7 | if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) 8 | cmake_policy(SET CMP0104 OLD) 9 | endif() 10 | 11 | # CXX is only needed for AppendOptionIfAvailable. 12 | project(CUB NONE) 13 | 14 | # Determine whether CUB is the top-level project or included into 15 | # another project via add_subdirectory(). 16 | if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") 17 | set(CUB_TOPLEVEL_PROJECT ON) 18 | else() 19 | set(CUB_TOPLEVEL_PROJECT OFF) 20 | endif() 21 | 22 | # This must be done before any languages are enabled: 23 | if (CUB_TOPLEVEL_PROJECT) 24 | include(cmake/CubCompilerHacks.cmake) 25 | endif() 26 | 27 | # This must appear after our Compiler Hacks or else CMake will delete the cache 28 | # and reconfigure from scratch. 29 | # This must also appear before the installation rules, as it is required by the 30 | # GNUInstallDirs CMake module. 31 | enable_language(CXX) 32 | 33 | # Thrust has its own copy of CUB install rules to handle packaging usecases 34 | # where we want to install CUB headers but aren't actually building anything. 35 | # In these cases the add_subdirectory(dependencies/cub) line in Thrust won't get 36 | # called so we can't rely on CUB providing its own rules. 37 | if (NOT CUB_IN_THRUST) 38 | option(CUB_ENABLE_INSTALL_RULES "Enable installation of CUB" ${CUB_TOPLEVEL_PROJECT}) 39 | if (CUB_ENABLE_INSTALL_RULES) 40 | include(cmake/CubInstallRules.cmake) 41 | endif() 42 | endif() 43 | 44 | # Support adding CUB to a parent project via add_subdirectory. 45 | # See examples/cmake/add_subdir/CMakeLists.txt for details. 46 | if (NOT CUB_TOPLEVEL_PROJECT AND NOT CUB_IN_THRUST) 47 | include(cmake/CubAddSubdir.cmake) 48 | return() 49 | endif() 50 | 51 | option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON) 52 | option(CUB_ENABLE_TESTING "Build CUB testing suite." ON) 53 | option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON) 54 | 55 | # This is needed for NVCXX QA, which requires a static set of executable names. 56 | # Only a single dialect may be enabled when this is off. 57 | option(CUB_ENABLE_CPP_DIALECT_IN_NAMES 58 | "Include C++ dialect information in target/object/etc names." 59 | ON 60 | ) 61 | mark_as_advanced(CUB_ENABLE_CPP_DIALECT_IN_NAMES) 62 | 63 | # This option is only used when CUB is built stand-alone; otherwise the Thrust 64 | # option has the same effect. 65 | if (NOT CUB_IN_THRUST) 66 | option(CUB_IGNORE_DEPRECATED_API 67 | "Suppress warnings about deprecated Thrust/CUB API." 68 | OFF 69 | ) 70 | endif() 71 | 72 | # Check if we're actually building anything before continuing. If not, no need 73 | # to search for deps, etc. This is a common approach for packagers that just 74 | # need the install rules. See GH issue NVIDIA/thrust#1211. 75 | if (NOT (CUB_ENABLE_HEADER_TESTING OR 76 | CUB_ENABLE_TESTING OR 77 | CUB_ENABLE_EXAMPLES)) 78 | return() 79 | endif() 80 | 81 | include(cmake/AppendOptionIfAvailable.cmake) 82 | include(cmake/CubBuildCompilerTargets.cmake) 83 | include(cmake/CubBuildTargetList.cmake) 84 | include(cmake/CubCudaConfig.cmake) 85 | include(cmake/CubUtilities.cmake) 86 | 87 | if ("" STREQUAL "${CMAKE_BUILD_TYPE}") 88 | set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE) 89 | 90 | set_property( 91 | CACHE CMAKE_BUILD_TYPE 92 | PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel 93 | ) 94 | endif () 95 | 96 | set(CMAKE_CXX_EXTENSIONS OFF) 97 | 98 | # Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up alongside 99 | # Thrust targets when building as part of Thrust. 100 | set(CUB_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib") 101 | set(CUB_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin") 102 | 103 | cub_build_target_list() 104 | 105 | if (CUB_ENABLE_HEADER_TESTING) 106 | include(cmake/CubHeaderTesting.cmake) 107 | endif() 108 | 109 | # Both testing and examples use ctest 110 | if (CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES) 111 | include(CTest) 112 | enable_testing() 113 | endif() 114 | 115 | if (CUB_ENABLE_TESTING) 116 | add_subdirectory(test) 117 | endif() 118 | 119 | if (CUB_ENABLE_EXAMPLES) 120 | add_subdirectory(examples) 121 | endif() 122 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Code of Conduct 3 | 4 | ## Overview 5 | 6 | This document defines the Code of Conduct followed and enforced for NVIDIA C++ 7 | Core Compute Libraries. 8 | 9 | ### Intended Audience 10 | 11 | * Community 12 | * Developers 13 | * Project Leads 14 | 15 | ## Our Pledge 16 | 17 | In the interest of fostering an open and welcoming environment, we as 18 | contributors and maintainers pledge to making participation in our project and 19 | our community a harassment-free experience for everyone, regardless of age, 20 | body size, disability, ethnicity, sex characteristics, gender identity and 21 | expression, level of experience, education, socio-economic status, nationality, 22 | personal appearance, race, religion, or sexual identity and orientation. 23 | 24 | ## Our Standards 25 | 26 | Examples of behavior that contributes to creating a positive environment include: 27 | 28 | - Using welcoming and inclusive language. 29 | - Being respectful of differing viewpoints and experiences. 30 | - Gracefully accepting constructive criticism. 31 | - Focusing on what is best for the community. 32 | - Showing empathy towards other community members. 33 | 34 | Examples of unacceptable behavior by participants include: 35 | 36 | - The use of sexualized language or imagery and unwelcome sexual attention or 37 | advances. 38 | - Trolling, insulting/derogatory comments, and personal or political attacks. 39 | - Public or private harassment. 40 | - Publishing others’ private information, such as a physical or electronic 41 | address, without explicit permission. 42 | - Other conduct which could reasonably be considered inappropriate. 43 | 44 | ## Our Responsibilities 45 | 46 | Project maintainers are responsible for clarifying the standards of acceptable 47 | behavior and are expected to take appropriate and fair corrective action in 48 | response to any instances of unacceptable behavior. 49 | 50 | Project maintainers have the right and responsibility to remove, edit, or 51 | reject comments, commits, code, wiki edits, issues, and other contributions 52 | that are not aligned to this Code of Conduct, or to ban temporarily or 53 | permanently any contributor for other behaviors that they deem inappropriate, 54 | threatening, offensive, or harmful. 55 | 56 | ## Scope 57 | 58 | This Code of Conduct applies both within project spaces and in public spaces 59 | when an individual is representing the project or its community. 60 | Examples of representing a project or community include using an official 61 | project email address, posting via an official social media account, or acting 62 | as an appointed representative at an online or offline event. 63 | Representation of a project may be further defined and clarified by project 64 | maintainers. 65 | 66 | ## Enforcement 67 | 68 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 69 | reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com). 70 | All complaints will be reviewed and investigated and will result in a response 71 | that is deemed necessary and appropriate to the circumstances. 72 | The project team is obligated to maintain confidentiality with regard to the 73 | reporter of an incident. 74 | Further details of specific enforcement policies may be posted separately. 75 | 76 | Project maintainers who do not follow or enforce the Code of Conduct in good 77 | faith may face temporary or permanent repercussions as determined by other 78 | members of the project’s leadership. 79 | 80 | ## Attribution 81 | 82 | This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was 83 | adapted from the [Contributor Covenant version 1.4]. 84 | 85 | Please see this [FAQ] for answers to common questions about this Code of Conduct. 86 | 87 | ## Contact 88 | 89 | Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters. 90 | 91 | 92 | [cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com 93 | 94 | [FAQ]: https://www.contributor-covenant.org/faq 95 | 96 | [NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/ 97 | [Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 98 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | 3 | 1. [Contributing to CUB](#contributing-to-cub) 4 | 1. [CMake Options](#cmake-options) 5 | 1. [Development Model](#development-model) 6 | 7 | # Contributing to CUB 8 | 9 | CUB uses Github to manage all open-source development, including bug tracking, 10 | pull requests, and design discussions. CUB is tightly coupled to the Thrust 11 | project, and a compatible version of Thrust is required when working on the 12 | development version of CUB. 13 | 14 | To setup a CUB development branch, it is recommended to recursively clone the 15 | Thrust repository and use the CUB submodule at `dependencies/cub` to stage 16 | changes. CUB's tests and examples can be built by configuring Thrust with the 17 | CMake option `THRUST_INCLUDE_CUB_CMAKE=ON`. 18 | 19 | This process is described in more detail in Thrust's 20 | [CONTRIBUTING.md](https://nvidia.github.io/thrust/contributing.html). 21 | 22 | The CMake options in the following section may be used to customize CUB's build 23 | process. Note that some of these are controlled by Thrust for compatibility and 24 | may not have an effect when building CUB through the Thrust build system. This 25 | is pointed out in the documentation below where applicable. 26 | 27 | # CMake Options 28 | 29 | A CUB build is configured using CMake options. These may be passed to CMake 30 | using 31 | 32 | ``` 33 | cmake -D= [Thrust or CUB project source root] 34 | ``` 35 | 36 | or configured interactively with the `ccmake` or `cmake-gui` interfaces. 37 | 38 | The configuration options for CUB are: 39 | 40 | - `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}` 41 | - Standard CMake build option. Default: `RelWithDebInfo` 42 | - `CUB_ENABLE_HEADER_TESTING={ON, OFF}` 43 | - Whether to test compile public headers. Default is `ON`. 44 | - `CUB_ENABLE_TESTING={ON, OFF}` 45 | - Whether to build unit tests. Default is `ON`. 46 | - `CUB_ENABLE_EXAMPLES={ON, OFF}` 47 | - Whether to build examples. Default is `ON`. 48 | - `CUB_ENABLE_DIALECT_CPPXX={ON, OFF}` 49 | - Setting this has no effect when building CUB as a component of Thrust. 50 | See Thrust's dialect options, which CUB will inherit. 51 | - Toggle whether a specific C++ dialect will be targeted. 52 | - Multiple dialects may be targeted in a single build. 53 | - Possible values of `XX` are `{11, 14, 17}`. 54 | - By default, only C++14 is enabled. 55 | - `CUB_ENABLE_COMPUTE_XX={ON, OFF}` 56 | - Setting this has no effect when building CUB as a component of Thrust. 57 | See Thrust's architecture options, which CUB will inherit. 58 | - Controls the targeted CUDA architecture(s) 59 | - Multiple options may be selected when using NVCC as the CUDA compiler. 60 | - Valid values of `XX` are: 61 | `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}` 62 | - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`: 63 | - `CUB_ENABLE_COMPUTE_FUTURE={ON, OFF}` 64 | - Setting this has no effect when building CUB as a component of Thrust. 65 | See Thrust's architecture options, which CUB will inherit. 66 | - If enabled, CUDA objects will target the most recent virtual architecture 67 | in addition to the real architectures specified by the 68 | `CUB_ENABLE_COMPUTE_XX` options. 69 | - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`: 70 | - `CUB_DISABLE_ARCH_BY_DEFAULT={ON, OFF}` 71 | - Setting this has no effect when building CUB as a component of Thrust. 72 | See Thrust's architecture options, which CUB will inherit. 73 | - When `ON`, all `CUB_ENABLE_COMPUTE_*` options are initially `OFF`. 74 | - Default: `OFF` (meaning all architectures are enabled by default) 75 | - `CUB_ENABLE_TESTS_WITH_RDC={ON, OFF}` 76 | - Whether to enable Relocatable Device Code when building tests. 77 | Default is `OFF`. 78 | - `CUB_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}` 79 | - Whether to enable Relocatable Device Code when building examples. 80 | Default is `OFF`. 81 | - `CUB_ENABLE_INSTALL_RULES={ON, OFF}` 82 | - Setting this has no effect when building CUB as a component of Thrust. 83 | See Thrust's `THRUST_INSTALL_CUB_HEADERS` option, which controls this 84 | behavior. 85 | - If true, installation rules will be generated for CUB. Default is `ON` when 86 | building CUB alone, and `OFF` when CUB is a subproject added via CMake's 87 | `add_subdirectory`. 88 | 89 | # Development Model 90 | 91 | CUB follows the same development model as Thrust, described 92 | [here](https://nvidia.github.io/thrust/releases/versioning.html). 93 | -------------------------------------------------------------------------------- /LICENSE.TXT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2011, Duane Merrill. All rights reserved. 2 | Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the NVIDIA CORPORATION nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /cmake/AppendOptionIfAvailable.cmake: -------------------------------------------------------------------------------- 1 | include_guard(GLOBAL) 2 | include(CheckCXXCompilerFlag) 3 | 4 | macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST) 5 | 6 | string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR) 7 | check_cxx_compiler_flag(${_FLAG} ${_VAR}) 8 | 9 | if (${${_VAR}}) 10 | list(APPEND ${_LIST} ${_FLAG}) 11 | endif () 12 | 13 | endmacro () 14 | -------------------------------------------------------------------------------- /cmake/CubAddSubdir.cmake: -------------------------------------------------------------------------------- 1 | find_package(CUB REQUIRED CONFIG 2 | NO_DEFAULT_PATH # Only check the explicit path in HINTS: 3 | HINTS "${CMAKE_CURRENT_LIST_DIR}/.." 4 | ) 5 | -------------------------------------------------------------------------------- /cmake/CubBuildCompilerTargets.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # This file defines the `cub_build_compiler_targets()` function, which 3 | # creates the following interface targets: 4 | # 5 | # cub.compiler_interface 6 | # - Interface target providing compiler-specific options needed to build 7 | # Thrust's tests, examples, etc. 8 | 9 | function(cub_build_compiler_targets) 10 | set(cxx_compile_definitions) 11 | set(cxx_compile_options) 12 | set(cuda_compile_options) 13 | 14 | if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") 15 | append_option_if_available("/W4" cxx_compile_options) 16 | 17 | append_option_if_available("/WX" cxx_compile_options) 18 | 19 | # Suppress overly-pedantic/unavoidable warnings brought in with /W4: 20 | # C4324: structure was padded due to alignment specifier 21 | append_option_if_available("/wd4324" cxx_compile_options) 22 | # C4127: conditional expression is constant 23 | # This can be fixed with `if constexpr` when available, but there's no way 24 | # to silence these pre-C++17. 25 | # TODO We should have per-dialect interface targets so we can leave these 26 | # warnings enabled on C++17: 27 | append_option_if_available("/wd4127" cxx_compile_options) 28 | # C4505: unreferenced local function has been removed 29 | # The CUDA `host_runtime.h` header emits this for 30 | # `__cudaUnregisterBinaryUtil`. 31 | append_option_if_available("/wd4505" cxx_compile_options) 32 | # C4706: assignment within conditional expression 33 | # MSVC doesn't provide an opt-out for this warning when the assignment is 34 | # intentional. Clang will warn for these, but suppresses the warning when 35 | # double-parentheses are used around the assignment. We'll let Clang catch 36 | # unintentional assignments and suppress all such warnings on MSVC. 37 | append_option_if_available("/wd4706" cxx_compile_options) 38 | 39 | # Some tests require /bigobj to fit everything into their object files: 40 | append_option_if_available("/bigobj" cxx_compile_options) 41 | else() 42 | append_option_if_available("-Wreorder" cuda_compile_options) 43 | 44 | append_option_if_available("-Werror" cxx_compile_options) 45 | append_option_if_available("-Wall" cxx_compile_options) 46 | append_option_if_available("-Wextra" cxx_compile_options) 47 | append_option_if_available("-Winit-self" cxx_compile_options) 48 | append_option_if_available("-Woverloaded-virtual" cxx_compile_options) 49 | append_option_if_available("-Wcast-qual" cxx_compile_options) 50 | append_option_if_available("-Wpointer-arith" cxx_compile_options) 51 | append_option_if_available("-Wunused-local-typedef" cxx_compile_options) 52 | append_option_if_available("-Wvla" cxx_compile_options) 53 | 54 | # Disable GNU extensions (flag is clang only) 55 | append_option_if_available("-Wgnu" cxx_compile_options) 56 | # Calling a variadic macro with zero args is a GNU extension until C++20, 57 | # but the THRUST_PP_ARITY macro is used with zero args. Need to see if this 58 | # is a real problem worth fixing. 59 | append_option_if_available("-Wno-gnu-zero-variadic-macro-arguments" cxx_compile_options) 60 | 61 | # This complains about functions in CUDA system headers when used with nvcc. 62 | append_option_if_available("-Wno-unused-function" cxx_compile_options) 63 | endif() 64 | 65 | if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}") 66 | if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3) 67 | # GCC 7.3 complains about name mangling changes due to `noexcept` 68 | # becoming part of the type system; we don't care. 69 | append_option_if_available("-Wno-noexcept-type" cxx_compile_options) 70 | endif() 71 | endif() 72 | 73 | if ("Intel" STREQUAL "${CMAKE_CXX_COMPILER_ID}") 74 | # Disable warning that inlining is inhibited by compiler thresholds. 75 | append_option_if_available("-diag-disable=11074" cxx_compile_options) 76 | append_option_if_available("-diag-disable=11076" cxx_compile_options) 77 | endif() 78 | 79 | if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}") 80 | option(CUB_ENABLE_CT_PROFILING "Enable compilation time profiling" OFF) 81 | if (CUB_ENABLE_CT_PROFILING) 82 | append_option_if_available("-ftime-trace" cxx_compile_options) 83 | endif() 84 | endif() 85 | 86 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 87 | list(APPEND cxx_compile_options -Mnodaz) 88 | # TODO: Managed memory is currently not supported on windows with WSL 89 | list(APPEND cxx_compile_options -gpu=nomanaged) 90 | endif() 91 | 92 | add_library(cub.compiler_interface INTERFACE) 93 | 94 | foreach (cxx_option IN LISTS cxx_compile_options) 95 | target_compile_options(cub.compiler_interface INTERFACE 96 | $<$:${cxx_option}> 97 | $<$:${cxx_option}> 98 | # Only use -Xcompiler with NVCC, not NVC++. 99 | # 100 | # CMake can't split genexs, so this can't be formatted better :( 101 | # This is: 102 | # if (using CUDA and CUDA_COMPILER is NVCC) add -Xcompiler=opt: 103 | $<$:-Xcompiler=${cxx_option}> 104 | ) 105 | endforeach() 106 | 107 | foreach (cuda_option IN LISTS cuda_compile_options) 108 | target_compile_options(cub.compiler_interface INTERFACE 109 | $<$:${cuda_option}> 110 | ) 111 | endforeach() 112 | 113 | # Add these for both CUDA and CXX targets: 114 | target_compile_definitions(cub.compiler_interface INTERFACE 115 | ${cxx_compile_definitions} 116 | ) 117 | 118 | # Promote warnings and display diagnostic numbers for nvcc: 119 | target_compile_options(cub.compiler_interface INTERFACE 120 | # If using CUDA w/ NVCC... 121 | # Display diagnostic numbers. 122 | $<$:-Xcudafe=--display_error_number> 123 | # Promote warnings. 124 | $<$:-Xcudafe=--promote_warnings> 125 | # Don't complain about deprecated GPU targets. 126 | $<$:-Wno-deprecated-gpu-targets> 127 | ) 128 | endfunction() 129 | -------------------------------------------------------------------------------- /cmake/CubCompilerHacks.cmake: -------------------------------------------------------------------------------- 1 | # Set up compiler paths and apply temporary hacks to support NVC++. 2 | # This file must be included before enabling any languages. 3 | 4 | # Temporary hacks to make NVC++ work; this requires you to define 5 | # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`. 6 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 7 | # If using NVC++, don't set CXX compiler 8 | if (NOT "${CMAKE_CXX_COMPILER}" STREQUAL "") 9 | unset(CMAKE_CXX_COMPILER CACHE) 10 | message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have" 11 | " specified a different ISO C++ compiler; NVC++ acts as both, so please" 12 | " unset the CMAKE_CXX_COMPILER variable." 13 | ) 14 | endif() 15 | 16 | # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to 17 | # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't 18 | # understand. 19 | if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "") 20 | unset(CMAKE_CUDA_HOST_COMPILER CACHE) 21 | message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have" 22 | " specified a different host ISO C++ compiler; NVC++ acts as both, so" 23 | " please unset the CMAKE_CUDA_HOST_COMPILER variable." 24 | ) 25 | endif() 26 | 27 | set(CMAKE_CXX_COMPILER "${CMAKE_CUDA_COMPILER}") 28 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -cuda") 29 | set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CUDA_COMPILER}") 30 | set(CMAKE_CUDA_LINK_EXECUTABLE 31 | " -o ") 32 | endif () 33 | 34 | # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to 35 | # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't 36 | # understand. 37 | if ((NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")) 38 | if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR 39 | "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}")) 40 | set(tmp "${CMAKE_CUDA_HOST_COMPILER}") 41 | unset(CMAKE_CUDA_HOST_COMPILER CACHE) 42 | message(FATAL_ERROR 43 | "For convenience, CUB's test harness uses CMAKE_CXX_COMPILER for the " 44 | "CUDA host compiler. Refusing to overwrite specified " 45 | "CMAKE_CUDA_HOST_COMPILER -- please reconfigure without setting this " 46 | "variable. Currently:\n" 47 | "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}\n" 48 | "CMAKE_CUDA_HOST_COMPILER=${tmp}" 49 | ) 50 | endif () 51 | set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") 52 | endif () 53 | 54 | # Temporary hacks to make NVC++ work; this requires you to define 55 | # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`. 56 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 57 | # Need 3.17 for the properties used below. 58 | cmake_minimum_required(VERSION 3.17) 59 | 60 | set(CMAKE_CUDA_STANDARD_DEFAULT 03) 61 | 62 | set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "-std=c++03") 63 | set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "-std=c++03") 64 | set(CMAKE_CUDA03_STANDARD__HAS_FULL_SUPPORT TRUE) 65 | set_property(GLOBAL PROPERTY CMAKE_CUDA03_KNOWN_FEATURES) 66 | 67 | set(CMAKE_CUDA11_STANDARD_COMPILE_OPTION "-std=c++11") 68 | set(CMAKE_CUDA11_EXTENSION_COMPILE_OPTION "-std=c++11") 69 | set(CMAKE_CUDA11_STANDARD__HAS_FULL_SUPPORT TRUE) 70 | set_property(GLOBAL PROPERTY CMAKE_CUDA11_KNOWN_FEATURES) 71 | 72 | set(CMAKE_CUDA14_STANDARD_COMPILE_OPTION "-std=c++14") 73 | set(CMAKE_CUDA14_EXTENSION_COMPILE_OPTION "-std=c++14") 74 | set(CMAKE_CUDA14_STANDARD__HAS_FULL_SUPPORT TRUE) 75 | set_property(GLOBAL PROPERTY CMAKE_CUDA14_KNOWN_FEATURES) 76 | 77 | set(CMAKE_CUDA17_STANDARD_COMPILE_OPTION "-std=c++17") 78 | set(CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "-std=c++17") 79 | set(CMAKE_CUDA17_STANDARD__HAS_FULL_SUPPORT TRUE) 80 | set_property(GLOBAL PROPERTY CMAKE_CUDA17_KNOWN_FEATURES) 81 | 82 | include(Internal/FeatureTesting) 83 | include(Compiler/CMakeCommonCompilerMacros) 84 | cmake_record_cuda_compile_features() 85 | 86 | set(CMAKE_CUDA_COMPILE_FEATURES 87 | ${CMAKE_CUDA03_COMPILE_FEATURES} 88 | ${CMAKE_CUDA11_COMPILE_FEATURES} 89 | ${CMAKE_CUDA14_COMPILE_FEATURES} 90 | ${CMAKE_CUDA17_COMPILE_FEATURES} 91 | ${CMAKE_CUDA20_COMPILE_FEATURES} 92 | ) 93 | endif () 94 | -------------------------------------------------------------------------------- /cmake/CubCudaConfig.cmake: -------------------------------------------------------------------------------- 1 | enable_language(CUDA) 2 | 3 | if (NOT CUB_IN_THRUST) 4 | message(FATAL_ERROR 5 | "Building CUB as a standalone project is no longer supported. " 6 | "Use the Thrust repo instead.") 7 | endif() 8 | 9 | set(CUB_CUDA_FLAGS_BASE "${THRUST_CUDA_FLAGS_BASE}") 10 | set(CUB_CUDA_FLAGS_RDC "${THRUST_CUDA_FLAGS_RDC}") 11 | set(CUB_CUDA_FLAGS_NO_RDC "${THRUST_CUDA_FLAGS_NO_RDC}") 12 | 13 | # Update the enabled architectures list from thrust 14 | foreach (arch IN LISTS THRUST_KNOWN_COMPUTE_ARCHS) 15 | if (THRUST_ENABLE_COMPUTE_${arch}) 16 | set(CUB_ENABLE_COMPUTE_${arch} True) 17 | string(APPEND arch_message " sm_${arch}") 18 | else() 19 | set(CUB_ENABLE_COMPUTE_${arch} False) 20 | endif() 21 | endforeach() 22 | 23 | message(STATUS ${arch_message}) 24 | 25 | # 26 | # RDC options: 27 | # 28 | 29 | # RDC is off by default in NVCC and on by default in NVC++. Turning off RDC 30 | # isn't currently supported by NVC++. So, we default to RDC off for NVCC and 31 | # RDC on for NVC++. 32 | set(option_init OFF) 33 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 34 | set(option_init ON) 35 | endif() 36 | 37 | option(CUB_ENABLE_TESTS_WITH_RDC 38 | "Build all CUB tests with RDC; tests that require RDC are not affected by this option." 39 | ${option_init} 40 | ) 41 | 42 | option(CUB_ENABLE_EXAMPLES_WITH_RDC 43 | "Build all CUB examples with RDC; examples which require RDC are not affected by this option." 44 | ${option_init} 45 | ) 46 | 47 | # Check for RDC/SM compatibility and error/warn if necessary 48 | set(rdc_supported True) 49 | foreach (arch IN LISTS no_rdc_archs) 50 | if (CUB_ENABLE_COMPUTE_${arch}) 51 | set(rdc_supported False) 52 | break() 53 | endif() 54 | endforeach() 55 | 56 | set(rdc_opts 57 | CUB_ENABLE_TESTS_WITH_RDC 58 | CUB_ENABLE_EXAMPLES_WITH_RDC 59 | ) 60 | set(rdc_requested False) 61 | foreach (rdc_opt IN LISTS rdc_opts) 62 | if (${rdc_opt}) 63 | set(rdc_requested True) 64 | break() 65 | endif() 66 | endforeach() 67 | 68 | if (rdc_requested AND NOT rdc_supported) 69 | string(JOIN ", " no_rdc ${no_rdc_archs}) 70 | string(JOIN "\n" opts ${rdc_opts}) 71 | message(FATAL_ERROR 72 | "Architectures {${no_rdc}} do not support RDC and are incompatible with " 73 | "these options:\n${opts}" 74 | ) 75 | endif() 76 | 77 | 78 | # 79 | # Clang CUDA options 80 | # 81 | if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 82 | set(CUB_CUDA_FLAGS_BASE "${CUB_CUDA_FLAGS_BASE} -Wno-unknown-cuda-version -Xclang=-fcuda-allow-variadic-functions") 83 | endif() 84 | 85 | 86 | # By default RDC is not used: 87 | set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}") 88 | -------------------------------------------------------------------------------- /cmake/CubHeaderTesting.cmake: -------------------------------------------------------------------------------- 1 | # For every public header, build a translation unit containing `#include
` 2 | # to let the compiler try to figure out warnings in that header if it is not otherwise 3 | # included in tests, and also to verify if the headers are modular enough. 4 | # .inl files are not globbed for, because they are not supposed to be used as public 5 | # entrypoints. 6 | 7 | # Meta target for all configs' header builds: 8 | add_custom_target(cub.all.headers) 9 | 10 | file(GLOB_RECURSE headers 11 | RELATIVE "${CUB_SOURCE_DIR}/cub" 12 | CONFIGURE_DEPENDS 13 | cub/*.cuh 14 | ) 15 | 16 | set(headertest_srcs) 17 | foreach (header IN LISTS headers) 18 | set(headertest_src "headers/${header}.cu") 19 | configure_file("${CUB_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}") 20 | list(APPEND headertest_srcs "${headertest_src}") 21 | endforeach() 22 | 23 | function(cub_add_header_test label definitions) 24 | foreach(cub_target IN LISTS CUB_TARGETS) 25 | cub_get_target_property(config_prefix ${cub_target} PREFIX) 26 | 27 | set(headertest_target ${config_prefix}.headers.${label}) 28 | add_library(${headertest_target} OBJECT ${headertest_srcs}) 29 | target_link_libraries(${headertest_target} PUBLIC ${cub_target}) 30 | target_compile_definitions(${headertest_target} PRIVATE ${definitions}) 31 | cub_clone_target_properties(${headertest_target} ${cub_target}) 32 | 33 | if (CUB_IN_THRUST) 34 | thrust_fix_clang_nvcc_build_for(${headertest_target}) 35 | endif() 36 | 37 | add_dependencies(cub.all.headers ${headertest_target}) 38 | add_dependencies(${config_prefix}.all ${headertest_target}) 39 | endforeach() 40 | endfunction() 41 | 42 | # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros: 43 | set(header_definitions 44 | "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" 45 | "CUB_WRAPPED_NAMESPACE=wrapped_cub") 46 | cub_add_header_test(base "${header_definitions}") 47 | 48 | list(APPEND header_definitions "CUB_DISABLE_BF16_SUPPORT") 49 | cub_add_header_test(bf16 "${header_definitions}") 50 | 51 | -------------------------------------------------------------------------------- /cmake/CubInstallRules.cmake: -------------------------------------------------------------------------------- 1 | # Thrust manages its own copy of these rules. Update ThrustInstallRules.cmake 2 | # if modifying this file. 3 | if (CUB_IN_THRUST) 4 | return() 5 | endif() 6 | 7 | # Bring in CMAKE_INSTALL_LIBDIR 8 | include(GNUInstallDirs) 9 | 10 | # CUB is a header library; no need to build anything before installing: 11 | set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE) 12 | 13 | install(DIRECTORY "${CUB_SOURCE_DIR}/cub" 14 | DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" 15 | FILES_MATCHING 16 | PATTERN "*.cuh" 17 | ) 18 | 19 | install(DIRECTORY "${CUB_SOURCE_DIR}/cub/cmake/" 20 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub" 21 | PATTERN *.cmake.in EXCLUDE 22 | ) 23 | # Need to configure a file to store the infix specified in 24 | # CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user 25 | set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/cub") 26 | configure_file("${CUB_SOURCE_DIR}/cub/cmake/cub-header-search.cmake.in" 27 | "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake" 28 | @ONLY) 29 | install(FILES "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake" 30 | DESTINATION "${install_location}") 31 | -------------------------------------------------------------------------------- /cmake/CubUtilities.cmake: -------------------------------------------------------------------------------- 1 | # Enable RDC for a CUDA target. Encapsulates compiler hacks: 2 | function(cub_enable_rdc_for_cuda_target target_name) 3 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 4 | set_target_properties(${target_name} PROPERTIES 5 | COMPILE_FLAGS "-gpu=rdc" 6 | ) 7 | elseif ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 8 | else() 9 | set_target_properties(${target_name} PROPERTIES 10 | CUDA_SEPARABLE_COMPILATION ON 11 | ) 12 | endif() 13 | endfunction() 14 | -------------------------------------------------------------------------------- /cmake/header_test.in: -------------------------------------------------------------------------------- 1 | // This source file checks that: 2 | // 1) Header compiles without error. 3 | // 2) Common macro collisions with platform/system headers are avoided. 4 | 5 | // Define CUB_MACRO_CHECK(macro, header), which emits a diagnostic indicating 6 | // a potential macro collision and halts. 7 | // 8 | // Use raw platform checks instead of the CUB_HOST_COMPILER macros since we 9 | // don't want to #include any headers other than the one being tested. 10 | // 11 | // This is only implemented for MSVC/GCC/Clang. 12 | #if defined(_MSC_VER) // MSVC 13 | 14 | // Fake up an error for MSVC 15 | #define CUB_MACRO_CHECK_IMPL(msg) \ 16 | /* Print message that looks like an error: */ \ 17 | __pragma(message(__FILE__ ":" CUB_MACRO_CHECK_IMPL0(__LINE__) \ 18 | ": error: " #msg)) \ 19 | /* abort compilation due to static_assert or syntax error: */ \ 20 | static_assert(false, #msg); 21 | #define CUB_MACRO_CHECK_IMPL0(x) CUB_MACRO_CHECK_IMPL1(x) 22 | #define CUB_MACRO_CHECK_IMPL1(x) #x 23 | 24 | #elif defined(__clang__) || defined(__GNUC__) 25 | 26 | // GCC/clang are easy: 27 | #define CUB_MACRO_CHECK_IMPL(msg) CUB_MACRO_CHECK_IMPL0(GCC error #msg) 28 | #define CUB_MACRO_CHECK_IMPL0(expr) _Pragma(#expr) 29 | 30 | #endif 31 | 32 | // Hacky way to build a string, but it works on all tested platforms. 33 | #define CUB_MACRO_CHECK(MACRO, HEADER) \ 34 | CUB_MACRO_CHECK_IMPL(Identifier MACRO should not be used from CUB \ 35 | headers due to conflicts with HEADER macros.) 36 | 37 | // complex.h conflicts 38 | #define I CUB_MACRO_CHECK('I', complex.h) 39 | 40 | // windows.h conflicts 41 | #define small CUB_MACRO_CHECK('small', windows.h) 42 | // We can't enable these checks without breaking some builds -- some standard 43 | // library implementations unconditionally `#undef` these macros, which then 44 | // causes random failures later. 45 | // Leaving these commented out as a warning: Here be dragons. 46 | //#define min(...) CUB_MACRO_CHECK('min', windows.h) 47 | //#define max(...) CUB_MACRO_CHECK('max', windows.h) 48 | 49 | // termios.h conflicts (NVIDIA/thrust#1547) 50 | #define B0 CUB_MACRO_CHECK("B0", termios.h) 51 | 52 | #include 53 | 54 | #if defined(CUB_DISABLE_BF16_SUPPORT) 55 | #if defined(__CUDA_BF16_TYPES_EXIST__) 56 | #error CUB should not include cuda_bf16.h when BF16 support is disabled 57 | #endif 58 | #endif 59 | -------------------------------------------------------------------------------- /cub/block/block_raking_layout.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #include "../config.cuh" 38 | #include "../util_type.cuh" 39 | 40 | CUB_NAMESPACE_BEGIN 41 | 42 | /** 43 | * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) 44 | * \ingroup BlockModule 45 | * 46 | * \par Overview 47 | * This type facilitates a shared memory usage pattern where a block of CUDA 48 | * threads places elements into shared memory and then reduces the active 49 | * parallelism to one "raking" warp of threads for serially aggregating consecutive 50 | * sequences of shared items. Padding is inserted to eliminate bank conflicts 51 | * (for most data types). 52 | * 53 | * \tparam T The data type to be exchanged. 54 | * \tparam BLOCK_THREADS The thread block size in threads. 55 | * \tparam LEGACY_PTX_ARCH [optional] Unused. 56 | */ 57 | template < 58 | typename T, 59 | int BLOCK_THREADS, 60 | int LEGACY_PTX_ARCH = 0> 61 | struct BlockRakingLayout 62 | { 63 | //--------------------------------------------------------------------- 64 | // Constants and type definitions 65 | //--------------------------------------------------------------------- 66 | 67 | enum 68 | { 69 | /// The total number of elements that need to be cooperatively reduced 70 | SHARED_ELEMENTS = BLOCK_THREADS, 71 | 72 | /// Maximum number of warp-synchronous raking threads 73 | MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(0)), 74 | 75 | /// Number of raking elements per warp-synchronous raking thread (rounded up) 76 | SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, 77 | 78 | /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) 79 | RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, 80 | 81 | /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) 82 | HAS_CONFLICTS = (CUB_SMEM_BANKS(0) % SEGMENT_LENGTH == 0), 83 | 84 | /// Degree of bank conflicts (e.g., 4-way) 85 | CONFLICT_DEGREE = (HAS_CONFLICTS) ? 86 | (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(0) : 87 | 1, 88 | 89 | /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load 90 | USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2), 91 | 92 | /// Total number of elements in the raking grid 93 | GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING), 94 | 95 | /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) 96 | UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), 97 | }; 98 | 99 | 100 | /** 101 | * \brief Shared memory storage type 102 | */ 103 | struct __align__(16) _TempStorage 104 | { 105 | T buff[BlockRakingLayout::GRID_ELEMENTS]; 106 | }; 107 | 108 | /// Alias wrapper allowing storage to be unioned 109 | struct TempStorage : Uninitialized<_TempStorage> {}; 110 | 111 | 112 | /** 113 | * \brief Returns the location for the calling thread to place data into the grid 114 | */ 115 | static __device__ __forceinline__ T* PlacementPtr( 116 | TempStorage &temp_storage, 117 | unsigned int linear_tid) 118 | { 119 | // Offset for partial 120 | unsigned int offset = linear_tid; 121 | 122 | // Add in one padding element for every segment 123 | if (USE_SEGMENT_PADDING > 0) 124 | { 125 | offset += offset / SEGMENT_LENGTH; 126 | } 127 | 128 | // Incorporating a block of padding partials every shared memory segment 129 | return temp_storage.Alias().buff + offset; 130 | } 131 | 132 | 133 | /** 134 | * \brief Returns the location for the calling thread to begin sequential raking 135 | */ 136 | static __device__ __forceinline__ T* RakingPtr( 137 | TempStorage &temp_storage, 138 | unsigned int linear_tid) 139 | { 140 | return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING)); 141 | } 142 | }; 143 | 144 | CUB_NAMESPACE_END 145 | 146 | -------------------------------------------------------------------------------- /cub/block/radix_rank_sort_operations.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * radix_rank_sort_operations.cuh contains common abstractions, definitions and 31 | * operations used for radix sorting and ranking. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../config.cuh" 37 | #include "../util_ptx.cuh" 38 | #include "../util_type.cuh" 39 | 40 | 41 | CUB_NAMESPACE_BEGIN 42 | 43 | /** \brief Twiddling keys for radix sort. */ 44 | template 45 | struct RadixSortTwiddle 46 | { 47 | typedef Traits TraitsT; 48 | typedef typename TraitsT::UnsignedBits UnsignedBits; 49 | static __host__ __device__ __forceinline__ UnsignedBits In(UnsignedBits key) 50 | { 51 | key = TraitsT::TwiddleIn(key); 52 | if (IS_DESCENDING) key = ~key; 53 | return key; 54 | } 55 | static __host__ __device__ __forceinline__ UnsignedBits Out(UnsignedBits key) 56 | { 57 | if (IS_DESCENDING) key = ~key; 58 | key = TraitsT::TwiddleOut(key); 59 | return key; 60 | } 61 | static __host__ __device__ __forceinline__ UnsignedBits DefaultKey() 62 | { 63 | return Out(~UnsignedBits(0)); 64 | } 65 | }; 66 | 67 | /** \brief Base struct for digit extractor. Contains common code to provide 68 | special handling for floating-point -0.0. 69 | 70 | \note This handles correctly both the case when the keys are 71 | bitwise-complemented after twiddling for descending sort (in onesweep) as 72 | well as when the keys are not bit-negated, but the implementation handles 73 | descending sort separately (in other implementations in CUB). Twiddling 74 | alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are 75 | subsequent bit patterns and bitwise complements of each other. For onesweep, 76 | both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for 77 | ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending 78 | sort. For all other sorting implementations in CUB, both are always mapped 79 | to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other 80 | and only one of them is used, the sorting works correctly. For double, the 81 | same applies, but with 64-bit patterns. 82 | */ 83 | template 84 | struct BaseDigitExtractor 85 | { 86 | typedef Traits TraitsT; 87 | typedef typename TraitsT::UnsignedBits UnsignedBits; 88 | 89 | enum 90 | { 91 | FLOAT_KEY = TraitsT::CATEGORY == FLOATING_POINT, 92 | }; 93 | 94 | static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key) 95 | { 96 | if (!FLOAT_KEY) { 97 | return key; 98 | } else { 99 | UnsignedBits TWIDDLED_MINUS_ZERO_BITS = 100 | TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1)); 101 | UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0); 102 | return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key; 103 | } 104 | } 105 | }; 106 | 107 | /** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a 108 | * key from a digit. */ 109 | template 110 | struct BFEDigitExtractor : BaseDigitExtractor 111 | { 112 | using typename BaseDigitExtractor::UnsignedBits; 113 | 114 | uint32_t bit_start, num_bits; 115 | explicit __device__ __forceinline__ BFEDigitExtractor( 116 | uint32_t bit_start = 0, uint32_t num_bits = 0) 117 | : bit_start(bit_start), num_bits(num_bits) 118 | { } 119 | 120 | __device__ __forceinline__ uint32_t Digit(UnsignedBits key) 121 | { 122 | return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits); 123 | } 124 | }; 125 | 126 | /** \brief A wrapper type to extract digits. Uses a combination of shift and 127 | * bitwise and to extract digits. */ 128 | template 129 | struct ShiftDigitExtractor : BaseDigitExtractor 130 | { 131 | using typename BaseDigitExtractor::UnsignedBits; 132 | 133 | uint32_t bit_start, mask; 134 | explicit __device__ __forceinline__ ShiftDigitExtractor( 135 | uint32_t bit_start = 0, uint32_t num_bits = 0) 136 | : bit_start(bit_start), mask((1 << num_bits) - 1) 137 | { } 138 | 139 | __device__ __forceinline__ uint32_t Digit(UnsignedBits key) 140 | { 141 | return uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask; 142 | } 143 | }; 144 | 145 | CUB_NAMESPACE_END 146 | -------------------------------------------------------------------------------- /cub/block/specializations/block_histogram_atomic.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../config.cuh" 37 | 38 | CUB_NAMESPACE_BEGIN 39 | 40 | 41 | /** 42 | * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 43 | */ 44 | template 45 | struct BlockHistogramAtomic 46 | { 47 | /// Shared memory storage layout type 48 | struct TempStorage {}; 49 | 50 | 51 | /// Constructor 52 | __device__ __forceinline__ BlockHistogramAtomic( 53 | TempStorage &temp_storage) 54 | {} 55 | 56 | 57 | /// Composite data onto an existing histogram 58 | template < 59 | typename T, 60 | typename CounterT, 61 | int ITEMS_PER_THREAD> 62 | __device__ __forceinline__ void Composite( 63 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram 64 | CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram 65 | { 66 | // Update histogram 67 | #pragma unroll 68 | for (int i = 0; i < ITEMS_PER_THREAD; ++i) 69 | { 70 | atomicAdd(histogram + items[i], 1); 71 | } 72 | } 73 | 74 | }; 75 | 76 | CUB_NAMESPACE_END 77 | 78 | -------------------------------------------------------------------------------- /cub/cmake/cub-config-version.cmake: -------------------------------------------------------------------------------- 1 | # Parse version information from version.cuh: 2 | include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake") 3 | 4 | file(READ "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh" CUB_VERSION_HEADER) 5 | string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}") 6 | set(CUB_VERSION_FLAT ${CMAKE_MATCH_1}) 7 | # Note that CUB calls this the PATCH number, CMake calls it the TWEAK number: 8 | string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}") 9 | set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1}) 10 | 11 | math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000") 12 | math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000") 13 | math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch" 14 | 15 | set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}") 16 | 17 | set(PACKAGE_VERSION ${CUB_VERSION}) 18 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 19 | set(PACKAGE_VERSION_EXACT FALSE) 20 | set(PACKAGE_VERSION_UNSUITABLE FALSE) 21 | 22 | if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION) 23 | if(CUB_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND 24 | CUB_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR) 25 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 26 | endif() 27 | 28 | if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION) 29 | set(PACKAGE_VERSION_EXACT TRUE) 30 | endif() 31 | endif() 32 | -------------------------------------------------------------------------------- /cub/cmake/cub-config.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # find_package(CUB) config file. 3 | # 4 | # Defines a CUB::CUB target that may be linked from user projects to include 5 | # CUB. 6 | 7 | if (TARGET CUB::CUB) 8 | return() 9 | endif() 10 | 11 | # Minimum supported libcudacxx version: 12 | set(cub_libcudacxx_version 1.8.0) 13 | 14 | function(_cub_declare_interface_alias alias_name ugly_name) 15 | # 1) Only IMPORTED and ALIAS targets can be placed in a namespace. 16 | # 2) When an IMPORTED library is linked to another target, its include 17 | # directories are treated as SYSTEM includes. 18 | # 3) nvcc will automatically check the CUDA Toolkit include path *before* the 19 | # system includes. This means that the Toolkit CUB will *always* be used 20 | # during compilation, and the include paths of an IMPORTED CUB::CUB 21 | # target will never have any effect. 22 | # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED 23 | # on EVERY target that links to CUB::CUB. This would be a burden and a 24 | # footgun for our users. Forgetting this would silently pull in the wrong CUB! 25 | # 5) A workaround is to make a non-IMPORTED library outside of the namespace, 26 | # configure it, and then ALIAS it into the namespace (or ALIAS and then 27 | # configure, that seems to work too). 28 | add_library(${ugly_name} INTERFACE) 29 | add_library(${alias_name} ALIAS ${ugly_name}) 30 | endfunction() 31 | 32 | # 33 | # Setup some internal cache variables 34 | # 35 | 36 | # Pull in the include dir detected by cub-config-version.cmake 37 | set(_CUB_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" 38 | CACHE INTERNAL "Location of CUB headers." 39 | FORCE 40 | ) 41 | unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache 42 | 43 | if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) 44 | set(_CUB_QUIET ON CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) 45 | set(_CUB_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE) 46 | else() 47 | set(_CUB_QUIET OFF CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) 48 | set(_CUB_QUIET_FLAG "" CACHE INTERNAL "" FORCE) 49 | endif() 50 | 51 | # 52 | # Setup dependencies 53 | # 54 | 55 | if (NOT TARGET CUB::libcudacxx) 56 | if (TARGET Thrust::libcudacxx) 57 | # Prefer the same libcudacxx as Thrust, if available: 58 | _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) 59 | target_link_libraries(_CUB_libcudacxx INTERFACE Thrust::libcudacxx) 60 | else() 61 | if (NOT TARGET libcudacxx::libcudacxx) 62 | # First do a non-required search for any co-packaged versions. 63 | # These are preferred. 64 | find_package(libcudacxx ${cub_libcudacxx_version} CONFIG 65 | ${_CUB_QUIET_FLAG} 66 | NO_DEFAULT_PATH # Only check the explicit HINTS below: 67 | HINTS 68 | "${_CUB_INCLUDE_DIR}/../libcudacxx" # Source layout 69 | "${_CUB_CMAKE_DIR}/.." # Install layout 70 | ) 71 | 72 | # A second required search allows externally packaged to be used and fails if 73 | # no suitable package exists. 74 | find_package(libcudacxx ${cub_libcudacxx_version} CONFIG 75 | REQUIRED 76 | ${_CUB_QUIET_FLAG} 77 | ) 78 | endif() 79 | _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) 80 | target_link_libraries(_CUB_libcudacxx INTERFACE libcudacxx::libcudacxx) 81 | endif() 82 | endif() 83 | 84 | # 85 | # Setup targets 86 | # 87 | 88 | _cub_declare_interface_alias(CUB::CUB _CUB_CUB) 89 | target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}") 90 | target_link_libraries(_CUB_CUB INTERFACE CUB::libcudacxx) 91 | 92 | if (CUB_IGNORE_DEPRECATED_API OR THRUST_IGNORE_DEPRECATED_API) 93 | target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_API") 94 | endif() 95 | 96 | if (CUB_IGNORE_DEPRECATED_CPP_DIALECT OR 97 | THRUST_IGNORE_DEPRECATED_CPP_DIALECT) 98 | target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT") 99 | endif() 100 | 101 | if (CUB_IGNORE_DEPRECATED_CPP_11 OR 102 | THRUST_IGNORE_DEPRECATED_CPP_11) 103 | target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11") 104 | endif() 105 | 106 | if (CUB_IGNORE_DEPRECATED_COMPILER OR 107 | THRUST_IGNORE_DEPRECATED_COMPILER) 108 | target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER") 109 | endif() 110 | 111 | # 112 | # Standardize version info 113 | # 114 | 115 | set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE) 116 | set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE) 117 | set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE) 118 | set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE) 119 | set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE) 120 | set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE) 121 | 122 | include(FindPackageHandleStandardArgs) 123 | if (NOT CUB_CONFIG) 124 | set(CUB_CONFIG "${CMAKE_CURRENT_LIST_FILE}") 125 | endif() 126 | find_package_handle_standard_args(CUB CONFIG_MODE) 127 | -------------------------------------------------------------------------------- /cub/cmake/cub-header-search.cmake: -------------------------------------------------------------------------------- 1 | # Parse version information from version.h in source tree 2 | set(_CUB_VERSION_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/../..") 3 | if(EXISTS "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh") 4 | set(_CUB_VERSION_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" CACHE FILEPATH "" FORCE) # Clear old result 5 | set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) 6 | endif() 7 | -------------------------------------------------------------------------------- /cub/cmake/cub-header-search.cmake.in: -------------------------------------------------------------------------------- 1 | # Parse version information from version.h: 2 | unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search 3 | 4 | # Find CMAKE_INSTALL_INCLUDEDIR=@CMAKE_INSTALL_INCLUDEDIR@ directory" 5 | set(from_install_prefix "@install_location@") 6 | 7 | # Transform to a list of directories, replace each directoy with "../" 8 | # and convert back to a string 9 | string(REGEX REPLACE "/" ";" from_install_prefix "${from_install_prefix}") 10 | list(TRANSFORM from_install_prefix REPLACE ".+" "../") 11 | list(JOIN from_install_prefix "" from_install_prefix) 12 | 13 | find_path(_CUB_VERSION_INCLUDE_DIR cub/version.cuh 14 | NO_DEFAULT_PATH # Only search explicit paths below: 15 | PATHS 16 | "${CMAKE_CURRENT_LIST_DIR}/${from_install_prefix}/@CMAKE_INSTALL_INCLUDEDIR@" 17 | ) 18 | set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) 19 | -------------------------------------------------------------------------------- /cub/config.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Static configuration header for the CUB project. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include "util_arch.cuh" 36 | #include "util_compiler.cuh" 37 | #include "util_cpp_dialect.cuh" 38 | #include "util_deprecated.cuh" 39 | #include "util_macro.cuh" 40 | #include "util_namespace.cuh" 41 | -------------------------------------------------------------------------------- /cub/cub.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * CUB umbrella include file 32 | */ 33 | 34 | #pragma once 35 | 36 | // Static configuration 37 | #include "config.cuh" 38 | 39 | // Block 40 | #include "block/block_adjacent_difference.cuh" 41 | #include "block/block_discontinuity.cuh" 42 | #include "block/block_exchange.cuh" 43 | #include "block/block_histogram.cuh" 44 | #include "block/block_load.cuh" 45 | #include "block/block_merge_sort.cuh" 46 | #include "block/block_radix_rank.cuh" 47 | #include "block/block_radix_sort.cuh" 48 | #include "block/block_reduce.cuh" 49 | #include "block/block_scan.cuh" 50 | #include "block/block_store.cuh" 51 | //#include "block/block_shift.cuh" 52 | 53 | // Device 54 | #include "device/device_adjacent_difference.cuh" 55 | #include "device/device_copy.cuh" 56 | #include "device/device_histogram.cuh" 57 | #include "device/device_memcpy.cuh" 58 | #include "device/device_merge_sort.cuh" 59 | #include "device/device_partition.cuh" 60 | #include "device/device_radix_sort.cuh" 61 | #include "device/device_reduce.cuh" 62 | #include "device/device_run_length_encode.cuh" 63 | #include "device/device_scan.cuh" 64 | #include "device/device_segmented_radix_sort.cuh" 65 | #include "device/device_segmented_reduce.cuh" 66 | #include "device/device_segmented_sort.cuh" 67 | #include "device/device_select.cuh" 68 | #include "device/device_spmv.cuh" 69 | 70 | // Grid 71 | //#include "grid/grid_barrier.cuh" 72 | #include "grid/grid_even_share.cuh" 73 | #include "grid/grid_mapping.cuh" 74 | #include "grid/grid_queue.cuh" 75 | 76 | // Thread 77 | #include "thread/thread_load.cuh" 78 | #include "thread/thread_operators.cuh" 79 | #include "thread/thread_reduce.cuh" 80 | #include "thread/thread_scan.cuh" 81 | #include "thread/thread_store.cuh" 82 | 83 | // Warp 84 | #include "warp/warp_exchange.cuh" 85 | #include "warp/warp_load.cuh" 86 | #include "warp/warp_merge_sort.cuh" 87 | #include "warp/warp_reduce.cuh" 88 | #include "warp/warp_scan.cuh" 89 | #include "warp/warp_store.cuh" 90 | 91 | // Iterator 92 | #include "iterator/arg_index_input_iterator.cuh" 93 | #include "iterator/cache_modified_input_iterator.cuh" 94 | #include "iterator/cache_modified_output_iterator.cuh" 95 | #include "iterator/constant_input_iterator.cuh" 96 | #include "iterator/counting_input_iterator.cuh" 97 | #include "iterator/discard_output_iterator.cuh" 98 | #include "iterator/tex_obj_input_iterator.cuh" 99 | #include "iterator/tex_ref_input_iterator.cuh" 100 | #include "iterator/transform_input_iterator.cuh" 101 | 102 | // Util 103 | #include "util_allocator.cuh" 104 | #include "util_arch.cuh" 105 | #include "util_debug.cuh" 106 | #include "util_device.cuh" 107 | #include "util_macro.cuh" 108 | #include "util_ptx.cuh" 109 | #include "util_type.cuh" 110 | -------------------------------------------------------------------------------- /cub/detail/choose_offset.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | #include 34 | 35 | CUB_NAMESPACE_BEGIN 36 | 37 | namespace detail 38 | { 39 | 40 | /** 41 | * ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and 42 | * selects the offset type based on it. 43 | */ 44 | template 45 | struct ChooseOffsetT 46 | { 47 | // NumItemsT must be an integral type (but not bool). 48 | static_assert( 49 | std::is_integral::value && 50 | !std::is_same::type, bool>::value, 51 | "NumItemsT must be an integral type, but not bool"); 52 | 53 | // Unsigned integer type for global offsets. 54 | using Type = typename std::conditional::type; 57 | }; 58 | 59 | } // namespace detail 60 | 61 | CUB_NAMESPACE_END 62 | 63 | -------------------------------------------------------------------------------- /cub/detail/cpp_compatibility.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | #pragma once 19 | 20 | #include 21 | 22 | #if CUB_CPP_DIALECT >= 2017 && __cpp_if_constexpr 23 | # define CUB_IF_CONSTEXPR if constexpr 24 | # define CUB_ELSE_IF_CONSTEXPR else if constexpr 25 | #else 26 | # define CUB_IF_CONSTEXPR if 27 | # define CUB_ELSE_IF_CONSTEXPR else if 28 | #endif 29 | -------------------------------------------------------------------------------- /cub/detail/detect_cuda_runtime.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Utilities for CUDA dynamic parallelism. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | 38 | #include 39 | 40 | CUB_NAMESPACE_BEGIN 41 | namespace detail 42 | { 43 | 44 | #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: 45 | 46 | /** 47 | * \def CUB_DISABLE_CDP 48 | * 49 | * If defined, support for device-side usage of CUB is disabled. 50 | */ 51 | #define CUB_DISABLE_CDP 52 | 53 | /** 54 | * \def CUB_RDC_ENABLED 55 | * 56 | * Defined if RDC is enabled and CUB_DISABLE_CDP is not defined. 57 | */ 58 | #define CUB_RDC_ENABLED 59 | 60 | /** 61 | * \def CUB_RUNTIME_FUNCTION 62 | * 63 | * Execution space for functions that can use the CUDA runtime API (`__host__` 64 | * when RDC is off, `__host__ __device__` when RDC is on). 65 | */ 66 | #define CUB_RUNTIME_FUNCTION 67 | 68 | /** 69 | * \def CUB_RUNTIME_ENABLED 70 | * 71 | * Whether or not the active compiler pass is allowed to invoke device kernels 72 | * or methods from the CUDA runtime API. 73 | * 74 | * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__` 75 | * and is not compatible with `NV_IF_TARGET`. It is provided for legacy 76 | * purposes only. 77 | * 78 | * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`. 79 | */ 80 | #define CUB_RUNTIME_ENABLED 81 | 82 | #else // Non-doxygen pass: 83 | 84 | #ifndef CUB_RUNTIME_FUNCTION 85 | 86 | #if defined(__CUDACC_RDC__) && !defined(CUB_DISABLE_CDP) 87 | 88 | #define CUB_RDC_ENABLED 89 | #define CUB_RUNTIME_FUNCTION __host__ __device__ 90 | 91 | #else // RDC disabled: 92 | 93 | #define CUB_RUNTIME_FUNCTION __host__ 94 | 95 | #endif // RDC enabled 96 | 97 | #if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) 98 | // Legacy only -- do not use in new code. 99 | #define CUB_RUNTIME_ENABLED 100 | #endif 101 | 102 | #endif // CUB_RUNTIME_FUNCTION predefined 103 | 104 | #ifdef CUB_RDC_ENABLED 105 | // Detect available version of CDP: 106 | #if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED) 107 | #define CUB_DETAIL_CDPv1 108 | #else 109 | #define CUB_DETAIL_CDPv2 110 | #endif 111 | #endif 112 | 113 | #endif // Do not document 114 | 115 | } // namespace detail 116 | CUB_NAMESPACE_END 117 | -------------------------------------------------------------------------------- /cub/detail/device_double_buffer.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | 22 | CUB_NAMESPACE_BEGIN 23 | 24 | namespace detail 25 | { 26 | 27 | 28 | /** 29 | * @brief It's a double-buffer storage wrapper for multi-pass stream 30 | * transformations that require more than one storage array for 31 | * streaming intermediate results back and forth. 32 | * 33 | * Many multi-pass computations require a pair of "ping-pong" storage buffers 34 | * (e.g., one for reading from and the other for writing to, and then 35 | * vice-versa for the subsequent pass). This structure wraps a set of device 36 | * buffers. 37 | * 38 | * Unlike `cub::DoubleBuffer` this class doesn't provide a "selector" member 39 | * to track which buffer is "current". The main reason for this class existence 40 | * is the performance difference. Since `cub::DoubleBuffer` relies on the 41 | * runtime variable to index pointers arrays, they are placed in the local 42 | * memory instead of registers. Local memory accesses significantly affect 43 | * performance. On the contrary, this class swaps pointer, so all operations 44 | * can be performed in registers. 45 | */ 46 | template 47 | class device_double_buffer 48 | { 49 | /// Pair of device buffer pointers 50 | T *m_current_buffer {}; 51 | T *m_alternate_buffer {}; 52 | 53 | public: 54 | /** 55 | * @param d_current 56 | * The currently valid buffer 57 | * 58 | * @param d_alternate 59 | * Alternate storage buffer of the same size as @p d_current 60 | */ 61 | __host__ __device__ __forceinline__ device_double_buffer(T *current, 62 | T *alternate) 63 | : m_current_buffer(current) 64 | , m_alternate_buffer(alternate) 65 | {} 66 | 67 | /// \brief Return pointer to the currently valid buffer 68 | __host__ __device__ __forceinline__ T *current() const 69 | { 70 | return m_current_buffer; 71 | } 72 | 73 | /// \brief Return pointer to the currently invalid buffer 74 | __host__ __device__ __forceinline__ T *alternate() const 75 | { 76 | return m_alternate_buffer; 77 | } 78 | 79 | __host__ __device__ void swap() 80 | { 81 | T *tmp = m_current_buffer; 82 | m_current_buffer = m_alternate_buffer; 83 | m_alternate_buffer = tmp; 84 | } 85 | }; 86 | 87 | 88 | } // namespace detail 89 | 90 | CUB_NAMESPACE_END 91 | -------------------------------------------------------------------------------- /cub/detail/device_synchronize.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | #include 27 | 28 | CUB_NAMESPACE_BEGIN 29 | 30 | namespace detail 31 | { 32 | 33 | /** 34 | * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and 35 | * CUDA configuration. 36 | */ 37 | CUB_EXEC_CHECK_DISABLE 38 | CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() 39 | { 40 | cudaError_t result = cudaErrorNotSupported; 41 | 42 | // Device-side sync is only available under CDPv1: 43 | #if defined(CUB_DETAIL_CDPv1) 44 | 45 | #if ((__CUDACC_VER_MAJOR__ > 11) || \ 46 | ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6))) 47 | // CUDA >= 11.6 48 | #define CUB_TMP_DEVICE_SYNC_IMPL \ 49 | result = __cudaDeviceSynchronizeDeprecationAvoidance(); 50 | #else // CUDA < 11.6: 51 | #define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize(); 52 | #endif 53 | 54 | #else // CDPv2 or no CDP: 55 | 56 | #define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */ 57 | 58 | #endif // CDP version 59 | 60 | NV_IF_TARGET(NV_IS_HOST, 61 | (result = cudaDeviceSynchronize();), 62 | (CUB_TMP_DEVICE_SYNC_IMPL)); 63 | 64 | #undef CUB_TMP_DEVICE_SYNC_IMPL 65 | 66 | return result; 67 | } 68 | 69 | } // namespace detail 70 | 71 | CUB_NAMESPACE_END 72 | -------------------------------------------------------------------------------- /cub/detail/exec_check_disable.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | /** 22 | * @def CUB_EXEC_CHECK_DISABLE 23 | * Wrapper around `#pragma nv_exec_check_disable`. 24 | */ 25 | 26 | // #pragma nv_exec_check_disable is only recognized by NVCC. 27 | #if defined(__CUDACC__) && \ 28 | !defined(_NVHPC_CUDA) && \ 29 | !(defined(__CUDA__) && defined(__clang__)) 30 | 31 | #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC 32 | #define CUB_EXEC_CHECK_DISABLE __pragma("nv_exec_check_disable") 33 | #else // // !MSVC 34 | #define CUB_EXEC_CHECK_DISABLE _Pragma("nv_exec_check_disable") 35 | #endif // MSVC 36 | 37 | #else // !NVCC 38 | 39 | #define CUB_EXEC_CHECK_DISABLE 40 | 41 | #endif // NVCC 42 | -------------------------------------------------------------------------------- /cub/detail/type_traits.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Wrappers and extensions around utilities. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include 36 | #include 37 | 38 | #include 39 | 40 | 41 | CUB_NAMESPACE_BEGIN 42 | namespace detail { 43 | 44 | template 45 | using invoke_result_t = 46 | #if CUB_CPP_DIALECT < 2017 47 | typename ::cuda::std::result_of::type; 48 | #else // 2017+ 49 | ::cuda::std::invoke_result_t; 50 | #endif 51 | 52 | /// The type of intermediate accumulator (according to P2322R6) 53 | template 54 | using accumulator_t = 55 | typename ::cuda::std::decay>::type; 56 | 57 | } // namespace detail 58 | CUB_NAMESPACE_END 59 | -------------------------------------------------------------------------------- /cub/detail/uninitialized_copy.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | 34 | CUB_NAMESPACE_BEGIN 35 | 36 | 37 | namespace detail 38 | { 39 | 40 | #if defined(_NVHPC_CUDA) 41 | template 42 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val) 43 | { 44 | // NVBug 3384810 45 | new (ptr) T(::cuda::std::forward(val)); 46 | } 47 | #else 48 | template ::value, 52 | int 53 | >::type = 0> 54 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val) 55 | { 56 | *ptr = ::cuda::std::forward(val); 57 | } 58 | 59 | template ::value, 63 | int 64 | >::type = 0> 65 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val) 66 | { 67 | new (ptr) T(::cuda::std::forward(val)); 68 | } 69 | #endif 70 | 71 | } // namespace detail 72 | 73 | 74 | CUB_NAMESPACE_END 75 | 76 | -------------------------------------------------------------------------------- /cub/grid/grid_barrier.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_debug.cuh" 37 | #include "../config.cuh" 38 | #include "../thread/thread_load.cuh" 39 | 40 | CUB_NAMESPACE_BEGIN 41 | 42 | 43 | /** 44 | * \addtogroup GridModule 45 | * @{ 46 | */ 47 | 48 | 49 | /** 50 | * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid 51 | */ 52 | class GridBarrier 53 | { 54 | protected : 55 | 56 | typedef unsigned int SyncFlag; 57 | 58 | // Counters in global device memory 59 | SyncFlag* d_sync; 60 | 61 | public: 62 | 63 | /** 64 | * Constructor 65 | */ 66 | GridBarrier() : d_sync(NULL) {} 67 | 68 | 69 | /** 70 | * Synchronize 71 | */ 72 | __device__ __forceinline__ void Sync() const 73 | { 74 | volatile SyncFlag *d_vol_sync = d_sync; 75 | 76 | // Threadfence and syncthreads to make sure global writes are visible before 77 | // thread-0 reports in with its sync counter 78 | __threadfence(); 79 | CTA_SYNC(); 80 | 81 | if (blockIdx.x == 0) 82 | { 83 | // Report in ourselves 84 | if (threadIdx.x == 0) 85 | { 86 | d_vol_sync[blockIdx.x] = 1; 87 | } 88 | 89 | CTA_SYNC(); 90 | 91 | // Wait for everyone else to report in 92 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 93 | { 94 | while (ThreadLoad(d_sync + peer_block) == 0) 95 | { 96 | __threadfence_block(); 97 | } 98 | } 99 | 100 | CTA_SYNC(); 101 | 102 | // Let everyone know it's safe to proceed 103 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 104 | { 105 | d_vol_sync[peer_block] = 0; 106 | } 107 | } 108 | else 109 | { 110 | if (threadIdx.x == 0) 111 | { 112 | // Report in 113 | d_vol_sync[blockIdx.x] = 1; 114 | 115 | // Wait for acknowledgment 116 | while (ThreadLoad(d_sync + blockIdx.x) == 1) 117 | { 118 | __threadfence_block(); 119 | } 120 | } 121 | 122 | CTA_SYNC(); 123 | } 124 | } 125 | }; 126 | 127 | 128 | /** 129 | * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. 130 | * 131 | * Uses RAII for lifetime, i.e., device resources are reclaimed when 132 | * the destructor is called. 133 | */ 134 | class GridBarrierLifetime : public GridBarrier 135 | { 136 | protected: 137 | 138 | // Number of bytes backed by d_sync 139 | size_t sync_bytes; 140 | 141 | public: 142 | 143 | /** 144 | * Constructor 145 | */ 146 | GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} 147 | 148 | 149 | /** 150 | * DeviceFrees and resets the progress counters 151 | */ 152 | cudaError_t HostReset() 153 | { 154 | cudaError_t retval = cudaSuccess; 155 | if (d_sync) 156 | { 157 | CubDebug(retval = cudaFree(d_sync)); 158 | d_sync = NULL; 159 | } 160 | sync_bytes = 0; 161 | return retval; 162 | } 163 | 164 | 165 | /** 166 | * Destructor 167 | */ 168 | virtual ~GridBarrierLifetime() 169 | { 170 | HostReset(); 171 | } 172 | 173 | 174 | /** 175 | * Sets up the progress counters for the next kernel launch (lazily 176 | * allocating and initializing them if necessary) 177 | */ 178 | cudaError_t Setup(int sweep_grid_size) 179 | { 180 | cudaError_t retval = cudaSuccess; 181 | do { 182 | size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); 183 | if (new_sync_bytes > sync_bytes) 184 | { 185 | if (d_sync) 186 | { 187 | if (CubDebug(retval = cudaFree(d_sync))) break; 188 | } 189 | 190 | sync_bytes = new_sync_bytes; 191 | 192 | // Allocate and initialize to zero 193 | if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; 194 | if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; 195 | } 196 | } while (0); 197 | 198 | return retval; 199 | } 200 | }; 201 | 202 | 203 | /** @} */ // end group GridModule 204 | 205 | CUB_NAMESPACE_END 206 | 207 | -------------------------------------------------------------------------------- /cub/grid/grid_mapping.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../config.cuh" 37 | 38 | CUB_NAMESPACE_BEGIN 39 | 40 | 41 | /** 42 | * \addtogroup GridModule 43 | * @{ 44 | */ 45 | 46 | 47 | /****************************************************************************** 48 | * Mapping policies 49 | *****************************************************************************/ 50 | 51 | 52 | /** 53 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 54 | */ 55 | enum GridMappingStrategy 56 | { 57 | /** 58 | * \brief An a "raking" access pattern in which each thread block is 59 | * assigned a consecutive sequence of input tiles 60 | * 61 | * \par Overview 62 | * The input is evenly partitioned into \p p segments, where \p p is 63 | * constant and corresponds loosely to the number of thread blocks that may 64 | * actively reside on the target device. Each segment is comprised of 65 | * consecutive tiles, where a tile is a small, constant-sized unit of input 66 | * to be processed to completion before the thread block terminates or 67 | * obtains more work. The kernel invokes \p p thread blocks, each 68 | * of which iteratively consumes a segment of n/p elements 69 | * in tile-size increments. 70 | */ 71 | GRID_MAPPING_RAKE, 72 | 73 | /** 74 | * \brief An a "strip mining" access pattern in which the input tiles assigned 75 | * to each thread block are separated by a stride equal to the the extent of 76 | * the grid. 77 | * 78 | * \par Overview 79 | * The input is evenly partitioned into \p p sets, where \p p is 80 | * constant and corresponds loosely to the number of thread blocks that may 81 | * actively reside on the target device. Each set is comprised of 82 | * data tiles separated by stride \p tiles, where a tile is a small, 83 | * constant-sized unit of input to be processed to completion before the 84 | * thread block terminates or obtains more work. The kernel invokes \p p 85 | * thread blocks, each of which iteratively consumes a segment of 86 | * n/p elements in tile-size increments. 87 | */ 88 | GRID_MAPPING_STRIP_MINE, 89 | 90 | /** 91 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. 92 | * 93 | * \par Overview 94 | * The input is treated as a queue to be dynamically consumed by a grid of 95 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a 96 | * unit of input to be processed to completion before the thread block 97 | * terminates or obtains more work. The grid size \p p is constant, 98 | * loosely corresponding to the number of thread blocks that may actively 99 | * reside on the target device. 100 | */ 101 | GRID_MAPPING_DYNAMIC, 102 | }; 103 | 104 | 105 | /** @} */ // end group GridModule 106 | 107 | CUB_NAMESPACE_END 108 | 109 | -------------------------------------------------------------------------------- /cub/host/mutex.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple portable mutex 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | 38 | #include 39 | #include 40 | 41 | 42 | CUB_NAMESPACE_BEGIN 43 | 44 | 45 | /** 46 | * Wraps std::mutex 47 | * @deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed 48 | * in a future release. Use `std::mutex` instead. 49 | */ 50 | struct CUB_DEPRECATED Mutex 51 | { 52 | std::mutex mtx; 53 | 54 | void Lock() 55 | { 56 | mtx.lock(); 57 | } 58 | 59 | void Unlock() 60 | { 61 | mtx.unlock(); 62 | } 63 | }; 64 | 65 | 66 | CUB_NAMESPACE_END 67 | -------------------------------------------------------------------------------- /cub/iterator/tex_ref_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include 40 | 41 | CUB_NAMESPACE_BEGIN 42 | 43 | /** 44 | * \addtogroup UtilIterator 45 | * @{ 46 | */ 47 | 48 | /** 49 | * \brief A random-access input wrapper for dereferencing array values through texture cache. 50 | * 51 | * \deprecated [Since 1.13.0] The CUDA texture management APIs used by 52 | * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead. 53 | * 54 | * \par Overview 55 | * - TexRefInputIterator wraps a native device pointer of type ValueType*. References 56 | * to elements are to be loaded through texture cache. 57 | * - Can be used to load any data type from memory through texture cache. 58 | * - Can be manipulated and exchanged within and between host and device 59 | * functions, can only be constructed within host functions, and can only be 60 | * dereferenced within device functions. 61 | * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture 62 | * reference. Only one TexRefInputIterator instance can be bound at any given time for a 63 | * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host 64 | * thread, and (4) compilation .o unit. 65 | * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be 66 | * created by the host thread and used by a top-level kernel (i.e. the one which is launched 67 | * from the host). 68 | * - Compatible with Thrust API v1.7 or newer. 69 | * 70 | * \par Snippet 71 | * The code snippet below illustrates the use of \p TexRefInputIterator to 72 | * dereference a device array of doubles through texture cache. 73 | * \par 74 | * \code 75 | * #include // or equivalently 76 | * 77 | * // Declare, allocate, and initialize a device array 78 | * int num_items; // e.g., 7 79 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] 80 | * 81 | * // Create an iterator wrapper 82 | * cub::TexRefInputIterator itr; 83 | * itr.BindTexture(d_in, sizeof(double) * num_items); 84 | * ... 85 | * 86 | * // Within device code: 87 | * printf("%f\n", itr[0]); // 8.0 88 | * printf("%f\n", itr[1]); // 6.0 89 | * printf("%f\n", itr[6]); // 9.0 90 | * 91 | * ... 92 | * itr.UnbindTexture(); 93 | * 94 | * \endcode 95 | * 96 | * \tparam T The value type of this iterator 97 | * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference 98 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 99 | */ 100 | template < 101 | typename T, 102 | int /*UNIQUE_ID*/, 103 | typename OffsetT = std::ptrdiff_t> 104 | using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator; 105 | 106 | /** @} */ // end group UtilIterator 107 | 108 | CUB_NAMESPACE_END 109 | -------------------------------------------------------------------------------- /cub/thread/thread_search.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Thread utilities for sequential search 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | #include 42 | 43 | CUB_NAMESPACE_BEGIN 44 | 45 | 46 | /** 47 | * Computes the begin offsets into A and B for the specific diagonal 48 | */ 49 | template < 50 | typename AIteratorT, 51 | typename BIteratorT, 52 | typename OffsetT, 53 | typename CoordinateT> 54 | __host__ __device__ __forceinline__ void MergePathSearch( 55 | OffsetT diagonal, 56 | AIteratorT a, 57 | BIteratorT b, 58 | OffsetT a_len, 59 | OffsetT b_len, 60 | CoordinateT& path_coordinate) 61 | { 62 | /// The value type of the input iterator 63 | using T = cub::detail::value_t; 64 | 65 | OffsetT split_min = CUB_MAX(diagonal - b_len, 0); 66 | OffsetT split_max = CUB_MIN(diagonal, a_len); 67 | 68 | while (split_min < split_max) 69 | { 70 | OffsetT split_pivot = (split_min + split_max) >> 1; 71 | if (a[split_pivot] <= b[diagonal - split_pivot - 1]) 72 | { 73 | // Move candidate split range up A, down B 74 | split_min = split_pivot + 1; 75 | } 76 | else 77 | { 78 | // Move candidate split range up B, down A 79 | split_max = split_pivot; 80 | } 81 | } 82 | 83 | path_coordinate.x = CUB_MIN(split_min, a_len); 84 | path_coordinate.y = diagonal - split_min; 85 | } 86 | 87 | 88 | 89 | /** 90 | * \brief Returns the offset of the first value within \p input which does not compare less than \p val 91 | */ 92 | template < 93 | typename InputIteratorT, 94 | typename OffsetT, 95 | typename T> 96 | __device__ __forceinline__ OffsetT LowerBound( 97 | InputIteratorT input, ///< [in] Input sequence 98 | OffsetT num_items, ///< [in] Input sequence length 99 | T val) ///< [in] Search key 100 | { 101 | OffsetT retval = 0; 102 | while (num_items > 0) 103 | { 104 | OffsetT half = num_items >> 1; 105 | if (input[retval + half] < val) 106 | { 107 | retval = retval + (half + 1); 108 | num_items = num_items - (half + 1); 109 | } 110 | else 111 | { 112 | num_items = half; 113 | } 114 | } 115 | 116 | return retval; 117 | } 118 | 119 | 120 | /** 121 | * \brief Returns the offset of the first value within \p input which compares greater than \p val 122 | */ 123 | template < 124 | typename InputIteratorT, 125 | typename OffsetT, 126 | typename T> 127 | __device__ __forceinline__ OffsetT UpperBound( 128 | InputIteratorT input, ///< [in] Input sequence 129 | OffsetT num_items, ///< [in] Input sequence length 130 | T val) ///< [in] Search key 131 | { 132 | OffsetT retval = 0; 133 | while (num_items > 0) 134 | { 135 | OffsetT half = num_items >> 1; 136 | if (val < input[retval + half]) 137 | { 138 | num_items = half; 139 | } 140 | else 141 | { 142 | retval = retval + (half + 1); 143 | num_items = num_items - (half + 1); 144 | } 145 | } 146 | 147 | return retval; 148 | } 149 | 150 | 151 | #if defined(__CUDA_FP16_TYPES_EXIST__) 152 | template < 153 | typename InputIteratorT, 154 | typename OffsetT> 155 | __device__ __forceinline__ OffsetT UpperBound( 156 | InputIteratorT input, ///< [in] Input sequence 157 | OffsetT num_items, ///< [in] Input sequence length 158 | __half val) ///< [in] Search key 159 | { 160 | OffsetT retval = 0; 161 | while (num_items > 0) 162 | { 163 | OffsetT half = num_items >> 1; 164 | 165 | bool lt; 166 | NV_IF_TARGET(NV_PROVIDES_SM_53, 167 | (lt = val < input[retval + half];), 168 | (lt = __half2float(val) < __half2float(input[retval + half]);)); 169 | 170 | if (lt) 171 | { 172 | num_items = half; 173 | } 174 | else 175 | { 176 | retval = retval + (half + 1); 177 | num_items = num_items - (half + 1); 178 | } 179 | } 180 | 181 | return retval; 182 | } 183 | #endif 184 | 185 | CUB_NAMESPACE_END 186 | -------------------------------------------------------------------------------- /cub/thread/thread_sort.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include "../config.cuh" 31 | #include "../util_ptx.cuh" 32 | #include "../util_type.cuh" 33 | 34 | CUB_NAMESPACE_BEGIN 35 | 36 | 37 | template 38 | __device__ __forceinline__ void Swap(T &lhs, T &rhs) 39 | { 40 | T temp = lhs; 41 | lhs = rhs; 42 | rhs = temp; 43 | } 44 | 45 | 46 | /** 47 | * @brief Sorts data using odd-even sort method 48 | * 49 | * The sorting method is stable. Further details can be found in: 50 | * A. Nico Habermann. Parallel neighbor sort (or the glory of the induction 51 | * principle). Technical Report AD-759 248, Carnegie Mellon University, 1972. 52 | * 53 | * @tparam KeyT 54 | * Key type 55 | * 56 | * @tparam ValueT 57 | * Value type. If `cub::NullType` is used as `ValueT`, only keys are sorted. 58 | * 59 | * @tparam CompareOp 60 | * functor type having member `bool operator()(KeyT lhs, KeyT rhs)` 61 | * 62 | * @tparam ITEMS_PER_THREAD 63 | * The number of items per thread 64 | * 65 | * @param[in,out] keys 66 | * Keys to sort 67 | * 68 | * @param[in,out] items 69 | * Values to sort 70 | * 71 | * @param[in] compare_op 72 | * Comparison function object which returns true if the first argument is 73 | * ordered before the second 74 | */ 75 | template 79 | __device__ __forceinline__ void 80 | StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], 81 | ValueT (&items)[ITEMS_PER_THREAD], 82 | CompareOp compare_op) 83 | { 84 | constexpr bool KEYS_ONLY = std::is_same::value; 85 | 86 | #pragma unroll 87 | for (int i = 0; i < ITEMS_PER_THREAD; ++i) 88 | { 89 | #pragma unroll 90 | for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) 91 | { 92 | if (compare_op(keys[j + 1], keys[j])) 93 | { 94 | Swap(keys[j], keys[j + 1]); 95 | if (!KEYS_ONLY) 96 | { 97 | Swap(items[j], items[j + 1]); 98 | } 99 | } 100 | } // inner loop 101 | } // outer loop 102 | } 103 | 104 | 105 | CUB_NAMESPACE_END 106 | -------------------------------------------------------------------------------- /cub/util_arch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Static architectural properties by SM version. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | #include 39 | 40 | // Legacy include; this functionality used to be defined in here. 41 | #include 42 | 43 | CUB_NAMESPACE_BEGIN 44 | 45 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 46 | 47 | // \deprecated [Since 2.1.0] 48 | #define CUB_USE_COOPERATIVE_GROUPS 49 | 50 | /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are 51 | /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. 52 | #ifndef CUB_PTX_ARCH 53 | #if defined(_NVHPC_CUDA) 54 | // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined 55 | // when compiling both host code and device code. Currently, only one 56 | // PTX version can be targeted. 57 | #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__ 58 | #elif !defined(__CUDA_ARCH__) 59 | #define CUB_PTX_ARCH 0 60 | #else 61 | #define CUB_PTX_ARCH __CUDA_ARCH__ 62 | #endif 63 | #endif 64 | 65 | // These definitions were intended for internal use only and are now obsolete. 66 | // If you relied on them, consider porting your code to use the functionality 67 | // in libcu++'s header. 68 | // For a temporary workaround, define CUB_PROVIDE_LEGACY_ARCH_MACROS to make 69 | // them available again. These should be considered deprecated and will be 70 | // fully removed in a future version. 71 | #ifdef CUB_PROVIDE_LEGACY_ARCH_MACROS 72 | #ifndef CUB_IS_DEVICE_CODE 73 | #if defined(_NVHPC_CUDA) 74 | #define CUB_IS_DEVICE_CODE __builtin_is_device_code() 75 | #define CUB_IS_HOST_CODE (!__builtin_is_device_code()) 76 | #define CUB_INCLUDE_DEVICE_CODE 1 77 | #define CUB_INCLUDE_HOST_CODE 1 78 | #elif CUB_PTX_ARCH > 0 79 | #define CUB_IS_DEVICE_CODE 1 80 | #define CUB_IS_HOST_CODE 0 81 | #define CUB_INCLUDE_DEVICE_CODE 1 82 | #define CUB_INCLUDE_HOST_CODE 0 83 | #else 84 | #define CUB_IS_DEVICE_CODE 0 85 | #define CUB_IS_HOST_CODE 1 86 | #define CUB_INCLUDE_DEVICE_CODE 0 87 | #define CUB_INCLUDE_HOST_CODE 1 88 | #endif 89 | #endif 90 | #endif // CUB_PROVIDE_LEGACY_ARCH_MACROS 91 | 92 | /// Maximum number of devices supported. 93 | #ifndef CUB_MAX_DEVICES 94 | #define CUB_MAX_DEVICES (128) 95 | #endif 96 | 97 | static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); 98 | 99 | 100 | /// Number of threads per warp 101 | #ifndef CUB_LOG_WARP_THREADS 102 | #define CUB_LOG_WARP_THREADS(unused) (5) 103 | #define CUB_WARP_THREADS(unused) (1 << CUB_LOG_WARP_THREADS(0)) 104 | 105 | #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(0) 106 | #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(0) 107 | #endif 108 | 109 | 110 | /// Number of smem banks 111 | #ifndef CUB_LOG_SMEM_BANKS 112 | #define CUB_LOG_SMEM_BANKS(unused) (5) 113 | #define CUB_SMEM_BANKS(unused) (1 << CUB_LOG_SMEM_BANKS(0)) 114 | 115 | #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(0) 116 | #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS 117 | #endif 118 | 119 | 120 | /// Oversubscription factor 121 | #ifndef CUB_SUBSCRIPTION_FACTOR 122 | #define CUB_SUBSCRIPTION_FACTOR(unused) (5) 123 | #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(0) 124 | #endif 125 | 126 | 127 | /// Prefer padding overhead vs X-way conflicts greater than this threshold 128 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING 129 | #define CUB_PREFER_CONFLICT_OVER_PADDING(unused) (1) 130 | #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(0) 131 | #endif 132 | 133 | 134 | template < 135 | int NOMINAL_4B_BLOCK_THREADS, 136 | int NOMINAL_4B_ITEMS_PER_THREAD, 137 | typename T> 138 | struct RegBoundScaling 139 | { 140 | enum { 141 | ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), 142 | BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), 143 | }; 144 | }; 145 | 146 | 147 | template < 148 | int NOMINAL_4B_BLOCK_THREADS, 149 | int NOMINAL_4B_ITEMS_PER_THREAD, 150 | typename T> 151 | struct MemBoundScaling 152 | { 153 | enum { 154 | ITEMS_PER_THREAD = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), 155 | BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), 156 | }; 157 | }; 158 | 159 | 160 | 161 | 162 | #endif // Do not document 163 | 164 | CUB_NAMESPACE_END 165 | -------------------------------------------------------------------------------- /cub/util_compiler.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Detect compiler information. 31 | */ 32 | 33 | #pragma once 34 | 35 | // enumerate host compilers we know about 36 | #define CUB_HOST_COMPILER_UNKNOWN 0 37 | #define CUB_HOST_COMPILER_MSVC 1 38 | #define CUB_HOST_COMPILER_GCC 2 39 | #define CUB_HOST_COMPILER_CLANG 3 40 | 41 | // enumerate device compilers we know about 42 | #define CUB_DEVICE_COMPILER_UNKNOWN 0 43 | #define CUB_DEVICE_COMPILER_MSVC 1 44 | #define CUB_DEVICE_COMPILER_GCC 2 45 | #define CUB_DEVICE_COMPILER_NVCC 3 46 | #define CUB_DEVICE_COMPILER_CLANG 4 47 | 48 | // figure out which host compiler we're using 49 | #if defined(_MSC_VER) 50 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC 51 | # define CUB_MSVC_VERSION _MSC_VER 52 | # define CUB_MSVC_VERSION_FULL _MSC_FULL_VER 53 | #elif defined(__clang__) 54 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG 55 | # define CUB_CLANG_VERSION \ 56 | (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) 57 | #elif defined(__GNUC__) 58 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC 59 | # define CUB_GCC_VERSION \ 60 | (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 61 | #else 62 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN 63 | #endif // CUB_HOST_COMPILER 64 | 65 | // figure out which device compiler we're using 66 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA) 67 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC 68 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC 69 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC 70 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC 71 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC 72 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG 73 | // CUDA-capable clang should behave similar to NVCC. 74 | # if defined(__CUDA__) 75 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC 76 | # else 77 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG 78 | # endif 79 | #else 80 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN 81 | #endif 82 | -------------------------------------------------------------------------------- /cub/util_deprecated.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Define CUB_DEPRECATED macro. 31 | */ 32 | 33 | #pragma once 34 | 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | 42 | #if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API) 43 | # define CUB_IGNORE_DEPRECATED_API 44 | #endif 45 | 46 | #ifdef CUB_IGNORE_DEPRECATED_API 47 | # define CUB_DEPRECATED 48 | # define CUB_DEPRECATED_BECAUSE(MSG) 49 | #elif CUB_CPP_DIALECT >= 2014 50 | # define CUB_DEPRECATED [[deprecated]] 51 | # define CUB_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]] 52 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC 53 | # define CUB_DEPRECATED __declspec(deprecated) 54 | # define CUB_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG)) 55 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG 56 | # define CUB_DEPRECATED __attribute__((deprecated)) 57 | # define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG))) 58 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC 59 | # define CUB_DEPRECATED __attribute__((deprecated)) 60 | # define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG))) 61 | #else 62 | # define CUB_DEPRECATED 63 | # define CUB_DEPRECATED_BECAUSE(MSG) 64 | #endif 65 | 66 | #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED \ 67 | CUB_DEPRECATED_BECAUSE( \ 68 | "CUB no longer accepts `debug_synchronous` parameter. " \ 69 | "Define CUB_DEBUG_SYNC instead, or silence this message with " \ 70 | "CUB_IGNORE_DEPRECATED_API.") 71 | 72 | #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG \ 73 | if (debug_synchronous) \ 74 | { \ 75 | _CubLog("%s\n", \ 76 | "CUB no longer accepts `debug_synchronous` parameter. " \ 77 | "Define CUB_DEBUG_SYNC instead."); \ 78 | } 79 | 80 | -------------------------------------------------------------------------------- /cub/util_macro.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Common C/C++ macro utilities 31 | ******************************************************************************/ 32 | 33 | #pragma once 34 | 35 | #include 36 | 37 | #include "util_namespace.cuh" 38 | 39 | CUB_NAMESPACE_BEGIN 40 | 41 | 42 | /** 43 | * \addtogroup UtilModule 44 | * @{ 45 | */ 46 | 47 | #ifndef CUB_ALIGN 48 | #if defined(_WIN32) || defined(_WIN64) 49 | /// Align struct 50 | #define CUB_ALIGN(bytes) __declspec(align(32)) 51 | #else 52 | /// Align struct 53 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) 54 | #endif 55 | #endif 56 | 57 | #define CUB_PREVENT_MACRO_SUBSTITUTION 58 | 59 | template 60 | constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, 61 | U &&u) 62 | -> decltype(t < u ? ::cuda::std::forward(t) : ::cuda::std::forward(u)) 63 | { 64 | return t < u ? ::cuda::std::forward(t) : ::cuda::std::forward(u); 65 | } 66 | 67 | template 68 | constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, 69 | U &&u) 70 | -> decltype(t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t)) 71 | { 72 | return t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t); 73 | } 74 | 75 | #ifndef CUB_MAX 76 | /// Select maximum(a, b) 77 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) 78 | #endif 79 | 80 | #ifndef CUB_MIN 81 | /// Select minimum(a, b) 82 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) 83 | #endif 84 | 85 | #ifndef CUB_QUOTIENT_FLOOR 86 | /// Quotient of x/y rounded down to nearest integer 87 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) 88 | #endif 89 | 90 | #ifndef CUB_QUOTIENT_CEILING 91 | /// Quotient of x/y rounded up to nearest integer 92 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) 93 | #endif 94 | 95 | #ifndef CUB_ROUND_UP_NEAREST 96 | /// x rounded up to the nearest multiple of y 97 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) 98 | #endif 99 | 100 | #ifndef CUB_ROUND_DOWN_NEAREST 101 | /// x rounded down to the nearest multiple of y 102 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) 103 | #endif 104 | 105 | 106 | #ifndef CUB_STATIC_ASSERT 107 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 108 | #define CUB_CAT_(a, b) a ## b 109 | #define CUB_CAT(a, b) CUB_CAT_(a, b) 110 | #endif // DOXYGEN_SHOULD_SKIP_THIS 111 | 112 | /// Static assert 113 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] 114 | #endif 115 | 116 | /** @} */ // end group UtilModule 117 | 118 | CUB_NAMESPACE_END 119 | -------------------------------------------------------------------------------- /cub/util_math.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Define helper math functions. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include 36 | 37 | #include "util_namespace.cuh" 38 | #include "util_macro.cuh" 39 | 40 | CUB_NAMESPACE_BEGIN 41 | 42 | namespace detail 43 | { 44 | 45 | template 46 | using is_integral_or_enum = 47 | std::integral_constant::value || std::is_enum::value>; 49 | 50 | __host__ __device__ __forceinline__ constexpr std::size_t 51 | VshmemSize(std::size_t max_shmem, 52 | std::size_t shmem_per_block, 53 | std::size_t num_blocks) 54 | { 55 | return shmem_per_block > max_shmem ? shmem_per_block * num_blocks : 0; 56 | } 57 | 58 | } 59 | 60 | /** 61 | * Divide n by d, round up if any remainder, and return the result. 62 | * 63 | * Effectively performs `(n + d - 1) / d`, but is robust against the case where 64 | * `(n + d - 1)` would overflow. 65 | */ 66 | template 67 | __host__ __device__ __forceinline__ constexpr NumeratorT 68 | DivideAndRoundUp(NumeratorT n, DenominatorT d) 69 | { 70 | static_assert(cub::detail::is_integral_or_enum::value && 71 | cub::detail::is_integral_or_enum::value, 72 | "DivideAndRoundUp is only intended for integral types."); 73 | 74 | // Static cast to undo integral promotion. 75 | return static_cast(n / d + (n % d != 0 ? 1 : 0)); 76 | } 77 | 78 | constexpr __device__ __host__ int 79 | Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes) 80 | { 81 | return (cub::min)(nominal_4b_items_per_thread, 82 | (cub::max)(1, 83 | nominal_4b_items_per_thread * 8 / 84 | combined_bytes)); 85 | } 86 | 87 | template 88 | constexpr __device__ __host__ int 89 | Nominal4BItemsToItems(int nominal_4b_items_per_thread) 90 | { 91 | return (cub::min)(nominal_4b_items_per_thread, 92 | (cub::max)(1, 93 | nominal_4b_items_per_thread * 4 / 94 | static_cast(sizeof(T)))); 95 | } 96 | 97 | template 98 | constexpr __device__ __host__ int 99 | Nominal8BItemsToItems(int nominal_8b_items_per_thread) 100 | { 101 | return sizeof(ItemT) <= 8u 102 | ? nominal_8b_items_per_thread 103 | : (cub::min)(nominal_8b_items_per_thread, 104 | (cub::max)(1, 105 | ((nominal_8b_items_per_thread * 8) + 106 | static_cast(sizeof(ItemT)) - 1) / 107 | static_cast(sizeof(ItemT)))); 108 | } 109 | 110 | /** 111 | * \brief Computes the midpoint of the integers 112 | * 113 | * Extra operation is performed in order to prevent overflow. 114 | * 115 | * \return Half the sum of \p begin and \p end 116 | */ 117 | template 118 | constexpr __device__ __host__ T MidPoint(T begin, T end) 119 | { 120 | return begin + (end - begin) / 2; 121 | } 122 | 123 | CUB_NAMESPACE_END 124 | -------------------------------------------------------------------------------- /cub/version.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /*! \file version.cuh 29 | * \brief Compile-time macros encoding CUB release version 30 | * 31 | * is the only CUB header that is guaranteed to 32 | * change with every CUB release. 33 | * 34 | */ 35 | 36 | #pragma once 37 | 38 | /*! \def CUB_VERSION 39 | * \brief The preprocessor macro \p CUB_VERSION encodes the version 40 | * number of the CUB library. 41 | * 42 | * CUB_VERSION % 100 is the sub-minor version. 43 | * CUB_VERSION / 100 % 1000 is the minor version. 44 | * CUB_VERSION / 100000 is the major version. 45 | */ 46 | #define CUB_VERSION 200200 47 | 48 | /*! \def CUB_MAJOR_VERSION 49 | * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the 50 | * major version number of the CUB library. 51 | */ 52 | #define CUB_MAJOR_VERSION (CUB_VERSION / 100000) 53 | 54 | /*! \def CUB_MINOR_VERSION 55 | * \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the 56 | * minor version number of the CUB library. 57 | */ 58 | #define CUB_MINOR_VERSION (CUB_VERSION / 100 % 1000) 59 | 60 | /*! \def CUB_SUBMINOR_VERSION 61 | * \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the 62 | * sub-minor version number of the CUB library. 63 | */ 64 | #define CUB_SUBMINOR_VERSION (CUB_VERSION % 100) 65 | 66 | /*! \def CUB_PATCH_NUMBER 67 | * \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the 68 | * patch number of the CUB library. 69 | */ 70 | #define CUB_PATCH_NUMBER 0 71 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Create meta targets that build all examples for a single configuration: 2 | foreach(cub_target IN LISTS CUB_TARGETS) 3 | cub_get_target_property(config_prefix ${cub_target} PREFIX) 4 | set(config_meta_target ${config_prefix}.examples) 5 | add_custom_target(${config_meta_target}) 6 | add_dependencies(${config_prefix}.all ${config_meta_target}) 7 | endforeach() 8 | 9 | # Update flags to reflect RDC options. See note in CubCudaConfig.cmake -- 10 | # these flag variables behave unintuitively: 11 | if (CUB_ENABLE_EXAMPLES_WITH_RDC) 12 | set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_RDC}") 13 | else() 14 | set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}") 15 | endif() 16 | 17 | ## cub_add_example 18 | # 19 | # Add an example executable and register it with ctest. 20 | # 21 | # target_name_var: Variable name to overwrite with the name of the example 22 | # target. Useful for post-processing target information per-backend. 23 | # example_name: The name of the example minus ".example." For 24 | # instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu 25 | # would be "cuda.copy". 26 | # example_src: The source file that implements the example. 27 | # cub_target: The reference cub target with configuration information. 28 | # 29 | function(cub_add_example target_name_var example_name example_src cub_target) 30 | cub_get_target_property(config_prefix ${cub_target} PREFIX) 31 | 32 | # The actual name of the test's target: 33 | set(example_target ${config_prefix}.example.${example_name}) 34 | set(${target_name_var} ${example_target} PARENT_SCOPE) 35 | 36 | # Related target names: 37 | set(config_meta_target ${config_prefix}.examples) 38 | set(example_meta_target cub.all.example.${example_name}) 39 | 40 | add_executable(${example_target} "${example_src}") 41 | target_link_libraries(${example_target} ${cub_target}) 42 | cub_clone_target_properties(${example_target} ${cub_target}) 43 | target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples") 44 | 45 | if (CUB_IN_THRUST) 46 | thrust_fix_clang_nvcc_build_for(${example_target}) 47 | endif() 48 | 49 | # Add to the active configuration's meta target 50 | add_dependencies(${config_meta_target} ${example_target}) 51 | 52 | # Meta target that builds examples with this name for all configurations: 53 | if (NOT TARGET ${example_meta_target}) 54 | add_custom_target(${example_meta_target}) 55 | endif() 56 | add_dependencies(${example_meta_target} ${example_target}) 57 | 58 | if (CUB_ENABLE_EXAMPLES_WITH_RDC) 59 | cub_enable_rdc_for_cuda_target(${example_target}) 60 | endif() 61 | 62 | add_test(NAME ${example_target} 63 | COMMAND "$" 64 | ) 65 | endfunction() 66 | 67 | add_subdirectory(cmake) 68 | add_subdirectory(block) 69 | add_subdirectory(device) 70 | -------------------------------------------------------------------------------- /examples/block/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /Debug 3 | /Release 4 | /cuda55.sdf 5 | /cuda55.suo 6 | /cuda60.sdf 7 | /cuda60.suo 8 | -------------------------------------------------------------------------------- /examples/block/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB_RECURSE example_srcs 2 | RELATIVE "${CMAKE_CURRENT_LIST_DIR}" 3 | CONFIGURE_DEPENDS 4 | example_*.cu 5 | ) 6 | 7 | foreach (cub_target IN LISTS CUB_TARGETS) 8 | foreach (example_src IN LISTS example_srcs) 9 | get_filename_component(example_name "${example_src}" NAME_WE) 10 | string(REGEX REPLACE 11 | "^example_block_" "block." 12 | example_name "${example_name}" 13 | ) 14 | cub_add_example(target_name ${example_name} "${example_src}" ${cub_target}) 15 | endforeach() 16 | endforeach() 17 | -------------------------------------------------------------------------------- /examples/cmake/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_test( 2 | NAME cub.example.cmake.add_subdir 3 | COMMAND "${CMAKE_COMMAND}" 4 | --log-level=VERBOSE 5 | -G "${CMAKE_GENERATOR}" 6 | -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir" 7 | -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir" 8 | -D "CUB_ROOT=${CUB_SOURCE_DIR}" 9 | -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" 10 | -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" 11 | ) 12 | -------------------------------------------------------------------------------- /examples/cmake/add_subdir/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This example demonstrates / tests adding CUB via a CMake add_subdirectory 2 | # call from a parent project. 3 | 4 | cmake_minimum_required(VERSION 3.15) 5 | 6 | # Silence warnings about empty CUDA_ARCHITECTURES properties on example targets: 7 | if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) 8 | cmake_policy(SET CMP0104 OLD) 9 | endif() 10 | 11 | project(CubAddSubDirExample CUDA) 12 | 13 | # Use your project's checkout of CUB here, for most cases 14 | # `add_subdirectory(cub)` will be sufficient. 15 | add_subdirectory("${CUB_ROOT}" cub) 16 | 17 | # Link the CUB::CUB target to your project's targets 18 | add_executable(HelloCUB dummy.cu) 19 | target_link_libraries(HelloCUB CUB::CUB) 20 | 21 | # 22 | # Validation 23 | # 24 | 25 | function(assert_target target_name) 26 | if (NOT TARGET "${target_name}") 27 | message(FATAL_ERROR "Target '${target_name}' not defined.") 28 | endif() 29 | endfunction() 30 | 31 | assert_target(CUB::CUB) 32 | assert_target(HelloCUB) 33 | -------------------------------------------------------------------------------- /examples/cmake/add_subdir/dummy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | int main() 6 | { 7 | std::cout << "Hello from CUB version " << CUB_VERSION << ":\n"; 8 | } 9 | -------------------------------------------------------------------------------- /examples/device/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /Debug 3 | /ipch 4 | /Release 5 | /cuda55.sdf 6 | /cuda55.suo 7 | /cuda60.sdf 8 | /cuda60.suo 9 | -------------------------------------------------------------------------------- /examples/device/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB_RECURSE example_srcs 2 | RELATIVE "${CMAKE_CURRENT_LIST_DIR}" 3 | CONFIGURE_DEPENDS 4 | example_*.cu 5 | ) 6 | 7 | foreach (cub_target IN LISTS CUB_TARGETS) 8 | foreach (example_src IN LISTS example_srcs) 9 | get_filename_component(example_name "${example_src}" NAME_WE) 10 | string(REGEX REPLACE 11 | "^example_device_" "device." 12 | example_name "${example_name}" 13 | ) 14 | cub_add_example(target_name ${example_name} "${example_src}" ${cub_target}) 15 | endforeach() 16 | endforeach() 17 | -------------------------------------------------------------------------------- /examples/device/example_device_reduce.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Simple example of DeviceReduce::Sum(). 31 | * 32 | * Sums an array of int keys. 33 | * 34 | * To compile using the command line: 35 | * nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3 36 | * 37 | ******************************************************************************/ 38 | 39 | // Ensure printing of CUDA runtime errors to console 40 | #define CUB_STDERR 41 | 42 | #include 43 | 44 | #include 45 | #include 46 | 47 | #include "../../test/test_util.h" 48 | 49 | using namespace cub; 50 | 51 | 52 | //--------------------------------------------------------------------- 53 | // Globals, constants and typedefs 54 | //--------------------------------------------------------------------- 55 | 56 | bool g_verbose = false; // Whether to display input/output to console 57 | CachingDeviceAllocator g_allocator(true); // Caching allocator for device memory 58 | 59 | 60 | //--------------------------------------------------------------------- 61 | // Test generation 62 | //--------------------------------------------------------------------- 63 | 64 | /** 65 | * Initialize problem 66 | */ 67 | void Initialize( 68 | int *h_in, 69 | int num_items) 70 | { 71 | for (int i = 0; i < num_items; ++i) 72 | h_in[i] = i; 73 | 74 | if (g_verbose) 75 | { 76 | printf("Input:\n"); 77 | DisplayResults(h_in, num_items); 78 | printf("\n\n"); 79 | } 80 | } 81 | 82 | 83 | /** 84 | * Compute solution 85 | */ 86 | void Solve( 87 | int *h_in, 88 | int &h_reference, 89 | int num_items) 90 | { 91 | for (int i = 0; i < num_items; ++i) 92 | { 93 | if (i == 0) 94 | h_reference = h_in[0]; 95 | else 96 | h_reference += h_in[i]; 97 | } 98 | } 99 | 100 | 101 | //--------------------------------------------------------------------- 102 | // Main 103 | //--------------------------------------------------------------------- 104 | 105 | /** 106 | * Main 107 | */ 108 | int main(int argc, char** argv) 109 | { 110 | int num_items = 150; 111 | 112 | // Initialize command line 113 | CommandLineArgs args(argc, argv); 114 | g_verbose = args.CheckCmdLineFlag("v"); 115 | args.GetCmdLineArgument("n", num_items); 116 | 117 | // Print usage 118 | if (args.CheckCmdLineFlag("help")) 119 | { 120 | printf("%s " 121 | "[--n= " 122 | "[--device=] " 123 | "[--v] " 124 | "\n", argv[0]); 125 | exit(0); 126 | } 127 | 128 | // Initialize device 129 | CubDebugExit(args.DeviceInit()); 130 | 131 | printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n", 132 | num_items, (int) sizeof(int)); 133 | fflush(stdout); 134 | 135 | // Allocate host arrays 136 | int* h_in = new int[num_items]; 137 | int h_reference{}; 138 | 139 | // Initialize problem and solution 140 | Initialize(h_in, num_items); 141 | Solve(h_in, h_reference, num_items); 142 | 143 | // Allocate problem device arrays 144 | int *d_in = NULL; 145 | CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items)); 146 | 147 | // Initialize device input 148 | CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice)); 149 | 150 | // Allocate device output array 151 | int *d_out = NULL; 152 | CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1)); 153 | 154 | // Request and allocate temporary storage 155 | void *d_temp_storage = NULL; 156 | size_t temp_storage_bytes = 0; 157 | CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); 158 | CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); 159 | 160 | // Run 161 | CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items)); 162 | 163 | // Check for correctness (and display results, if specified) 164 | int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose); 165 | printf("\t%s", compare ? "FAIL" : "PASS"); 166 | AssertEquals(0, compare); 167 | 168 | // Cleanup 169 | if (h_in) delete[] h_in; 170 | if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); 171 | if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); 172 | if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); 173 | 174 | printf("\n\n"); 175 | 176 | return 0; 177 | } 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /link_main.obj 3 | /dummy/ 4 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Test Parametrization 2 | 3 | Some of CUB's tests are very slow to build and are capable of exhausting RAM 4 | during compilation/linking. To avoid such issues, large tests are split into 5 | multiple executables to take advantage of parallel computation and reduce memory 6 | usage. 7 | 8 | CUB facilitates this by checking for special `%PARAM%` comments in each test's 9 | source code, and then uses this information to generate multiple executables 10 | with different configurations. 11 | 12 | ## Using `%PARAM%` 13 | 14 | The `%PARAM%` hint provides an automated method of generating multiple test 15 | executables from a single source file. To use it, add one or more special 16 | comments to the test source file: 17 | 18 | ```cpp 19 | // %PARAM% [definition] [label] [values] 20 | ``` 21 | 22 | CMake will parse the source file and extract these comments, using them to 23 | generate multiple test executables for the full cartesian product of values. 24 | 25 | - `definition` will be used as a preprocessor definition name. By convention, 26 | these begin with `TEST_`. 27 | - `label` is a short, human-readable label that will be used in the test 28 | executable's name to identify the test variant. 29 | - `values` is a colon-separated list of values used during test generation. Only 30 | numeric values have been tested. 31 | 32 | ## Special Labels 33 | 34 | ### CDP / RDC Testing 35 | 36 | If a `label` is `cdp`, it is assumed that the parameter is used to explicitly 37 | test variants built with and without CDP support. The `values` for such a 38 | parameter must be `0:1`, with `0` indicating CDP disabled (RDC off) and `1` 39 | indicating CDP enabled (RDC on). 40 | 41 | Tests that do not contain a variant labeled `cdp` will only enable RDC if 42 | the CMake variable `CUB_ENABLE_TESTS_WITH_RDC` is true. 43 | 44 | ## Example 45 | 46 | For example, if `test_baz.cu` contains the following lines: 47 | 48 | ```cpp 49 | // %PARAM% TEST_FOO foo 0:1:2 50 | // %PARAM% TEST_CDP cdp 0:1 51 | ``` 52 | 53 | Six executables and CTest targets will be generated with unique definitions 54 | (only c++17 targets shown): 55 | 56 | | Executable Name | Preprocessor Definitions | RDC State | 57 | |----------------------------------|-----------------------------|-----------| 58 | | `cub.cpp17.test.baz.foo_0.cdp_0` | `-DTEST_FOO=0 -DTEST_CDP=0` | Disabled | 59 | | `cub.cpp17.test.baz.foo_0.cdp_1` | `-DTEST_FOO=0 -DTEST_CDP=1` | Enabled | 60 | | `cub.cpp17.test.baz.foo_1.cdp_0` | `-DTEST_FOO=1 -DTEST_CDP=0` | Disabled | 61 | | `cub.cpp17.test.baz.foo_1.cdp_1` | `-DTEST_FOO=1 -DTEST_CDP=1` | Enabled | 62 | | `cub.cpp17.test.baz.foo_2.cdp_0` | `-DTEST_FOO=2 -DTEST_CDP=0` | Disabled | 63 | | `cub.cpp17.test.baz.foo_2.cdp_1` | `-DTEST_FOO=2 -DTEST_CDP=1` | Enabled | 64 | 65 | ## Changing `%PARAM%` Hints 66 | 67 | Since CMake does not automatically reconfigure the build when source files are 68 | modified, CMake will need to be rerun manually whenever the `%PARAM%` comments 69 | change. 70 | 71 | ## Building and Running Split Tests 72 | 73 | CMake will generate individual build and test targets for each test variant, and 74 | also provides build "metatargets" that compile all variants of a given test. 75 | 76 | The variants follow the usual naming convention for CUB's tests, but include a 77 | suffix that differentiates them (e.g. `.foo_X.bar_Y` in the example above). 78 | 79 | ### Individual Test Variants 80 | 81 | Continuing with the `test_baz.cu` example, the test variant that uses 82 | `-DTEST_FOO=1 -DTEST_BAR=4` can be built and run alone: 83 | 84 | ```bash 85 | # Build a single variant: 86 | make cub.cpp17.test.baz.foo_1.bar_4 87 | 88 | # Run a single variant 89 | bin/cub.cpp17.test.baz.foo_1.bar_4 90 | 91 | # Run a single variant using CTest regex: 92 | ctest -R cub\.cpp17\.test\.baz\.foo_1\.bar_4 93 | ``` 94 | 95 | ### All Variants of a Test 96 | 97 | Using a metatarget and the proper regex, all variants of a test can be built and 98 | executed without listing all variants explicitly: 99 | 100 | ```bash 101 | # Build all variants using the `.all` metatarget 102 | make cub.cpp17.test.baz.all 103 | 104 | # Run all variants: 105 | ctest -R cub\.cpp17\.test\.baz\. 106 | ``` 107 | 108 | ## Debugging 109 | 110 | Running CMake with `--log-level=VERBOSE` will print out extra information about 111 | all detected test variants. 112 | 113 | ## Additional Info 114 | 115 | Ideally, only parameters that directly influence kernel template instantiations 116 | should be split out in this way. If changing a parameter doesn't change the 117 | kernel template type, the same kernel will be compiled into multiple 118 | executables. This defeats the purpose of splitting up the test since the 119 | compiler will generate redundant code across the new split executables. 120 | 121 | The best candidate parameters for splitting are input value types, rather than 122 | integral parameters like BLOCK_THREADS, etc. Splitting by value type allows more 123 | infrastructure (data generation, validation) to be reused. Splitting other 124 | parameters can cause build times to increase since type-related infrastructure 125 | has to be rebuilt for each test variant. 126 | -------------------------------------------------------------------------------- /test/c2h/custom_type.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | namespace c2h 37 | { 38 | 39 | class custom_type_state_t 40 | { 41 | std::size_t m_key{}; 42 | std::size_t m_val{}; 43 | 44 | public: 45 | __host__ __device__ void set_key(std::size_t key) { m_key = key; } 46 | __host__ __device__ std::size_t get_key() const { return m_key; } 47 | __host__ __device__ void set_val(std::size_t val) { m_val = val; } 48 | __host__ __device__ std::size_t get_val() const { return m_val; } 49 | }; 50 | 51 | template class... Policies> 52 | class custom_type_t : public custom_type_state_t 53 | , public Policies>... 54 | { 55 | 56 | public: 57 | friend __host__ std::ostream &operator<<(std::ostream &os, 58 | const custom_type_t &self) 59 | { 60 | return os << "{ " << self.get_key() << ", " << self.get_val() << " }"; 61 | } 62 | 63 | }; 64 | 65 | template 66 | class less_comparable_t 67 | { 68 | // The CUDA compiler follows the IA64 ABI for class layout, while the 69 | // Microsoft host compiler does not. 70 | char workaround_msvc; 71 | 72 | public: 73 | __host__ __device__ bool operator<(const CustomType& other) const 74 | { 75 | return static_cast(*this).get_key() 76 | < other.get_key(); 77 | } 78 | }; 79 | 80 | template 81 | class lexicographical_less_comparable_t 82 | { 83 | // The CUDA compiler follows the IA64 ABI for class layout, while the 84 | // Microsoft host compiler does not. 85 | char workaround_msvc; 86 | 87 | public: 88 | __host__ __device__ bool operator<(const CustomType& other) const 89 | { 90 | return static_cast(*this).get_key() < other.get_key() || 91 | (static_cast(*this).get_key() == other.get_key() && 92 | static_cast(*this).get_val() < other.get_val()); 93 | } 94 | }; 95 | 96 | template 97 | class equal_comparable_t 98 | { 99 | // The CUDA compiler follows the IA64 ABI for class layout, while the 100 | // Microsoft host compiler does not. 101 | char workaround_msvc; 102 | 103 | public: 104 | __host__ __device__ bool operator==(const CustomType& other) const 105 | { 106 | const CustomType& self = static_cast(*this); 107 | 108 | return self.get_key() == other.get_key() && 109 | self.get_val() == other.get_val(); 110 | } 111 | }; 112 | 113 | template 114 | class subtractable_t 115 | { 116 | // The CUDA compiler follows the IA64 ABI for class layout, while the 117 | // Microsoft host compiler does not. 118 | char workaround_msvc; 119 | 120 | public: 121 | __host__ __device__ CustomType operator-(const CustomType& other) const 122 | { 123 | CustomType result{}; 124 | 125 | const CustomType& self = static_cast(*this); 126 | 127 | result.set_key(self.get_key() - other.get_key()); 128 | result.set_val(self.get_val() - other.get_val()); 129 | 130 | return result; 131 | } 132 | }; 133 | 134 | template 135 | class accumulateable_t 136 | { 137 | // The CUDA compiler follows the IA64 ABI for class layout, while the 138 | // Microsoft host compiler does not. 139 | char workaround_msvc; 140 | 141 | public: 142 | __host__ __device__ CustomType operator+(const CustomType& other) const 143 | { 144 | CustomType result{}; 145 | 146 | const CustomType& self = static_cast(*this); 147 | 148 | result.set_key(self.get_key() + other.get_key()); 149 | result.set_val(self.get_val() + other.get_val()); 150 | 151 | return result; 152 | } 153 | }; 154 | 155 | } // c2h 156 | 157 | namespace std { 158 | template class... Policies> 159 | class numeric_limits> 160 | { 161 | public: 162 | static c2h::custom_type_t max() 163 | { 164 | c2h::custom_type_t val; 165 | val.set_key(std::numeric_limits::max()); 166 | val.set_val(std::numeric_limits::max()); 167 | return val; 168 | } 169 | 170 | static c2h::custom_type_t lowest() 171 | { 172 | c2h::custom_type_t val; 173 | val.set_key(std::numeric_limits::lowest()); 174 | val.set_val(std::numeric_limits::lowest()); 175 | return val; 176 | } 177 | }; 178 | } 179 | 180 | -------------------------------------------------------------------------------- /test/c2h/generators.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | 34 | #include 35 | 36 | namespace c2h 37 | { 38 | 39 | namespace detail 40 | { 41 | 42 | template 43 | class value_wrapper_t 44 | { 45 | T m_val{}; 46 | 47 | public: 48 | explicit value_wrapper_t(T val) : m_val(val) {} 49 | explicit value_wrapper_t(int val) : m_val(static_cast(val)) {} 50 | T get() const { return m_val; } 51 | }; 52 | 53 | } 54 | 55 | class seed_t : public detail::value_wrapper_t 56 | { 57 | using value_wrapper_t::value_wrapper_t; 58 | }; 59 | 60 | class modulo_t : public detail::value_wrapper_t 61 | { 62 | using value_wrapper_t::value_wrapper_t; 63 | }; 64 | 65 | namespace detail 66 | { 67 | 68 | void gen(seed_t seed, 69 | char* data, 70 | c2h::custom_type_state_t min, 71 | c2h::custom_type_state_t max, 72 | std::size_t elements, 73 | std::size_t element_size); 74 | 75 | } 76 | 77 | template