├── .clang-format
├── .clang-tidy
├── .clangd
├── .git-blame-ignore-revs
├── .github
    └── workflows
    │   ├── mirror-main-branch-to-master-branch.yml
    │   └── push-to-legacy-repositories.yml
├── .gitignore
├── CHANGELOG.md
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.TXT
├── README.md
├── cmake
    ├── AppendOptionIfAvailable.cmake
    ├── CPM.cmake
    ├── CubAddSubdir.cmake
    ├── CubBuildCompilerTargets.cmake
    ├── CubBuildTargetList.cmake
    ├── CubCompilerHacks.cmake
    ├── CubCudaConfig.cmake
    ├── CubHeaderTesting.cmake
    ├── CubInstallRules.cmake
    ├── CubUtilities.cmake
    └── header_test.in
├── cub
    ├── agent
    │   ├── agent_adjacent_difference.cuh
    │   ├── agent_batch_memcpy.cuh
    │   ├── agent_histogram.cuh
    │   ├── agent_merge_sort.cuh
    │   ├── agent_radix_sort_downsweep.cuh
    │   ├── agent_radix_sort_histogram.cuh
    │   ├── agent_radix_sort_onesweep.cuh
    │   ├── agent_radix_sort_upsweep.cuh
    │   ├── agent_reduce.cuh
    │   ├── agent_reduce_by_key.cuh
    │   ├── agent_rle.cuh
    │   ├── agent_scan.cuh
    │   ├── agent_scan_by_key.cuh
    │   ├── agent_segment_fixup.cuh
    │   ├── agent_segmented_radix_sort.cuh
    │   ├── agent_select_if.cuh
    │   ├── agent_spmv_orig.cuh
    │   ├── agent_sub_warp_merge_sort.cuh
    │   ├── agent_three_way_partition.cuh
    │   ├── agent_unique_by_key.cuh
    │   └── single_pass_scan_operators.cuh
    ├── block
    │   ├── block_adjacent_difference.cuh
    │   ├── block_discontinuity.cuh
    │   ├── block_exchange.cuh
    │   ├── block_histogram.cuh
    │   ├── block_load.cuh
    │   ├── block_merge_sort.cuh
    │   ├── block_radix_rank.cuh
    │   ├── block_radix_sort.cuh
    │   ├── block_raking_layout.cuh
    │   ├── block_reduce.cuh
    │   ├── block_run_length_decode.cuh
    │   ├── block_scan.cuh
    │   ├── block_shuffle.cuh
    │   ├── block_store.cuh
    │   ├── radix_rank_sort_operations.cuh
    │   └── specializations
    │   │   ├── block_histogram_atomic.cuh
    │   │   ├── block_histogram_sort.cuh
    │   │   ├── block_reduce_raking.cuh
    │   │   ├── block_reduce_raking_commutative_only.cuh
    │   │   ├── block_reduce_warp_reductions.cuh
    │   │   ├── block_scan_raking.cuh
    │   │   └── block_scan_warp_scans.cuh
    ├── cmake
    │   ├── cub-config-version.cmake
    │   ├── cub-config.cmake
    │   ├── cub-header-search.cmake
    │   └── cub-header-search.cmake.in
    ├── config.cuh
    ├── cub.cuh
    ├── detail
    │   ├── choose_offset.cuh
    │   ├── cpp_compatibility.cuh
    │   ├── detect_cuda_runtime.cuh
    │   ├── device_double_buffer.cuh
    │   ├── device_synchronize.cuh
    │   ├── exec_check_disable.cuh
    │   ├── strong_load.cuh
    │   ├── strong_store.cuh
    │   ├── temporary_storage.cuh
    │   ├── type_traits.cuh
    │   └── uninitialized_copy.cuh
    ├── device
    │   ├── device_adjacent_difference.cuh
    │   ├── device_copy.cuh
    │   ├── device_histogram.cuh
    │   ├── device_memcpy.cuh
    │   ├── device_merge_sort.cuh
    │   ├── device_partition.cuh
    │   ├── device_radix_sort.cuh
    │   ├── device_reduce.cuh
    │   ├── device_run_length_encode.cuh
    │   ├── device_scan.cuh
    │   ├── device_segmented_radix_sort.cuh
    │   ├── device_segmented_reduce.cuh
    │   ├── device_segmented_sort.cuh
    │   ├── device_select.cuh
    │   ├── device_spmv.cuh
    │   └── dispatch
    │   │   ├── dispatch_adjacent_difference.cuh
    │   │   ├── dispatch_batch_memcpy.cuh
    │   │   ├── dispatch_histogram.cuh
    │   │   ├── dispatch_merge_sort.cuh
    │   │   ├── dispatch_radix_sort.cuh
    │   │   ├── dispatch_reduce.cuh
    │   │   ├── dispatch_reduce_by_key.cuh
    │   │   ├── dispatch_rle.cuh
    │   │   ├── dispatch_scan.cuh
    │   │   ├── dispatch_scan_by_key.cuh
    │   │   ├── dispatch_segmented_sort.cuh
    │   │   ├── dispatch_select_if.cuh
    │   │   ├── dispatch_spmv_orig.cuh
    │   │   ├── dispatch_three_way_partition.cuh
    │   │   └── dispatch_unique_by_key.cuh
    ├── grid
    │   ├── grid_barrier.cuh
    │   ├── grid_even_share.cuh
    │   ├── grid_mapping.cuh
    │   └── grid_queue.cuh
    ├── host
    │   └── mutex.cuh
    ├── iterator
    │   ├── arg_index_input_iterator.cuh
    │   ├── cache_modified_input_iterator.cuh
    │   ├── cache_modified_output_iterator.cuh
    │   ├── constant_input_iterator.cuh
    │   ├── counting_input_iterator.cuh
    │   ├── discard_output_iterator.cuh
    │   ├── tex_obj_input_iterator.cuh
    │   ├── tex_ref_input_iterator.cuh
    │   └── transform_input_iterator.cuh
    ├── thread
    │   ├── thread_load.cuh
    │   ├── thread_operators.cuh
    │   ├── thread_reduce.cuh
    │   ├── thread_scan.cuh
    │   ├── thread_search.cuh
    │   ├── thread_sort.cuh
    │   └── thread_store.cuh
    ├── util_allocator.cuh
    ├── util_arch.cuh
    ├── util_compiler.cuh
    ├── util_cpp_dialect.cuh
    ├── util_debug.cuh
    ├── util_deprecated.cuh
    ├── util_device.cuh
    ├── util_macro.cuh
    ├── util_math.cuh
    ├── util_namespace.cuh
    ├── util_ptx.cuh
    ├── util_type.cuh
    ├── version.cuh
    └── warp
    │   ├── specializations
    │       ├── warp_reduce_shfl.cuh
    │       ├── warp_reduce_smem.cuh
    │       ├── warp_scan_shfl.cuh
    │       └── warp_scan_smem.cuh
    │   ├── warp_exchange.cuh
    │   ├── warp_load.cuh
    │   ├── warp_merge_sort.cuh
    │   ├── warp_reduce.cuh
    │   ├── warp_scan.cuh
    │   └── warp_store.cuh
├── docs
    ├── developer_overview.md
    └── test_overview.md
├── examples
    ├── CMakeLists.txt
    ├── block
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── example_block_radix_sort.cu
    │   ├── example_block_reduce.cu
    │   ├── example_block_reduce_dyn_smem.cu
    │   └── example_block_scan.cu
    ├── cmake
    │   ├── CMakeLists.txt
    │   └── add_subdir
    │   │   ├── CMakeLists.txt
    │   │   └── dummy.cu
    └── device
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── example_device_partition_flagged.cu
    │   ├── example_device_partition_if.cu
    │   ├── example_device_radix_sort.cu
    │   ├── example_device_reduce.cu
    │   ├── example_device_scan.cu
    │   ├── example_device_select_flagged.cu
    │   ├── example_device_select_if.cu
    │   ├── example_device_select_unique.cu
    │   └── example_device_sort_find_non_trivial_runs.cu
└── test
    ├── .gitignore
    ├── CMakeLists.txt
    ├── README.md
    ├── bfloat16.h
    ├── c2h
        ├── custom_type.cuh
        ├── generators.cu
        └── generators.cuh
    ├── catch2_runner.cu
    ├── catch2_test_block_adjacent_difference.cu
    ├── catch2_test_block_histogram.cu
    ├── catch2_test_block_load.cu
    ├── catch2_test_block_merge_sort.cu
    ├── catch2_test_block_radix_sort.cu
    ├── catch2_test_block_reduce.cu
    ├── catch2_test_block_run_length_decode.cu
    ├── catch2_test_block_scan.cu
    ├── catch2_test_block_shuffle.cu
    ├── catch2_test_block_store.cu
    ├── catch2_test_cdp_helper.h
    ├── catch2_test_cdp_wrapper.cu
    ├── catch2_test_helper.h
    ├── catch2_test_printing.cu
    ├── catch2_test_util_type.cu
    ├── catch2_test_warp_exchange.cu
    ├── catch2_test_warp_load.cu
    ├── catch2_test_warp_mask.cu
    ├── catch2_test_warp_merge_sort.cu
    ├── catch2_test_warp_reduce.cu
    ├── catch2_test_warp_scan.cu
    ├── catch2_test_warp_store.cu
    ├── cmake
        ├── CMakeLists.txt
        ├── check_source_files.cmake
        └── test_install
        │   └── CMakeLists.txt
    ├── fill_striped.cuh
    ├── half.h
    ├── link_a.cu
    ├── link_b.cu
    ├── link_main.cpp
    ├── mersenne.h
    ├── test_allocator.cu
    ├── test_block_radix_rank.cu
    ├── test_cdp_variant_state.cu
    ├── test_device_adjacent_difference.cu
    ├── test_device_batch_copy.cu
    ├── test_device_batch_memcpy.cu
    ├── test_device_histogram.cu
    ├── test_device_merge_sort.cu
    ├── test_device_radix_sort.cu
    ├── test_device_reduce.cu
    ├── test_device_reduce_by_key.cu
    ├── test_device_run_length_encode.cu
    ├── test_device_scan.cu
    ├── test_device_scan_by_key.cu
    ├── test_device_segmented_sort.cu
    ├── test_device_select_if.cu
    ├── test_device_select_unique.cu
    ├── test_device_select_unique_by_key.cu
    ├── test_device_spmv.cu
    ├── test_device_three_way_partition.cu
    ├── test_grid_barrier.cu
    ├── test_iterator.cu
    ├── test_iterator_deprecated.cu
    ├── test_namespace_wrapped.cu
    ├── test_temporary_storage_layout.cu
    ├── test_thread_operators.cu
    ├── test_thread_sort.cu
    ├── test_util.h
    └── test_util_vec.h


/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: LLVM
 2 | AccessModifierOffset: -2
 3 | AlignAfterOpenBracket: Align
 4 | AlignConsecutiveAssignments: true
 5 | AlignEscapedNewlines: Right
 6 | AlignOperands: true
 7 | AllowAllArgumentsOnNextLine: false
 8 | AllowAllConstructorInitializersOnNextLine: false
 9 | AllowAllParametersOfDeclarationOnNextLine: false
10 | AllowShortBlocksOnASingleLine: false
11 | AllowShortCaseLabelsOnASingleLine: false
12 | AllowShortFunctionsOnASingleLine: All
13 | AllowShortIfStatementsOnASingleLine: Never
14 | AllowShortLambdasOnASingleLine: All
15 | AllowShortLoopsOnASingleLine: false
16 | AlwaysBreakAfterReturnType: None
17 | AlwaysBreakTemplateDeclarations: Yes
18 | BinPackArguments: false
19 | BinPackParameters: false
20 | BreakBeforeBraces: Custom
21 | BraceWrapping:
22 |   AfterCaseLabel: false
23 |   AfterClass: true
24 |   AfterControlStatement: true
25 |   AfterEnum: true
26 |   AfterFunction: true
27 |   AfterNamespace: true
28 |   AfterStruct: true
29 |   AfterUnion: true
30 |   BeforeCatch: true
31 |   BeforeElse: true
32 |   IndentBraces: false
33 |   SplitEmptyFunction: false
34 |   SplitEmptyRecord: false
35 | BreakBeforeBinaryOperators: None
36 | BreakBeforeTernaryOperators: true
37 | BreakConstructorInitializers: BeforeComma
38 | BreakInheritanceList: BeforeComma
39 | ColumnLimit: 100
40 | CompactNamespaces: false
41 | ContinuationIndentWidth: 2
42 | IncludeBlocks:   Regroup
43 | IncludeCategories:
44 |   - Regex:           '^<cub'
45 |     Priority:        1
46 |   - Regex:           '^<thrust'
47 |     Priority:        2
48 |   - Regex:           '^<cuda'
49 |     Priority:        3
50 |   - Regex:           '^<[a-z_]*>$'
51 |     Priority:        4
52 | IndentCaseLabels: true
53 | IndentPPDirectives: None
54 | IndentWidth: 2
55 | KeepEmptyLinesAtTheStartOfBlocks: true
56 | MaxEmptyLinesToKeep: 1
57 | NamespaceIndentation: None
58 | PenaltyBreakAssignment: 30
59 | PenaltyBreakBeforeFirstCallParameter: 50
60 | PenaltyBreakComment: 0
61 | PenaltyBreakFirstLessLess: 0
62 | PenaltyBreakString: 70
63 | PenaltyBreakTemplateDeclaration: 0
64 | PenaltyExcessCharacter: 100
65 | PenaltyReturnTypeOnItsOwnLine: 90
66 | PointerAlignment: Right
67 | ReflowComments: true
68 | SortIncludes: CaseInsensitive
69 | SpaceAfterCStyleCast: false
70 | SpaceAfterLogicalNot: false
71 | SpaceAfterTemplateKeyword: true
72 | SpaceBeforeAssignmentOperators: true
73 | SpaceBeforeCpp11BracedList: false
74 | SpaceBeforeCtorInitializerColon: true
75 | SpaceBeforeInheritanceColon: true
76 | SpaceBeforeParens: ControlStatements
77 | SpaceBeforeRangeBasedForLoopColon: true
78 | SpaceInEmptyParentheses: false
79 | SpacesBeforeTrailingComments: 1
80 | SpacesInAngles: false
81 | SpacesInCStyleCastParentheses: false
82 | SpacesInParentheses: false
83 | SpacesInSquareBrackets: false
84 | Standard: c++11
85 | TabWidth: 2
86 | UseTab: Never
87 | 


--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
 1 | ---
 2 | Checks:
 3 |       'modernize-*,
 4 |        -modernize-use-equals-default,
 5 |        -modernize-concat-nested-namespaces,
 6 |        -modernize-use-trailing-return-type'
 7 | 
 8 |       # -modernize-use-equals-default        # auto-fix is broken (doesn't insert =default correctly)
 9 |       # -modernize-concat-nested-namespaces  # auto-fix is broken (can delete code)
10 |       # -modernize-use-trailing-return-type  # just a preference
11 | 
12 | WarningsAsErrors: ''
13 | HeaderFilterRegex: ''
14 | AnalyzeTemporaryDtors: false
15 | FormatStyle:     none
16 | CheckOptions:
17 |  - key:             modernize-loop-convert.MaxCopySize
18 |    value:           '16'
19 |  - key:             modernize-loop-convert.MinConfidence
20 |    value:           reasonable
21 |  - key:             modernize-pass-by-value.IncludeStyle
22 |    value:           llvm
23 |  - key:             modernize-replace-auto-ptr.IncludeStyle
24 |    value:           llvm
25 |  - key:             modernize-use-nullptr.NullMacros
26 |    value:           'NULL'
27 | ...
28 | 


--------------------------------------------------------------------------------
/.clangd:
--------------------------------------------------------------------------------
 1 | # https://clangd.llvm.org/config
 2 | 
 3 | # Apply a config conditionally to all C files
 4 | If:
 5 |   PathMatch: .*\.(c|h)$
 6 | 
 7 | ---
 8 | 
 9 | # Apply a config conditionally to all C++ files
10 | If:
11 |   PathMatch: .*\.(c|h)pp
12 | 
13 | ---
14 | 
15 | # Apply a config conditionally to all CUDA files
16 | If:
17 |   PathMatch: .*\.cuh?
18 | CompileFlags:
19 |   Add:
20 |     # Allow variadic CUDA functions
21 |     - "-Xclang=-fcuda-allow-variadic-functions"
22 | 
23 | ---
24 | 
25 | # Tweak the clangd parse settings for all files
26 | CompileFlags:
27 |   Compiler: clang++
28 |   CompilationDatabase: .
29 |   Add:
30 |     - -x
31 |     - cuda
32 |     # report all errors
33 |     - "-ferror-limit=0"
34 |     - "-ftemplate-backtrace-limit=0"
35 |     - "-stdlib=libc++"
36 |   Remove:
37 |     - -stdpar
38 |     # strip CUDA fatbin args
39 |     - "-Xfatbin*"
40 |     - "-gpu=*"
41 |     - "--diag_suppress*"
42 |     # strip CUDA arch flags
43 |     - "-gencode*"
44 |     - "--generate-code*"
45 |     # strip gcc's -fcoroutines
46 |     - -fcoroutines
47 |     # strip CUDA flags unknown to clang
48 |     - "-ccbin*"
49 |     - "--compiler-options*"
50 |     - "--expt-extended-lambda"
51 |     - "--expt-relaxed-constexpr"
52 |     - "-forward-unknown-to-host-compiler"
53 |     - "-Werror=cross-execution-space-call"
54 | Diagnostics:
55 |   Suppress:
56 |     - "variadic_device_fn"
57 |     - "attributes_not_allowed"
58 |     # The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error.
59 |     # Temporarily suppressing it, but should probably fix
60 |     - "template_param_shadow"
61 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
 1 | # Exclude these commits from git-blame and similar tools.
 2 | #
 3 | # To use this file, run the following command from the repo root:
 4 | #
 5 | # ```
 6 | # $ git config blame.ignoreRevsFile .git-blame-ignore-revs
 7 | # ```
 8 | #
 9 | # Include a brief comment with each commit added, for example:
10 | #
11 | # ```
12 | # d92d9f8baac5ec48a8f8718dd69f415a45efe372 # Initial clang-format
13 | # ```
14 | #
15 | # Only add commits that are pure formatting changes (e.g.
16 | # clang-format version changes, etc).
17 | 


--------------------------------------------------------------------------------
/.github/workflows/mirror-main-branch-to-master-branch.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - "main"
 5 | 
 6 | jobs:
 7 |   mirror-main-branch-to-master-branch:
 8 |     name: Mirror main branch to master branch
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - name: Mirror main branch to master branch
12 |       id: mirror
13 |       uses: google/mirror-branch-action@v1.0
14 |       with:
15 |         source: "main"
16 |         dest: "master"
17 |         github-token: ${{ secrets.GITHUB_TOKEN }}
18 | 


--------------------------------------------------------------------------------
/.github/workflows/push-to-legacy-repositories.yml:
--------------------------------------------------------------------------------
 1 | on: push
 2 | 
 3 | jobs:
 4 |   push-to-legacy-repositories:
 5 |     name: Push to legacy repositories
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |     - name: Push `main` to github.com/nvlabs/cub
 9 |       uses: wei/git-sync@v2
10 |       if: github.repository == 'nvidia/cub'
11 |       with:
12 |         source_repo: "nvidia/cub"
13 |         source_branch: "main"
14 |         destination_repo: "nvlabs/cub"
15 |         destination_branch: "main"
16 |         ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
17 |     - name: Push all tags to github.com/nvlabs/cub
18 |       uses: wei/git-sync@v2
19 |       if: github.repository == 'nvidia/cub'
20 |       with:
21 |         source_repo: "nvidia/cub"
22 |         source_branch: "refs/tags/*"
23 |         destination_repo: "nvlabs/cub"
24 |         destination_branch: "refs/tags/*"
25 |         ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
26 |     - name: Push `main` to github.com/thrust/cub
27 |       uses: wei/git-sync@v2
28 |       if: github.repository == 'nvidia/cub'
29 |       with:
30 |         source_repo: "nvidia/cub"
31 |         source_branch: "main"
32 |         destination_repo: "thrust/cub"
33 |         destination_branch: "main"
34 |         ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
35 |     - name: Push all tags to github.com/thrust/cub
36 |       uses: wei/git-sync@v2
37 |       if: github.repository == 'nvidia/cub'
38 |       with:
39 |         source_repo: "nvidia/cub"
40 |         source_branch: "refs/tags/*"
41 |         destination_repo: "thrust/cub"
42 |         destination_branch: "refs/tags/*"
43 |         ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .p4config
2 | *~
3 | \#*
4 | /build
5 | .cache
6 | .vscode
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # 3.15 is the minimum.
  2 | # 3.17 for NVC++.
  3 | # 3.18.3 for C++17 + CUDA.
  4 | cmake_minimum_required(VERSION 3.15)
  5 | 
  6 | # Remove this when we use the new CUDA_ARCHITECTURES properties.
  7 | if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
  8 |   cmake_policy(SET CMP0104 OLD)
  9 | endif()
 10 | 
 11 | # CXX is only needed for AppendOptionIfAvailable.
 12 | project(CUB NONE)
 13 | 
 14 | # Determine whether CUB is the top-level project or included into
 15 | # another project via add_subdirectory().
 16 | if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}")
 17 |   set(CUB_TOPLEVEL_PROJECT ON)
 18 | else()
 19 |   set(CUB_TOPLEVEL_PROJECT OFF)
 20 | endif()
 21 | 
 22 | # This must be done before any languages are enabled:
 23 | if (CUB_TOPLEVEL_PROJECT)
 24 |   include(cmake/CubCompilerHacks.cmake)
 25 | endif()
 26 | 
 27 | # This must appear after our Compiler Hacks or else CMake will delete the cache
 28 | # and reconfigure from scratch.
 29 | # This must also appear before the installation rules, as it is required by the
 30 | # GNUInstallDirs CMake module.
 31 | enable_language(CXX)
 32 | 
 33 | # Thrust has its own copy of CUB install rules to handle packaging usecases
 34 | # where we want to install CUB headers but aren't actually building anything.
 35 | # In these cases the add_subdirectory(dependencies/cub) line in Thrust won't get
 36 | # called so we can't rely on CUB providing its own rules.
 37 | if (NOT CUB_IN_THRUST)
 38 |   option(CUB_ENABLE_INSTALL_RULES "Enable installation of CUB" ${CUB_TOPLEVEL_PROJECT})
 39 |   if (CUB_ENABLE_INSTALL_RULES)
 40 |     include(cmake/CubInstallRules.cmake)
 41 |   endif()
 42 | endif()
 43 | 
 44 | # Support adding CUB to a parent project via add_subdirectory.
 45 | # See examples/cmake/add_subdir/CMakeLists.txt for details.
 46 | if (NOT CUB_TOPLEVEL_PROJECT AND NOT CUB_IN_THRUST)
 47 |   include(cmake/CubAddSubdir.cmake)
 48 |   return()
 49 | endif()
 50 | 
 51 | option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON)
 52 | option(CUB_ENABLE_TESTING "Build CUB testing suite." ON)
 53 | option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON)
 54 | 
 55 | # This is needed for NVCXX QA, which requires a static set of executable names.
 56 | # Only a single dialect may be enabled when this is off.
 57 | option(CUB_ENABLE_CPP_DIALECT_IN_NAMES
 58 |   "Include C++ dialect information in target/object/etc names."
 59 |   ON
 60 | )
 61 | mark_as_advanced(CUB_ENABLE_CPP_DIALECT_IN_NAMES)
 62 | 
 63 | # This option is only used when CUB is built stand-alone; otherwise the Thrust
 64 | # option has the same effect.
 65 | if (NOT CUB_IN_THRUST)
 66 |   option(CUB_IGNORE_DEPRECATED_API
 67 |     "Suppress warnings about deprecated Thrust/CUB API."
 68 |     OFF
 69 |   )
 70 | endif()
 71 | 
 72 | # Check if we're actually building anything before continuing. If not, no need
 73 | # to search for deps, etc. This is a common approach for packagers that just
 74 | # need the install rules. See GH issue NVIDIA/thrust#1211.
 75 | if (NOT (CUB_ENABLE_HEADER_TESTING OR
 76 |          CUB_ENABLE_TESTING OR
 77 |          CUB_ENABLE_EXAMPLES))
 78 |   return()
 79 | endif()
 80 | 
 81 | include(cmake/AppendOptionIfAvailable.cmake)
 82 | include(cmake/CubBuildCompilerTargets.cmake)
 83 | include(cmake/CubBuildTargetList.cmake)
 84 | include(cmake/CubCudaConfig.cmake)
 85 | include(cmake/CubUtilities.cmake)
 86 | 
 87 | if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
 88 |   set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE)
 89 | 
 90 |   set_property(
 91 |     CACHE CMAKE_BUILD_TYPE
 92 |     PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel
 93 |   )
 94 | endif ()
 95 | 
 96 | set(CMAKE_CXX_EXTENSIONS OFF)
 97 | 
 98 | # Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up alongside
 99 | # Thrust targets when building as part of Thrust.
100 | set(CUB_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib")
101 | set(CUB_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin")
102 | 
103 | cub_build_target_list()
104 | 
105 | if (CUB_ENABLE_HEADER_TESTING)
106 |   include(cmake/CubHeaderTesting.cmake)
107 | endif()
108 | 
109 | # Both testing and examples use ctest
110 | if (CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES)
111 |   include(CTest)
112 |   enable_testing()
113 | endif()
114 | 
115 | if (CUB_ENABLE_TESTING)
116 |   add_subdirectory(test)
117 | endif()
118 | 
119 | if (CUB_ENABLE_EXAMPLES)
120 |   add_subdirectory(examples)
121 | endif()
122 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Code of Conduct
 3 | 
 4 | ## Overview
 5 | 
 6 | This document defines the Code of Conduct followed and enforced for NVIDIA C++
 7 |   Core Compute Libraries.
 8 | 
 9 | ### Intended Audience
10 | 
11 | * Community
12 | * Developers
13 | * Project Leads
14 | 
15 | ## Our Pledge
16 | 
17 | In the interest of fostering an open and welcoming environment, we as
18 |   contributors and maintainers pledge to making participation in our project and
19 |   our community a harassment-free experience for everyone, regardless of age,
20 |   body size, disability, ethnicity, sex characteristics, gender identity and
21 |   expression, level of experience, education, socio-economic status, nationality,
22 |   personal appearance, race, religion, or sexual identity and orientation.
23 | 
24 | ## Our Standards
25 | 
26 | Examples of behavior that contributes to creating a positive environment include:
27 | 
28 | - Using welcoming and inclusive language.
29 | - Being respectful of differing viewpoints and experiences.
30 | - Gracefully accepting constructive criticism.
31 | - Focusing on what is best for the community.
32 | - Showing empathy towards other community members.
33 | 
34 | Examples of unacceptable behavior by participants include:
35 | 
36 | - The use of sexualized language or imagery and unwelcome sexual attention or
37 |     advances.
38 | - Trolling, insulting/derogatory comments, and personal or political attacks.
39 | - Public or private harassment.
40 | - Publishing others’ private information, such as a physical or electronic
41 |     address, without explicit permission.
42 | - Other conduct which could reasonably be considered inappropriate.
43 | 
44 | ## Our Responsibilities
45 | 
46 | Project maintainers are responsible for clarifying the standards of acceptable
47 |   behavior and are expected to take appropriate and fair corrective action in
48 |   response to any instances of unacceptable behavior.
49 | 
50 | Project maintainers have the right and responsibility to remove, edit, or
51 |   reject comments, commits, code, wiki edits, issues, and other contributions
52 |   that are not aligned to this Code of Conduct, or to ban temporarily or
53 |   permanently any contributor for other behaviors that they deem inappropriate,
54 |   threatening, offensive, or harmful.
55 | 
56 | ## Scope
57 | 
58 | This Code of Conduct applies both within project spaces and in public spaces
59 |   when an individual is representing the project or its community.
60 | Examples of representing a project or community include using an official
61 |   project email address, posting via an official social media account, or acting
62 |   as an appointed representative at an online or offline event.
63 | Representation of a project may be further defined and clarified by project
64 |   maintainers.
65 | 
66 | ## Enforcement
67 | 
68 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
69 |   reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com).
70 | All complaints will be reviewed and investigated and will result in a response
71 |   that is deemed necessary and appropriate to the circumstances.
72 | The project team is obligated to maintain confidentiality with regard to the
73 |   reporter of an incident.
74 | Further details of specific enforcement policies may be posted separately.
75 | 
76 | Project maintainers who do not follow or enforce the Code of Conduct in good
77 |   faith may face temporary or permanent repercussions as determined by other
78 |   members of the project’s leadership.
79 | 
80 | ## Attribution
81 | 
82 | This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was
83 |   adapted from the [Contributor Covenant version 1.4].
84 | 
85 | Please see this [FAQ] for answers to common questions about this Code of Conduct.
86 | 
87 | ## Contact
88 | 
89 | Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters.
90 | 
91 | 
92 | [cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com
93 | 
94 | [FAQ]: https://www.contributor-covenant.org/faq
95 | 
96 | [NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/
97 | [Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
98 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Table of Contents
 2 | 
 3 | 1. [Contributing to CUB](#contributing-to-cub)
 4 | 1. [CMake Options](#cmake-options)
 5 | 1. [Development Model](#development-model)
 6 | 
 7 | # Contributing to CUB
 8 | 
 9 | CUB uses Github to manage all open-source development, including bug tracking,
10 | pull requests, and design discussions. CUB is tightly coupled to the Thrust
11 | project, and a compatible version of Thrust is required when working on the
12 | development version of CUB.
13 | 
14 | To setup a CUB development branch, it is recommended to recursively clone the
15 | Thrust repository and use the CUB submodule at `dependencies/cub` to stage
16 | changes. CUB's tests and examples can be built by configuring Thrust with the
17 | CMake option `THRUST_INCLUDE_CUB_CMAKE=ON`.
18 | 
19 | This process is described in more detail in Thrust's
20 | [CONTRIBUTING.md](https://nvidia.github.io/thrust/contributing.html).
21 | 
22 | The CMake options in the following section may be used to customize CUB's build
23 | process. Note that some of these are controlled by Thrust for compatibility and
24 | may not have an effect when building CUB through the Thrust build system. This
25 | is pointed out in the documentation below where applicable.
26 | 
27 | # CMake Options
28 | 
29 | A CUB build is configured using CMake options. These may be passed to CMake
30 | using
31 | 
32 | ```
33 | cmake -D<option_name>=<value> [Thrust or CUB project source root]
34 | ```
35 | 
36 | or configured interactively with the `ccmake` or `cmake-gui` interfaces.
37 | 
38 | The configuration options for CUB are:
39 | 
40 | - `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}`
41 |   - Standard CMake build option. Default: `RelWithDebInfo`
42 | - `CUB_ENABLE_HEADER_TESTING={ON, OFF}`
43 |   - Whether to test compile public headers. Default is `ON`.
44 | - `CUB_ENABLE_TESTING={ON, OFF}`
45 |   - Whether to build unit tests. Default is `ON`.
46 | - `CUB_ENABLE_EXAMPLES={ON, OFF}`
47 |   - Whether to build examples. Default is `ON`.
48 | - `CUB_ENABLE_DIALECT_CPPXX={ON, OFF}`
49 |   - Setting this has no effect when building CUB as a component of Thrust.
50 |     See Thrust's dialect options, which CUB will inherit.
51 |   - Toggle whether a specific C++ dialect will be targeted.
52 |   - Multiple dialects may be targeted in a single build.
53 |   - Possible values of `XX` are `{11, 14, 17}`.
54 |   - By default, only C++14 is enabled.
55 | - `CUB_ENABLE_COMPUTE_XX={ON, OFF}`
56 |   - Setting this has no effect when building CUB as a component of Thrust.
57 |     See Thrust's architecture options, which CUB will inherit.
58 |   - Controls the targeted CUDA architecture(s)
59 |   - Multiple options may be selected when using NVCC as the CUDA compiler.
60 |   - Valid values of `XX` are:
61 |     `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}`
62 |   - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`:
63 | - `CUB_ENABLE_COMPUTE_FUTURE={ON, OFF}`
64 |   - Setting this has no effect when building CUB as a component of Thrust.
65 |     See Thrust's architecture options, which CUB will inherit.
66 |   - If enabled, CUDA objects will target the most recent virtual architecture
67 |     in addition to the real architectures specified by the
68 |     `CUB_ENABLE_COMPUTE_XX` options.
69 |   - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`:
70 | - `CUB_DISABLE_ARCH_BY_DEFAULT={ON, OFF}`
71 |   - Setting this has no effect when building CUB as a component of Thrust.
72 |     See Thrust's architecture options, which CUB will inherit.
73 |   - When `ON`, all `CUB_ENABLE_COMPUTE_*` options are initially `OFF`.
74 |   - Default: `OFF` (meaning all architectures are enabled by default)
75 | - `CUB_ENABLE_TESTS_WITH_RDC={ON, OFF}`
76 |   - Whether to enable Relocatable Device Code when building tests.
77 |     Default is `OFF`.
78 | - `CUB_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}`
79 |   - Whether to enable Relocatable Device Code when building examples.
80 |     Default is `OFF`.
81 | - `CUB_ENABLE_INSTALL_RULES={ON, OFF}`
82 |   - Setting this has no effect when building CUB as a component of Thrust.
83 |     See Thrust's `THRUST_INSTALL_CUB_HEADERS` option, which controls this
84 |     behavior.
85 |   - If true, installation rules will be generated for CUB. Default is `ON` when
86 |     building CUB alone, and `OFF` when CUB is a subproject added via CMake's
87 |     `add_subdirectory`.
88 | 
89 | # Development Model
90 | 
91 | CUB follows the same development model as Thrust, described
92 | [here](https://nvidia.github.io/thrust/releases/versioning.html).
93 | 


--------------------------------------------------------------------------------
/LICENSE.TXT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
 2 | Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |    *  Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |    *  Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |    *  Neither the name of the NVIDIA CORPORATION nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/cmake/AppendOptionIfAvailable.cmake:
--------------------------------------------------------------------------------
 1 | include_guard(GLOBAL)
 2 | include(CheckCXXCompilerFlag)
 3 | 
 4 | macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST)
 5 | 
 6 | string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR)
 7 | check_cxx_compiler_flag(${_FLAG} ${_VAR})
 8 | 
 9 | if (${${_VAR}})
10 |   list(APPEND ${_LIST} ${_FLAG})
11 | endif ()
12 | 
13 | endmacro ()
14 | 


--------------------------------------------------------------------------------
/cmake/CubAddSubdir.cmake:
--------------------------------------------------------------------------------
1 | find_package(CUB REQUIRED CONFIG
2 |   NO_DEFAULT_PATH # Only check the explicit path in HINTS:
3 |   HINTS "${CMAKE_CURRENT_LIST_DIR}/.."
4 | )
5 | 


--------------------------------------------------------------------------------
/cmake/CubBuildCompilerTargets.cmake:
--------------------------------------------------------------------------------
  1 | #
  2 | # This file defines the `cub_build_compiler_targets()` function, which
  3 | # creates the following interface targets:
  4 | #
  5 | # cub.compiler_interface
  6 | # - Interface target providing compiler-specific options needed to build
  7 | #   Thrust's tests, examples, etc.
  8 | 
  9 | function(cub_build_compiler_targets)
 10 |   set(cxx_compile_definitions)
 11 |   set(cxx_compile_options)
 12 |   set(cuda_compile_options)
 13 | 
 14 |   if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
 15 |     append_option_if_available("/W4" cxx_compile_options)
 16 | 
 17 |     append_option_if_available("/WX" cxx_compile_options)
 18 | 
 19 |     # Suppress overly-pedantic/unavoidable warnings brought in with /W4:
 20 |     # C4324: structure was padded due to alignment specifier
 21 |     append_option_if_available("/wd4324" cxx_compile_options)
 22 |     # C4127: conditional expression is constant
 23 |     # This can be fixed with `if constexpr` when available, but there's no way
 24 |     # to silence these pre-C++17.
 25 |     # TODO We should have per-dialect interface targets so we can leave these
 26 |     # warnings enabled on C++17:
 27 |     append_option_if_available("/wd4127" cxx_compile_options)
 28 |     # C4505: unreferenced local function has been removed
 29 |     # The CUDA `host_runtime.h` header emits this for
 30 |     # `__cudaUnregisterBinaryUtil`.
 31 |     append_option_if_available("/wd4505" cxx_compile_options)
 32 |     # C4706: assignment within conditional expression
 33 |     # MSVC doesn't provide an opt-out for this warning when the assignment is
 34 |     # intentional. Clang will warn for these, but suppresses the warning when
 35 |     # double-parentheses are used around the assignment. We'll let Clang catch
 36 |     # unintentional assignments and suppress all such warnings on MSVC.
 37 |     append_option_if_available("/wd4706" cxx_compile_options)
 38 | 
 39 |     # Some tests require /bigobj to fit everything into their object files:
 40 |     append_option_if_available("/bigobj" cxx_compile_options)
 41 |   else()
 42 |     append_option_if_available("-Wreorder" cuda_compile_options)
 43 | 
 44 |     append_option_if_available("-Werror" cxx_compile_options)
 45 |     append_option_if_available("-Wall" cxx_compile_options)
 46 |     append_option_if_available("-Wextra" cxx_compile_options)
 47 |     append_option_if_available("-Winit-self" cxx_compile_options)
 48 |     append_option_if_available("-Woverloaded-virtual" cxx_compile_options)
 49 |     append_option_if_available("-Wcast-qual" cxx_compile_options)
 50 |     append_option_if_available("-Wpointer-arith" cxx_compile_options)
 51 |     append_option_if_available("-Wunused-local-typedef" cxx_compile_options)
 52 |     append_option_if_available("-Wvla" cxx_compile_options)
 53 | 
 54 |     # Disable GNU extensions (flag is clang only)
 55 |     append_option_if_available("-Wgnu" cxx_compile_options)
 56 |     # Calling a variadic macro with zero args is a GNU extension until C++20,
 57 |     # but the THRUST_PP_ARITY macro is used with zero args. Need to see if this
 58 |     # is a real problem worth fixing.
 59 |     append_option_if_available("-Wno-gnu-zero-variadic-macro-arguments" cxx_compile_options)
 60 | 
 61 |     # This complains about functions in CUDA system headers when used with nvcc.
 62 |     append_option_if_available("-Wno-unused-function" cxx_compile_options)
 63 |   endif()
 64 | 
 65 |   if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
 66 |     if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3)
 67 |       # GCC 7.3 complains about name mangling changes due to `noexcept`
 68 |       # becoming part of the type system; we don't care.
 69 |       append_option_if_available("-Wno-noexcept-type" cxx_compile_options)
 70 |     endif()
 71 |   endif()
 72 | 
 73 |   if ("Intel" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
 74 |     # Disable warning that inlining is inhibited by compiler thresholds.
 75 |     append_option_if_available("-diag-disable=11074" cxx_compile_options)
 76 |     append_option_if_available("-diag-disable=11076" cxx_compile_options)
 77 |   endif()
 78 | 
 79 |   if ("Clang" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
 80 |     option(CUB_ENABLE_CT_PROFILING "Enable compilation time profiling" OFF)
 81 |     if (CUB_ENABLE_CT_PROFILING)
 82 |       append_option_if_available("-ftime-trace" cxx_compile_options)
 83 |     endif()
 84 |   endif()
 85 | 
 86 |   if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
 87 |     list(APPEND cxx_compile_options -Mnodaz)
 88 |     # TODO: Managed memory is currently not supported on windows with WSL
 89 |     list(APPEND cxx_compile_options -gpu=nomanaged)
 90 |   endif()
 91 | 
 92 |   add_library(cub.compiler_interface INTERFACE)
 93 | 
 94 |   foreach (cxx_option IN LISTS cxx_compile_options)
 95 |     target_compile_options(cub.compiler_interface INTERFACE
 96 |       $<$<COMPILE_LANGUAGE:CXX>:${cxx_option}>
 97 |       $<$<COMPILE_LANG_AND_ID:CUDA,NVCXX>:${cxx_option}>
 98 |       # Only use -Xcompiler with NVCC, not NVC++.
 99 |       #
100 |       # CMake can't split genexs, so this can't be formatted better :(
101 |       # This is:
102 |       # if (using CUDA and CUDA_COMPILER is NVCC) add -Xcompiler=opt:
103 |       $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcompiler=${cxx_option}>
104 |     )
105 |   endforeach()
106 | 
107 |   foreach (cuda_option IN LISTS cuda_compile_options)
108 |     target_compile_options(cub.compiler_interface INTERFACE
109 |       $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:${cuda_option}>
110 |     )
111 |   endforeach()
112 | 
113 |   # Add these for both CUDA and CXX targets:
114 |   target_compile_definitions(cub.compiler_interface INTERFACE
115 |     ${cxx_compile_definitions}
116 |   )
117 | 
118 |   # Promote warnings and display diagnostic numbers for nvcc:
119 |   target_compile_options(cub.compiler_interface INTERFACE
120 |     # If using CUDA w/ NVCC...
121 |     # Display diagnostic numbers.
122 |     $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcudafe=--display_error_number>
123 |     # Promote warnings.
124 |     $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcudafe=--promote_warnings>
125 |     # Don't complain about deprecated GPU targets.
126 |     $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Wno-deprecated-gpu-targets>
127 |   )
128 | endfunction()
129 | 


--------------------------------------------------------------------------------
/cmake/CubCompilerHacks.cmake:
--------------------------------------------------------------------------------
 1 | # Set up compiler paths and apply temporary hacks to support NVC++.
 2 | # This file must be included before enabling any languages.
 3 | 
 4 | # Temporary hacks to make NVC++ work; this requires you to define
 5 | # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`.
 6 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
 7 |   # If using NVC++, don't set CXX compiler
 8 |   if (NOT "${CMAKE_CXX_COMPILER}" STREQUAL "")
 9 |     unset(CMAKE_CXX_COMPILER CACHE)
10 |     message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have"
11 |       " specified a different ISO C++ compiler; NVC++ acts as both, so please"
12 |       " unset the CMAKE_CXX_COMPILER variable."
13 |     )
14 |   endif()
15 | 
16 |   # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to
17 |   # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't
18 |   # understand.
19 |   if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "")
20 |     unset(CMAKE_CUDA_HOST_COMPILER CACHE)
21 |     message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have"
22 |       " specified a different host ISO C++ compiler; NVC++ acts as both, so"
23 |       " please unset the CMAKE_CUDA_HOST_COMPILER variable."
24 |     )
25 |   endif()
26 | 
27 |   set(CMAKE_CXX_COMPILER "${CMAKE_CUDA_COMPILER}")
28 |   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -cuda")
29 |   set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CUDA_COMPILER}")
30 |   set(CMAKE_CUDA_LINK_EXECUTABLE
31 |     "<CMAKE_CUDA_HOST_LINK_LAUNCHER> <FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
32 | endif ()
33 | 
34 | # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to
35 | # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't
36 | # understand.
37 | if ((NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}"))
38 |   if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR
39 |     "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}"))
40 |     set(tmp "${CMAKE_CUDA_HOST_COMPILER}")
41 |     unset(CMAKE_CUDA_HOST_COMPILER CACHE)
42 |     message(FATAL_ERROR
43 |       "For convenience, CUB's test harness uses CMAKE_CXX_COMPILER for the "
44 |       "CUDA host compiler. Refusing to overwrite specified "
45 |       "CMAKE_CUDA_HOST_COMPILER -- please reconfigure without setting this "
46 |       "variable. Currently:\n"
47 |       "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}\n"
48 |       "CMAKE_CUDA_HOST_COMPILER=${tmp}"
49 |     )
50 |   endif ()
51 |   set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
52 | endif ()
53 | 
54 | # Temporary hacks to make NVC++ work; this requires you to define
55 | # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`.
56 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
57 |   # Need 3.17 for the properties used below.
58 |   cmake_minimum_required(VERSION 3.17)
59 | 
60 |   set(CMAKE_CUDA_STANDARD_DEFAULT 03)
61 | 
62 |   set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "-std=c++03")
63 |   set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "-std=c++03")
64 |   set(CMAKE_CUDA03_STANDARD__HAS_FULL_SUPPORT TRUE)
65 |   set_property(GLOBAL PROPERTY CMAKE_CUDA03_KNOWN_FEATURES)
66 | 
67 |   set(CMAKE_CUDA11_STANDARD_COMPILE_OPTION "-std=c++11")
68 |   set(CMAKE_CUDA11_EXTENSION_COMPILE_OPTION "-std=c++11")
69 |   set(CMAKE_CUDA11_STANDARD__HAS_FULL_SUPPORT TRUE)
70 |   set_property(GLOBAL PROPERTY CMAKE_CUDA11_KNOWN_FEATURES)
71 | 
72 |   set(CMAKE_CUDA14_STANDARD_COMPILE_OPTION "-std=c++14")
73 |   set(CMAKE_CUDA14_EXTENSION_COMPILE_OPTION "-std=c++14")
74 |   set(CMAKE_CUDA14_STANDARD__HAS_FULL_SUPPORT TRUE)
75 |   set_property(GLOBAL PROPERTY CMAKE_CUDA14_KNOWN_FEATURES)
76 | 
77 |   set(CMAKE_CUDA17_STANDARD_COMPILE_OPTION "-std=c++17")
78 |   set(CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "-std=c++17")
79 |   set(CMAKE_CUDA17_STANDARD__HAS_FULL_SUPPORT TRUE)
80 |   set_property(GLOBAL PROPERTY CMAKE_CUDA17_KNOWN_FEATURES)
81 | 
82 |   include(Internal/FeatureTesting)
83 |   include(Compiler/CMakeCommonCompilerMacros)
84 |   cmake_record_cuda_compile_features()
85 | 
86 |   set(CMAKE_CUDA_COMPILE_FEATURES
87 |     ${CMAKE_CUDA03_COMPILE_FEATURES}
88 |     ${CMAKE_CUDA11_COMPILE_FEATURES}
89 |     ${CMAKE_CUDA14_COMPILE_FEATURES}
90 |     ${CMAKE_CUDA17_COMPILE_FEATURES}
91 |     ${CMAKE_CUDA20_COMPILE_FEATURES}
92 |   )
93 | endif ()
94 | 


--------------------------------------------------------------------------------
/cmake/CubCudaConfig.cmake:
--------------------------------------------------------------------------------
 1 | enable_language(CUDA)
 2 | 
 3 | if (NOT CUB_IN_THRUST)
 4 |   message(FATAL_ERROR
 5 |     "Building CUB as a standalone project is no longer supported. "
 6 |     "Use the Thrust repo instead.")
 7 | endif()
 8 | 
 9 | set(CUB_CUDA_FLAGS_BASE "${THRUST_CUDA_FLAGS_BASE}")
10 | set(CUB_CUDA_FLAGS_RDC "${THRUST_CUDA_FLAGS_RDC}")
11 | set(CUB_CUDA_FLAGS_NO_RDC "${THRUST_CUDA_FLAGS_NO_RDC}")
12 | 
13 | # Update the enabled architectures list from thrust
14 | foreach (arch IN LISTS THRUST_KNOWN_COMPUTE_ARCHS)
15 |   if (THRUST_ENABLE_COMPUTE_${arch})
16 |     set(CUB_ENABLE_COMPUTE_${arch} True)
17 |     string(APPEND arch_message " sm_${arch}")
18 |   else()
19 |     set(CUB_ENABLE_COMPUTE_${arch} False)
20 |   endif()
21 | endforeach()
22 | 
23 | message(STATUS ${arch_message})
24 | 
25 | #
26 | # RDC options:
27 | #
28 | 
29 | # RDC is off by default in NVCC and on by default in NVC++. Turning off RDC
30 | # isn't currently supported by NVC++. So, we default to RDC off for NVCC and
31 | # RDC on for NVC++.
32 | set(option_init OFF)
33 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
34 |   set(option_init ON)
35 | endif()
36 | 
37 | option(CUB_ENABLE_TESTS_WITH_RDC
38 |   "Build all CUB tests with RDC; tests that require RDC are not affected by this option."
39 |   ${option_init}
40 | )
41 | 
42 | option(CUB_ENABLE_EXAMPLES_WITH_RDC
43 |   "Build all CUB examples with RDC; examples which require RDC are not affected by this option."
44 |   ${option_init}
45 | )
46 | 
47 | # Check for RDC/SM compatibility and error/warn if necessary
48 | set(rdc_supported True)
49 | foreach (arch IN LISTS no_rdc_archs)
50 |   if (CUB_ENABLE_COMPUTE_${arch})
51 |     set(rdc_supported False)
52 |     break()
53 |   endif()
54 | endforeach()
55 | 
56 | set(rdc_opts
57 |   CUB_ENABLE_TESTS_WITH_RDC
58 |   CUB_ENABLE_EXAMPLES_WITH_RDC
59 | )
60 | set(rdc_requested False)
61 | foreach (rdc_opt IN LISTS rdc_opts)
62 |   if (${rdc_opt})
63 |     set(rdc_requested True)
64 |     break()
65 |   endif()
66 | endforeach()
67 | 
68 | if (rdc_requested AND NOT rdc_supported)
69 |   string(JOIN ", " no_rdc ${no_rdc_archs})
70 |   string(JOIN "\n" opts ${rdc_opts})
71 |   message(FATAL_ERROR
72 |     "Architectures {${no_rdc}} do not support RDC and are incompatible with "
73 |     "these options:\n${opts}"
74 |   )
75 | endif()
76 | 
77 | 
78 | # 
79 | # Clang CUDA options 
80 | #
81 | if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
82 |   set(CUB_CUDA_FLAGS_BASE "${CUB_CUDA_FLAGS_BASE} -Wno-unknown-cuda-version -Xclang=-fcuda-allow-variadic-functions")
83 | endif()
84 | 
85 | 
86 | # By default RDC is not used:
87 | set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}")
88 | 


--------------------------------------------------------------------------------
/cmake/CubHeaderTesting.cmake:
--------------------------------------------------------------------------------
 1 | # For every public header, build a translation unit containing `#include <header>`
 2 | # to let the compiler try to figure out warnings in that header if it is not otherwise
 3 | # included in tests, and also to verify if the headers are modular enough.
 4 | # .inl files are not globbed for, because they are not supposed to be used as public
 5 | # entrypoints.
 6 | 
 7 | # Meta target for all configs' header builds:
 8 | add_custom_target(cub.all.headers)
 9 | 
10 | file(GLOB_RECURSE headers
11 |   RELATIVE "${CUB_SOURCE_DIR}/cub"
12 |   CONFIGURE_DEPENDS
13 |   cub/*.cuh
14 | )
15 | 
16 | set(headertest_srcs)
17 | foreach (header IN LISTS headers)
18 |   set(headertest_src "headers/${header}.cu")
19 |   configure_file("${CUB_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}")
20 |   list(APPEND headertest_srcs "${headertest_src}")
21 | endforeach()
22 | 
23 | function(cub_add_header_test label definitions)
24 |   foreach(cub_target IN LISTS CUB_TARGETS)
25 |     cub_get_target_property(config_prefix ${cub_target} PREFIX)
26 | 
27 |     set(headertest_target ${config_prefix}.headers.${label})
28 |     add_library(${headertest_target} OBJECT ${headertest_srcs})
29 |     target_link_libraries(${headertest_target} PUBLIC ${cub_target})
30 |     target_compile_definitions(${headertest_target} PRIVATE ${definitions})
31 |     cub_clone_target_properties(${headertest_target} ${cub_target})
32 | 
33 |     if (CUB_IN_THRUST)
34 |       thrust_fix_clang_nvcc_build_for(${headertest_target})
35 |     endif()
36 | 
37 |     add_dependencies(cub.all.headers ${headertest_target})
38 |     add_dependencies(${config_prefix}.all ${headertest_target})
39 |   endforeach()
40 | endfunction()
41 | 
42 | # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros:
43 | set(header_definitions 
44 |   "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" 
45 |   "CUB_WRAPPED_NAMESPACE=wrapped_cub")
46 | cub_add_header_test(base "${header_definitions}")
47 | 
48 | list(APPEND header_definitions "CUB_DISABLE_BF16_SUPPORT")
49 | cub_add_header_test(bf16 "${header_definitions}")
50 | 
51 | 


--------------------------------------------------------------------------------
/cmake/CubInstallRules.cmake:
--------------------------------------------------------------------------------
 1 | # Thrust manages its own copy of these rules. Update ThrustInstallRules.cmake
 2 | # if modifying this file.
 3 | if (CUB_IN_THRUST)
 4 |   return()
 5 | endif()
 6 | 
 7 | # Bring in CMAKE_INSTALL_LIBDIR
 8 | include(GNUInstallDirs)
 9 | 
10 | # CUB is a header library; no need to build anything before installing:
11 | set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE)
12 | 
13 | install(DIRECTORY "${CUB_SOURCE_DIR}/cub"
14 |   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
15 |   FILES_MATCHING
16 |     PATTERN "*.cuh"
17 | )
18 | 
19 | install(DIRECTORY "${CUB_SOURCE_DIR}/cub/cmake/"
20 |   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub"
21 |   PATTERN *.cmake.in EXCLUDE
22 | )
23 | # Need to configure a file to store the infix specified in
24 | # CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user
25 | set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/cub")
26 | configure_file("${CUB_SOURCE_DIR}/cub/cmake/cub-header-search.cmake.in"
27 |   "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake"
28 |   @ONLY)
29 | install(FILES "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake"
30 |   DESTINATION "${install_location}")
31 | 


--------------------------------------------------------------------------------
/cmake/CubUtilities.cmake:
--------------------------------------------------------------------------------
 1 | # Enable RDC for a CUDA target. Encapsulates compiler hacks:
 2 | function(cub_enable_rdc_for_cuda_target target_name)
 3 |   if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
 4 |     set_target_properties(${target_name} PROPERTIES
 5 |       COMPILE_FLAGS "-gpu=rdc"
 6 |     )
 7 |   elseif ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
 8 |   else()
 9 |     set_target_properties(${target_name} PROPERTIES
10 |       CUDA_SEPARABLE_COMPILATION ON
11 |     )
12 |   endif()
13 | endfunction()
14 | 


--------------------------------------------------------------------------------
/cmake/header_test.in:
--------------------------------------------------------------------------------
 1 | // This source file checks that:
 2 | // 1) Header <cub/${header}> compiles without error.
 3 | // 2) Common macro collisions with platform/system headers are avoided.
 4 | 
 5 | // Define CUB_MACRO_CHECK(macro, header), which emits a diagnostic indicating
 6 | // a potential macro collision and halts.
 7 | //
 8 | // Use raw platform checks instead of the CUB_HOST_COMPILER macros since we
 9 | // don't want to #include any headers other than the one being tested.
10 | //
11 | // This is only implemented for MSVC/GCC/Clang.
12 | #if defined(_MSC_VER) // MSVC
13 | 
14 | // Fake up an error for MSVC
15 | #define CUB_MACRO_CHECK_IMPL(msg)                                              \
16 |   /* Print message that looks like an error: */                                \
17 |   __pragma(message(__FILE__ ":" CUB_MACRO_CHECK_IMPL0(__LINE__)                \
18 |                    ": error: " #msg))                                          \
19 |   /* abort compilation due to static_assert or syntax error: */                \
20 |   static_assert(false, #msg);
21 | #define CUB_MACRO_CHECK_IMPL0(x) CUB_MACRO_CHECK_IMPL1(x)
22 | #define CUB_MACRO_CHECK_IMPL1(x) #x
23 | 
24 | #elif defined(__clang__) || defined(__GNUC__)
25 | 
26 | // GCC/clang are easy:
27 | #define CUB_MACRO_CHECK_IMPL(msg) CUB_MACRO_CHECK_IMPL0(GCC error #msg)
28 | #define CUB_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
29 | 
30 | #endif
31 | 
32 | // Hacky way to build a string, but it works on all tested platforms.
33 | #define CUB_MACRO_CHECK(MACRO, HEADER)                                         \
34 |   CUB_MACRO_CHECK_IMPL(Identifier MACRO should not be used from CUB            \
35 |                        headers due to conflicts with HEADER macros.)
36 | 
37 | // complex.h conflicts
38 | #define I CUB_MACRO_CHECK('I', complex.h)
39 | 
40 | // windows.h conflicts
41 | #define small CUB_MACRO_CHECK('small', windows.h)
42 | // We can't enable these checks without breaking some builds -- some standard
43 | // library implementations unconditionally `#undef` these macros, which then
44 | // causes random failures later.
45 | // Leaving these commented out as a warning: Here be dragons.
46 | //#define min(...) CUB_MACRO_CHECK('min', windows.h)
47 | //#define max(...) CUB_MACRO_CHECK('max', windows.h)
48 | 
49 | // termios.h conflicts (NVIDIA/thrust#1547)
50 | #define B0 CUB_MACRO_CHECK("B0", termios.h)
51 | 
52 | #include <cub/${header}>
53 | 
54 | #if defined(CUB_DISABLE_BF16_SUPPORT)
55 | #if defined(__CUDA_BF16_TYPES_EXIST__)
56 | #error CUB should not include cuda_bf16.h when BF16 support is disabled
57 | #endif
58 | #endif
59 | 


--------------------------------------------------------------------------------
/cub/block/block_raking_layout.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../config.cuh"
 38 | #include "../util_type.cuh"
 39 | 
 40 | CUB_NAMESPACE_BEGIN
 41 | 
 42 | /**
 43 |  * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
 44 |  * \ingroup BlockModule
 45 |  *
 46 |  * \par Overview
 47 |  * This type facilitates a shared memory usage pattern where a block of CUDA
 48 |  * threads places elements into shared memory and then reduces the active
 49 |  * parallelism to one "raking" warp of threads for serially aggregating consecutive
 50 |  * sequences of shared items.  Padding is inserted to eliminate bank conflicts
 51 |  * (for most data types).
 52 |  *
 53 |  * \tparam T                        The data type to be exchanged.
 54 |  * \tparam BLOCK_THREADS            The thread block size in threads.
 55 |  * \tparam LEGACY_PTX_ARCH          <b>[optional]</b> Unused.
 56 |  */
 57 | template <
 58 |     typename    T,
 59 |     int         BLOCK_THREADS,
 60 |     int         LEGACY_PTX_ARCH = 0>
 61 | struct BlockRakingLayout
 62 | {
 63 |     //---------------------------------------------------------------------
 64 |     // Constants and type definitions
 65 |     //---------------------------------------------------------------------
 66 | 
 67 |     enum
 68 |     {
 69 |         /// The total number of elements that need to be cooperatively reduced
 70 |         SHARED_ELEMENTS = BLOCK_THREADS,
 71 | 
 72 |         /// Maximum number of warp-synchronous raking threads
 73 |         MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(0)),
 74 | 
 75 |         /// Number of raking elements per warp-synchronous raking thread (rounded up)
 76 |         SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
 77 | 
 78 |         /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
 79 |         RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
 80 | 
 81 |         /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
 82 |         HAS_CONFLICTS = (CUB_SMEM_BANKS(0) % SEGMENT_LENGTH == 0),
 83 | 
 84 |         /// Degree of bank conflicts (e.g., 4-way)
 85 |         CONFLICT_DEGREE = (HAS_CONFLICTS) ?
 86 |             (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(0) :
 87 |             1,
 88 | 
 89 |         /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
 90 |         USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
 91 | 
 92 |         /// Total number of elements in the raking grid
 93 |         GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
 94 | 
 95 |         /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
 96 |         UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
 97 |     };
 98 | 
 99 | 
100 |     /**
101 |      * \brief Shared memory storage type
102 |      */
103 |     struct __align__(16) _TempStorage
104 |     {
105 |         T buff[BlockRakingLayout::GRID_ELEMENTS];
106 |     };
107 | 
108 |     /// Alias wrapper allowing storage to be unioned
109 |     struct TempStorage : Uninitialized<_TempStorage> {};
110 | 
111 | 
112 |     /**
113 |      * \brief Returns the location for the calling thread to place data into the grid
114 |      */
115 |     static __device__ __forceinline__ T* PlacementPtr(
116 |         TempStorage &temp_storage,
117 |         unsigned int linear_tid)
118 |     {
119 |         // Offset for partial
120 |         unsigned int offset = linear_tid;
121 | 
122 |         // Add in one padding element for every segment
123 |         if (USE_SEGMENT_PADDING > 0)
124 |         {
125 |             offset += offset / SEGMENT_LENGTH;
126 |         }
127 | 
128 |         // Incorporating a block of padding partials every shared memory segment
129 |         return temp_storage.Alias().buff + offset;
130 |     }
131 | 
132 | 
133 |     /**
134 |      * \brief Returns the location for the calling thread to begin sequential raking
135 |      */
136 |     static __device__ __forceinline__ T* RakingPtr(
137 |         TempStorage &temp_storage,
138 |         unsigned int linear_tid)
139 |     {
140 |         return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
141 |     }
142 | };
143 | 
144 | CUB_NAMESPACE_END
145 | 
146 | 


--------------------------------------------------------------------------------
/cub/block/radix_rank_sort_operations.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011-2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /**
 29 |  * \file
 30 |  * radix_rank_sort_operations.cuh contains common abstractions, definitions and
 31 |  * operations used for radix sorting and ranking.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../config.cuh"
 37 | #include "../util_ptx.cuh"
 38 | #include "../util_type.cuh"
 39 | 
 40 | 
 41 | CUB_NAMESPACE_BEGIN
 42 | 
 43 | /** \brief Twiddling keys for radix sort. */
 44 | template <bool IS_DESCENDING, typename KeyT>
 45 | struct RadixSortTwiddle
 46 | {
 47 |     typedef Traits<KeyT> TraitsT;
 48 |     typedef typename TraitsT::UnsignedBits UnsignedBits;
 49 |     static __host__ __device__ __forceinline__ UnsignedBits In(UnsignedBits key)
 50 |     {
 51 |         key = TraitsT::TwiddleIn(key);
 52 |         if (IS_DESCENDING) key = ~key;
 53 |         return key;
 54 |     }
 55 |     static __host__ __device__ __forceinline__ UnsignedBits Out(UnsignedBits key)
 56 |     {
 57 |         if (IS_DESCENDING) key = ~key;
 58 |         key = TraitsT::TwiddleOut(key);
 59 |         return key;
 60 |     }
 61 |     static __host__ __device__ __forceinline__ UnsignedBits DefaultKey()
 62 |     {
 63 |         return Out(~UnsignedBits(0));
 64 |     }
 65 | };
 66 | 
 67 | /** \brief Base struct for digit extractor. Contains common code to provide
 68 |     special handling for floating-point -0.0.
 69 | 
 70 |     \note This handles correctly both the case when the keys are
 71 |     bitwise-complemented after twiddling for descending sort (in onesweep) as
 72 |     well as when the keys are not bit-negated, but the implementation handles
 73 |     descending sort separately (in other implementations in CUB). Twiddling
 74 |     alone maps -0.0f to 0x7fffffff and +0.0f to 0x80000000 for float, which are
 75 |     subsequent bit patterns and bitwise complements of each other. For onesweep,
 76 |     both -0.0f and +0.0f are mapped to the bit pattern of +0.0f (0x80000000) for
 77 |     ascending sort, and to the pattern of -0.0f (0x7fffffff) for descending
 78 |     sort. For all other sorting implementations in CUB, both are always mapped
 79 |     to +0.0f. Since bit patterns for both -0.0f and +0.0f are next to each other
 80 |     and only one of them is used, the sorting works correctly. For double, the
 81 |     same applies, but with 64-bit patterns.
 82 | */
 83 | template <typename KeyT>
 84 | struct BaseDigitExtractor
 85 | {
 86 |     typedef Traits<KeyT> TraitsT;
 87 |     typedef typename TraitsT::UnsignedBits UnsignedBits;
 88 | 
 89 |     enum
 90 |     {
 91 |         FLOAT_KEY = TraitsT::CATEGORY == FLOATING_POINT,
 92 |     };
 93 | 
 94 |     static __device__ __forceinline__ UnsignedBits ProcessFloatMinusZero(UnsignedBits key)
 95 |     {
 96 |         if (!FLOAT_KEY) {
 97 |             return key;
 98 |         } else {
 99 |             UnsignedBits TWIDDLED_MINUS_ZERO_BITS =
100 |                 TraitsT::TwiddleIn(UnsignedBits(1) << UnsignedBits(8 * sizeof(UnsignedBits) - 1));
101 |             UnsignedBits TWIDDLED_ZERO_BITS = TraitsT::TwiddleIn(0);
102 |             return key == TWIDDLED_MINUS_ZERO_BITS ? TWIDDLED_ZERO_BITS : key;
103 |         }
104 |     }
105 | };
106 | 
107 | /** \brief A wrapper type to extract digits. Uses the BFE intrinsic to extract a
108 |  * key from a digit. */
109 | template <typename KeyT>
110 | struct BFEDigitExtractor : BaseDigitExtractor<KeyT>
111 | {   
112 |     using typename BaseDigitExtractor<KeyT>::UnsignedBits;
113 | 
114 |     uint32_t bit_start, num_bits;
115 |     explicit __device__ __forceinline__ BFEDigitExtractor(
116 |         uint32_t bit_start = 0, uint32_t num_bits = 0)
117 |         : bit_start(bit_start), num_bits(num_bits)
118 |     { }
119 | 
120 |     __device__ __forceinline__ uint32_t Digit(UnsignedBits key)
121 |     {
122 |         return BFE(this->ProcessFloatMinusZero(key), bit_start, num_bits);
123 |     }
124 | };
125 | 
126 | /** \brief A wrapper type to extract digits. Uses a combination of shift and
127 |  * bitwise and to extract digits. */
128 | template <typename KeyT>
129 | struct ShiftDigitExtractor : BaseDigitExtractor<KeyT>
130 | {
131 |     using typename BaseDigitExtractor<KeyT>::UnsignedBits;
132 | 
133 |     uint32_t bit_start, mask;
134 |     explicit __device__ __forceinline__ ShiftDigitExtractor(
135 |         uint32_t bit_start = 0, uint32_t num_bits = 0)
136 |         : bit_start(bit_start), mask((1 << num_bits) - 1)
137 |     { }
138 | 
139 |     __device__ __forceinline__ uint32_t Digit(UnsignedBits key)
140 |     {
141 |         return uint32_t(this->ProcessFloatMinusZero(key) >> UnsignedBits(bit_start)) & mask;
142 |     }
143 | };
144 | 
145 | CUB_NAMESPACE_END
146 | 


--------------------------------------------------------------------------------
/cub/block/specializations/block_histogram_atomic.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "../../config.cuh"
37 | 
38 | CUB_NAMESPACE_BEGIN
39 | 
40 | 
41 | /**
42 |  * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
43 |  */
44 | template <int BINS>
45 | struct BlockHistogramAtomic
46 | {
47 |     /// Shared memory storage layout type
48 |     struct TempStorage {};
49 | 
50 | 
51 |     /// Constructor
52 |     __device__ __forceinline__ BlockHistogramAtomic(
53 |         TempStorage &temp_storage)
54 |     {}
55 | 
56 | 
57 |     /// Composite data onto an existing histogram
58 |     template <
59 |         typename            T,
60 |         typename            CounterT,     
61 |         int                 ITEMS_PER_THREAD>
62 |     __device__ __forceinline__ void Composite(
63 |         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
64 |         CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
65 |     {
66 |         // Update histogram
67 |         #pragma unroll
68 |         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
69 |         {
70 |               atomicAdd(histogram + items[i], 1);
71 |         }
72 |     }
73 | 
74 | };
75 | 
76 | CUB_NAMESPACE_END
77 | 
78 | 


--------------------------------------------------------------------------------
/cub/cmake/cub-config-version.cmake:
--------------------------------------------------------------------------------
 1 | # Parse version information from version.cuh:
 2 | include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake")
 3 | 
 4 | file(READ "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh" CUB_VERSION_HEADER)
 5 | string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
 6 | set(CUB_VERSION_FLAT ${CMAKE_MATCH_1})
 7 | # Note that CUB calls this the PATCH number, CMake calls it the TWEAK number:
 8 | string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
 9 | set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1})
10 | 
11 | math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000")
12 | math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000")
13 | math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch"
14 | 
15 | set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}")
16 | 
17 | set(PACKAGE_VERSION ${CUB_VERSION})
18 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
19 | set(PACKAGE_VERSION_EXACT FALSE)
20 | set(PACKAGE_VERSION_UNSUITABLE FALSE)
21 | 
22 | if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION)
23 |   if(CUB_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND
24 |      CUB_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR)
25 |     set(PACKAGE_VERSION_COMPATIBLE TRUE)
26 |   endif()
27 | 
28 |   if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
29 |     set(PACKAGE_VERSION_EXACT TRUE)
30 |   endif()
31 | endif()
32 | 


--------------------------------------------------------------------------------
/cub/cmake/cub-config.cmake:
--------------------------------------------------------------------------------
  1 | #
  2 | # find_package(CUB) config file.
  3 | #
  4 | # Defines a CUB::CUB target that may be linked from user projects to include
  5 | # CUB.
  6 | 
  7 | if (TARGET CUB::CUB)
  8 |   return()
  9 | endif()
 10 | 
 11 | # Minimum supported libcudacxx version:
 12 | set(cub_libcudacxx_version 1.8.0)
 13 | 
 14 | function(_cub_declare_interface_alias alias_name ugly_name)
 15 |   # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
 16 |   # 2) When an IMPORTED library is linked to another target, its include
 17 |   #    directories are treated as SYSTEM includes.
 18 |   # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
 19 |   #    system includes. This means that the Toolkit CUB will *always* be used
 20 |   #    during compilation, and the include paths of an IMPORTED CUB::CUB
 21 |   #    target will never have any effect.
 22 |   # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
 23 |   #    on EVERY target that links to CUB::CUB. This would be a burden and a
 24 |   #    footgun for our users. Forgetting this would silently pull in the wrong CUB!
 25 |   # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
 26 |   #    configure it, and then ALIAS it into the namespace (or ALIAS and then
 27 |   #    configure, that seems to work too).
 28 |   add_library(${ugly_name} INTERFACE)
 29 |   add_library(${alias_name} ALIAS ${ugly_name})
 30 | endfunction()
 31 | 
 32 | #
 33 | # Setup some internal cache variables
 34 | #
 35 | 
 36 | # Pull in the include dir detected by cub-config-version.cmake
 37 | set(_CUB_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}"
 38 |   CACHE INTERNAL "Location of CUB headers."
 39 |   FORCE
 40 | )
 41 | unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache
 42 | 
 43 | if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
 44 |   set(_CUB_QUIET ON CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE)
 45 |   set(_CUB_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE)
 46 | else()
 47 |   set(_CUB_QUIET OFF CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE)
 48 |   set(_CUB_QUIET_FLAG "" CACHE INTERNAL "" FORCE)
 49 | endif()
 50 | 
 51 | #
 52 | # Setup dependencies
 53 | #
 54 | 
 55 | if (NOT TARGET CUB::libcudacxx)
 56 |   if (TARGET Thrust::libcudacxx)
 57 |     # Prefer the same libcudacxx as Thrust, if available:
 58 |     _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx)
 59 |     target_link_libraries(_CUB_libcudacxx INTERFACE Thrust::libcudacxx)
 60 |   else()
 61 |     if (NOT TARGET libcudacxx::libcudacxx)
 62 |       # First do a non-required search for any co-packaged versions.
 63 |       # These are preferred.
 64 |       find_package(libcudacxx ${cub_libcudacxx_version} CONFIG
 65 |         ${_CUB_QUIET_FLAG}
 66 |         NO_DEFAULT_PATH # Only check the explicit HINTS below:
 67 |         HINTS
 68 |           "${_CUB_INCLUDE_DIR}/../libcudacxx"           # Source layout
 69 |           "${_CUB_CMAKE_DIR}/.."                        # Install layout
 70 |       )
 71 | 
 72 |       # A second required search allows externally packaged to be used and fails if
 73 |       # no suitable package exists.
 74 |       find_package(libcudacxx ${cub_libcudacxx_version} CONFIG
 75 |         REQUIRED
 76 |         ${_CUB_QUIET_FLAG}
 77 |       )
 78 |     endif()
 79 |     _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx)
 80 |     target_link_libraries(_CUB_libcudacxx INTERFACE libcudacxx::libcudacxx)
 81 |   endif()
 82 | endif()
 83 | 
 84 | #
 85 | # Setup targets
 86 | #
 87 | 
 88 | _cub_declare_interface_alias(CUB::CUB _CUB_CUB)
 89 | target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}")
 90 | target_link_libraries(_CUB_CUB INTERFACE CUB::libcudacxx)
 91 | 
 92 | if (CUB_IGNORE_DEPRECATED_API OR THRUST_IGNORE_DEPRECATED_API)
 93 |   target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_API")
 94 | endif()
 95 | 
 96 | if (CUB_IGNORE_DEPRECATED_CPP_DIALECT OR
 97 |     THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
 98 |   target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT")
 99 | endif()
100 | 
101 | if (CUB_IGNORE_DEPRECATED_CPP_11 OR
102 |     THRUST_IGNORE_DEPRECATED_CPP_11)
103 |   target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11")
104 | endif()
105 | 
106 | if (CUB_IGNORE_DEPRECATED_COMPILER OR
107 |     THRUST_IGNORE_DEPRECATED_COMPILER)
108 |   target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER")
109 | endif()
110 | 
111 | #
112 | # Standardize version info
113 | #
114 | 
115 | set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE)
116 | set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE)
117 | set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE)
118 | set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE)
119 | set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE)
120 | set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE)
121 | 
122 | include(FindPackageHandleStandardArgs)
123 | if (NOT CUB_CONFIG)
124 |   set(CUB_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
125 | endif()
126 | find_package_handle_standard_args(CUB CONFIG_MODE)
127 | 


--------------------------------------------------------------------------------
/cub/cmake/cub-header-search.cmake:
--------------------------------------------------------------------------------
1 | # Parse version information from version.h in source tree
2 | set(_CUB_VERSION_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/../..")
3 | if(EXISTS "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh")
4 |   set(_CUB_VERSION_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" CACHE FILEPATH "" FORCE) # Clear old result
5 |   set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
6 | endif()
7 | 


--------------------------------------------------------------------------------
/cub/cmake/cub-header-search.cmake.in:
--------------------------------------------------------------------------------
 1 | # Parse version information from version.h:
 2 | unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search
 3 | 
 4 | # Find CMAKE_INSTALL_INCLUDEDIR=@CMAKE_INSTALL_INCLUDEDIR@ directory"
 5 | set(from_install_prefix "@install_location@")
 6 | 
 7 | # Transform to a list of directories, replace each directoy with "../"
 8 | # and convert back to a string
 9 | string(REGEX REPLACE "/" ";" from_install_prefix "${from_install_prefix}")
10 | list(TRANSFORM from_install_prefix REPLACE ".+" "../")
11 | list(JOIN from_install_prefix "" from_install_prefix)
12 | 
13 | find_path(_CUB_VERSION_INCLUDE_DIR cub/version.cuh
14 |   NO_DEFAULT_PATH # Only search explicit paths below:
15 |   PATHS
16 |     "${CMAKE_CURRENT_LIST_DIR}/${from_install_prefix}/@CMAKE_INSTALL_INCLUDEDIR@"
17 | )
18 | set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
19 | 


--------------------------------------------------------------------------------
/cub/config.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Static configuration header for the CUB project.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | #include "util_arch.cuh"
36 | #include "util_compiler.cuh"
37 | #include "util_cpp_dialect.cuh"
38 | #include "util_deprecated.cuh"
39 | #include "util_macro.cuh"
40 | #include "util_namespace.cuh"
41 | 


--------------------------------------------------------------------------------
/cub/cub.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * CUB umbrella include file
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | // Static configuration
 37 | #include "config.cuh"
 38 | 
 39 | // Block
 40 | #include "block/block_adjacent_difference.cuh"
 41 | #include "block/block_discontinuity.cuh"
 42 | #include "block/block_exchange.cuh"
 43 | #include "block/block_histogram.cuh"
 44 | #include "block/block_load.cuh"
 45 | #include "block/block_merge_sort.cuh"
 46 | #include "block/block_radix_rank.cuh"
 47 | #include "block/block_radix_sort.cuh"
 48 | #include "block/block_reduce.cuh"
 49 | #include "block/block_scan.cuh"
 50 | #include "block/block_store.cuh"
 51 | //#include "block/block_shift.cuh"
 52 | 
 53 | // Device
 54 | #include "device/device_adjacent_difference.cuh"
 55 | #include "device/device_copy.cuh"
 56 | #include "device/device_histogram.cuh"
 57 | #include "device/device_memcpy.cuh"
 58 | #include "device/device_merge_sort.cuh"
 59 | #include "device/device_partition.cuh"
 60 | #include "device/device_radix_sort.cuh"
 61 | #include "device/device_reduce.cuh"
 62 | #include "device/device_run_length_encode.cuh"
 63 | #include "device/device_scan.cuh"
 64 | #include "device/device_segmented_radix_sort.cuh"
 65 | #include "device/device_segmented_reduce.cuh"
 66 | #include "device/device_segmented_sort.cuh"
 67 | #include "device/device_select.cuh"
 68 | #include "device/device_spmv.cuh"
 69 | 
 70 | // Grid
 71 | //#include "grid/grid_barrier.cuh"
 72 | #include "grid/grid_even_share.cuh"
 73 | #include "grid/grid_mapping.cuh"
 74 | #include "grid/grid_queue.cuh"
 75 | 
 76 | // Thread
 77 | #include "thread/thread_load.cuh"
 78 | #include "thread/thread_operators.cuh"
 79 | #include "thread/thread_reduce.cuh"
 80 | #include "thread/thread_scan.cuh"
 81 | #include "thread/thread_store.cuh"
 82 | 
 83 | // Warp
 84 | #include "warp/warp_exchange.cuh"
 85 | #include "warp/warp_load.cuh"
 86 | #include "warp/warp_merge_sort.cuh"
 87 | #include "warp/warp_reduce.cuh"
 88 | #include "warp/warp_scan.cuh"
 89 | #include "warp/warp_store.cuh"
 90 | 
 91 | // Iterator
 92 | #include "iterator/arg_index_input_iterator.cuh"
 93 | #include "iterator/cache_modified_input_iterator.cuh"
 94 | #include "iterator/cache_modified_output_iterator.cuh"
 95 | #include "iterator/constant_input_iterator.cuh"
 96 | #include "iterator/counting_input_iterator.cuh"
 97 | #include "iterator/discard_output_iterator.cuh"
 98 | #include "iterator/tex_obj_input_iterator.cuh"
 99 | #include "iterator/tex_ref_input_iterator.cuh"
100 | #include "iterator/transform_input_iterator.cuh"
101 | 
102 | // Util
103 | #include "util_allocator.cuh"
104 | #include "util_arch.cuh"
105 | #include "util_debug.cuh"
106 | #include "util_device.cuh"
107 | #include "util_macro.cuh"
108 | #include "util_ptx.cuh"
109 | #include "util_type.cuh"
110 | 


--------------------------------------------------------------------------------
/cub/detail/choose_offset.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #pragma once
29 | 
30 | #include <cub/config.cuh>
31 | 
32 | #include <cstdint>
33 | #include <type_traits>
34 | 
35 | CUB_NAMESPACE_BEGIN
36 | 
37 | namespace detail
38 | {
39 | 
40 | /**
41 |  * ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and
42 |  * selects the offset type based on it.
43 |  */
44 | template <typename NumItemsT>
45 | struct ChooseOffsetT
46 | {
47 |   // NumItemsT must be an integral type (but not bool).
48 |   static_assert(
49 |     std::is_integral<NumItemsT>::value &&
50 |       !std::is_same<typename std::remove_cv<NumItemsT>::type, bool>::value,
51 |     "NumItemsT must be an integral type, but not bool");
52 | 
53 |   // Unsigned integer type for global offsets.
54 |   using Type = typename std::conditional<sizeof(NumItemsT) <= 4,
55 |                                          std::uint32_t,
56 |                                          unsigned long long>::type;
57 | };
58 | 
59 | } // namespace detail
60 | 
61 | CUB_NAMESPACE_END
62 | 
63 | 


--------------------------------------------------------------------------------
/cub/detail/cpp_compatibility.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  Copyright 2022 NVIDIA Corporation
 3 | *
 4 | *  Licensed under the Apache License, Version 2.0 (the "License");
 5 | *  you may not use this file except in compliance with the License.
 6 | *  You may obtain a copy of the License at
 7 | *
 8 | *      http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | *  Unless required by applicable law or agreed to in writing, software
11 | *  distributed under the License is distributed on an "AS IS" BASIS,
12 | *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | *  See the License for the specific language governing permissions and
14 | *  limitations under the License.
15 | */
16 | 
17 | 
18 | #pragma once
19 | 
20 | #include <cub/util_cpp_dialect.cuh>
21 | 
22 | #if CUB_CPP_DIALECT >= 2017 && __cpp_if_constexpr
23 | #  define CUB_IF_CONSTEXPR if constexpr
24 | #  define CUB_ELSE_IF_CONSTEXPR else if constexpr
25 | #else
26 | #  define CUB_IF_CONSTEXPR if
27 | #  define CUB_ELSE_IF_CONSTEXPR else if
28 | #endif
29 | 


--------------------------------------------------------------------------------
/cub/detail/detect_cuda_runtime.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Utilities for CUDA dynamic parallelism.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <cub/util_namespace.cuh>
 37 | 
 38 | #include <cuda_runtime_api.h>
 39 | 
 40 | CUB_NAMESPACE_BEGIN
 41 | namespace detail
 42 | {
 43 | 
 44 | #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
 45 | 
 46 | /**
 47 |  * \def CUB_DISABLE_CDP
 48 |  *
 49 |  * If defined, support for device-side usage of CUB is disabled.
 50 |  */
 51 | #define CUB_DISABLE_CDP
 52 | 
 53 | /**
 54 |  * \def CUB_RDC_ENABLED
 55 |  *
 56 |  * Defined if RDC is enabled and CUB_DISABLE_CDP is not defined.
 57 |  */
 58 | #define CUB_RDC_ENABLED
 59 | 
 60 | /**
 61 |  * \def CUB_RUNTIME_FUNCTION
 62 |  *
 63 |  * Execution space for functions that can use the CUDA runtime API (`__host__`
 64 |  * when RDC is off, `__host__ __device__` when RDC is on).
 65 |  */
 66 | #define CUB_RUNTIME_FUNCTION
 67 | 
 68 | /**
 69 |  * \def CUB_RUNTIME_ENABLED
 70 |  *
 71 |  * Whether or not the active compiler pass is allowed to invoke device kernels
 72 |  * or methods from the CUDA runtime API.
 73 |  *
 74 |  * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__`
 75 |  * and is not compatible with `NV_IF_TARGET`. It is provided for legacy
 76 |  * purposes only.
 77 |  *
 78 |  * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`.
 79 |  */
 80 | #define CUB_RUNTIME_ENABLED
 81 | 
 82 | #else // Non-doxygen pass:
 83 | 
 84 | #ifndef CUB_RUNTIME_FUNCTION
 85 | 
 86 | #if defined(__CUDACC_RDC__) && !defined(CUB_DISABLE_CDP)
 87 | 
 88 | #define CUB_RDC_ENABLED
 89 | #define CUB_RUNTIME_FUNCTION __host__ __device__
 90 | 
 91 | #else // RDC disabled:
 92 | 
 93 | #define CUB_RUNTIME_FUNCTION __host__
 94 | 
 95 | #endif // RDC enabled
 96 | 
 97 | #if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
 98 | // Legacy only -- do not use in new code.
 99 | #define CUB_RUNTIME_ENABLED
100 | #endif
101 | 
102 | #endif // CUB_RUNTIME_FUNCTION predefined
103 | 
104 | #ifdef CUB_RDC_ENABLED
105 | // Detect available version of CDP:
106 | #if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED)
107 | #define CUB_DETAIL_CDPv1
108 | #else
109 | #define CUB_DETAIL_CDPv2
110 | #endif
111 | #endif
112 | 
113 | #endif // Do not document
114 | 
115 | } // namespace detail
116 | CUB_NAMESPACE_END
117 | 


--------------------------------------------------------------------------------
/cub/detail/device_double_buffer.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2021 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <cub/util_namespace.cuh>
20 | 
21 | 
22 | CUB_NAMESPACE_BEGIN
23 | 
24 | namespace detail
25 | {
26 | 
27 | 
28 | /**
29 |  * @brief It's a double-buffer storage wrapper for multi-pass stream
30 |  *        transformations that require more than one storage array for
31 |  *        streaming intermediate results back and forth.
32 |  *
33 |  * Many multi-pass computations require a pair of "ping-pong" storage buffers
34 |  * (e.g., one for reading from and the other for writing to, and then
35 |  * vice-versa for the subsequent pass). This structure wraps a set of device
36 |  * buffers.
37 |  *
38 |  * Unlike `cub::DoubleBuffer` this class doesn't provide a "selector" member
39 |  * to track which buffer is "current". The main reason for this class existence
40 |  * is the performance difference. Since `cub::DoubleBuffer` relies on the
41 |  * runtime variable to index pointers arrays, they are placed in the local
42 |  * memory instead of registers. Local memory accesses significantly affect
43 |  * performance. On the contrary, this class swaps pointer, so all operations
44 |  * can be performed in registers.
45 |  */
46 | template <typename T>
47 | class device_double_buffer
48 | {
49 |   /// Pair of device buffer pointers
50 |   T *m_current_buffer {};
51 |   T *m_alternate_buffer {};
52 | 
53 | public:
54 |   /**
55 |    * @param d_current
56 |    *   The currently valid buffer
57 |    *
58 |    * @param d_alternate
59 |    *   Alternate storage buffer of the same size as @p d_current
60 |    */
61 |   __host__ __device__ __forceinline__ device_double_buffer(T *current,
62 |                                                            T *alternate)
63 |       : m_current_buffer(current)
64 |       , m_alternate_buffer(alternate)
65 |   {}
66 | 
67 |   /// \brief Return pointer to the currently valid buffer
68 |   __host__ __device__ __forceinline__ T *current() const
69 |   {
70 |     return m_current_buffer;
71 |   }
72 | 
73 |   /// \brief Return pointer to the currently invalid buffer
74 |   __host__ __device__ __forceinline__ T *alternate() const
75 |   {
76 |     return m_alternate_buffer;
77 |   }
78 | 
79 |   __host__ __device__ void swap()
80 |   {
81 |     T *tmp             = m_current_buffer;
82 |     m_current_buffer   = m_alternate_buffer;
83 |     m_alternate_buffer = tmp;
84 |   }
85 | };
86 | 
87 | 
88 | } // namespace detail
89 | 
90 | CUB_NAMESPACE_END
91 | 


--------------------------------------------------------------------------------
/cub/detail/device_synchronize.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2021 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <cub/detail/detect_cuda_runtime.cuh>
20 | #include <cub/detail/exec_check_disable.cuh>
21 | #include <cub/util_arch.cuh>
22 | #include <cub/util_namespace.cuh>
23 | 
24 | #include <nv/target>
25 | 
26 | #include <cuda_runtime_api.h>
27 | 
28 | CUB_NAMESPACE_BEGIN
29 | 
30 | namespace detail
31 | {
32 | 
33 | /**
34 |  * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and
35 |  * CUDA configuration.
36 |  */
37 | CUB_EXEC_CHECK_DISABLE
38 | CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
39 | {
40 |   cudaError_t result = cudaErrorNotSupported;
41 | 
42 |   // Device-side sync is only available under CDPv1:
43 | #if defined(CUB_DETAIL_CDPv1)
44 | 
45 | #if ((__CUDACC_VER_MAJOR__ > 11) ||                                            \
46 |      ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
47 |   // CUDA >= 11.6
48 | #define CUB_TMP_DEVICE_SYNC_IMPL                                               \
49 |   result = __cudaDeviceSynchronizeDeprecationAvoidance();
50 | #else // CUDA < 11.6:
51 | #define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
52 | #endif
53 | 
54 | #else // CDPv2 or no CDP:
55 | 
56 | #define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */
57 | 
58 | #endif // CDP version
59 | 
60 |   NV_IF_TARGET(NV_IS_HOST,
61 |                (result = cudaDeviceSynchronize();),
62 |                (CUB_TMP_DEVICE_SYNC_IMPL));
63 | 
64 | #undef CUB_TMP_DEVICE_SYNC_IMPL
65 | 
66 |   return result;
67 | }
68 | 
69 | } // namespace detail
70 | 
71 | CUB_NAMESPACE_END
72 | 


--------------------------------------------------------------------------------
/cub/detail/exec_check_disable.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  Copyright 2021 NVIDIA Corporation
 3 | *
 4 | *  Licensed under the Apache License, Version 2.0 (the "License");
 5 | *  you may not use this file except in compliance with the License.
 6 | *  You may obtain a copy of the License at
 7 | *
 8 | *      http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | *  Unless required by applicable law or agreed to in writing, software
11 | *  distributed under the License is distributed on an "AS IS" BASIS,
12 | *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | *  See the License for the specific language governing permissions and
14 | *  limitations under the License.
15 | */
16 | 
17 | #pragma once
18 | 
19 | #include <cub/util_compiler.cuh>
20 | 
21 | /**
22 |  * @def CUB_EXEC_CHECK_DISABLE
23 |  * Wrapper around `#pragma nv_exec_check_disable`.
24 |  */
25 | 
26 | // #pragma nv_exec_check_disable is only recognized by NVCC.
27 | #if defined(__CUDACC__) && \
28 |     !defined(_NVHPC_CUDA) && \
29 |     !(defined(__CUDA__) && defined(__clang__))
30 | 
31 | #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
32 | #define CUB_EXEC_CHECK_DISABLE __pragma("nv_exec_check_disable")
33 | #else // // !MSVC
34 | #define CUB_EXEC_CHECK_DISABLE _Pragma("nv_exec_check_disable")
35 | #endif // MSVC
36 | 
37 | #else // !NVCC
38 | 
39 | #define CUB_EXEC_CHECK_DISABLE
40 | 
41 | #endif // NVCC
42 | 


--------------------------------------------------------------------------------
/cub/detail/type_traits.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Wrappers and extensions around <type_traits> utilities.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | #include <cub/util_cpp_dialect.cuh>
36 | #include <cub/util_namespace.cuh>
37 | 
38 | #include <cuda/std/type_traits>
39 | 
40 | 
41 | CUB_NAMESPACE_BEGIN
42 | namespace detail {
43 | 
44 | template <typename Invokable, typename... Args>
45 | using invoke_result_t =
46 | #if CUB_CPP_DIALECT < 2017
47 |   typename ::cuda::std::result_of<Invokable(Args...)>::type;
48 | #else // 2017+
49 |   ::cuda::std::invoke_result_t<Invokable, Args...>;
50 | #endif
51 | 
52 | /// The type of intermediate accumulator (according to P2322R6)
53 | template <typename Invokable, typename InitT, typename InputT>
54 | using accumulator_t = 
55 |   typename ::cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;
56 | 
57 | } // namespace detail
58 | CUB_NAMESPACE_END
59 | 


--------------------------------------------------------------------------------
/cub/detail/uninitialized_copy.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #pragma once
29 | 
30 | #include <cub/config.cuh>
31 | 
32 | #include <cuda/std/type_traits>
33 | 
34 | CUB_NAMESPACE_BEGIN
35 | 
36 | 
37 | namespace detail
38 | {
39 | 
40 | #if defined(_NVHPC_CUDA)
41 | template <typename T, typename U>
42 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val)
43 | {
44 |   // NVBug 3384810
45 |   new (ptr) T(::cuda::std::forward<U>(val));
46 | }
47 | #else
48 | template <typename T,
49 |           typename U,
50 |           typename ::cuda::std::enable_if<
51 |             ::cuda::std::is_trivially_copyable<T>::value, 
52 |             int
53 |           >::type = 0>
54 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val)
55 | {
56 |   *ptr = ::cuda::std::forward<U>(val);
57 | }
58 | 
59 | template <typename T, 
60 |          typename U,
61 |          typename ::cuda::std::enable_if<
62 |            !::cuda::std::is_trivially_copyable<T>::value,
63 |            int
64 |          >::type = 0>
65 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val)
66 | {
67 |   new (ptr) T(::cuda::std::forward<U>(val));
68 | }
69 | #endif
70 | 
71 | } // namespace detail
72 | 
73 | 
74 | CUB_NAMESPACE_END
75 | 
76 | 


--------------------------------------------------------------------------------
/cub/grid/grid_barrier.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_debug.cuh"
 37 | #include "../config.cuh"
 38 | #include "../thread/thread_load.cuh"
 39 | 
 40 | CUB_NAMESPACE_BEGIN
 41 | 
 42 | 
 43 | /**
 44 |  * \addtogroup GridModule
 45 |  * @{
 46 |  */
 47 | 
 48 | 
 49 | /**
 50 |  * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 51 |  */
 52 | class GridBarrier
 53 | {
 54 | protected :
 55 | 
 56 |     typedef unsigned int SyncFlag;
 57 | 
 58 |     // Counters in global device memory
 59 |     SyncFlag* d_sync;
 60 | 
 61 | public:
 62 | 
 63 |     /**
 64 |      * Constructor
 65 |      */
 66 |     GridBarrier() : d_sync(NULL) {}
 67 | 
 68 | 
 69 |     /**
 70 |      * Synchronize
 71 |      */
 72 |     __device__ __forceinline__ void Sync() const
 73 |     {
 74 |         volatile SyncFlag *d_vol_sync = d_sync;
 75 | 
 76 |         // Threadfence and syncthreads to make sure global writes are visible before
 77 |         // thread-0 reports in with its sync counter
 78 |         __threadfence();
 79 |         CTA_SYNC();
 80 | 
 81 |         if (blockIdx.x == 0)
 82 |         {
 83 |             // Report in ourselves
 84 |             if (threadIdx.x == 0)
 85 |             {
 86 |                 d_vol_sync[blockIdx.x] = 1;
 87 |             }
 88 | 
 89 |             CTA_SYNC();
 90 | 
 91 |             // Wait for everyone else to report in
 92 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
 93 |             {
 94 |                 while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
 95 |                 {
 96 |                     __threadfence_block();
 97 |                 }
 98 |             }
 99 | 
100 |             CTA_SYNC();
101 | 
102 |             // Let everyone know it's safe to proceed
103 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
104 |             {
105 |                 d_vol_sync[peer_block] = 0;
106 |             }
107 |         }
108 |         else
109 |         {
110 |             if (threadIdx.x == 0)
111 |             {
112 |                 // Report in
113 |                 d_vol_sync[blockIdx.x] = 1;
114 | 
115 |                 // Wait for acknowledgment
116 |                 while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
117 |                 {
118 |                     __threadfence_block();
119 |                 }
120 |             }
121 | 
122 |             CTA_SYNC();
123 |         }
124 |     }
125 | };
126 | 
127 | 
128 | /**
129 |  * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
130 |  *
131 |  * Uses RAII for lifetime, i.e., device resources are reclaimed when
132 |  * the destructor is called.
133 |  */
134 | class GridBarrierLifetime : public GridBarrier
135 | {
136 | protected:
137 | 
138 |     // Number of bytes backed by d_sync
139 |     size_t sync_bytes;
140 | 
141 | public:
142 | 
143 |     /**
144 |      * Constructor
145 |      */
146 |     GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
147 | 
148 | 
149 |     /**
150 |      * DeviceFrees and resets the progress counters
151 |      */
152 |     cudaError_t HostReset()
153 |     {
154 |         cudaError_t retval = cudaSuccess;
155 |         if (d_sync)
156 |         {
157 |             CubDebug(retval = cudaFree(d_sync));
158 |             d_sync = NULL;
159 |         }
160 |         sync_bytes = 0;
161 |         return retval;
162 |     }
163 | 
164 | 
165 |     /**
166 |      * Destructor
167 |      */
168 |     virtual ~GridBarrierLifetime()
169 |     {
170 |         HostReset();
171 |     }
172 | 
173 | 
174 |     /**
175 |      * Sets up the progress counters for the next kernel launch (lazily
176 |      * allocating and initializing them if necessary)
177 |      */
178 |     cudaError_t Setup(int sweep_grid_size)
179 |     {
180 |         cudaError_t retval = cudaSuccess;
181 |         do {
182 |             size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
183 |             if (new_sync_bytes > sync_bytes)
184 |             {
185 |                 if (d_sync)
186 |                 {
187 |                     if (CubDebug(retval = cudaFree(d_sync))) break;
188 |                 }
189 | 
190 |                 sync_bytes = new_sync_bytes;
191 | 
192 |                 // Allocate and initialize to zero
193 |                 if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
194 |                 if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
195 |             }
196 |         } while (0);
197 | 
198 |         return retval;
199 |     }
200 | };
201 | 
202 | 
203 | /** @} */       // end group GridModule
204 | 
205 | CUB_NAMESPACE_END
206 | 
207 | 


--------------------------------------------------------------------------------
/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../config.cuh"
 37 | 
 38 | CUB_NAMESPACE_BEGIN
 39 | 
 40 | 
 41 | /**
 42 |  * \addtogroup GridModule
 43 |  * @{
 44 |  */
 45 | 
 46 | 
 47 | /******************************************************************************
 48 |  * Mapping policies
 49 |  *****************************************************************************/
 50 | 
 51 | 
 52 | /**
 53 |  * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 54 |  */
 55 | enum GridMappingStrategy
 56 | {
 57 |     /**
 58 |      * \brief An a "raking" access pattern in which each thread block is
 59 |      * assigned a consecutive sequence of input tiles
 60 |      *
 61 |      * \par Overview
 62 |      * The input is evenly partitioned into \p p segments, where \p p is
 63 |      * constant and corresponds loosely to the number of thread blocks that may
 64 |      * actively reside on the target device. Each segment is comprised of
 65 |      * consecutive tiles, where a tile is a small, constant-sized unit of input
 66 |      * to be processed to completion before the thread block terminates or
 67 |      * obtains more work.  The kernel invokes \p p thread blocks, each
 68 |      * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
 69 |      * in tile-size increments.
 70 |      */
 71 |     GRID_MAPPING_RAKE,
 72 | 
 73 |     /**
 74 |      * \brief An a "strip mining" access pattern in which the input tiles assigned
 75 |      * to each thread block are separated by a stride equal to the the extent of
 76 |      * the grid.
 77 |      *
 78 |      * \par Overview
 79 |      * The input is evenly partitioned into \p p sets, where \p p is
 80 |      * constant and corresponds loosely to the number of thread blocks that may
 81 |      * actively reside on the target device. Each set is comprised of
 82 |      * data tiles separated by stride \p tiles, where a tile is a small,
 83 |      * constant-sized unit of input to be processed to completion before the
 84 |      * thread block terminates or obtains more work.  The kernel invokes \p p
 85 |      * thread blocks, each of which iteratively consumes a segment of
 86 |      * <em>n</em>/<em>p</em> elements in tile-size increments.
 87 |      */
 88 |     GRID_MAPPING_STRIP_MINE,
 89 | 
 90 |     /**
 91 |      * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
 92 |      *
 93 |      * \par Overview
 94 |      * The input is treated as a queue to be dynamically consumed by a grid of
 95 |      * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
 96 |      * unit of input to be processed to completion before the thread block
 97 |      * terminates or obtains more work.  The grid size \p p is constant,
 98 |      * loosely corresponding to the number of thread blocks that may actively
 99 |      * reside on the target device.
100 |      */
101 |     GRID_MAPPING_DYNAMIC,
102 | };
103 | 
104 | 
105 | /** @} */       // end group GridModule
106 | 
107 | CUB_NAMESPACE_END
108 | 
109 | 


--------------------------------------------------------------------------------
/cub/host/mutex.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * Simple portable mutex
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include <mutex>
37 | 
38 | #include <cub/config.cuh>
39 | #include <cub/util_deprecated.cuh>
40 | 
41 | 
42 | CUB_NAMESPACE_BEGIN
43 | 
44 | 
45 | /**
46 |  * Wraps std::mutex 
47 |  * @deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed 
48 |  *             in a future release. Use `std::mutex` instead.
49 |  */
50 | struct CUB_DEPRECATED Mutex
51 | {
52 |     std::mutex mtx;
53 | 
54 |     void Lock()
55 |     {
56 |         mtx.lock();
57 |     }
58 | 
59 |     void Unlock()
60 |     {
61 |         mtx.unlock();
62 |     }
63 | };
64 | 
65 | 
66 | CUB_NAMESPACE_END
67 | 


--------------------------------------------------------------------------------
/cub/iterator/tex_ref_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <cub/config.cuh>
 37 | #include <cub/iterator/tex_obj_input_iterator.cuh>
 38 | 
 39 | #include <cstddef>
 40 | 
 41 | CUB_NAMESPACE_BEGIN
 42 | 
 43 | /**
 44 |  * \addtogroup UtilIterator
 45 |  * @{
 46 |  */
 47 | 
 48 | /**
 49 |  * \brief A random-access input wrapper for dereferencing array values through texture cache.
 50 |  *
 51 |  * \deprecated [Since 1.13.0] The CUDA texture management APIs used by
 52 |  * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead.
 53 |  *
 54 |  * \par Overview
 55 |  * - TexRefInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
 56 |  *   to elements are to be loaded through texture cache.
 57 |  * - Can be used to load any data type from memory through texture cache.
 58 |  * - Can be manipulated and exchanged within and between host and device
 59 |  *   functions, can only be constructed within host functions, and can only be
 60 |  *   dereferenced within device functions.
 61 |  * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
 62 |  *   reference.  Only one TexRefInputIterator instance can be bound at any given time for a
 63 |  *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
 64 |  *   thread, and (4) compilation .o unit.
 65 |  * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be
 66 |  *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
 67 |  *   from the host).
 68 |  * - Compatible with Thrust API v1.7 or newer.
 69 |  *
 70 |  * \par Snippet
 71 |  * The code snippet below illustrates the use of \p TexRefInputIterator to
 72 |  * dereference a device array of doubles through texture cache.
 73 |  * \par
 74 |  * \code
 75 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
 76 |  *
 77 |  * // Declare, allocate, and initialize a device array
 78 |  * int num_items;   // e.g., 7
 79 |  * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
 80 |  *
 81 |  * // Create an iterator wrapper
 82 |  * cub::TexRefInputIterator<double, __LINE__> itr;
 83 |  * itr.BindTexture(d_in, sizeof(double) * num_items);
 84 |  * ...
 85 |  *
 86 |  * // Within device code:
 87 |  * printf("%f\n", itr[0]);      // 8.0
 88 |  * printf("%f\n", itr[1]);      // 6.0
 89 |  * printf("%f\n", itr[6]);      // 9.0
 90 |  *
 91 |  * ...
 92 |  * itr.UnbindTexture();
 93 |  *
 94 |  * \endcode
 95 |  *
 96 |  * \tparam T                    The value type of this iterator
 97 |  * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
 98 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
 99 |  */
100 | template <
101 |     typename    T,
102 |     int         /*UNIQUE_ID*/,
103 |     typename    OffsetT = std::ptrdiff_t>
104 | using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator<T, OffsetT>;
105 | 
106 | /** @} */       // end group UtilIterator
107 | 
108 | CUB_NAMESPACE_END
109 | 


--------------------------------------------------------------------------------
/cub/thread/thread_search.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Thread utilities for sequential search
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <cub/util_namespace.cuh>
 38 | #include <cub/util_type.cuh>
 39 | #include <cub/config.cuh>
 40 | 
 41 | #include <nv/target>
 42 | 
 43 | CUB_NAMESPACE_BEGIN
 44 | 
 45 | 
 46 | /**
 47 |  * Computes the begin offsets into A and B for the specific diagonal
 48 |  */
 49 | template <
 50 |     typename AIteratorT,
 51 |     typename BIteratorT,
 52 |     typename OffsetT,
 53 |     typename CoordinateT>
 54 | __host__ __device__ __forceinline__ void MergePathSearch(
 55 |     OffsetT         diagonal,
 56 |     AIteratorT      a,
 57 |     BIteratorT      b,
 58 |     OffsetT         a_len,
 59 |     OffsetT         b_len,
 60 |     CoordinateT&    path_coordinate)
 61 | {
 62 |     /// The value type of the input iterator
 63 |     using T = cub::detail::value_t<AIteratorT>;
 64 | 
 65 |     OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
 66 |     OffsetT split_max = CUB_MIN(diagonal, a_len);
 67 | 
 68 |     while (split_min < split_max)
 69 |     {
 70 |         OffsetT split_pivot = (split_min + split_max) >> 1;
 71 |         if (a[split_pivot] <= b[diagonal - split_pivot - 1])
 72 |         {
 73 |             // Move candidate split range up A, down B
 74 |             split_min = split_pivot + 1;
 75 |         }
 76 |         else
 77 |         {
 78 |             // Move candidate split range up B, down A
 79 |             split_max = split_pivot;
 80 |         }
 81 |     }
 82 | 
 83 |     path_coordinate.x = CUB_MIN(split_min, a_len);
 84 |     path_coordinate.y = diagonal - split_min;
 85 | }
 86 | 
 87 | 
 88 | 
 89 | /**
 90 |  * \brief Returns the offset of the first value within \p input which does not compare less than \p val
 91 |  */
 92 | template <
 93 |     typename InputIteratorT,
 94 |     typename OffsetT,
 95 |     typename T>
 96 | __device__ __forceinline__ OffsetT LowerBound(
 97 |     InputIteratorT      input,              ///< [in] Input sequence
 98 |     OffsetT             num_items,          ///< [in] Input sequence length
 99 |     T                   val)                ///< [in] Search key
100 | {
101 |     OffsetT retval = 0;
102 |     while (num_items > 0)
103 |     {
104 |         OffsetT half = num_items >> 1;
105 |         if (input[retval + half] < val)
106 |         {
107 |             retval = retval + (half + 1);
108 |             num_items = num_items - (half + 1);
109 |         }
110 |         else
111 |         {
112 |             num_items = half;
113 |         }
114 |     }
115 | 
116 |     return retval;
117 | }
118 | 
119 | 
120 | /**
121 |  * \brief Returns the offset of the first value within \p input which compares greater than \p val
122 |  */
123 | template <
124 |     typename InputIteratorT,
125 |     typename OffsetT,
126 |     typename T>
127 | __device__ __forceinline__ OffsetT UpperBound(
128 |     InputIteratorT      input,              ///< [in] Input sequence
129 |     OffsetT             num_items,          ///< [in] Input sequence length
130 |     T                   val)                ///< [in] Search key
131 | {
132 |     OffsetT retval = 0;
133 |     while (num_items > 0)
134 |     {
135 |         OffsetT half = num_items >> 1;
136 |         if (val < input[retval + half])
137 |         {
138 |             num_items = half;
139 |         }
140 |         else
141 |         {
142 |             retval = retval + (half + 1);
143 |             num_items = num_items - (half + 1);
144 |         }
145 |     }
146 | 
147 |     return retval;
148 | }
149 | 
150 | 
151 | #if defined(__CUDA_FP16_TYPES_EXIST__)
152 | template <
153 |     typename InputIteratorT,
154 |     typename OffsetT>
155 | __device__ __forceinline__ OffsetT UpperBound(
156 |     InputIteratorT      input,              ///< [in] Input sequence
157 |     OffsetT             num_items,          ///< [in] Input sequence length
158 |     __half              val)                ///< [in] Search key
159 | {
160 |     OffsetT retval = 0;
161 |     while (num_items > 0)
162 |     {
163 |         OffsetT half = num_items >> 1;
164 | 
165 |         bool lt;
166 |         NV_IF_TARGET(NV_PROVIDES_SM_53,
167 |                      (lt = val < input[retval + half];),
168 |                      (lt = __half2float(val) < __half2float(input[retval + half]);));
169 | 
170 |         if (lt)
171 |         {
172 |             num_items = half;
173 |         }
174 |         else
175 |         {
176 |             retval = retval + (half + 1);
177 |             num_items = num_items - (half + 1);
178 |         }
179 |     }
180 | 
181 |     return retval;
182 | }
183 | #endif
184 | 
185 | CUB_NAMESPACE_END
186 | 


--------------------------------------------------------------------------------
/cub/thread/thread_sort.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #pragma once
 29 | 
 30 | #include "../config.cuh"
 31 | #include "../util_ptx.cuh"
 32 | #include "../util_type.cuh"
 33 | 
 34 | CUB_NAMESPACE_BEGIN
 35 | 
 36 | 
 37 | template <typename T>
 38 | __device__ __forceinline__ void Swap(T &lhs, T &rhs)
 39 | {
 40 |   T temp = lhs;
 41 |   lhs    = rhs;
 42 |   rhs    = temp;
 43 | }
 44 | 
 45 | 
 46 | /**
 47 |  * @brief Sorts data using odd-even sort method
 48 |  *
 49 |  * The sorting method is stable. Further details can be found in:
 50 |  * A. Nico Habermann. Parallel neighbor sort (or the glory of the induction
 51 |  * principle). Technical Report AD-759 248, Carnegie Mellon University, 1972.
 52 |  *
 53 |  * @tparam KeyT
 54 |  *   Key type
 55 |  *
 56 |  * @tparam ValueT
 57 |  *   Value type. If `cub::NullType` is used as `ValueT`, only keys are sorted.
 58 |  *
 59 |  * @tparam CompareOp
 60 |  *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`
 61 |  *
 62 |  * @tparam ITEMS_PER_THREAD
 63 |  *   The number of items per thread
 64 |  *
 65 |  * @param[in,out] keys
 66 |  *   Keys to sort
 67 |  *
 68 |  * @param[in,out] items
 69 |  *   Values to sort
 70 |  *
 71 |  * @param[in] compare_op
 72 |  *   Comparison function object which returns true if the first argument is
 73 |  *   ordered before the second
 74 |  */
 75 | template <typename KeyT,
 76 |           typename ValueT,
 77 |           typename CompareOp,
 78 |           int ITEMS_PER_THREAD>
 79 | __device__ __forceinline__ void
 80 | StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
 81 |                   ValueT (&items)[ITEMS_PER_THREAD],
 82 |                   CompareOp compare_op)
 83 | {
 84 |   constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
 85 | 
 86 |   #pragma unroll
 87 |   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
 88 |   {
 89 |   #pragma unroll
 90 |     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
 91 |     {
 92 |       if (compare_op(keys[j + 1], keys[j]))
 93 |       {
 94 |         Swap(keys[j], keys[j + 1]);
 95 |         if (!KEYS_ONLY)
 96 |         {
 97 |           Swap(items[j], items[j + 1]);
 98 |         }
 99 |       }
100 |     } // inner loop
101 |   }   // outer loop
102 | }
103 | 
104 | 
105 | CUB_NAMESPACE_END
106 | 


--------------------------------------------------------------------------------
/cub/util_arch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Static architectural properties by SM version.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <cub/util_cpp_dialect.cuh>
 37 | #include <cub/util_namespace.cuh>
 38 | #include <cub/util_macro.cuh>
 39 | 
 40 | // Legacy include; this functionality used to be defined in here.
 41 | #include <cub/detail/detect_cuda_runtime.cuh>
 42 | 
 43 | CUB_NAMESPACE_BEGIN
 44 | 
 45 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 46 | 
 47 | // \deprecated [Since 2.1.0] 
 48 | #define CUB_USE_COOPERATIVE_GROUPS
 49 | 
 50 | /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are
 51 | /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined.
 52 | #ifndef CUB_PTX_ARCH
 53 |     #if defined(_NVHPC_CUDA)
 54 |         // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined
 55 |         // when compiling both host code and device code. Currently, only one
 56 |         // PTX version can be targeted.
 57 |         #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__
 58 |     #elif !defined(__CUDA_ARCH__)
 59 |         #define CUB_PTX_ARCH 0
 60 |     #else
 61 |         #define CUB_PTX_ARCH __CUDA_ARCH__
 62 |     #endif
 63 | #endif
 64 | 
 65 | // These definitions were intended for internal use only and are now obsolete.
 66 | // If you relied on them, consider porting your code to use the functionality
 67 | // in libcu++'s <nv/target> header.
 68 | // For a temporary workaround, define CUB_PROVIDE_LEGACY_ARCH_MACROS to make
 69 | // them available again. These should be considered deprecated and will be
 70 | // fully removed in a future version.
 71 | #ifdef CUB_PROVIDE_LEGACY_ARCH_MACROS
 72 |     #ifndef CUB_IS_DEVICE_CODE
 73 |         #if defined(_NVHPC_CUDA)
 74 |             #define CUB_IS_DEVICE_CODE __builtin_is_device_code()
 75 |             #define CUB_IS_HOST_CODE (!__builtin_is_device_code())
 76 |             #define CUB_INCLUDE_DEVICE_CODE 1
 77 |             #define CUB_INCLUDE_HOST_CODE 1
 78 |         #elif CUB_PTX_ARCH > 0
 79 |             #define CUB_IS_DEVICE_CODE 1
 80 |             #define CUB_IS_HOST_CODE 0
 81 |             #define CUB_INCLUDE_DEVICE_CODE 1
 82 |             #define CUB_INCLUDE_HOST_CODE 0
 83 |         #else
 84 |             #define CUB_IS_DEVICE_CODE 0
 85 |             #define CUB_IS_HOST_CODE 1
 86 |             #define CUB_INCLUDE_DEVICE_CODE 0
 87 |             #define CUB_INCLUDE_HOST_CODE 1
 88 |         #endif
 89 |     #endif
 90 | #endif // CUB_PROVIDE_LEGACY_ARCH_MACROS
 91 | 
 92 | /// Maximum number of devices supported.
 93 | #ifndef CUB_MAX_DEVICES
 94 |     #define CUB_MAX_DEVICES (128)
 95 | #endif
 96 | 
 97 | static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0.");
 98 | 
 99 | 
100 | /// Number of threads per warp
101 | #ifndef CUB_LOG_WARP_THREADS
102 |     #define CUB_LOG_WARP_THREADS(unused) (5)
103 |     #define CUB_WARP_THREADS(unused) (1 << CUB_LOG_WARP_THREADS(0))
104 | 
105 |     #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(0)
106 |     #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(0)
107 | #endif
108 | 
109 | 
110 | /// Number of smem banks
111 | #ifndef CUB_LOG_SMEM_BANKS
112 |     #define CUB_LOG_SMEM_BANKS(unused) (5)
113 |     #define CUB_SMEM_BANKS(unused) (1 << CUB_LOG_SMEM_BANKS(0))
114 | 
115 |     #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(0)
116 |     #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS
117 | #endif
118 | 
119 | 
120 | /// Oversubscription factor
121 | #ifndef CUB_SUBSCRIPTION_FACTOR
122 |     #define CUB_SUBSCRIPTION_FACTOR(unused) (5)
123 |     #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(0)
124 | #endif
125 | 
126 | 
127 | /// Prefer padding overhead vs X-way conflicts greater than this threshold
128 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING
129 |     #define CUB_PREFER_CONFLICT_OVER_PADDING(unused) (1)
130 |     #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(0)
131 | #endif
132 | 
133 | 
134 | template <
135 |     int NOMINAL_4B_BLOCK_THREADS,
136 |     int NOMINAL_4B_ITEMS_PER_THREAD,
137 |     typename T>
138 | struct RegBoundScaling
139 | {
140 |     enum {
141 |         ITEMS_PER_THREAD    = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))),
142 |         BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
143 |     };
144 | };
145 | 
146 | 
147 | template <
148 |     int NOMINAL_4B_BLOCK_THREADS,
149 |     int NOMINAL_4B_ITEMS_PER_THREAD,
150 |     typename T>
151 | struct MemBoundScaling
152 | {
153 |     enum {
154 |         ITEMS_PER_THREAD    = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)),
155 |         BLOCK_THREADS       = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32),
156 |     };
157 | };
158 | 
159 | 
160 | 
161 | 
162 | #endif  // Do not document
163 | 
164 | CUB_NAMESPACE_END
165 | 


--------------------------------------------------------------------------------
/cub/util_compiler.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 |  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Detect compiler information.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | // enumerate host compilers we know about
36 | #define CUB_HOST_COMPILER_UNKNOWN 0
37 | #define CUB_HOST_COMPILER_MSVC 1
38 | #define CUB_HOST_COMPILER_GCC 2
39 | #define CUB_HOST_COMPILER_CLANG 3
40 | 
41 | // enumerate device compilers we know about
42 | #define CUB_DEVICE_COMPILER_UNKNOWN 0
43 | #define CUB_DEVICE_COMPILER_MSVC 1
44 | #define CUB_DEVICE_COMPILER_GCC 2
45 | #define CUB_DEVICE_COMPILER_NVCC 3
46 | #define CUB_DEVICE_COMPILER_CLANG 4
47 | 
48 | // figure out which host compiler we're using
49 | #if defined(_MSC_VER)
50 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
51 | #  define CUB_MSVC_VERSION _MSC_VER
52 | #  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
53 | #elif defined(__clang__)
54 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
55 | #  define CUB_CLANG_VERSION                                                    \
56 |     (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
57 | #elif defined(__GNUC__)
58 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
59 | #  define CUB_GCC_VERSION                                                      \
60 |     (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
61 | #else
62 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN
63 | #endif // CUB_HOST_COMPILER
64 | 
65 | // figure out which device compiler we're using
66 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
67 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
68 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
69 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
70 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
71 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
72 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
73 | // CUDA-capable clang should behave similar to NVCC.
74 | #  if defined(__CUDA__)
75 | #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
76 | #  else
77 | #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
78 | #  endif
79 | #else
80 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
81 | #endif
82 | 


--------------------------------------------------------------------------------
/cub/util_deprecated.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 |  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Define CUB_DEPRECATED macro.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | 
36 | #include <cub/detail/type_traits.cuh>
37 | #include <cub/util_compiler.cuh>
38 | #include <cub/util_cpp_dialect.cuh>
39 | #include <cub/util_debug.cuh>
40 | 
41 | 
42 | #if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API)
43 | #  define CUB_IGNORE_DEPRECATED_API
44 | #endif
45 | 
46 | #ifdef CUB_IGNORE_DEPRECATED_API
47 | #  define CUB_DEPRECATED
48 | #  define CUB_DEPRECATED_BECAUSE(MSG)
49 | #elif CUB_CPP_DIALECT >= 2014
50 | #  define CUB_DEPRECATED [[deprecated]]
51 | #  define CUB_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]]
52 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
53 | #  define CUB_DEPRECATED __declspec(deprecated)
54 | #  define CUB_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG))
55 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
56 | #  define CUB_DEPRECATED __attribute__((deprecated))
57 | #  define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG)))
58 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
59 | #  define CUB_DEPRECATED __attribute__((deprecated))
60 | #  define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG)))
61 | #else
62 | #  define CUB_DEPRECATED
63 | #  define CUB_DEPRECATED_BECAUSE(MSG)
64 | #endif
65 | 
66 | #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED                         \
67 |   CUB_DEPRECATED_BECAUSE(                                                      \
68 |     "CUB no longer accepts `debug_synchronous` parameter. "                    \
69 |     "Define CUB_DEBUG_SYNC instead, or silence this message with "             \
70 |     "CUB_IGNORE_DEPRECATED_API.")
71 | 
72 | #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG                                \
73 |   if (debug_synchronous)                                                       \
74 |   {                                                                            \
75 |     _CubLog("%s\n",                                                            \
76 |             "CUB no longer accepts `debug_synchronous` parameter. "            \
77 |             "Define CUB_DEBUG_SYNC instead.");                                 \
78 |   }
79 | 
80 | 


--------------------------------------------------------------------------------
/cub/util_macro.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Common C/C++ macro utilities
 31 |  ******************************************************************************/
 32 | 
 33 | #pragma once
 34 | 
 35 | #include <cuda/std/utility>
 36 | 
 37 | #include "util_namespace.cuh"
 38 | 
 39 | CUB_NAMESPACE_BEGIN
 40 | 
 41 | 
 42 | /**
 43 |  * \addtogroup UtilModule
 44 |  * @{
 45 |  */
 46 | 
 47 | #ifndef CUB_ALIGN
 48 |     #if defined(_WIN32) || defined(_WIN64)
 49 |         /// Align struct
 50 |         #define CUB_ALIGN(bytes) __declspec(align(32))
 51 |     #else
 52 |         /// Align struct
 53 |         #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
 54 |     #endif
 55 | #endif
 56 | 
 57 | #define CUB_PREVENT_MACRO_SUBSTITUTION
 58 | 
 59 | template <typename T, typename U>
 60 | constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
 61 |                                                                       U &&u)
 62 |   -> decltype(t < u ? ::cuda::std::forward<T>(t) : ::cuda::std::forward<U>(u))
 63 | {
 64 |   return t < u ? ::cuda::std::forward<T>(t) : ::cuda::std::forward<U>(u);
 65 | }
 66 | 
 67 | template <typename T, typename U>
 68 | constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
 69 |                                                                       U &&u)
 70 |   -> decltype(t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t))
 71 | {
 72 |   return t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t);
 73 | }
 74 | 
 75 | #ifndef CUB_MAX
 76 |     /// Select maximum(a, b)
 77 |     #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
 78 | #endif
 79 | 
 80 | #ifndef CUB_MIN
 81 |     /// Select minimum(a, b)
 82 |     #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
 83 | #endif
 84 | 
 85 | #ifndef CUB_QUOTIENT_FLOOR
 86 |     /// Quotient of x/y rounded down to nearest integer
 87 |     #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
 88 | #endif
 89 | 
 90 | #ifndef CUB_QUOTIENT_CEILING
 91 |     /// Quotient of x/y rounded up to nearest integer
 92 |     #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
 93 | #endif
 94 | 
 95 | #ifndef CUB_ROUND_UP_NEAREST
 96 |     /// x rounded up to the nearest multiple of y
 97 |     #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
 98 | #endif
 99 | 
100 | #ifndef CUB_ROUND_DOWN_NEAREST
101 |     /// x rounded down to the nearest multiple of y
102 |     #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
103 | #endif
104 | 
105 | 
106 | #ifndef CUB_STATIC_ASSERT
107 |     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
108 |         #define CUB_CAT_(a, b) a ## b
109 |         #define CUB_CAT(a, b) CUB_CAT_(a, b)
110 |     #endif // DOXYGEN_SHOULD_SKIP_THIS
111 | 
112 |     /// Static assert
113 |     #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
114 | #endif
115 | 
116 | /** @} */       // end group UtilModule
117 | 
118 | CUB_NAMESPACE_END
119 | 


--------------------------------------------------------------------------------
/cub/util_math.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /**
 29 |  * \file
 30 |  * Define helper math functions.
 31 |  */
 32 | 
 33 | #pragma once
 34 | 
 35 | #include <type_traits>
 36 | 
 37 | #include "util_namespace.cuh"
 38 | #include "util_macro.cuh"
 39 | 
 40 | CUB_NAMESPACE_BEGIN
 41 | 
 42 | namespace detail
 43 | {
 44 | 
 45 | template <typename T>
 46 | using is_integral_or_enum =
 47 |   std::integral_constant<bool,
 48 |                          std::is_integral<T>::value || std::is_enum<T>::value>;
 49 | 
 50 | __host__ __device__ __forceinline__ constexpr  std::size_t
 51 | VshmemSize(std::size_t max_shmem,
 52 |            std::size_t shmem_per_block,
 53 |            std::size_t num_blocks)
 54 | {
 55 |   return shmem_per_block > max_shmem ? shmem_per_block * num_blocks : 0;
 56 | }
 57 | 
 58 | }
 59 | 
 60 | /**
 61 |  * Divide n by d, round up if any remainder, and return the result.
 62 |  *
 63 |  * Effectively performs `(n + d - 1) / d`, but is robust against the case where
 64 |  * `(n + d - 1)` would overflow.
 65 |  */
 66 | template <typename NumeratorT, typename DenominatorT>
 67 | __host__ __device__ __forceinline__ constexpr NumeratorT
 68 | DivideAndRoundUp(NumeratorT n, DenominatorT d)
 69 | {
 70 |   static_assert(cub::detail::is_integral_or_enum<NumeratorT>::value &&
 71 |                 cub::detail::is_integral_or_enum<DenominatorT>::value,
 72 |                 "DivideAndRoundUp is only intended for integral types.");
 73 | 
 74 |   // Static cast to undo integral promotion.
 75 |   return static_cast<NumeratorT>(n / d + (n % d != 0 ? 1 : 0));
 76 | }
 77 | 
 78 | constexpr __device__ __host__ int
 79 | Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes)
 80 | {
 81 |   return (cub::min)(nominal_4b_items_per_thread,
 82 |                     (cub::max)(1,
 83 |                                nominal_4b_items_per_thread * 8 /
 84 |                                combined_bytes));
 85 | }
 86 | 
 87 | template <typename T>
 88 | constexpr __device__ __host__ int
 89 | Nominal4BItemsToItems(int nominal_4b_items_per_thread)
 90 | {
 91 |   return (cub::min)(nominal_4b_items_per_thread,
 92 |                     (cub::max)(1,
 93 |                                nominal_4b_items_per_thread * 4 /
 94 |                                  static_cast<int>(sizeof(T))));
 95 | }
 96 | 
 97 | template <typename ItemT>
 98 | constexpr __device__ __host__ int
 99 | Nominal8BItemsToItems(int nominal_8b_items_per_thread)
100 | {
101 |   return sizeof(ItemT) <= 8u
102 |            ? nominal_8b_items_per_thread
103 |            : (cub::min)(nominal_8b_items_per_thread,
104 |                         (cub::max)(1,
105 |                                    ((nominal_8b_items_per_thread * 8) +
106 |                                     static_cast<int>(sizeof(ItemT)) - 1) /
107 |                                      static_cast<int>(sizeof(ItemT))));
108 | }
109 | 
110 | /**
111 |  * \brief Computes the midpoint of the integers
112 |  *
113 |  * Extra operation is performed in order to prevent overflow.
114 |  *
115 |  * \return Half the sum of \p begin and \p end
116 |  */
117 | template <typename T>
118 | constexpr __device__ __host__ T MidPoint(T begin, T end)
119 | {
120 |   return begin + (end - begin) / 2;
121 | }
122 | 
123 | CUB_NAMESPACE_END
124 | 


--------------------------------------------------------------------------------
/cub/version.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /*! \file version.cuh
29 |  *  \brief Compile-time macros encoding CUB release version
30 |  *
31 |  *         <cub/version.h> is the only CUB header that is guaranteed to
32 |  *         change with every CUB release.
33 |  *
34 |  */
35 | 
36 | #pragma once
37 | 
38 | /*! \def CUB_VERSION
39 |  *  \brief The preprocessor macro \p CUB_VERSION encodes the version
40 |  *         number of the CUB library.
41 |  *
42 |  *         <tt>CUB_VERSION % 100</tt> is the sub-minor version.
43 |  *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
44 |  *         <tt>CUB_VERSION / 100000</tt> is the major version.
45 |  */
46 | #define CUB_VERSION 200200
47 | 
48 | /*! \def CUB_MAJOR_VERSION
49 |  *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
50 |  *         major version number of the CUB library.
51 |  */
52 | #define CUB_MAJOR_VERSION     (CUB_VERSION / 100000)
53 | 
54 | /*! \def CUB_MINOR_VERSION
55 |  *  \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the
56 |  *         minor version number of the CUB library.
57 |  */
58 | #define CUB_MINOR_VERSION     (CUB_VERSION / 100 % 1000)
59 | 
60 | /*! \def CUB_SUBMINOR_VERSION
61 |  *  \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the
62 |  *         sub-minor version number of the CUB library.
63 |  */
64 | #define CUB_SUBMINOR_VERSION  (CUB_VERSION % 100)
65 | 
66 | /*! \def CUB_PATCH_NUMBER
67 |  *  \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the
68 |  *         patch number of the CUB library.
69 |  */
70 | #define CUB_PATCH_NUMBER 0
71 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Create meta targets that build all examples for a single configuration:
 2 | foreach(cub_target IN LISTS CUB_TARGETS)
 3 |   cub_get_target_property(config_prefix ${cub_target} PREFIX)
 4 |   set(config_meta_target ${config_prefix}.examples)
 5 |   add_custom_target(${config_meta_target})
 6 |   add_dependencies(${config_prefix}.all ${config_meta_target})
 7 | endforeach()
 8 | 
 9 | # Update flags to reflect RDC options. See note in CubCudaConfig.cmake --
10 | # these flag variables behave unintuitively:
11 | if (CUB_ENABLE_EXAMPLES_WITH_RDC)
12 |   set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_RDC}")
13 | else()
14 |   set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}")
15 | endif()
16 | 
17 | ## cub_add_example
18 | #
19 | # Add an example executable and register it with ctest.
20 | #
21 | # target_name_var: Variable name to overwrite with the name of the example
22 | #   target. Useful for post-processing target information per-backend.
23 | # example_name: The name of the example minus "<config_prefix>.example." For
24 | #   instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu
25 | #   would be "cuda.copy".
26 | # example_src: The source file that implements the example.
27 | # cub_target: The reference cub target with configuration information.
28 | #
29 | function(cub_add_example target_name_var example_name example_src cub_target)
30 |   cub_get_target_property(config_prefix ${cub_target} PREFIX)
31 | 
32 |   # The actual name of the test's target:
33 |   set(example_target ${config_prefix}.example.${example_name})
34 |   set(${target_name_var} ${example_target} PARENT_SCOPE)
35 | 
36 |   # Related target names:
37 |   set(config_meta_target ${config_prefix}.examples)
38 |   set(example_meta_target cub.all.example.${example_name})
39 | 
40 |   add_executable(${example_target} "${example_src}")
41 |   target_link_libraries(${example_target} ${cub_target})
42 |   cub_clone_target_properties(${example_target} ${cub_target})
43 |   target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
44 | 
45 |   if (CUB_IN_THRUST)
46 |     thrust_fix_clang_nvcc_build_for(${example_target})
47 |   endif()
48 | 
49 |   # Add to the active configuration's meta target
50 |   add_dependencies(${config_meta_target} ${example_target})
51 | 
52 |   # Meta target that builds examples with this name for all configurations:
53 |   if (NOT TARGET ${example_meta_target})
54 |     add_custom_target(${example_meta_target})
55 |   endif()
56 |   add_dependencies(${example_meta_target} ${example_target})
57 | 
58 |   if (CUB_ENABLE_EXAMPLES_WITH_RDC)
59 |     cub_enable_rdc_for_cuda_target(${example_target})
60 |   endif()
61 | 
62 |   add_test(NAME ${example_target}
63 |     COMMAND "$<TARGET_FILE:${example_target}>"
64 |   )
65 | endfunction()
66 | 
67 | add_subdirectory(cmake)
68 | add_subdirectory(block)
69 | add_subdirectory(device)
70 | 


--------------------------------------------------------------------------------
/examples/block/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /Debug
3 | /Release
4 | /cuda55.sdf
5 | /cuda55.suo
6 | /cuda60.sdf
7 | /cuda60.suo
8 | 


--------------------------------------------------------------------------------
/examples/block/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB_RECURSE example_srcs
 2 |   RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
 3 |   CONFIGURE_DEPENDS
 4 |   example_*.cu
 5 | )
 6 | 
 7 | foreach (cub_target IN LISTS CUB_TARGETS)
 8 |   foreach (example_src IN LISTS example_srcs)
 9 |     get_filename_component(example_name "${example_src}" NAME_WE)
10 |     string(REGEX REPLACE
11 |       "^example_block_" "block."
12 |       example_name "${example_name}"
13 |     )
14 |     cub_add_example(target_name ${example_name} "${example_src}" ${cub_target})
15 |   endforeach()
16 | endforeach()
17 | 


--------------------------------------------------------------------------------
/examples/cmake/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_test(
 2 |   NAME cub.example.cmake.add_subdir
 3 |   COMMAND "${CMAKE_COMMAND}"
 4 |     --log-level=VERBOSE
 5 |     -G "${CMAKE_GENERATOR}"
 6 |     -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir"
 7 |     -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir"
 8 |     -D "CUB_ROOT=${CUB_SOURCE_DIR}"
 9 |     -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
10 |     -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
11 | )
12 | 


--------------------------------------------------------------------------------
/examples/cmake/add_subdir/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This example demonstrates / tests adding CUB via a CMake add_subdirectory
 2 | # call from a parent project.
 3 | 
 4 | cmake_minimum_required(VERSION 3.15)
 5 | 
 6 | # Silence warnings about empty CUDA_ARCHITECTURES properties on example targets:
 7 | if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
 8 |   cmake_policy(SET CMP0104 OLD)
 9 | endif()
10 | 
11 | project(CubAddSubDirExample CUDA)
12 | 
13 | # Use your project's checkout of CUB here, for most cases
14 | # `add_subdirectory(cub)` will be sufficient.
15 | add_subdirectory("${CUB_ROOT}" cub)
16 | 
17 | # Link the CUB::CUB target to your project's targets
18 | add_executable(HelloCUB dummy.cu)
19 | target_link_libraries(HelloCUB CUB::CUB)
20 | 
21 | #
22 | # Validation
23 | #
24 | 
25 | function(assert_target target_name)
26 |   if (NOT TARGET "${target_name}")
27 |     message(FATAL_ERROR "Target '${target_name}' not defined.")
28 |   endif()
29 | endfunction()
30 | 
31 | assert_target(CUB::CUB)
32 | assert_target(HelloCUB)
33 | 


--------------------------------------------------------------------------------
/examples/cmake/add_subdir/dummy.cu:
--------------------------------------------------------------------------------
1 | #include <cub/config.cuh>
2 | 
3 | #include <iostream>
4 | 
5 | int main()
6 | {
7 |   std::cout << "Hello from CUB version " << CUB_VERSION << ":\n";
8 | }
9 | 


--------------------------------------------------------------------------------
/examples/device/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /Debug
3 | /ipch
4 | /Release
5 | /cuda55.sdf
6 | /cuda55.suo
7 | /cuda60.sdf
8 | /cuda60.suo
9 | 


--------------------------------------------------------------------------------
/examples/device/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB_RECURSE example_srcs
 2 |   RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
 3 |   CONFIGURE_DEPENDS
 4 |   example_*.cu
 5 | )
 6 | 
 7 | foreach (cub_target IN LISTS CUB_TARGETS)
 8 |   foreach (example_src IN LISTS example_srcs)
 9 |     get_filename_component(example_name "${example_src}" NAME_WE)
10 |     string(REGEX REPLACE
11 |       "^example_device_" "device."
12 |       example_name "${example_name}"
13 |     )
14 |     cub_add_example(target_name ${example_name} "${example_src}" ${cub_target})
15 |   endforeach()
16 | endforeach()
17 | 


--------------------------------------------------------------------------------
/examples/device/example_device_reduce.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Simple example of DeviceReduce::Sum().
 31 |  *
 32 |  * Sums an array of int keys.
 33 |  *
 34 |  * To compile using the command line:
 35 |  *   nvcc -arch=sm_XX example_device_reduce.cu -I../.. -lcudart -O3
 36 |  *
 37 |  ******************************************************************************/
 38 | 
 39 | // Ensure printing of CUDA runtime errors to console
 40 | #define CUB_STDERR
 41 | 
 42 | #include <stdio.h>
 43 | 
 44 | #include <cub/util_allocator.cuh>
 45 | #include <cub/device/device_reduce.cuh>
 46 | 
 47 | #include "../../test/test_util.h"
 48 | 
 49 | using namespace cub;
 50 | 
 51 | 
 52 | //---------------------------------------------------------------------
 53 | // Globals, constants and typedefs
 54 | //---------------------------------------------------------------------
 55 | 
 56 | bool                    g_verbose = false;  // Whether to display input/output to console
 57 | CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
 58 | 
 59 | 
 60 | //---------------------------------------------------------------------
 61 | // Test generation
 62 | //---------------------------------------------------------------------
 63 | 
 64 | /**
 65 |  * Initialize problem
 66 |  */
 67 | void Initialize(
 68 |     int   *h_in,
 69 |     int     num_items)
 70 | {
 71 |     for (int i = 0; i < num_items; ++i)
 72 |         h_in[i] = i;
 73 | 
 74 |     if (g_verbose)
 75 |     {
 76 |         printf("Input:\n");
 77 |         DisplayResults(h_in, num_items);
 78 |         printf("\n\n");
 79 |     }
 80 | }
 81 | 
 82 | 
 83 | /**
 84 |  * Compute solution
 85 |  */
 86 | void Solve(
 87 |     int           *h_in,
 88 |     int           &h_reference,
 89 |     int             num_items)
 90 | {
 91 |     for (int i = 0; i < num_items; ++i)
 92 |     {
 93 |         if (i == 0)
 94 |             h_reference = h_in[0];
 95 |         else
 96 |             h_reference += h_in[i];
 97 |     }
 98 | }
 99 | 
100 | 
101 | //---------------------------------------------------------------------
102 | // Main
103 | //---------------------------------------------------------------------
104 | 
105 | /**
106 |  * Main
107 |  */
108 | int main(int argc, char** argv)
109 | {
110 |     int num_items = 150;
111 | 
112 |     // Initialize command line
113 |     CommandLineArgs args(argc, argv);
114 |     g_verbose = args.CheckCmdLineFlag("v");
115 |     args.GetCmdLineArgument("n", num_items);
116 | 
117 |     // Print usage
118 |     if (args.CheckCmdLineFlag("help"))
119 |     {
120 |         printf("%s "
121 |             "[--n=<input items> "
122 |             "[--device=<device-id>] "
123 |             "[--v] "
124 |             "\n", argv[0]);
125 |         exit(0);
126 |     }
127 | 
128 |     // Initialize device
129 |     CubDebugExit(args.DeviceInit());
130 | 
131 |     printf("cub::DeviceReduce::Sum() %d items (%d-byte elements)\n",
132 |         num_items, (int) sizeof(int));
133 |     fflush(stdout);
134 | 
135 |     // Allocate host arrays
136 |     int* h_in = new int[num_items];
137 |     int  h_reference{};
138 | 
139 |     // Initialize problem and solution
140 |     Initialize(h_in, num_items);
141 |     Solve(h_in, h_reference, num_items);
142 | 
143 |     // Allocate problem device arrays
144 |     int *d_in = NULL;
145 |     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(int) * num_items));
146 | 
147 |     // Initialize device input
148 |     CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(int) * num_items, cudaMemcpyHostToDevice));
149 | 
150 |     // Allocate device output array
151 |     int *d_out = NULL;
152 |     CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(int) * 1));
153 | 
154 |     // Request and allocate temporary storage
155 |     void            *d_temp_storage = NULL;
156 |     size_t          temp_storage_bytes = 0;
157 |     CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
158 |     CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));
159 | 
160 |     // Run
161 |     CubDebugExit(DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
162 | 
163 |     // Check for correctness (and display results, if specified)
164 |     int compare = CompareDeviceResults(&h_reference, d_out, 1, g_verbose, g_verbose);
165 |     printf("\t%s", compare ? "FAIL" : "PASS");
166 |     AssertEquals(0, compare);
167 | 
168 |     // Cleanup
169 |     if (h_in) delete[] h_in;
170 |     if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
171 |     if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
172 |     if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));
173 | 
174 |     printf("\n\n");
175 | 
176 |     return 0;
177 | }
178 | 
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /link_main.obj
3 | /dummy/
4 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
  1 | # Test Parametrization
  2 | 
  3 | Some of CUB's tests are very slow to build and are capable of exhausting RAM
  4 | during compilation/linking. To avoid such issues, large tests are split into
  5 | multiple executables to take advantage of parallel computation and reduce memory
  6 | usage.
  7 | 
  8 | CUB facilitates this by checking for special `%PARAM%` comments in each test's
  9 | source code, and then uses this information to generate multiple executables
 10 | with different configurations.
 11 | 
 12 | ## Using `%PARAM%`
 13 | 
 14 | The `%PARAM%` hint provides an automated method of generating multiple test
 15 | executables from a single source file. To use it, add one or more special
 16 | comments to the test source file:
 17 | 
 18 | ```cpp
 19 | // %PARAM% [definition] [label] [values]
 20 | ```
 21 | 
 22 | CMake will parse the source file and extract these comments, using them to
 23 | generate multiple test executables for the full cartesian product of values.
 24 | 
 25 | - `definition` will be used as a preprocessor definition name. By convention,
 26 |   these begin with `TEST_`.
 27 | - `label` is a short, human-readable label that will be used in the test
 28 |   executable's name to identify the test variant.
 29 | - `values` is a colon-separated list of values used during test generation. Only
 30 |   numeric values have been tested.
 31 | 
 32 | ## Special Labels
 33 | 
 34 | ### CDP / RDC Testing
 35 | 
 36 | If a `label` is `cdp`, it is assumed that the parameter is used to explicitly
 37 | test variants built with and without CDP support. The `values` for such a
 38 | parameter must be `0:1`, with `0` indicating CDP disabled (RDC off) and `1`
 39 | indicating CDP enabled (RDC on).
 40 | 
 41 | Tests that do not contain a variant labeled `cdp` will only enable RDC if
 42 | the CMake variable `CUB_ENABLE_TESTS_WITH_RDC` is true.
 43 | 
 44 | ## Example
 45 | 
 46 | For example, if `test_baz.cu` contains the following lines:
 47 | 
 48 | ```cpp
 49 | // %PARAM% TEST_FOO foo 0:1:2
 50 | // %PARAM% TEST_CDP cdp 0:1
 51 | ```
 52 | 
 53 | Six executables and CTest targets will be generated with unique definitions
 54 | (only c++17 targets shown):
 55 | 
 56 | | Executable Name                  | Preprocessor Definitions    | RDC State |
 57 | |----------------------------------|-----------------------------|-----------|
 58 | | `cub.cpp17.test.baz.foo_0.cdp_0` | `-DTEST_FOO=0 -DTEST_CDP=0` | Disabled  |
 59 | | `cub.cpp17.test.baz.foo_0.cdp_1` | `-DTEST_FOO=0 -DTEST_CDP=1` | Enabled   |
 60 | | `cub.cpp17.test.baz.foo_1.cdp_0` | `-DTEST_FOO=1 -DTEST_CDP=0` | Disabled  |
 61 | | `cub.cpp17.test.baz.foo_1.cdp_1` | `-DTEST_FOO=1 -DTEST_CDP=1` | Enabled   |
 62 | | `cub.cpp17.test.baz.foo_2.cdp_0` | `-DTEST_FOO=2 -DTEST_CDP=0` | Disabled  |
 63 | | `cub.cpp17.test.baz.foo_2.cdp_1` | `-DTEST_FOO=2 -DTEST_CDP=1` | Enabled   |
 64 | 
 65 | ## Changing `%PARAM%` Hints
 66 | 
 67 | Since CMake does not automatically reconfigure the build when source files are
 68 | modified, CMake will need to be rerun manually whenever the `%PARAM%` comments
 69 | change.
 70 | 
 71 | ## Building and Running Split Tests
 72 | 
 73 | CMake will generate individual build and test targets for each test variant, and
 74 | also provides build "metatargets" that compile all variants of a given test.
 75 | 
 76 | The variants follow the usual naming convention for CUB's tests, but include a
 77 | suffix that differentiates them (e.g. `.foo_X.bar_Y` in the example above).
 78 | 
 79 | ### Individual Test Variants
 80 | 
 81 | Continuing with the `test_baz.cu` example, the test variant that uses
 82 | `-DTEST_FOO=1 -DTEST_BAR=4` can be built and run alone:
 83 | 
 84 | ```bash
 85 | # Build a single variant:
 86 | make cub.cpp17.test.baz.foo_1.bar_4
 87 | 
 88 | # Run a single variant
 89 | bin/cub.cpp17.test.baz.foo_1.bar_4
 90 | 
 91 | # Run a single variant using CTest regex:
 92 | ctest -R cub\.cpp17\.test\.baz\.foo_1\.bar_4
 93 | ```
 94 | 
 95 | ### All Variants of a Test
 96 | 
 97 | Using a metatarget and the proper regex, all variants of a test can be built and
 98 | executed without listing all variants explicitly:
 99 | 
100 | ```bash
101 | # Build all variants using the `.all` metatarget
102 | make cub.cpp17.test.baz.all
103 | 
104 | # Run all variants:
105 | ctest -R cub\.cpp17\.test\.baz\.
106 | ```
107 | 
108 | ## Debugging
109 | 
110 | Running CMake with `--log-level=VERBOSE` will print out extra information about
111 | all detected test variants.
112 | 
113 | ## Additional Info
114 | 
115 | Ideally, only parameters that directly influence kernel template instantiations
116 | should be split out in this way. If changing a parameter doesn't change the
117 | kernel template type, the same kernel will be compiled into multiple
118 | executables. This defeats the purpose of splitting up the test since the
119 | compiler will generate redundant code across the new split executables.
120 | 
121 | The best candidate parameters for splitting are input value types, rather than
122 | integral parameters like BLOCK_THREADS, etc. Splitting by value type allows more
123 | infrastructure (data generation, validation) to be reused. Splitting other
124 | parameters can cause build times to increase since type-related infrastructure
125 | has to be rebuilt for each test variant.
126 | 


--------------------------------------------------------------------------------
/test/c2h/custom_type.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Redistribution and use in source and binary forms, with or without
  5 | * modification, are permitted provided that the following conditions are met:
  6 | *     * Redistributions of source code must retain the above copyright
  7 | *       notice, this list of conditions and the following disclaimer.
  8 | *     * Redistributions in binary form must reproduce the above copyright
  9 | *       notice, this list of conditions and the following disclaimer in the
 10 | *       documentation and/or other materials provided with the distribution.
 11 | *     * Neither the name of the NVIDIA CORPORATION nor the
 12 | *       names of its contributors may be used to endorse or promote products
 13 | *       derived from this software without specific prior written permission.
 14 | *
 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | *
 26 | ******************************************************************************/
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <limits>
 31 | #include <memory>
 32 | #include <ostream>
 33 | 
 34 | #include <thrust/device_vector.h>
 35 | 
 36 | namespace c2h
 37 | {
 38 | 
 39 | class custom_type_state_t
 40 | {
 41 |   std::size_t m_key{};
 42 |   std::size_t m_val{};
 43 | 
 44 | public:
 45 |   __host__ __device__ void set_key(std::size_t key) { m_key = key; }
 46 |   __host__ __device__ std::size_t get_key() const { return m_key; }
 47 |   __host__ __device__ void set_val(std::size_t val) { m_val = val; }
 48 |   __host__ __device__ std::size_t get_val() const { return m_val; }
 49 | };
 50 | 
 51 | template <template<typename> class... Policies>
 52 | class custom_type_t : public custom_type_state_t
 53 |                     , public Policies<custom_type_t<Policies...>>...
 54 | {
 55 | 
 56 | public:
 57 |   friend __host__ std::ostream &operator<<(std::ostream &os, 
 58 |                                            const custom_type_t &self) 
 59 |   { 
 60 |     return os << "{ " << self.get_key() << ", " << self.get_val() << " }";
 61 |   }
 62 | 
 63 | };
 64 | 
 65 | template <class CustomType>
 66 | class less_comparable_t
 67 | {
 68 |   // The CUDA compiler follows the IA64 ABI for class layout, while the 
 69 |   // Microsoft host compiler does not.
 70 |   char workaround_msvc;
 71 | 
 72 | public:
 73 |   __host__ __device__ bool operator<(const CustomType& other) const
 74 |   {
 75 |     return static_cast<const CustomType&>(*this).get_key() 
 76 |          < other.get_key();
 77 |   }
 78 | };
 79 | 
 80 | template <class CustomType>
 81 | class lexicographical_less_comparable_t
 82 | {
 83 |   // The CUDA compiler follows the IA64 ABI for class layout, while the 
 84 |   // Microsoft host compiler does not.
 85 |   char workaround_msvc;
 86 | 
 87 | public:
 88 |   __host__ __device__ bool operator<(const CustomType& other) const
 89 |   {
 90 |     return static_cast<const CustomType &>(*this).get_key() < other.get_key() ||
 91 |            (static_cast<const CustomType &>(*this).get_key() == other.get_key() &&
 92 |             static_cast<const CustomType &>(*this).get_val() < other.get_val());
 93 |   }
 94 | };
 95 | 
 96 | template <class CustomType>
 97 | class equal_comparable_t
 98 | {
 99 |   // The CUDA compiler follows the IA64 ABI for class layout, while the 
100 |   // Microsoft host compiler does not.
101 |   char workaround_msvc;
102 | 
103 | public:
104 |   __host__ __device__ bool operator==(const CustomType& other) const
105 |   {
106 |     const CustomType& self = static_cast<const CustomType&>(*this);
107 |     
108 |     return self.get_key() == other.get_key() &&
109 |            self.get_val() == other.get_val();
110 |   }
111 | };
112 | 
113 | template <class CustomType>
114 | class subtractable_t
115 | {
116 |   // The CUDA compiler follows the IA64 ABI for class layout, while the 
117 |   // Microsoft host compiler does not.
118 |   char workaround_msvc;
119 | 
120 | public:
121 |   __host__ __device__ CustomType operator-(const CustomType& other) const
122 |   {
123 |     CustomType result{};
124 | 
125 |     const CustomType& self = static_cast<const CustomType&>(*this);
126 | 
127 |     result.set_key(self.get_key() - other.get_key());
128 |     result.set_val(self.get_val() - other.get_val());
129 |     
130 |     return result;
131 |   }
132 | };
133 | 
134 | template <class CustomType>
135 | class accumulateable_t
136 | {
137 |   // The CUDA compiler follows the IA64 ABI for class layout, while the 
138 |   // Microsoft host compiler does not.
139 |   char workaround_msvc;
140 | 
141 | public:
142 |   __host__ __device__ CustomType operator+(const CustomType& other) const
143 |   {
144 |     CustomType result{};
145 | 
146 |     const CustomType& self = static_cast<const CustomType&>(*this);
147 | 
148 |     result.set_key(self.get_key() + other.get_key());
149 |     result.set_val(self.get_val() + other.get_val());
150 |     
151 |     return result;
152 |   }
153 | };
154 | 
155 | } // c2h
156 | 
157 | namespace std {
158 |   template<template<typename> class... Policies> 
159 |   class numeric_limits<c2h::custom_type_t<Policies...>> 
160 |   {
161 |   public:
162 |      static c2h::custom_type_t<Policies...> max() 
163 |      {
164 |        c2h::custom_type_t<Policies...> val;
165 |        val.set_key(std::numeric_limits<std::size_t>::max());
166 |        val.set_val(std::numeric_limits<std::size_t>::max());
167 |        return val;
168 |      }
169 | 
170 |      static c2h::custom_type_t<Policies...> lowest() 
171 |      {
172 |        c2h::custom_type_t<Policies...> val;
173 |        val.set_key(std::numeric_limits<std::size_t>::lowest());
174 |        val.set_val(std::numeric_limits<std::size_t>::lowest());
175 |        return val;
176 |      }
177 |   };
178 | }
179 | 
180 | 


--------------------------------------------------------------------------------
/test/c2h/generators.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Redistribution and use in source and binary forms, with or without
  5 | * modification, are permitted provided that the following conditions are met:
  6 | *     * Redistributions of source code must retain the above copyright
  7 | *       notice, this list of conditions and the following disclaimer.
  8 | *     * Redistributions in binary form must reproduce the above copyright
  9 | *       notice, this list of conditions and the following disclaimer in the
 10 | *       documentation and/or other materials provided with the distribution.
 11 | *     * Neither the name of the NVIDIA CORPORATION nor the
 12 | *       names of its contributors may be used to endorse or promote products
 13 | *       derived from this software without specific prior written permission.
 14 | *
 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | *
 26 | ******************************************************************************/
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <limits>
 31 | 
 32 | #include <thrust/device_vector.h>
 33 | 
 34 | #include <c2h/custom_type.cuh>
 35 | 
 36 | namespace c2h
 37 | {
 38 | 
 39 | namespace detail
 40 | {
 41 | 
 42 | template <class T>
 43 | class value_wrapper_t
 44 | {
 45 |   T m_val{};
 46 | 
 47 | public:
 48 |   explicit value_wrapper_t(T val) : m_val(val) {}
 49 |   explicit value_wrapper_t(int val) : m_val(static_cast<T>(val)) {}
 50 |   T get() const { return m_val; }
 51 | };
 52 | 
 53 | }
 54 | 
 55 | class seed_t : public detail::value_wrapper_t<unsigned long long int> 
 56 | {
 57 |   using value_wrapper_t::value_wrapper_t;
 58 | };
 59 | 
 60 | class modulo_t : public detail::value_wrapper_t<std::size_t> 
 61 | {
 62 |   using value_wrapper_t::value_wrapper_t;
 63 | };
 64 | 
 65 | namespace detail
 66 | {
 67 |   
 68 | void gen(seed_t seed,
 69 |          char* data,
 70 |          c2h::custom_type_state_t min,
 71 |          c2h::custom_type_state_t max,
 72 |          std::size_t elements,
 73 |          std::size_t element_size);
 74 | 
 75 | }
 76 | 
 77 | template <template <typename> class... Ps>
 78 | void gen(
 79 |   seed_t seed,
 80 |   thrust::device_vector<c2h::custom_type_t<Ps...>> &data,
 81 |   c2h::custom_type_t<Ps...> min = std::numeric_limits<c2h::custom_type_t<Ps...>>::lowest(),
 82 |   c2h::custom_type_t<Ps...> max = std::numeric_limits<c2h::custom_type_t<Ps...>>::max())
 83 | {
 84 |   detail::gen(
 85 |       seed, 
 86 |       reinterpret_cast<char*>(thrust::raw_pointer_cast(data.data())),
 87 |       min,
 88 |       max,
 89 |       data.size(),
 90 |       sizeof(c2h::custom_type_t<Ps...>));
 91 | }
 92 | 
 93 | template <typename T>
 94 | void gen(seed_t seed,
 95 |          thrust::device_vector<T> &data,
 96 |          T min = std::numeric_limits<T>::min(),
 97 |          T max = std::numeric_limits<T>::max());
 98 | 
 99 | template <typename T>
100 | void gen(modulo_t mod, thrust::device_vector<T> &data);
101 | 
102 | } // c2h
103 | 
104 | 


--------------------------------------------------------------------------------
/test/catch2_runner.cu:
--------------------------------------------------------------------------------
1 | #define CUB_CONFIG_MAIN
2 | #include "catch2_test_helper.h"
3 | 
4 | 


--------------------------------------------------------------------------------
/test/catch2_test_printing.cu:
--------------------------------------------------------------------------------
 1 | #include "test_util.h"
 2 | 
 3 | #include "catch2_test_helper.h"
 4 | 
 5 | template <typename T>
 6 | std::string print(T val) 
 7 | {
 8 |   std::stringstream ss;
 9 |   ss << val;
10 |   return ss.str();
11 | }
12 | 
13 | #if CUB_IS_INT128_ENABLED
14 | TEST_CASE("Test utils can print __int128", "[test][utils]")
15 | {
16 |   REQUIRE( print(__int128_t{0}) == "0" );
17 |   REQUIRE( print(__int128_t{42}) == "42" );
18 |   REQUIRE( print(__int128_t{-1}) == "-1" );
19 |   REQUIRE( print(__int128_t{-42}) == "-42" );
20 |   REQUIRE( print(-1 * (__int128_t{1} << 120)) == "-1329227995784915872903807060280344576" );
21 | }
22 | 
23 | TEST_CASE("Test utils can print __uint128", "[test][utils]")
24 | {
25 |   REQUIRE( print(__uint128_t{0}) == "0" );
26 |   REQUIRE( print(__uint128_t{1}) == "1" );
27 |   REQUIRE( print(__uint128_t{42}) == "42" );
28 |   REQUIRE( print(__uint128_t{1} << 120) == "1329227995784915872903807060280344576" );
29 | }
30 | #endif
31 | 
32 | TEST_CASE("Test utils can print KeyValuePair", "[test][utils]")
33 | {
34 |   REQUIRE( print(cub::KeyValuePair<int, int>{42, -42}) == "(42,-42)" );
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/test/catch2_test_util_type.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include <cub/iterator/counting_input_iterator.cuh>
29 | #include <cub/iterator/discard_output_iterator.cuh>
30 | #include <cub/util_type.cuh>
31 | 
32 | #include <cuda/std/type_traits>
33 | 
34 | // Has to go after all cub headers. Otherwise, this test won't catch unused
35 | // variables in cub kernels.
36 | #include "catch2_test_helper.h"
37 | 
38 | CUB_TEST("Tests non_void_value_t", "[util][type]")
39 | {
40 |   using fallback_t        = float;
41 |   using void_fancy_it     = cub::DiscardOutputIterator<std::size_t>;
42 |   using non_void_fancy_it = cub::CountingInputIterator<int>;
43 | 
44 |   // falls back for const void*
45 |   STATIC_REQUIRE(
46 |     ::cuda::std::is_same<fallback_t, //
47 |                          cub::detail::non_void_value_t<const void *, fallback_t>>::value);
48 |   // falls back for const volatile void*
49 |   STATIC_REQUIRE(
50 |     ::cuda::std::is_same<fallback_t, //
51 |                          cub::detail::non_void_value_t<const volatile void *, fallback_t>>::value);
52 |   // falls back for volatile void*
53 |   STATIC_REQUIRE(
54 |     ::cuda::std::is_same<fallback_t, //
55 |                          cub::detail::non_void_value_t<volatile void *, fallback_t>>::value);
56 |   // falls back for void*
57 |   STATIC_REQUIRE(::cuda::std::is_same<fallback_t, //
58 |                                       cub::detail::non_void_value_t<void *, fallback_t>>::value);
59 |   // works for int*
60 |   STATIC_REQUIRE(::cuda::std::is_same<int, //
61 |                                       cub::detail::non_void_value_t<int *, void>>::value);
62 |   // falls back for fancy iterator with a void value type
63 |   STATIC_REQUIRE(
64 |     ::cuda::std::is_same<fallback_t, //
65 |                          cub::detail::non_void_value_t<void_fancy_it, fallback_t>>::value);
66 |   // works for a fancy iterator that has int as value type
67 |   STATIC_REQUIRE(
68 |     ::cuda::std::is_same<int, //
69 |                          cub::detail::non_void_value_t<non_void_fancy_it, fallback_t>>::value);
70 | }
71 | 


--------------------------------------------------------------------------------
/test/catch2_test_warp_mask.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #include <cub/util_ptx.cuh>
 29 | 
 30 | // Has to go after all cub headers. Otherwise, this test won't catch unused
 31 | // variables in cub kernels.
 32 | #include "catch2_test_helper.h"
 33 | 
 34 | template <int logical_warp_threads>
 35 | struct total_warps_t
 36 | {
 37 | private:
 38 |   static constexpr unsigned int total_warps = (cub::PowerOfTwo<logical_warp_threads>::VALUE)
 39 |                                                 ? CUB_WARP_THREADS(0) / logical_warp_threads
 40 |                                                 : 1;
 41 | 
 42 | public:
 43 |   static constexpr unsigned int value() { return total_warps; }
 44 | };
 45 | 
 46 | bool is_lane_involved(unsigned int member_mask, unsigned int lane)
 47 | {
 48 |   return member_mask & (1 << lane);
 49 | }
 50 | 
 51 | using logical_warp_threads      = c2h::iota<1, 32>;
 52 | using power_of_two_warp_threads = c2h::enum_type_list<int, 1, 2, 4, 8, 16, 32>;
 53 | 
 54 | CUB_TEST("Warp mask ignores lanes before current logical warp",
 55 |          "[mask][warp]",
 56 |          power_of_two_warp_threads)
 57 | {
 58 |   constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
 59 |   constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
 60 | 
 61 |   for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
 62 |   {
 63 |     const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
 64 |     const unsigned int warp_begin = logical_warp_thread * warp_id;
 65 | 
 66 |     for (unsigned int prev_warp_lane = 0; prev_warp_lane < warp_begin; prev_warp_lane++)
 67 |     {
 68 |       REQUIRE_FALSE(is_lane_involved(warp_mask, prev_warp_lane));
 69 |     }
 70 |   }
 71 | }
 72 | 
 73 | CUB_TEST("Warp mask involves lanes of current logical warp", "[mask][warp]", logical_warp_threads)
 74 | {
 75 |   constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
 76 |   constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
 77 | 
 78 |   for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
 79 |   {
 80 |     const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
 81 |     const unsigned int warp_begin = logical_warp_thread * warp_id;
 82 |     const unsigned int warp_end   = warp_begin + logical_warp_thread;
 83 | 
 84 |     for (unsigned int warp_lane = warp_begin; warp_lane < warp_end; warp_lane++)
 85 |     {
 86 |       REQUIRE(is_lane_involved(warp_mask, warp_lane));
 87 |     }
 88 |   }
 89 | }
 90 | 
 91 | CUB_TEST("Warp mask ignores lanes after current logical warp", "[mask][warp]", logical_warp_threads)
 92 | {
 93 |   constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
 94 |   constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
 95 | 
 96 |   for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
 97 |   {
 98 |     const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
 99 |     const unsigned int warp_begin = logical_warp_thread * warp_id;
100 |     const unsigned int warp_end   = warp_begin + logical_warp_thread;
101 | 
102 |     for (unsigned int post_warp_lane = warp_end; post_warp_lane < CUB_WARP_THREADS(0);
103 |          post_warp_lane++)
104 |     {
105 |       REQUIRE_FALSE(is_lane_involved(warp_mask, post_warp_lane));
106 |     }
107 |   }
108 | }
109 | 


--------------------------------------------------------------------------------
/test/cmake/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (NOT CUB_IN_THRUST) # Thrust has its own checks for this:
 2 |   # Test that we can use `find_package` on an installed CUB:
 3 |   add_test(
 4 |     NAME cub.test.cmake.test_install
 5 |     COMMAND "${CMAKE_COMMAND}"
 6 |       --log-level=VERBOSE
 7 |       -G "${CMAKE_GENERATOR}"
 8 |       -S "${CMAKE_CURRENT_SOURCE_DIR}/test_install"
 9 |       -B "${CMAKE_CURRENT_BINARY_DIR}/test_install"
10 |       -D "CUB_BINARY_DIR=${CUB_BINARY_DIR}"
11 |       -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
12 |       -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
13 |       -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
14 |   )
15 | endif()
16 | 
17 | # Check source code for issues that can be found by pattern matching:
18 | add_test(
19 |   NAME cub.test.cmake.check_source_files
20 |   COMMAND
21 |     "${CMAKE_COMMAND}"
22 |       -D "CUB_SOURCE_DIR=${CUB_SOURCE_DIR}"
23 |       -P "${CMAKE_CURRENT_LIST_DIR}/check_source_files.cmake"
24 | )
25 | 


--------------------------------------------------------------------------------
/test/cmake/test_install/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Test that an installation of the project can be located by find_package() call
 2 | # with appropriate prefix settings.
 3 | #
 4 | # Expects CUB_BINARY_DIR to be set to an existing cub build directory.
 5 | 
 6 | cmake_minimum_required(VERSION 3.15)
 7 | 
 8 | project(CubTestInstall CXX CUDA)
 9 | 
10 | # This will eventually get deleted recursively -- keep that in mind if modifying
11 | set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/install_prefix/")
12 | 
13 | function(do_manual_install)
14 |   # Inspired by the VTK-m install tests, we can just glob up all of the
15 |   # cmake_install.cmake, include (ie. run) them, and they'll effectively
16 |   # install the project into the current value of CMAKE_INSTALL_PREFIX.
17 | 
18 |   # Gather all of the install files from CUB's root:
19 |   file(GLOB_RECURSE install_files
20 |     LIST_DIRECTORIES False
21 |     "${CUB_BINARY_DIR}/cmake_install.cmake"
22 |   )
23 | 
24 |   message(STATUS "Locating install files...")
25 |   foreach (install_file IN LISTS install_files)
26 |     message(STATUS "  * ${install_file}")
27 |   endforeach()
28 | 
29 |   message(STATUS "Building install tree...")
30 |   foreach(install_file IN LISTS install_files)
31 |     include("${install_file}")
32 |   endforeach()
33 | endfunction()
34 | 
35 | function(do_cleanup)
36 |   message(STATUS "Removing ${CMAKE_INSTALL_PREFIX}")
37 |   file(REMOVE_RECURSE "${CMAKE_INSTALL_PREFIX}")
38 | endfunction()
39 | 
40 | function(assert_boolean var_name expect)
41 |   if (expect)
42 |     if (NOT ${var_name})
43 |       message(FATAL_ERROR "'${var_name}' is false, expected true.")
44 |     endif()
45 |   else()
46 |     if (${var_name})
47 |       message(FATAL_ERROR "'${var_name}' is true, expected false.")
48 |     endif()
49 |   endif()
50 | endfunction()
51 | 
52 | function(assert_target target_name)
53 |   if (NOT TARGET "${target_name}")
54 |     message(FATAL_ERROR "Target '${target_name}' not defined.")
55 |   endif()
56 | endfunction()
57 | 
58 | function(find_installed_project)
59 |   set(CMAKE_PREFIX_PATH "${CMAKE_INSTALL_PREFIX}")
60 |   find_package(CUB CONFIG)
61 | 
62 |   if (NOT CUB_FOUND)
63 |     message(FATAL_ERROR
64 |       "find_package(CUB) failed. "
65 |       "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}"
66 |     )
67 |   endif()
68 | 
69 |   # Test some internal config vars to check that this is the expected install:
70 |   # TODO The cmake_path (3.19) command will provide more robust ways to do this
71 | 
72 |   # Escape regex special characters in the install prefix, see
73 |   # https://gitlab.kitware.com/cmake/cmake/-/issues/18580
74 |   string(REGEX REPLACE "([][+.*()^])" "\\\\\\1"
75 |     prefix_regex
76 |     "${CMAKE_INSTALL_PREFIX}"
77 |   )
78 |   if (NOT _CUB_INCLUDE_DIR MATCHES "^${prefix_regex}")
79 |     message(FATAL_ERROR
80 |       "Found CUB in unexpected location: "
81 |       " * _CUB_INCLUDE_DIR=${_CUB_INCLUDE_DIR} "
82 |       " * ExpectedPrefix=${CMAKE_INSTALL_DIR}"
83 |     )
84 |   endif()
85 | 
86 |   assert_target(CUB::CUB)
87 | 
88 | endfunction()
89 | 
90 | do_cleanup() # Prepare for new installation
91 | do_manual_install()
92 | find_installed_project()
93 | do_cleanup() # Clean up if successful
94 | 


--------------------------------------------------------------------------------
/test/fill_striped.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #include <type_traits>
 29 | 
 30 | template <typename T, typename = int>
 31 | struct has_x : std::false_type
 32 | {};
 33 | 
 34 | template <typename T>
 35 | struct has_x<T, decltype((void)T::x, 0)> : std::true_type
 36 | {};
 37 | 
 38 | template <typename T, typename = int>
 39 | struct has_y : std::false_type
 40 | {};
 41 | 
 42 | template <typename T>
 43 | struct has_y<T, decltype((void)T::y, 0)> : std::true_type
 44 | {};
 45 | 
 46 | template <typename T, typename = int>
 47 | struct has_z : std::false_type
 48 | {};
 49 | 
 50 | template <typename T>
 51 | struct has_z<T, decltype((void)T::z, 0)> : std::true_type
 52 | {};
 53 | 
 54 | template <typename T, typename = int>
 55 | struct has_w : std::false_type
 56 | {};
 57 | 
 58 | template <typename T>
 59 | struct has_w<T, decltype((void)T::w, 0)> : std::true_type
 60 | {};
 61 | 
 62 | template <typename ScalarT, typename = int>
 63 | struct component_type_impl_t
 64 | {
 65 |   using type = ScalarT;
 66 | };
 67 | 
 68 | template <typename VectorT>
 69 | struct component_type_impl_t<VectorT, decltype((void)VectorT::x, 0)>
 70 | {
 71 |   using type = decltype(std::declval<VectorT>().x);
 72 | };
 73 | 
 74 | template <typename T>
 75 | using component_type_t = typename component_type_impl_t<T>::type;
 76 | 
 77 | template <typename VectorT>
 78 | struct scalar_to_vec_t
 79 | {
 80 |   using component_t = component_type_t<VectorT>;
 81 | 
 82 |   template <typename T, typename V = VectorT>
 83 |   __host__ __device__ __forceinline__
 84 |     typename std::enable_if<std::is_same<V, VectorT>::value && !has_x<V>::value, V>::type
 85 |     operator()(T scalar)
 86 |   {
 87 |     return static_cast<component_t>(scalar);
 88 |   }
 89 | 
 90 |   template <typename T, typename V = VectorT>
 91 |   __host__ __device__ __forceinline__
 92 |     typename std::enable_if<std::is_same<V, VectorT>::value && has_x<V>::value && !has_y<V>::value,
 93 |                             V>::type
 94 |     operator()(T scalar)
 95 |   {
 96 |     V val;
 97 |     val.x = static_cast<component_t>(scalar);
 98 |     return val;
 99 |   }
100 | 
101 |   template <typename T, typename V = VectorT>
102 |   __host__ __device__ __forceinline__
103 |     typename std::enable_if<std::is_same<V, VectorT>::value && has_y<V>::value && !has_z<V>::value,
104 |                             V>::type
105 |     operator()(T scalar)
106 |   {
107 |     V val;
108 |     val.x = static_cast<component_t>(scalar);
109 |     val.y = static_cast<component_t>(scalar);
110 |     return val;
111 |   }
112 | 
113 |   template <typename T, typename V = VectorT>
114 |   __host__ __device__ __forceinline__
115 |     typename std::enable_if<std::is_same<V, VectorT>::value && has_z<V>::value && !has_w<V>::value,
116 |                             V>::type
117 |     operator()(T scalar)
118 |   {
119 |     V val;
120 |     val.x = static_cast<component_t>(scalar);
121 |     val.y = static_cast<component_t>(scalar);
122 |     val.z = static_cast<component_t>(scalar);
123 |     return val;
124 |   }
125 | 
126 |   template <typename T, typename V = VectorT>
127 |   __host__ __device__ __forceinline__
128 |     typename std::enable_if<std::is_same<V, VectorT>::value && has_w<V>::value, V>::type
129 |     operator()(T scalar)
130 |   {
131 |     V val;
132 |     val.x = static_cast<component_t>(scalar);
133 |     val.y = static_cast<component_t>(scalar);
134 |     val.z = static_cast<component_t>(scalar);
135 |     val.w = static_cast<component_t>(scalar);
136 |     return val;
137 |   }
138 | };
139 | 
140 | template <int LogicalWarpThreads, int ItemsPerThread, int BlockThreads, typename IteratorT>
141 | void fill_striped(IteratorT it)
142 | {
143 |   using T = cub::detail::value_t<IteratorT>;
144 | 
145 |   const int warps_in_block = BlockThreads / LogicalWarpThreads;
146 |   const int items_per_warp = LogicalWarpThreads * ItemsPerThread;
147 |   scalar_to_vec_t<T> convert;
148 | 
149 |   for (int warp_id = 0; warp_id < warps_in_block; warp_id++)
150 |   {
151 |     const int warp_offset_val = items_per_warp * warp_id;
152 | 
153 |     for (int lane_id = 0; lane_id < LogicalWarpThreads; lane_id++)
154 |     {
155 |       const int lane_offset = warp_offset_val + lane_id;
156 | 
157 |       for (int item = 0; item < ItemsPerThread; item++)
158 |       {
159 |         *(it++) = convert(lane_offset + item * LogicalWarpThreads);
160 |       }
161 |     }
162 |   }
163 | }
164 | 


--------------------------------------------------------------------------------
/test/link_a.cu:
--------------------------------------------------------------------------------
 1 | #include <cub/cub.cuh>
 2 | 
 3 | void a()
 4 | {
 5 |     printf("a() called\n");
 6 | 
 7 |     cub::DoubleBuffer<unsigned int>     d_keys;
 8 |     cub::DoubleBuffer<cub::NullType>    d_values;
 9 |     size_t                              temp_storage_bytes = 0;
10 |     cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
11 | }
12 | 


--------------------------------------------------------------------------------
/test/link_b.cu:
--------------------------------------------------------------------------------
 1 | #include <cub/cub.cuh>
 2 | 
 3 | void b()
 4 | {
 5 |     printf("b() called\n");
 6 | 
 7 |     cub::DoubleBuffer<unsigned int>     d_keys;
 8 |     cub::DoubleBuffer<cub::NullType>    d_values;
 9 |     size_t                              temp_storage_bytes = 0;
10 |     cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
11 | }
12 | 


--------------------------------------------------------------------------------
/test/link_main.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | extern void a();
 4 | extern void b();
 5 | 
 6 | int main()
 7 | {
 8 |     printf("hello world\n");
 9 |     return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/test/mersenne.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  A C-program for MT19937, with initialization improved 2002/1/26.
  3 |  Coded by Takuji Nishimura and Makoto Matsumoto.
  4 | 
  5 |  Before using, initialize the state by using init_genrand(seed)
  6 |  or init_by_array(init_key, key_length).
  7 | 
  8 |  Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
  9 |  All rights reserved.
 10 | 
 11 |  Redistribution and use in source and binary forms, with or without
 12 |  modification, are permitted provided that the following conditions
 13 |  are met:
 14 | 
 15 |  1. Redistributions of source code must retain the above copyright
 16 |  notice, this list of conditions and the following disclaimer.
 17 | 
 18 |  2. Redistributions in binary form must reproduce the above copyright
 19 |  notice, this list of conditions and the following disclaimer in the
 20 |  documentation and/or other materials provided with the distribution.
 21 | 
 22 |  3. The names of its contributors may not be used to endorse or promote
 23 |  products derived from this software without specific prior written
 24 |  permission.
 25 | 
 26 |  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 27 |  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 28 |  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 29 |  A PARTICAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 30 |  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 31 |  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 32 |  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 33 |  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 34 |  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 35 |  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 36 |  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 37 | 
 38 | 
 39 |  Any feedback is very welcome.
 40 |  http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
 41 |  email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
 42 |  */
 43 | 
 44 | #include <stdio.h>
 45 | 
 46 | namespace mersenne {
 47 | 
 48 | /* Period parameters */
 49 | const unsigned int N          = 624;
 50 | const unsigned int M          = 397;
 51 | const unsigned int MATRIX_A   = 0x9908b0df; /* constant vector a */
 52 | const unsigned int UPPER_MASK = 0x80000000; /* most significant w-r bits */
 53 | const unsigned int LOWER_MASK = 0x7fffffff; /* least significant r bits */
 54 | 
 55 | static unsigned int mt[N];  /* the array for the state vector  */
 56 | static int mti = N + 1;     /* mti==N+1 means mt[N] is not initialized */
 57 | 
 58 | /* initializes mt[N] with a seed */
 59 | void init_genrand(unsigned int s)
 60 | {
 61 |     mt[0] = s & 0xffffffff;
 62 |     for (mti = 1; mti < static_cast<int>(N); mti++)
 63 |     {
 64 |         mt[mti] = (1812433253 * (mt[mti - 1] ^ (mt[mti - 1] >> 30)) + mti);
 65 | 
 66 |         /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for mtiplier. */
 67 |         /* In the previous versions, MSBs of the seed affect   */
 68 |         /* only MSBs of the array mt[].                        */
 69 |         /* 2002/01/09 modified by Makoto Matsumoto             */
 70 | 
 71 |         mt[mti] &= 0xffffffff;
 72 |         /* for >32 bit machines */
 73 |     }
 74 | }
 75 | 
 76 | /* initialize by an array with array-length */
 77 | /* init_key is the array for initializing keys */
 78 | /* key_length is its length */
 79 | /* slight change for C++, 2004/2/26 */
 80 | void init_by_array(unsigned int init_key[], int key_length)
 81 | {
 82 |     int i, j, k;
 83 |     init_genrand(19650218);
 84 |     i = 1;
 85 |     j = 0;
 86 |     k = (static_cast<int>(N) > key_length
 87 | 	 ? static_cast<int>(N)
 88 | 	 : key_length);
 89 |     for (; k; k--)
 90 |     {
 91 |         mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1664525))
 92 |             + init_key[j] + j;  /* non linear */
 93 |         mt[i] &= 0xffffffff;    /* for WORDSIZE > 32 machines */
 94 |         i++;
 95 |         j++;
 96 |         if (i >= static_cast<int>(N))
 97 |         {
 98 |             mt[0] = mt[N - 1];
 99 |             i = 1;
100 |         }
101 |         if (j >= key_length) j = 0;
102 |     }
103 |     for (k = N - 1; k; k--)
104 |     {
105 |         mt[i] = (mt[i] ^ ((mt[i - 1] ^ (mt[i - 1] >> 30)) * 1566083941)) - i; /* non linear */
106 |         mt[i] &= 0xffffffff; /* for WORDSIZE > 32 machines */
107 |         i++;
108 |         if (i >= static_cast<int>(N))
109 |         {
110 |             mt[0] = mt[N - 1];
111 |             i = 1;
112 |         }
113 |     }
114 | 
115 |     mt[0] = 0x80000000; /* MSB is 1; assuring non-zero initial array */
116 | }
117 | 
118 | /* generates a random number on [0,0xffffffff]-interval */
119 | unsigned int genrand_int32(void)
120 | {
121 |     unsigned int y;
122 |     static unsigned int mag01[2] = { 0x0, MATRIX_A };
123 | 
124 |     /* mag01[x] = x * MATRIX_A  for x=0,1 */
125 | 
126 |     if (mti >= static_cast<int>(N))
127 |     { /* generate N words at one time */
128 |         int kk;
129 | 
130 |         if (mti == N + 1) /* if init_genrand() has not been called, */
131 |         init_genrand(5489); /* a defat initial seed is used */
132 | 
133 |         for (kk = 0; kk < static_cast<int>(N - M); kk++)
134 |         {
135 |             y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
136 |             mt[kk] = mt[kk + M] ^ (y >> 1) ^ mag01[y & 0x1];
137 |         }
138 |         for (; kk < static_cast<int>(N - 1); kk++)
139 |         {
140 |             y = (mt[kk] & UPPER_MASK) | (mt[kk + 1] & LOWER_MASK);
141 |             mt[kk] = mt[kk + (M - N)] ^ (y >> 1) ^ mag01[y & 0x1];
142 |         }
143 |         y = (mt[N - 1] & UPPER_MASK) | (mt[0] & LOWER_MASK);
144 |         mt[N - 1] = mt[M - 1] ^ (y >> 1) ^ mag01[y & 0x1];
145 | 
146 |         mti = 0;
147 |     }
148 | 
149 |     y = mt[mti++];
150 | 
151 |     /* Tempering */
152 |     y ^= (y >> 11);
153 |     y ^= (y << 7) & 0x9d2c5680;
154 |     y ^= (y << 15) & 0xefc60000;
155 |     y ^= (y >> 18);
156 | 
157 |     return y;
158 | }
159 | 
160 | 
161 | 
162 | } // namespace mersenne
163 | 


--------------------------------------------------------------------------------
/test/test_cdp_variant_state.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  Copyright 2022 NVIDIA Corporation
 3 | *
 4 | *  Licensed under the Apache License, Version 2.0 (the "License");
 5 | *  you may not use this file except in compliance with the License.
 6 | *  You may obtain a copy of the License at
 7 | *
 8 | *      http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | *  Unless required by applicable law or agreed to in writing, software
11 | *  distributed under the License is distributed on an "AS IS" BASIS,
12 | *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | *  See the License for the specific language governing permissions and
14 | *  limitations under the License.
15 | */
16 | 
17 | #include <cub/detail/detect_cuda_runtime.cuh>
18 | 
19 | #include <cstdlib>
20 | 
21 | int main()
22 | {
23 |   // This test just checks that RDC is enabled and detected properly when using
24 |   // the %PARAM% system to request CDP support (see the README.md file in
25 |   // this directory).
26 | 
27 |   // %PARAM% TEST_CDP cdp 0:1
28 | 
29 | #ifdef CUB_RDC_ENABLED
30 |   return (TEST_CDP == 1) ? EXIT_SUCCESS : EXIT_FAILURE;
31 | #else
32 |   return (TEST_CDP == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
33 | #endif
34 | }
35 | 


--------------------------------------------------------------------------------
/test/test_grid_barrier.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Test evaluation for software global barrier throughput
 31 |  ******************************************************************************/
 32 | 
 33 | // Ensure printing of CUDA runtime errors to console
 34 | #define CUB_STDERR
 35 | 
 36 | #include <stdio.h>
 37 | 
 38 | #include <cub/grid/grid_barrier.cuh>
 39 | 
 40 | #include "test_util.h"
 41 | 
 42 | using namespace cub;
 43 | 
 44 | 
 45 | //---------------------------------------------------------------------
 46 | // Test kernels
 47 | //---------------------------------------------------------------------
 48 | 
 49 | /**
 50 |  * Kernel that iterates through the specified number of software global barriers
 51 |  */
 52 | __global__ void Kernel(
 53 |     GridBarrier global_barrier,
 54 |     int iterations)
 55 | {
 56 |     for (int i = 0; i < iterations; i++)
 57 |     {
 58 |         global_barrier.Sync();
 59 |     }
 60 | }
 61 | 
 62 | 
 63 | //---------------------------------------------------------------------
 64 | // Main
 65 | //---------------------------------------------------------------------
 66 | 
 67 | /**
 68 |  * Main
 69 |  */
 70 | int main(int argc, char** argv)
 71 | {
 72 |     cudaError_t retval = cudaSuccess;
 73 | 
 74 |     // Defaults
 75 |     int iterations = 10000;
 76 |     int block_size = 128;
 77 |     int grid_size = -1;
 78 | 
 79 |     // Initialize command line
 80 |     CommandLineArgs args(argc, argv);
 81 | 
 82 |     // Get args
 83 |     args.GetCmdLineArgument("i", iterations);
 84 |     args.GetCmdLineArgument("grid-size", grid_size);
 85 |     args.GetCmdLineArgument("block-size", block_size);
 86 | 
 87 |     // Print usage
 88 |     if (args.CheckCmdLineFlag("help"))
 89 |     {
 90 |         printf("%s "
 91 |             "[--device=<device-id>]"
 92 |             "[--i=<iterations>]"
 93 |             "[--grid-size<grid-size>]"
 94 |             "[--block-size<block-size>]"
 95 |             "\n", argv[0]);
 96 |         exit(0);
 97 |     }
 98 | 
 99 |     // Initialize device
100 |     CubDebugExit(args.DeviceInit());
101 | 
102 |     // Get device ordinal
103 |     int device_ordinal;
104 |     CubDebugExit(cudaGetDevice(&device_ordinal));
105 | 
106 |     // Get device SM version
107 |     int sm_version = 0;
108 |     CubDebugExit(SmVersion(sm_version, device_ordinal));
109 | 
110 |     // Get SM properties
111 |     int sm_count, max_block_threads, max_sm_occupancy;
112 |     CubDebugExit(cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal));
113 |     CubDebugExit(cudaDeviceGetAttribute(&max_block_threads, cudaDevAttrMaxThreadsPerBlock, device_ordinal));
114 |     CubDebugExit(MaxSmOccupancy(max_sm_occupancy, EmptyKernel<void>, 32));
115 | 
116 |     // Compute grid size and occupancy
117 |     int occupancy = CUB_MIN((max_block_threads / block_size), max_sm_occupancy);
118 | 
119 |     if (grid_size == -1)
120 |     {
121 |         grid_size = occupancy * sm_count;
122 |     }
123 |     else
124 |     {
125 |         occupancy = grid_size / sm_count;
126 |     }
127 | 
128 |     printf("Initializing software global barrier for Kernel<<<%d,%d>>> with %d occupancy\n",
129 |         grid_size, block_size, occupancy);
130 |     fflush(stdout);
131 | 
132 |     // Init global barrier
133 |     GridBarrierLifetime global_barrier;
134 |     global_barrier.Setup(grid_size);
135 | 
136 |     // Time kernel
137 |     GpuTimer gpu_timer;
138 |     gpu_timer.Start();
139 |     Kernel<<<grid_size, block_size>>>(global_barrier, iterations);
140 |     gpu_timer.Stop();
141 | 
142 |     retval = CubDebug(cudaDeviceSynchronize());
143 | 
144 |     // Output timing results
145 |     float avg_elapsed = gpu_timer.ElapsedMillis() / float(iterations);
146 |     printf("%d iterations, %f total elapsed millis, %f avg elapsed millis\n",
147 |         iterations,
148 |         gpu_timer.ElapsedMillis(),
149 |         avg_elapsed);
150 | 
151 |     return retval;
152 | }
153 | 


--------------------------------------------------------------------------------
/test/test_namespace_wrapped.cu:
--------------------------------------------------------------------------------
 1 | // Wrap thrust and cub in different enclosing namespaces
 2 | // (In practice, you probably want these to be the same, in which case just
 3 | // set THRUST_CUB_WRAPPED_NAMESPACE to set both).
 4 | #define THRUST_WRAPPED_NAMESPACE wrap_thrust
 5 | #define CUB_WRAPPED_NAMESPACE    wrap_cub
 6 | 
 7 | // Enable error checking:
 8 | #define CUB_STDERR
 9 | 
10 | #include <thrust/device_vector.h>
11 | #include <thrust/host_vector.h>
12 | #include <thrust/sort.h>
13 | 
14 | #include <cub/device/device_radix_sort.cuh>
15 | #include <cub/util_debug.cuh>
16 | 
17 | #include "test_util.h"
18 | 
19 | #include <cstdint>
20 | #include <cstdlib>
21 | 
22 | // Test that we can use a few common utilities and algorithms from wrapped
23 | // Thrust/CUB namespaces at runtime. More extensive testing is performed by the
24 | // header tests and the check_namespace.cmake test.
25 | int main(int argc, char **argv)
26 | {
27 |   CommandLineArgs args(argc, argv);
28 |   CubDebugExit(args.DeviceInit());
29 | 
30 |   const std::size_t n = 2048;
31 | 
32 |   // Fill a vector with random data:
33 |   ::wrap_thrust::thrust::host_vector<int> h_input(n);
34 |   for (auto &val : h_input)
35 |   {
36 |     RandomBits(val);
37 |   }
38 | 
39 |   // Test the qualifier macro:
40 |   THRUST_NS_QUALIFIER::device_vector<int> d_input(h_input);
41 |   THRUST_NS_QUALIFIER::device_vector<int> d_output(n);
42 | 
43 |   std::size_t temp_storage_bytes{};
44 | 
45 |   // Sort with DeviceRadixSort:
46 |   auto error = ::wrap_cub::cub::DeviceRadixSort::SortKeys(
47 |     nullptr,
48 |     temp_storage_bytes,
49 |     ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()),
50 |     ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()),
51 |     static_cast<std::size_t>(n));
52 | 
53 |   CubDebugExit(error);
54 | 
55 |   ::wrap_thrust::thrust::device_vector<std::uint8_t> temp_storage(
56 |     temp_storage_bytes);
57 | 
58 |   // Test the CUB qualifier macro:
59 |   error = CUB_NS_QUALIFIER::DeviceRadixSort::SortKeys(
60 |     ::wrap_thrust::thrust::raw_pointer_cast(temp_storage.data()),
61 |     temp_storage_bytes,
62 |     ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()),
63 |     ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()),
64 |     static_cast<std::size_t>(n));
65 | 
66 |   CubDebugExit(error);
67 | 
68 |   // Verify output:
69 |   if (!::wrap_thrust::thrust::is_sorted(d_output.cbegin(), d_output.cend()))
70 |   {
71 |     std::cerr << "Output is not sorted!\n";
72 |     return EXIT_FAILURE;
73 |   }
74 | 
75 |   return EXIT_SUCCESS;
76 | }
77 | 


--------------------------------------------------------------------------------
/test/test_thread_sort.cu:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #include "test_util.h"
 29 | #include "cub/thread/thread_sort.cuh"
 30 | 
 31 | #include <thrust/device_vector.h>
 32 | #include <thrust/host_vector.h>
 33 | #include <thrust/sequence.h>
 34 | #include <thrust/shuffle.h>
 35 | #include <thrust/sort.h>
 36 | #include <thrust/random.h>
 37 | 
 38 | 
 39 | struct CustomLess
 40 | {
 41 |   template <typename DataType>
 42 |   __host__ __device__ bool operator()(DataType &lhs, DataType &rhs)
 43 |   {
 44 |     return lhs < rhs;
 45 |   }
 46 | };
 47 | 
 48 | 
 49 | template <typename KeyT,
 50 |           typename ValueT,
 51 |           int ItemsPerThread>
 52 | __global__ void kernel(const KeyT *keys_in,
 53 |                        KeyT *keys_out,
 54 |                        const ValueT *values_in,
 55 |                        ValueT *values_out)
 56 | {
 57 |   KeyT thread_keys[ItemsPerThread];
 58 |   KeyT thread_values[ItemsPerThread];
 59 | 
 60 |   const auto thread_offset = ItemsPerThread * threadIdx.x;
 61 |   keys_in += thread_offset;
 62 |   keys_out += thread_offset;
 63 |   values_in += thread_offset;
 64 |   values_out += thread_offset;
 65 | 
 66 |   for (int item = 0; item < ItemsPerThread; item++)
 67 |   {
 68 |     thread_keys[item] = keys_in[item];
 69 |     thread_values[item] = values_in[item];
 70 |   }
 71 | 
 72 |   cub::StableOddEvenSort(thread_keys, thread_values, CustomLess{});
 73 | 
 74 |   for (int item = 0; item < ItemsPerThread; item++)
 75 |   {
 76 |     keys_out[item] = thread_keys[item];
 77 |     values_out[item] = thread_values[item];
 78 |   }
 79 | }
 80 | 
 81 | 
 82 | template <typename KeyT,
 83 |           typename ValueT,
 84 |           int ItemsPerThread>
 85 | void Test()
 86 | {
 87 |   const unsigned int threads_in_block = 1024;
 88 |   const unsigned int elements = threads_in_block * ItemsPerThread;
 89 | 
 90 |   thrust::default_random_engine re;
 91 |   thrust::device_vector<std::uint8_t> data_source(elements);
 92 | 
 93 |   for (int iteration = 0; iteration < 10; iteration++)
 94 |   {
 95 |     thrust::sequence(data_source.begin(), data_source.end());
 96 |     thrust::shuffle(data_source.begin(), data_source.end(), re);
 97 |     thrust::device_vector<KeyT> in_keys(data_source);
 98 |     thrust::device_vector<KeyT> out_keys(elements);
 99 | 
100 |     thrust::shuffle(data_source.begin(), data_source.end(), re);
101 |     thrust::device_vector<ValueT> in_values(data_source);
102 |     thrust::device_vector<ValueT> out_values(elements);
103 | 
104 |     thrust::host_vector<KeyT> host_keys(in_keys);
105 |     thrust::host_vector<ValueT> host_values(in_values);
106 | 
107 |     kernel<KeyT, ValueT, ItemsPerThread><<<1, threads_in_block>>>(
108 |       thrust::raw_pointer_cast(in_keys.data()),
109 |       thrust::raw_pointer_cast(out_keys.data()),
110 |       thrust::raw_pointer_cast(in_values.data()),
111 |       thrust::raw_pointer_cast(out_values.data()));
112 | 
113 |     for (unsigned int tid = 0; tid < threads_in_block; tid++)
114 |     {
115 |       const auto thread_begin = tid * ItemsPerThread;
116 |       const auto thread_end = thread_begin + ItemsPerThread;
117 | 
118 |       thrust::sort_by_key(host_keys.begin() + thread_begin,
119 |                           host_keys.begin() + thread_end,
120 |                           host_values.begin() + thread_begin,
121 |                           CustomLess{});
122 |     }
123 | 
124 |     AssertEquals(host_keys, out_keys);
125 |     AssertEquals(host_values, out_values);
126 |   }
127 | }
128 | 
129 | 
130 | template <typename KeyT,
131 |           typename ValueT>
132 | void Test()
133 | {
134 |   Test<KeyT, ValueT, 2>();
135 |   Test<KeyT, ValueT, 3>();
136 |   Test<KeyT, ValueT, 4>();
137 |   Test<KeyT, ValueT, 5>();
138 |   Test<KeyT, ValueT, 7>();
139 |   Test<KeyT, ValueT, 8>();
140 |   Test<KeyT, ValueT, 9>();
141 |   Test<KeyT, ValueT, 11>();
142 | }
143 | 
144 | int main()
145 | {
146 |   Test<std::uint32_t, std::uint32_t>();
147 |   Test<std::uint32_t, std::uint64_t>();
148 | 
149 |   return 0;
150 | }
151 | 


--------------------------------------------------------------------------------