├── .clang-format
├── .clang-tidy
├── .clangd
├── .git-blame-ignore-revs
├── .github
    ├── ISSUE_TEMPLATE
    │   └── config.yml
    └── workflows
    │   ├── mirror-main-branch-to-master-branch.yml
    │   └── push-to-legacy-repositories.yml
├── .gitignore
├── CHANGELOG.md
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.TXT
├── README.md
├── benchmarks
    ├── CMakeLists.txt
    ├── README.md
    ├── bench
    │   ├── adjacent_difference
    │   │   └── subtract_left.cu
    │   ├── histogram
    │   │   ├── even.cu
    │   │   ├── histogram_common.cuh
    │   │   ├── multi
    │   │   │   ├── even.cu
    │   │   │   └── range.cu
    │   │   └── range.cu
    │   ├── merge_sort
    │   │   ├── keys.cu
    │   │   └── pairs.cu
    │   ├── partition
    │   │   ├── flagged.cu
    │   │   └── if.cu
    │   ├── radix_sort
    │   │   ├── keys.cu
    │   │   └── pairs.cu
    │   ├── reduce
    │   │   ├── base.cuh
    │   │   ├── by_key.cu
    │   │   ├── max.cu
    │   │   └── sum.cu
    │   ├── run_length_encode
    │   │   ├── encode.cu
    │   │   └── non_trivial_runs.cu
    │   ├── scan
    │   │   └── exclusive
    │   │   │   ├── base.cuh
    │   │   │   ├── by_key.cu
    │   │   │   ├── max.cu
    │   │   │   └── sum.cu
    │   ├── segmented_sort
    │   │   ├── large
    │   │   │   └── keys.cu
    │   │   ├── power_law
    │   │   │   └── keys.cu
    │   │   └── small
    │   │   │   └── keys.cu
    │   └── select
    │   │   ├── flagged.cu
    │   │   ├── if.cu
    │   │   └── unique_by_key.cu
    ├── docker
    │   ├── .gitignore
    │   └── recipe.py
    ├── nvbench_helper
    │   ├── CMakeLists.txt
    │   ├── look_back_helper.cuh
    │   ├── nvbench_helper.cu
    │   └── nvbench_helper.cuh
    └── scripts
    │   ├── .gitignore
    │   ├── analysis.ipynb
    │   ├── analyze.py
    │   ├── cub
    │       ├── __init__.py
    │       └── bench
    │       │   ├── __init__.py
    │       │   ├── bench.py
    │       │   ├── build.py
    │       │   ├── cmake.py
    │       │   ├── config.py
    │       │   ├── logger.py
    │       │   ├── score.py
    │       │   ├── search.py
    │       │   └── storage.py
    │   ├── search.py
    │   └── verify.py
├── cmake
    ├── AppendOptionIfAvailable.cmake
    ├── CPM.cmake
    ├── CubAddSubdir.cmake
    ├── CubBuildCompilerTargets.cmake
    ├── CubBuildTargetList.cmake
    ├── CubCompilerHacks.cmake
    ├── CubCudaConfig.cmake
    ├── CubHeaderTesting.cmake
    ├── CubInstallRules.cmake
    ├── CubUtilities.cmake
    └── header_test.in
├── cub
    ├── agent
    │   ├── agent_adjacent_difference.cuh
    │   ├── agent_batch_memcpy.cuh
    │   ├── agent_histogram.cuh
    │   ├── agent_merge_sort.cuh
    │   ├── agent_radix_sort_downsweep.cuh
    │   ├── agent_radix_sort_histogram.cuh
    │   ├── agent_radix_sort_onesweep.cuh
    │   ├── agent_radix_sort_upsweep.cuh
    │   ├── agent_reduce.cuh
    │   ├── agent_reduce_by_key.cuh
    │   ├── agent_rle.cuh
    │   ├── agent_scan.cuh
    │   ├── agent_scan_by_key.cuh
    │   ├── agent_segment_fixup.cuh
    │   ├── agent_segmented_radix_sort.cuh
    │   ├── agent_select_if.cuh
    │   ├── agent_spmv_orig.cuh
    │   ├── agent_sub_warp_merge_sort.cuh
    │   ├── agent_three_way_partition.cuh
    │   ├── agent_unique_by_key.cuh
    │   └── single_pass_scan_operators.cuh
    ├── block
    │   ├── block_adjacent_difference.cuh
    │   ├── block_discontinuity.cuh
    │   ├── block_exchange.cuh
    │   ├── block_histogram.cuh
    │   ├── block_load.cuh
    │   ├── block_merge_sort.cuh
    │   ├── block_radix_rank.cuh
    │   ├── block_radix_sort.cuh
    │   ├── block_raking_layout.cuh
    │   ├── block_reduce.cuh
    │   ├── block_run_length_decode.cuh
    │   ├── block_scan.cuh
    │   ├── block_shuffle.cuh
    │   ├── block_store.cuh
    │   ├── radix_rank_sort_operations.cuh
    │   └── specializations
    │   │   ├── block_histogram_atomic.cuh
    │   │   ├── block_histogram_sort.cuh
    │   │   ├── block_reduce_raking.cuh
    │   │   ├── block_reduce_raking_commutative_only.cuh
    │   │   ├── block_reduce_warp_reductions.cuh
    │   │   ├── block_scan_raking.cuh
    │   │   └── block_scan_warp_scans.cuh
    ├── cmake
    │   ├── cub-config-version.cmake
    │   ├── cub-config.cmake
    │   ├── cub-header-search.cmake
    │   └── cub-header-search.cmake.in
    ├── config.cuh
    ├── cub.cuh
    ├── detail
    │   ├── choose_offset.cuh
    │   ├── cpp_compatibility.cuh
    │   ├── detect_cuda_runtime.cuh
    │   ├── device_double_buffer.cuh
    │   ├── device_synchronize.cuh
    │   ├── exec_check_disable.cuh
    │   ├── strong_load.cuh
    │   ├── strong_store.cuh
    │   ├── temporary_storage.cuh
    │   ├── type_traits.cuh
    │   └── uninitialized_copy.cuh
    ├── device
    │   ├── device_adjacent_difference.cuh
    │   ├── device_copy.cuh
    │   ├── device_histogram.cuh
    │   ├── device_memcpy.cuh
    │   ├── device_merge_sort.cuh
    │   ├── device_partition.cuh
    │   ├── device_radix_sort.cuh
    │   ├── device_reduce.cuh
    │   ├── device_run_length_encode.cuh
    │   ├── device_scan.cuh
    │   ├── device_segmented_radix_sort.cuh
    │   ├── device_segmented_reduce.cuh
    │   ├── device_segmented_sort.cuh
    │   ├── device_select.cuh
    │   ├── device_spmv.cuh
    │   └── dispatch
    │   │   ├── dispatch_adjacent_difference.cuh
    │   │   ├── dispatch_batch_memcpy.cuh
    │   │   ├── dispatch_histogram.cuh
    │   │   ├── dispatch_merge_sort.cuh
    │   │   ├── dispatch_radix_sort.cuh
    │   │   ├── dispatch_reduce.cuh
    │   │   ├── dispatch_reduce_by_key.cuh
    │   │   ├── dispatch_rle.cuh
    │   │   ├── dispatch_scan.cuh
    │   │   ├── dispatch_scan_by_key.cuh
    │   │   ├── dispatch_segmented_sort.cuh
    │   │   ├── dispatch_select_if.cuh
    │   │   ├── dispatch_spmv_orig.cuh
    │   │   ├── dispatch_three_way_partition.cuh
    │   │   ├── dispatch_unique_by_key.cuh
    │   │   └── tuning
    │   │       ├── tuning_run_length_encode.cuh
    │   │       ├── tuning_scan.cuh
    │   │       └── tuning_select_if.cuh
    ├── grid
    │   ├── grid_barrier.cuh
    │   ├── grid_even_share.cuh
    │   ├── grid_mapping.cuh
    │   └── grid_queue.cuh
    ├── host
    │   └── mutex.cuh
    ├── iterator
    │   ├── arg_index_input_iterator.cuh
    │   ├── cache_modified_input_iterator.cuh
    │   ├── cache_modified_output_iterator.cuh
    │   ├── constant_input_iterator.cuh
    │   ├── counting_input_iterator.cuh
    │   ├── discard_output_iterator.cuh
    │   ├── tex_obj_input_iterator.cuh
    │   ├── tex_ref_input_iterator.cuh
    │   └── transform_input_iterator.cuh
    ├── thread
    │   ├── thread_load.cuh
    │   ├── thread_operators.cuh
    │   ├── thread_reduce.cuh
    │   ├── thread_scan.cuh
    │   ├── thread_search.cuh
    │   ├── thread_sort.cuh
    │   └── thread_store.cuh
    ├── util_allocator.cuh
    ├── util_arch.cuh
    ├── util_compiler.cuh
    ├── util_cpp_dialect.cuh
    ├── util_debug.cuh
    ├── util_deprecated.cuh
    ├── util_device.cuh
    ├── util_macro.cuh
    ├── util_math.cuh
    ├── util_namespace.cuh
    ├── util_ptx.cuh
    ├── util_type.cuh
    ├── version.cuh
    └── warp
    │   ├── specializations
    │       ├── warp_reduce_shfl.cuh
    │       ├── warp_reduce_smem.cuh
    │       ├── warp_scan_shfl.cuh
    │       └── warp_scan_smem.cuh
    │   ├── warp_exchange.cuh
    │   ├── warp_load.cuh
    │   ├── warp_merge_sort.cuh
    │   ├── warp_reduce.cuh
    │   ├── warp_scan.cuh
    │   └── warp_store.cuh
├── docs
    ├── .gitignore
    ├── VERSION.md
    ├── deps
    │   └── repo-deps.packman.xml
    ├── developer_overview.rst
    ├── gen_docs.sh
    ├── index.rst
    ├── repo.bat
    ├── repo.sh
    ├── repo.toml
    ├── test_overview.rst
    ├── tools
    │   ├── packman
    │   │   ├── bootstrap
    │   │   │   ├── configure.bat
    │   │   │   ├── download_file_from_url.ps1
    │   │   │   ├── fetch_file_from_packman_bootstrap.cmd
    │   │   │   ├── generate_temp_file_name.ps1
    │   │   │   ├── generate_temp_folder.ps1
    │   │   │   └── install_package.py
    │   │   ├── config.packman.xml
    │   │   ├── packman
    │   │   ├── packman.cmd
    │   │   ├── packmanconf.py
    │   │   ├── python.bat
    │   │   └── python.sh
    │   └── repoman
    │   │   ├── omni
    │   │       └── repo
    │   │       │   └── format
    │   │       │       └── .gitignore
    │   │   └── repoman.py
    └── tuning.rst
├── examples
    ├── CMakeLists.txt
    ├── block
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── example_block_radix_sort.cu
    │   ├── example_block_reduce.cu
    │   ├── example_block_reduce_dyn_smem.cu
    │   └── example_block_scan.cu
    ├── cmake
    │   ├── CMakeLists.txt
    │   └── add_subdir
    │   │   ├── CMakeLists.txt
    │   │   └── dummy.cu
    └── device
    │   ├── .gitignore
    │   ├── CMakeLists.txt
    │   ├── example_device_decoupled_look_back.cu
    │   ├── example_device_partition_flagged.cu
    │   ├── example_device_partition_if.cu
    │   ├── example_device_radix_sort.cu
    │   ├── example_device_radix_sort_custom.cu
    │   ├── example_device_reduce.cu
    │   ├── example_device_scan.cu
    │   ├── example_device_select_flagged.cu
    │   ├── example_device_select_if.cu
    │   ├── example_device_select_unique.cu
    │   └── example_device_sort_find_non_trivial_runs.cu
└── test
    ├── .gitignore
    ├── CMakeLists.txt
    ├── README.md
    ├── bfloat16.h
    ├── c2h
        ├── custom_type.cuh
        ├── generators.cu
        └── generators.cuh
    ├── catch2_runner.cu
    ├── catch2_test_block_adjacent_difference.cu
    ├── catch2_test_block_histogram.cu
    ├── catch2_test_block_load.cu
    ├── catch2_test_block_merge_sort.cu
    ├── catch2_test_block_radix_sort.cu
    ├── catch2_test_block_radix_sort.cuh
    ├── catch2_test_block_radix_sort_custom.cu
    ├── catch2_test_block_reduce.cu
    ├── catch2_test_block_run_length_decode.cu
    ├── catch2_test_block_scan.cu
    ├── catch2_test_block_shuffle.cu
    ├── catch2_test_block_store.cu
    ├── catch2_test_cdp_helper.h
    ├── catch2_test_cdp_wrapper.cu
    ├── catch2_test_device_decoupled_look_back.cu
    ├── catch2_test_device_radix_sort_custom.cu
    ├── catch2_test_helper.h
    ├── catch2_test_printing.cu
    ├── catch2_test_radix_operations.cu
    ├── catch2_test_util_type.cu
    ├── catch2_test_warp_exchange.cu
    ├── catch2_test_warp_load.cu
    ├── catch2_test_warp_mask.cu
    ├── catch2_test_warp_merge_sort.cu
    ├── catch2_test_warp_reduce.cu
    ├── catch2_test_warp_scan.cu
    ├── catch2_test_warp_store.cu
    ├── cmake
        ├── CMakeLists.txt
        ├── check_source_files.cmake
        └── test_install
        │   └── CMakeLists.txt
    ├── fill_striped.cuh
    ├── half.h
    ├── link_a.cu
    ├── link_b.cu
    ├── link_main.cpp
    ├── mersenne.h
    ├── test_allocator.cu
    ├── test_block_radix_rank.cu
    ├── test_cdp_variant_state.cu
    ├── test_device_adjacent_difference.cu
    ├── test_device_batch_copy.cu
    ├── test_device_batch_memcpy.cu
    ├── test_device_histogram.cu
    ├── test_device_merge_sort.cu
    ├── test_device_radix_sort.cu
    ├── test_device_reduce.cu
    ├── test_device_reduce_by_key.cu
    ├── test_device_run_length_encode.cu
    ├── test_device_scan.cu
    ├── test_device_scan_by_key.cu
    ├── test_device_segmented_sort.cu
    ├── test_device_select_if.cu
    ├── test_device_select_unique.cu
    ├── test_device_select_unique_by_key.cu
    ├── test_device_spmv.cu
    ├── test_device_three_way_partition.cu
    ├── test_grid_barrier.cu
    ├── test_iterator.cu
    ├── test_iterator_deprecated.cu
    ├── test_namespace_wrapped.cu
    ├── test_temporary_storage_layout.cu
    ├── test_thread_operators.cu
    ├── test_thread_sort.cu
    ├── test_util.h
    └── test_util_vec.h


/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: LLVM
 2 | AccessModifierOffset: -2
 3 | AlignAfterOpenBracket: Align
 4 | AlignConsecutiveAssignments: true
 5 | AlignEscapedNewlines: Right
 6 | AlignOperands: true
 7 | AllowAllArgumentsOnNextLine: false
 8 | AllowAllConstructorInitializersOnNextLine: false
 9 | AllowAllParametersOfDeclarationOnNextLine: false
10 | AllowShortBlocksOnASingleLine: false
11 | AllowShortCaseLabelsOnASingleLine: false
12 | AllowShortFunctionsOnASingleLine: All
13 | AllowShortIfStatementsOnASingleLine: Never
14 | AllowShortLambdasOnASingleLine: All
15 | AllowShortLoopsOnASingleLine: false
16 | AlwaysBreakAfterReturnType: None
17 | AlwaysBreakTemplateDeclarations: Yes
18 | BinPackArguments: false
19 | BinPackParameters: false
20 | BreakBeforeBraces: Custom
21 | BraceWrapping:
22 |   AfterCaseLabel: false
23 |   AfterClass: true
24 |   AfterControlStatement: true
25 |   AfterEnum: true
26 |   AfterFunction: true
27 |   AfterNamespace: true
28 |   AfterStruct: true
29 |   AfterUnion: true
30 |   BeforeCatch: true
31 |   BeforeElse: true
32 |   IndentBraces: false
33 |   SplitEmptyFunction: false
34 |   SplitEmptyRecord: false
35 | BreakBeforeBinaryOperators: None
36 | BreakBeforeTernaryOperators: true
37 | BreakConstructorInitializers: BeforeComma
38 | BreakInheritanceList: BeforeComma
39 | ColumnLimit: 100
40 | CompactNamespaces: false
41 | ContinuationIndentWidth: 2
42 | IncludeBlocks:   Regroup
43 | IncludeCategories:
44 |   - Regex:           '^<cub'
45 |     Priority:        1
46 |   - Regex:           '^<thrust'
47 |     Priority:        2
48 |   - Regex:           '^<cuda'
49 |     Priority:        3
50 |   - Regex:           '^<[a-z_]*>$'
51 |     Priority:        4
52 | IndentCaseLabels: true
53 | IndentPPDirectives: None
54 | IndentWidth: 2
55 | KeepEmptyLinesAtTheStartOfBlocks: true
56 | MaxEmptyLinesToKeep: 1
57 | NamespaceIndentation: None
58 | PenaltyBreakAssignment: 30
59 | PenaltyBreakBeforeFirstCallParameter: 50
60 | PenaltyBreakComment: 0
61 | PenaltyBreakFirstLessLess: 0
62 | PenaltyBreakString: 70
63 | PenaltyBreakTemplateDeclaration: 0
64 | PenaltyExcessCharacter: 100
65 | PenaltyReturnTypeOnItsOwnLine: 90
66 | PointerAlignment: Right
67 | ReflowComments: true
68 | SortIncludes: CaseInsensitive
69 | SpaceAfterCStyleCast: false
70 | SpaceAfterLogicalNot: false
71 | SpaceAfterTemplateKeyword: true
72 | SpaceBeforeAssignmentOperators: true
73 | SpaceBeforeCpp11BracedList: false
74 | SpaceBeforeCtorInitializerColon: true
75 | SpaceBeforeInheritanceColon: true
76 | SpaceBeforeParens: ControlStatements
77 | SpaceBeforeRangeBasedForLoopColon: true
78 | SpaceInEmptyParentheses: false
79 | SpacesBeforeTrailingComments: 1
80 | SpacesInAngles: false
81 | SpacesInCStyleCastParentheses: false
82 | SpacesInParentheses: false
83 | SpacesInSquareBrackets: false
84 | Standard: c++11
85 | TabWidth: 2
86 | UseTab: Never
87 | 


--------------------------------------------------------------------------------
/.clang-tidy:
--------------------------------------------------------------------------------
 1 | ---
 2 | Checks:
 3 |       'modernize-*,
 4 |        -modernize-use-equals-default,
 5 |        -modernize-concat-nested-namespaces,
 6 |        -modernize-use-trailing-return-type'
 7 | 
 8 |       # -modernize-use-equals-default        # auto-fix is broken (doesn't insert =default correctly)
 9 |       # -modernize-concat-nested-namespaces  # auto-fix is broken (can delete code)
10 |       # -modernize-use-trailing-return-type  # just a preference
11 | 
12 | WarningsAsErrors: ''
13 | HeaderFilterRegex: ''
14 | AnalyzeTemporaryDtors: false
15 | FormatStyle:     none
16 | CheckOptions:
17 |  - key:             modernize-loop-convert.MaxCopySize
18 |    value:           '16'
19 |  - key:             modernize-loop-convert.MinConfidence
20 |    value:           reasonable
21 |  - key:             modernize-pass-by-value.IncludeStyle
22 |    value:           llvm
23 |  - key:             modernize-replace-auto-ptr.IncludeStyle
24 |    value:           llvm
25 |  - key:             modernize-use-nullptr.NullMacros
26 |    value:           'NULL'
27 | ...
28 | 


--------------------------------------------------------------------------------
/.clangd:
--------------------------------------------------------------------------------
 1 | # https://clangd.llvm.org/config
 2 | 
 3 | # Apply a config conditionally to all C files
 4 | If:
 5 |   PathMatch: .*\.(c|h)$
 6 | 
 7 | ---
 8 | 
 9 | # Apply a config conditionally to all C++ files
10 | If:
11 |   PathMatch: .*\.(c|h)pp
12 | 
13 | ---
14 | 
15 | # Apply a config conditionally to all CUDA files
16 | If:
17 |   PathMatch: .*\.cuh?
18 | CompileFlags:
19 |   Add:
20 |     # Allow variadic CUDA functions
21 |     - "-Xclang=-fcuda-allow-variadic-functions"
22 | 
23 | ---
24 | 
25 | # Tweak the clangd parse settings for all files
26 | CompileFlags:
27 |   Compiler: clang++
28 |   CompilationDatabase: .
29 |   Add:
30 |     - -x
31 |     - cuda
32 |     # report all errors
33 |     - "-ferror-limit=0"
34 |     - "-ftemplate-backtrace-limit=0"
35 |     - "-stdlib=libc++"
36 |   Remove:
37 |     - -stdpar
38 |     # strip CUDA fatbin args
39 |     - "-Xfatbin*"
40 |     - "-Xcompiler*"
41 |     - "-Xcudafe*"
42 |     - "-rdc=*"
43 |     - "-gpu=*"
44 |     - "--diag_suppress*"
45 |     # strip CUDA arch flags
46 |     - "-gencode*"
47 |     - "--generate-code*"
48 |     # strip gcc's -fcoroutines
49 |     - -fcoroutines
50 |     # strip CUDA flags unknown to clang
51 |     - "-ccbin*"
52 |     - "--compiler-options*"
53 |     - "--expt-extended-lambda"
54 |     - "--expt-relaxed-constexpr"
55 |     - "-forward-unknown-to-host-compiler"
56 |     - "-Werror=cross-execution-space-call"
57 | Diagnostics:
58 |   Suppress:
59 |     - "variadic_device_fn"
60 |     - "attributes_not_allowed"
61 |     # The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error.
62 |     # Temporarily suppressing it, but should probably fix
63 |     - "template_param_shadow"
64 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
 1 | # Exclude these commits from git-blame and similar tools.
 2 | #
 3 | # To use this file, run the following command from the repo root:
 4 | #
 5 | # ```
 6 | # $ git config blame.ignoreRevsFile .git-blame-ignore-revs
 7 | # ```
 8 | #
 9 | # Include a brief comment with each commit added, for example:
10 | #
11 | # ```
12 | # d92d9f8baac5ec48a8f8718dd69f415a45efe372 # Initial clang-format
13 | # ```
14 | #
15 | # Only add commits that are pure formatting changes (e.g.
16 | # clang-format version changes, etc).
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Open Issue in CCCL Repository
4 |     url: https://github.com/NVIDIA/cccl/issues/new/choose
5 |     about:  This repository has moved! Please see the new home for CUB.
6 | 


--------------------------------------------------------------------------------
/.github/workflows/mirror-main-branch-to-master-branch.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - "main"
 5 | 
 6 | jobs:
 7 |   mirror-main-branch-to-master-branch:
 8 |     name: Mirror main branch to master branch
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - name: Mirror main branch to master branch
12 |       id: mirror
13 |       uses: google/mirror-branch-action@v1.0
14 |       with:
15 |         source: "main"
16 |         dest: "master"
17 |         github-token: ${{ secrets.GITHUB_TOKEN }}
18 | 


--------------------------------------------------------------------------------
/.github/workflows/push-to-legacy-repositories.yml:
--------------------------------------------------------------------------------
 1 | on: push
 2 | 
 3 | jobs:
 4 |   push-to-legacy-repositories:
 5 |     name: Push to legacy repositories
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |     - name: Push `main` to github.com/nvlabs/cub
 9 |       uses: wei/git-sync@v2
10 |       if: github.repository == 'nvidia/cub'
11 |       with:
12 |         source_repo: "nvidia/cub"
13 |         source_branch: "main"
14 |         destination_repo: "nvlabs/cub"
15 |         destination_branch: "main"
16 |         ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
17 |     - name: Push all tags to github.com/nvlabs/cub
18 |       uses: wei/git-sync@v2
19 |       if: github.repository == 'nvidia/cub'
20 |       with:
21 |         source_repo: "nvidia/cub"
22 |         source_branch: "refs/tags/*"
23 |         destination_repo: "nvlabs/cub"
24 |         destination_branch: "refs/tags/*"
25 |         ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
26 |     - name: Push `main` to github.com/thrust/cub
27 |       uses: wei/git-sync@v2
28 |       if: github.repository == 'nvidia/cub'
29 |       with:
30 |         source_repo: "nvidia/cub"
31 |         source_branch: "main"
32 |         destination_repo: "thrust/cub"
33 |         destination_branch: "main"
34 |         ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
35 |     - name: Push all tags to github.com/thrust/cub
36 |       uses: wei/git-sync@v2
37 |       if: github.repository == 'nvidia/cub'
38 |       with:
39 |         source_repo: "nvidia/cub"
40 |         source_branch: "refs/tags/*"
41 |         destination_repo: "thrust/cub"
42 |         destination_branch: "refs/tags/*"
43 |         ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }}
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .p4config
2 | *~
3 | \#*
4 | /build
5 | .cache
6 | .vscode
7 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # 3.15 is the minimum.
  2 | # 3.17 for NVC++.
  3 | # 3.18.3 for C++17 + CUDA.
  4 | cmake_minimum_required(VERSION 3.15)
  5 | 
  6 | # Remove this when we use the new CUDA_ARCHITECTURES properties.
  7 | if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
  8 |   cmake_policy(SET CMP0104 OLD)
  9 | endif()
 10 | 
 11 | # CXX is only needed for AppendOptionIfAvailable.
 12 | project(CUB NONE)
 13 | 
 14 | # Determine whether CUB is the top-level project or included into
 15 | # another project via add_subdirectory().
 16 | if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}")
 17 |   set(CUB_TOPLEVEL_PROJECT ON)
 18 | else()
 19 |   set(CUB_TOPLEVEL_PROJECT OFF)
 20 | endif()
 21 | 
 22 | # This must be done before any languages are enabled:
 23 | if (CUB_TOPLEVEL_PROJECT)
 24 |   include(cmake/CubCompilerHacks.cmake)
 25 | endif()
 26 | 
 27 | # This must appear after our Compiler Hacks or else CMake will delete the cache
 28 | # and reconfigure from scratch.
 29 | # This must also appear before the installation rules, as it is required by the
 30 | # GNUInstallDirs CMake module.
 31 | enable_language(CXX)
 32 | 
 33 | # Thrust has its own copy of CUB install rules to handle packaging usecases
 34 | # where we want to install CUB headers but aren't actually building anything.
 35 | # In these cases the add_subdirectory(dependencies/cub) line in Thrust won't get
 36 | # called so we can't rely on CUB providing its own rules.
 37 | if (NOT CUB_IN_THRUST)
 38 |   option(CUB_ENABLE_INSTALL_RULES "Enable installation of CUB" ${CUB_TOPLEVEL_PROJECT})
 39 |   if (CUB_ENABLE_INSTALL_RULES)
 40 |     include(cmake/CubInstallRules.cmake)
 41 |   endif()
 42 | endif()
 43 | 
 44 | # Support adding CUB to a parent project via add_subdirectory.
 45 | # See examples/cmake/add_subdir/CMakeLists.txt for details.
 46 | if (NOT CUB_TOPLEVEL_PROJECT AND NOT CUB_IN_THRUST)
 47 |   include(cmake/CubAddSubdir.cmake)
 48 |   return()
 49 | endif()
 50 | 
 51 | option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON)
 52 | option(CUB_ENABLE_TESTING "Build CUB testing suite." ON)
 53 | option(CUB_ENABLE_BENCHMARKS "Build CUB benchmarking suite." OFF)
 54 | option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF)
 55 | option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON)
 56 | 
 57 | # This is needed for NVCXX QA, which requires a static set of executable names.
 58 | # Only a single dialect may be enabled when this is off.
 59 | option(CUB_ENABLE_CPP_DIALECT_IN_NAMES
 60 |   "Include C++ dialect information in target/object/etc names."
 61 |   ON
 62 | )
 63 | mark_as_advanced(CUB_ENABLE_CPP_DIALECT_IN_NAMES)
 64 | 
 65 | # This option is only used when CUB is built stand-alone; otherwise the Thrust
 66 | # option has the same effect.
 67 | if (NOT CUB_IN_THRUST)
 68 |   option(CUB_IGNORE_DEPRECATED_API
 69 |     "Suppress warnings about deprecated Thrust/CUB API."
 70 |     OFF
 71 |   )
 72 | endif()
 73 | 
 74 | # Check if we're actually building anything before continuing. If not, no need
 75 | # to search for deps, etc. This is a common approach for packagers that just
 76 | # need the install rules. See GH issue NVIDIA/thrust#1211.
 77 | if (NOT (CUB_ENABLE_HEADER_TESTING OR
 78 |          CUB_ENABLE_TESTING OR
 79 |          CUB_ENABLE_EXAMPLES))
 80 |   return()
 81 | endif()
 82 | 
 83 | include(cmake/AppendOptionIfAvailable.cmake)
 84 | include(cmake/CubBuildCompilerTargets.cmake)
 85 | include(cmake/CubBuildTargetList.cmake)
 86 | include(cmake/CubCudaConfig.cmake)
 87 | include(cmake/CubUtilities.cmake)
 88 | 
 89 | if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
 90 |   set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE)
 91 | 
 92 |   set_property(
 93 |     CACHE CMAKE_BUILD_TYPE
 94 |     PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel
 95 |   )
 96 | endif ()
 97 | 
 98 | set(CMAKE_CXX_EXTENSIONS OFF)
 99 | 
100 | # Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up alongside
101 | # Thrust targets when building as part of Thrust.
102 | set(CUB_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib")
103 | set(CUB_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin")
104 | 
105 | cub_build_target_list()
106 | 
107 | if (CUB_ENABLE_HEADER_TESTING)
108 |   include(cmake/CubHeaderTesting.cmake)
109 | endif()
110 | 
111 | # Both testing and examples use ctest
112 | if (CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES)
113 |   include(CTest)
114 |   enable_testing()
115 | endif()
116 | 
117 | if (CUB_ENABLE_TESTING)
118 |   add_subdirectory(test)
119 | endif()
120 | 
121 | if (CUB_ENABLE_EXAMPLES)
122 |   add_subdirectory(examples)
123 | endif()
124 | 
125 | if (CUB_ENABLE_BENCHMARKS OR CUB_ENABLE_TUNING)
126 |   add_subdirectory(benchmarks)
127 | endif()
128 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Code of Conduct
 3 | 
 4 | ## Overview
 5 | 
 6 | This document defines the Code of Conduct followed and enforced for NVIDIA C++
 7 |   Core Compute Libraries.
 8 | 
 9 | ### Intended Audience
10 | 
11 | * Community
12 | * Developers
13 | * Project Leads
14 | 
15 | ## Our Pledge
16 | 
17 | In the interest of fostering an open and welcoming environment, we as
18 |   contributors and maintainers pledge to making participation in our project and
19 |   our community a harassment-free experience for everyone, regardless of age,
20 |   body size, disability, ethnicity, sex characteristics, gender identity and
21 |   expression, level of experience, education, socio-economic status, nationality,
22 |   personal appearance, race, religion, or sexual identity and orientation.
23 | 
24 | ## Our Standards
25 | 
26 | Examples of behavior that contributes to creating a positive environment include:
27 | 
28 | - Using welcoming and inclusive language.
29 | - Being respectful of differing viewpoints and experiences.
30 | - Gracefully accepting constructive criticism.
31 | - Focusing on what is best for the community.
32 | - Showing empathy towards other community members.
33 | 
34 | Examples of unacceptable behavior by participants include:
35 | 
36 | - The use of sexualized language or imagery and unwelcome sexual attention or
37 |     advances.
38 | - Trolling, insulting/derogatory comments, and personal or political attacks.
39 | - Public or private harassment.
40 | - Publishing others’ private information, such as a physical or electronic
41 |     address, without explicit permission.
42 | - Other conduct which could reasonably be considered inappropriate.
43 | 
44 | ## Our Responsibilities
45 | 
46 | Project maintainers are responsible for clarifying the standards of acceptable
47 |   behavior and are expected to take appropriate and fair corrective action in
48 |   response to any instances of unacceptable behavior.
49 | 
50 | Project maintainers have the right and responsibility to remove, edit, or
51 |   reject comments, commits, code, wiki edits, issues, and other contributions
52 |   that are not aligned to this Code of Conduct, or to ban temporarily or
53 |   permanently any contributor for other behaviors that they deem inappropriate,
54 |   threatening, offensive, or harmful.
55 | 
56 | ## Scope
57 | 
58 | This Code of Conduct applies both within project spaces and in public spaces
59 |   when an individual is representing the project or its community.
60 | Examples of representing a project or community include using an official
61 |   project email address, posting via an official social media account, or acting
62 |   as an appointed representative at an online or offline event.
63 | Representation of a project may be further defined and clarified by project
64 |   maintainers.
65 | 
66 | ## Enforcement
67 | 
68 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
69 |   reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com).
70 | All complaints will be reviewed and investigated and will result in a response
71 |   that is deemed necessary and appropriate to the circumstances.
72 | The project team is obligated to maintain confidentiality with regard to the
73 |   reporter of an incident.
74 | Further details of specific enforcement policies may be posted separately.
75 | 
76 | Project maintainers who do not follow or enforce the Code of Conduct in good
77 |   faith may face temporary or permanent repercussions as determined by other
78 |   members of the project’s leadership.
79 | 
80 | ## Attribution
81 | 
82 | This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was
83 |   adapted from the [Contributor Covenant version 1.4].
84 | 
85 | Please see this [FAQ] for answers to common questions about this Code of Conduct.
86 | 
87 | ## Contact
88 | 
89 | Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters.
90 | 
91 | 
92 | [cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com
93 | 
94 | [FAQ]: https://www.contributor-covenant.org/faq
95 | 
96 | [NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/
97 | [Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
98 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Table of Contents
 2 | 
 3 | 1. [Contributing to CUB](#contributing-to-cub)
 4 | 1. [CMake Options](#cmake-options)
 5 | 1. [Development Model](#development-model)
 6 | 
 7 | # Contributing to CUB
 8 | 
 9 | CUB uses Github to manage all open-source development, including bug tracking,
10 | pull requests, and design discussions. CUB is tightly coupled to the Thrust
11 | project, and a compatible version of Thrust is required when working on the
12 | development version of CUB.
13 | 
14 | To setup a CUB development branch, it is recommended to recursively clone the
15 | Thrust repository and use the CUB submodule at `dependencies/cub` to stage
16 | changes. CUB's tests and examples can be built by configuring Thrust with the
17 | CMake option `THRUST_INCLUDE_CUB_CMAKE=ON`.
18 | 
19 | This process is described in more detail in Thrust's
20 | [CONTRIBUTING.md](https://nvidia.github.io/thrust/contributing.html).
21 | 
22 | The CMake options in the following section may be used to customize CUB's build
23 | process. Note that some of these are controlled by Thrust for compatibility and
24 | may not have an effect when building CUB through the Thrust build system. This
25 | is pointed out in the documentation below where applicable.
26 | 
27 | # CMake Options
28 | 
29 | A CUB build is configured using CMake options. These may be passed to CMake
30 | using
31 | 
32 | ```
33 | cmake -D<option_name>=<value> [Thrust or CUB project source root]
34 | ```
35 | 
36 | or configured interactively with the `ccmake` or `cmake-gui` interfaces.
37 | 
38 | The configuration options for CUB are:
39 | 
40 | - `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}`
41 |   - Standard CMake build option. Default: `RelWithDebInfo`
42 | - `CUB_ENABLE_HEADER_TESTING={ON, OFF}`
43 |   - Whether to test compile public headers. Default is `ON`.
44 | - `CUB_ENABLE_TESTING={ON, OFF}`
45 |   - Whether to build unit tests. Default is `ON`.
46 | - `CUB_ENABLE_EXAMPLES={ON, OFF}`
47 |   - Whether to build examples. Default is `ON`.
48 | - `CUB_ENABLE_DIALECT_CPPXX={ON, OFF}`
49 |   - Setting this has no effect when building CUB as a component of Thrust.
50 |     See Thrust's dialect options, which CUB will inherit.
51 |   - Toggle whether a specific C++ dialect will be targeted.
52 |   - Multiple dialects may be targeted in a single build.
53 |   - Possible values of `XX` are `{11, 14, 17}`.
54 |   - By default, only C++14 is enabled.
55 | - `CUB_ENABLE_COMPUTE_XX={ON, OFF}`
56 |   - Setting this has no effect when building CUB as a component of Thrust.
57 |     See Thrust's architecture options, which CUB will inherit.
58 |   - Controls the targeted CUDA architecture(s)
59 |   - Multiple options may be selected when using NVCC as the CUDA compiler.
60 |   - Valid values of `XX` are:
61 |     `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}`
62 |   - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`:
63 | - `CUB_ENABLE_COMPUTE_FUTURE={ON, OFF}`
64 |   - Setting this has no effect when building CUB as a component of Thrust.
65 |     See Thrust's architecture options, which CUB will inherit.
66 |   - If enabled, CUDA objects will target the most recent virtual architecture
67 |     in addition to the real architectures specified by the
68 |     `CUB_ENABLE_COMPUTE_XX` options.
69 |   - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`:
70 | - `CUB_DISABLE_ARCH_BY_DEFAULT={ON, OFF}`
71 |   - Setting this has no effect when building CUB as a component of Thrust.
72 |     See Thrust's architecture options, which CUB will inherit.
73 |   - When `ON`, all `CUB_ENABLE_COMPUTE_*` options are initially `OFF`.
74 |   - Default: `OFF` (meaning all architectures are enabled by default)
75 | - `CUB_ENABLE_TESTS_WITH_RDC={ON, OFF}`
76 |   - Whether to enable Relocatable Device Code when building tests.
77 |     Default is `OFF`.
78 | - `CUB_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}`
79 |   - Whether to enable Relocatable Device Code when building examples.
80 |     Default is `OFF`.
81 | - `CUB_ENABLE_INSTALL_RULES={ON, OFF}`
82 |   - Setting this has no effect when building CUB as a component of Thrust.
83 |     See Thrust's `THRUST_INSTALL_CUB_HEADERS` option, which controls this
84 |     behavior.
85 |   - If true, installation rules will be generated for CUB. Default is `ON` when
86 |     building CUB alone, and `OFF` when CUB is a subproject added via CMake's
87 |     `add_subdirectory`.
88 | 
89 | # Development Model
90 | 
91 | CUB follows the same development model as Thrust, described
92 | [here](https://nvidia.github.io/thrust/releases/versioning.html).
93 | 


--------------------------------------------------------------------------------
/LICENSE.TXT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
 2 | Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |    *  Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |    *  Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |    *  Neither the name of the NVIDIA CORPORATION nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/benchmarks/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | find_package(CUDAToolkit REQUIRED)
  2 | find_package(Python3 COMPONENTS Interpreter REQUIRED)
  3 | 
  4 | # Defer dependencies collection to nvbench helper 
  5 | add_subdirectory(nvbench_helper)
  6 | 
  7 | set(benches_root "${CMAKE_CURRENT_LIST_DIR}")
  8 | 
  9 | if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
 10 |   message(FATAL_ERROR "CUB benchmarks must be built in release mode.")
 11 | endif()
 12 | 
 13 | if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
 14 |   message(FATAL_ERROR "CMAKE_CUDA_ARCHITECTURES must be set to build CUB benchmarks.")
 15 | endif()
 16 | 
 17 | set(benches_meta_target cub.all.benches)
 18 | add_custom_target(${benches_meta_target})
 19 | 
 20 | function(get_recursive_subdirs subdirs)
 21 |   set(dirs)
 22 |   file(GLOB_RECURSE contents
 23 |     CONFIGURE_DEPENDS
 24 |     LIST_DIRECTORIES ON
 25 |     "${CMAKE_CURRENT_LIST_DIR}/bench/*"
 26 |   )
 27 | 
 28 |   foreach(test_dir IN LISTS contents)
 29 |     if(IS_DIRECTORY "${test_dir}")
 30 |       list(APPEND dirs "${test_dir}")
 31 |     endif()
 32 |   endforeach()
 33 | 
 34 |   set(${subdirs} "${dirs}" PARENT_SCOPE)
 35 | endfunction()
 36 | 
 37 | set(meta_path "${CMAKE_BINARY_DIR}/cub_bench_meta.csv")
 38 | file(REMOVE "${meta_path}")
 39 | 
 40 | set(ctk_version "${CUDAToolkit_VERSION}")
 41 | message(STATUS "CTK version: ${ctk_version}")
 42 | 
 43 | find_package(Git REQUIRED)
 44 | if(GIT_FOUND)
 45 |     execute_process(
 46 |         COMMAND ${GIT_EXECUTABLE} describe
 47 |         WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
 48 |         OUTPUT_VARIABLE cub_revision
 49 |         OUTPUT_STRIP_TRAILING_WHITESPACE)
 50 |     message(STATUS "Git revision: ${cub_revision}")
 51 | else()
 52 |     message(WARNING "Git not found. Unable to determine Git revision.")
 53 | endif()
 54 | 
 55 | file(APPEND "${meta_path}" "ctk_version,${ctk_version}\n")
 56 | file(APPEND "${meta_path}" "cub_revision,${cub_revision}\n")
 57 | 
 58 | function(get_bench_ranges src bench_name)
 59 |   file(READ "${src}" file_data)
 60 |   set(param_regex "//[ ]+%RANGE%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)")
 61 | 
 62 |   string(REGEX MATCHALL "${param_regex}" matches "${file_data}")
 63 | 
 64 |   set(ranges "")
 65 | 
 66 |   foreach(match IN LISTS matches)
 67 |     string(REGEX MATCH "${param_regex}" unused "${match}")
 68 | 
 69 |     set(def ${CMAKE_MATCH_1})
 70 |     set(label ${CMAKE_MATCH_2})
 71 |     set(range ${CMAKE_MATCH_3})
 72 |     set(ranges "${ranges}${def}|${label}=${range},")
 73 | 
 74 |     string(REPLACE ":" ";" range "${range}")
 75 |     list(LENGTH range range_len)
 76 | 
 77 |     if (NOT "${range_len}" STREQUAL 3)
 78 |       message(FATAL_ERROR "Range should be represented as 'start:end:step'")
 79 |     endif()
 80 |   endforeach()
 81 | 
 82 |   string(LENGTH "${ranges}" ranges_length)
 83 |   math(EXPR last_character_index "${ranges_length} - 1")
 84 |   string(SUBSTRING "${ranges}" 0 ${last_character_index} ranges)
 85 |   file(APPEND "${meta_path}" "${bench_name},${ranges}\n")
 86 | endfunction()
 87 | 
 88 | function(add_bench target_name bench_name bench_src)
 89 |   set(bench_target ${bench_name})
 90 |   set(${target_name} ${bench_target} PARENT_SCOPE)
 91 | 
 92 |   add_executable(${bench_target} "${bench_src}")
 93 |   target_link_libraries(${bench_target} PRIVATE nvbench_helper nvbench::main)
 94 |   set_target_properties(${bench_target}
 95 |     PROPERTIES
 96 |       ARCHIVE_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}"
 97 |       LIBRARY_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}"
 98 |       RUNTIME_OUTPUT_DIRECTORY "${CUB_EXECUTABLE_OUTPUT_DIR}"
 99 |   )
100 | endfunction()
101 | 
102 | function(add_bench_dir bench_dir)
103 |   file(GLOB bench_srcs CONFIGURE_DEPENDS "${bench_dir}/*.cu")
104 |   file(RELATIVE_PATH bench_prefix "${benches_root}" "${bench_dir}")
105 |   file(TO_CMAKE_PATH "${bench_prefix}" bench_prefix)
106 |   string(REPLACE "/" "." bench_prefix "${bench_prefix}")
107 | 
108 |   foreach(bench_src IN LISTS bench_srcs)
109 |     # base tuning
110 |     get_filename_component(bench_name "${bench_src}" NAME_WLE)
111 |     string(PREPEND bench_name "cub.${bench_prefix}.")
112 |     
113 |     set(base_bench_name "${bench_name}.base")
114 |     add_bench(base_bench_target ${base_bench_name} "${bench_src}")
115 |     add_dependencies(${benches_meta_target} ${base_bench_target})
116 |     target_compile_definitions(${base_bench_target} PRIVATE TUNE_BASE=1)
117 | 
118 |     # tuning
119 |     if (CUB_ENABLE_TUNING)
120 |       set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${bench_src}")
121 |       get_bench_ranges("${bench_src}" "${bench_name}")
122 |       set(tuning_name "${bench_name}.variant")
123 |       set(tuning_path "${CMAKE_BINARY_DIR}/${tuning_name}.h")
124 |       add_bench(bench_target ${tuning_name} "${bench_src}")
125 |       file(WRITE "${tuning_path}" "#pragma once\n")
126 |       target_compile_options(${bench_target} PRIVATE "-include${tuning_path}")
127 |     endif()
128 |   endforeach()
129 | endfunction()
130 | 
131 | get_recursive_subdirs(subdirs)
132 | 
133 | foreach(subdir IN LISTS subdirs)
134 |   add_bench_dir("${subdir}")
135 | endforeach()
136 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | ## Benchmarks 
 2 | 
 3 | ### Ready
 4 | 
 5 | - radix sort
 6 |   - keys
 7 |   - pairs 
 8 | - merge sort
 9 |   - keys
10 |   - pairs 
11 | - reduce
12 |   - sum
13 |   - max
14 |   - by_key
15 | - scan
16 |   - sum
17 |   - max
18 |   - by key
19 | - select
20 |   - flagged
21 |   - if
22 | - partition
23 |   - flagged
24 |   - if
25 | - histogram 
26 |   - even
27 |   - range
28 |   - multi even
29 |   - multi range
30 | - rle 
31 |   - encode
32 |   - non trivial runs
33 | - adjacent difference 
34 |   - left
35 | 
36 | ### TODO
37 | 
38 | - segmented
39 | 


--------------------------------------------------------------------------------
/benchmarks/bench/histogram/histogram_common.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #pragma once
29 | 
30 | #include <cub/device/device_histogram.cuh>
31 | 
32 | #if !TUNE_BASE
33 | 
34 | #if TUNE_LOAD == 0
35 | #define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT
36 | #elif TUNE_LOAD == 1
37 | #define TUNE_LOAD_MODIFIER cub::LOAD_LDG
38 | #else // TUNE_LOAD == 2
39 | #define TUNE_LOAD_MODIFIER cub::LOAD_CA
40 | #endif // TUNE_LOAD 
41 | 
42 | #if TUNE_MEM_PREFERENCE == 0
43 | constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::GMEM;
44 | #elif TUNE_MEM_PREFERENCE == 1
45 | constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::SMEM;
46 | #else // TUNE_MEM_PREFERENCE == 2
47 | constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND;
48 | #endif // TUNE_MEM_PREFERENCE
49 | 
50 | 
51 | template <typename SampleT, int NUM_ACTIVE_CHANNELS>
52 | struct policy_hub_t
53 | {
54 |   template <int NOMINAL_ITEMS_PER_THREAD>
55 |   struct TScale
56 |   {
57 |     enum
58 |     {
59 |       V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
60 |       VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
61 |     };
62 |   };
63 | 
64 |   struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t>
65 |   {
66 |     using AgentHistogramPolicyT = cub::AgentHistogramPolicy<TUNE_THREADS,
67 |                                                             TScale<TUNE_ITEMS>::VALUE,
68 |                                                             cub::BLOCK_LOAD_DIRECT,
69 |                                                             TUNE_LOAD_MODIFIER,
70 |                                                             TUNE_RLE_COMPRESS,
71 |                                                             MEM_PREFERENCE,
72 |                                                             TUNE_WORK_STEALING>;
73 |   };
74 | 
75 |   using MaxPolicy = policy_t;
76 | };
77 | #endif // !TUNE_BASE
78 | 
79 | template <class SampleT, class OffsetT>
80 | SampleT get_upper_level(OffsetT bins, OffsetT elements)
81 | {
82 |   if constexpr (cuda::std::is_integral_v<SampleT>)
83 |   {
84 |     if constexpr (sizeof(SampleT) < sizeof(OffsetT))
85 |     {
86 |       const SampleT max_key = std::numeric_limits<SampleT>::max();
87 |       return static_cast<SampleT>(std::min(bins, static_cast<OffsetT>(max_key)));
88 |     }
89 |     else
90 |     {
91 |       return static_cast<SampleT>(bins);
92 |     }
93 |   }
94 | 
95 |   return static_cast<SampleT>(elements);
96 | }
97 | 


--------------------------------------------------------------------------------
/benchmarks/bench/reduce/base.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #include <cub/device/device_reduce.cuh>
 29 | 
 30 | #ifndef TUNE_BASE
 31 | #define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2)
 32 | #endif
 33 | 
 34 | #if !TUNE_BASE
 35 | template <typename AccumT, typename OffsetT>
 36 | struct policy_hub_t
 37 | {
 38 |   struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
 39 |   {
 40 |     static constexpr int threads_per_block  = TUNE_THREADS_PER_BLOCK;
 41 |     static constexpr int items_per_thread   = TUNE_ITEMS_PER_THREAD;
 42 |     static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD;
 43 | 
 44 |     using ReducePolicy = cub::AgentReducePolicy<threads_per_block,
 45 |                                                 items_per_thread,
 46 |                                                 AccumT,
 47 |                                                 items_per_vec_load,
 48 |                                                 cub::BLOCK_REDUCE_WARP_REDUCTIONS,
 49 |                                                 cub::LOAD_DEFAULT>;
 50 | 
 51 |     // SingleTilePolicy
 52 |     using SingleTilePolicy = ReducePolicy;
 53 | 
 54 |     // SegmentedReducePolicy
 55 |     using SegmentedReducePolicy = ReducePolicy;
 56 |   };
 57 | 
 58 |   using MaxPolicy = policy_t;
 59 | };
 60 | #endif // !TUNE_BASE
 61 | 
 62 | template <typename T, typename OffsetT>
 63 | void reduce(nvbench::state &state, nvbench::type_list<T, OffsetT>)
 64 | {
 65 |   using accum_t     = T;
 66 |   using input_it_t  = const T *;
 67 |   using output_it_t = T *;
 68 |   using offset_t    = typename cub::detail::ChooseOffsetT<OffsetT>::Type;
 69 |   using output_t    = T;
 70 |   using init_t      = T;
 71 | #if !TUNE_BASE
 72 |   using policy_t = policy_hub_t<accum_t, offset_t>;
 73 |   using dispatch_t =
 74 |     cub::DispatchReduce<input_it_t, output_it_t, offset_t, op_t, init_t, accum_t, policy_t>;
 75 | #else // TUNE_BASE
 76 |   using dispatch_t = cub::DispatchReduce<input_it_t, output_it_t, offset_t, op_t, init_t, accum_t>;
 77 | #endif // TUNE_BASE
 78 | 
 79 |   // Retrieve axis parameters
 80 |   const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
 81 |   thrust::device_vector<T> in(elements);
 82 |   thrust::device_vector<T> out(1);
 83 | 
 84 |   gen(seed_t{}, in);
 85 | 
 86 |   input_it_t d_in   = thrust::raw_pointer_cast(in.data());
 87 |   output_it_t d_out = thrust::raw_pointer_cast(out.data());
 88 | 
 89 |   // Enable throughput calculations and add "Size" column to results.
 90 |   state.add_element_count(elements);
 91 |   state.add_global_memory_reads<T>(elements, "Size");
 92 |   state.add_global_memory_writes<T>(1);
 93 | 
 94 |   // Allocate temporary storage:
 95 |   std::size_t temp_size;
 96 |   dispatch_t::Dispatch(nullptr,
 97 |                        temp_size,
 98 |                        d_in,
 99 |                        d_out,
100 |                        static_cast<offset_t>(elements),
101 |                        op_t{},
102 |                        init_t{},
103 |                        0 /* stream */);
104 | 
105 |   thrust::device_vector<nvbench::uint8_t> temp(temp_size);
106 |   auto *temp_storage = thrust::raw_pointer_cast(temp.data());
107 | 
108 |   state.exec([&](nvbench::launch &launch) {
109 |     dispatch_t::Dispatch(temp_storage,
110 |                          temp_size,
111 |                          d_in,
112 |                          d_out,
113 |                          static_cast<offset_t>(elements),
114 |                          op_t{},
115 |                          init_t{},
116 |                          launch.get_stream());
117 |   });
118 | }
119 | 
120 | NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types))
121 |   .set_name("cub::DeviceReduce::Reduce")
122 |   .set_type_axes_names({"T{ct}", "OffsetT{ct}"})
123 |   .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4));
124 | 


--------------------------------------------------------------------------------
/benchmarks/bench/reduce/max.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include <nvbench_helper.cuh>
29 | 
30 | // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
31 | // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
32 | // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
33 | 
34 | using op_t = max_t;
35 | #include "base.cuh"
36 | 


--------------------------------------------------------------------------------
/benchmarks/bench/reduce/sum.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include <nvbench_helper.cuh>
29 | 
30 | // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1
31 | // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32
32 | // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1
33 | 
34 | using op_t = cub::Sum;
35 | #include "base.cuh"


--------------------------------------------------------------------------------
/benchmarks/bench/scan/exclusive/max.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | // %RANGE% TUNE_ITEMS ipt 7:24:1
29 | // %RANGE% TUNE_THREADS tpb 128:1024:32
30 | // %RANGE% TUNE_MAGIC_NS ns 0:2048:4
31 | // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
32 | // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
33 | // %RANGE% TUNE_TRANSPOSE trp 0:1:1
34 | // %RANGE% TUNE_LOAD ld 0:2:1
35 | 
36 | #include <nvbench_helper.cuh>
37 | 
38 | using op_t = max_t;
39 | #include "base.cuh"
40 | 


--------------------------------------------------------------------------------
/benchmarks/bench/scan/exclusive/sum.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include <nvbench_helper.cuh>
29 | 
30 | // %RANGE% TUNE_ITEMS ipt 7:24:1
31 | // %RANGE% TUNE_THREADS tpb 128:1024:32
32 | // %RANGE% TUNE_MAGIC_NS ns 0:2048:4
33 | // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1
34 | // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5
35 | // %RANGE% TUNE_TRANSPOSE trp 0:1:1
36 | // %RANGE% TUNE_LOAD ld 0:2:1
37 | 
38 | using op_t = cub::Sum;
39 | #include "base.cuh"
40 | 


--------------------------------------------------------------------------------
/benchmarks/docker/.gitignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | 


--------------------------------------------------------------------------------
/benchmarks/docker/recipe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import hpccm
 4 | 
 5 | hpccm.config.set_container_format('docker')
 6 | 
 7 | Stage0 += hpccm.primitives.baseimage(image='nvidia/cuda:12.1.0-devel-ubuntu22.04')
 8 | Stage0 += hpccm.building_blocks.apt_get(ospackages=['git', 'tmux', 'gcc', 'g++', 'vim', 'python3', 'python-is-python3', 'ninja-build'])
 9 | Stage0 += hpccm.building_blocks.llvm(version='15', extra_tools=True, toolset=True)
10 | Stage0 += hpccm.building_blocks.cmake(eula=True, version='3.26.3')
11 | Stage0 += hpccm.building_blocks.nsight_compute(eula=True, version='2023.1.1')
12 | Stage0 += hpccm.building_blocks.pip(packages=['fpzip', 'numpy', 'pandas'], pip='pip3')
13 | 
14 | 


--------------------------------------------------------------------------------
/benchmarks/nvbench_helper/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Fetch nvbench
 2 | CPMAddPackage("gh:NVIDIA/nvbench#main")
 3 | 
 4 | add_library(nvbench_helper SHARED nvbench_helper.cuh nvbench_helper.cu)
 5 | target_link_libraries(nvbench_helper PUBLIC CUB::CUB 
 6 |                                             Thrust::Thrust 
 7 |                                             CUB::libcudacxx 
 8 |                                             nvbench::nvbench
 9 |                                      PRIVATE CUDA::curand)
10 | 
11 | target_include_directories(nvbench_helper PUBLIC "${CMAKE_CURRENT_LIST_DIR}")
12 | 


--------------------------------------------------------------------------------
/benchmarks/nvbench_helper/look_back_helper.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #pragma once
29 | 
30 | #if !TUNE_BASE
31 | #include <nvbench_helper.cuh>
32 | #include <cub/agent/single_pass_scan_operators.cuh>
33 | 
34 | #if !defined(TUNE_MAGIC_NS) || !defined(TUNE_L2_WRITE_LATENCY_NS) || !defined(TUNE_DELAY_CONSTRUCTOR_ID)
35 | #error "TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS, and TUNE_DELAY_CONSTRUCTOR_ID must be defined"
36 | #endif
37 | 
38 | using delay_constructors = nvbench::type_list<
39 |   cub::detail::no_delay_constructor_t<TUNE_L2_WRITE_LATENCY_NS>,
40 |   cub::detail::fixed_delay_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
41 |   cub::detail::exponential_backoff_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
42 |   cub::detail::exponential_backoff_jitter_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
43 |   cub::detail::exponential_backoff_jitter_window_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
44 |   cub::detail::exponential_backon_jitter_window_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
45 |   cub::detail::exponential_backon_jitter_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>,
46 |   cub::detail::exponential_backon_constructor_t<TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS>>;
47 | 
48 | using delay_constructor_t = nvbench::tl::get<TUNE_DELAY_CONSTRUCTOR_ID, delay_constructors>;
49 | #endif // !TUNE_BASE
50 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | *.pyc
3 | *.pyo
4 | *.pyd
5 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/cub/__init__.py:
--------------------------------------------------------------------------------
1 | from . import bench
2 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/cub/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import *
2 | from .storage import *
3 | from .bench import Bench
4 | from .cmake import CMake
5 | from .score import *
6 | from .search import *
7 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/cub/bench/build.py:
--------------------------------------------------------------------------------
1 | 
2 | class Build:
3 |   def __init__(self, code, elapsed):
4 |       self.code = code
5 |       self.elapsed = elapsed
6 | 
7 |   def __repr__(self):
8 |       return "Build(code = {}, elapsed = {:.4f}s)".format(self.code, self.elapsed)
9 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/cub/bench/cmake.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import signal
  4 | import subprocess
  5 | 
  6 | from .build import Build
  7 | from .config import Config
  8 | from .storage import Storage
  9 | from .logger import *
 10 | 
 11 | 
 12 | def create_builds_table(conn):
 13 |     with conn:
 14 |         conn.execute("""
 15 |         CREATE TABLE IF NOT EXISTS builds (
 16 |             ctk TEXT NOT NULL,
 17 |             cub TEXT NOT NULL,
 18 |             bench TEXT NOT NULL,
 19 |             code TEXT NOT NULL,
 20 |             elapsed REAL
 21 |         );
 22 |         """)
 23 | 
 24 | 
 25 | class CMakeCache:
 26 |   _instance = None
 27 | 
 28 |   def __new__(cls, *args, **kwargs):
 29 |       if cls._instance is None:
 30 |           cls._instance = super().__new__(cls, *args, **kwargs)
 31 |           create_builds_table(Storage().connection())
 32 |       return cls._instance
 33 | 
 34 |   def pull_build(self, bench):
 35 |       config = Config()
 36 |       ctk = config.ctk
 37 |       cub = config.cub
 38 |       conn = Storage().connection()
 39 | 
 40 |       with conn:
 41 |           query = "SELECT code, elapsed FROM builds WHERE ctk = ? AND cub = ? AND bench = ?;"
 42 |           result = conn.execute(query, (ctk, cub, bench.label())).fetchone()
 43 | 
 44 |           if result:
 45 |               code, elapsed = result
 46 |               return Build(int(code), float(elapsed))
 47 | 
 48 |           return result
 49 | 
 50 |   def push_build(self, bench, build):
 51 |       config = Config()
 52 |       ctk = config.ctk
 53 |       cub = config.cub
 54 |       conn = Storage().connection()
 55 | 
 56 |       with conn:
 57 |           conn.execute("INSERT INTO builds (ctk, cub, bench, code, elapsed) VALUES (?, ?, ?, ?, ?);",
 58 |                        (ctk, cub, bench.label(), build.code, build.elapsed))
 59 | 
 60 | 
 61 | class CMake:
 62 |   def __init__(self):
 63 |     pass 
 64 | 
 65 |   def do_build(self, bench, timeout):
 66 |       logger = Logger()
 67 | 
 68 |       try:
 69 |           if not bench.is_base():
 70 |               with open(bench.exe_name() + ".h", "w") as f:
 71 |                   f.writelines(bench.definitions())
 72 | 
 73 |           cmd = ["cmake", "--build", ".", "--target", bench.exe_name()]
 74 |           logger.info("starting build for {}: {}".format(bench.label(), " ".join(cmd)))
 75 | 
 76 |           begin = time.time()
 77 |           p = subprocess.Popen(cmd,
 78 |                               start_new_session=True,
 79 |                               stdout=subprocess.DEVNULL,
 80 |                               stderr=subprocess.DEVNULL)
 81 |           p.wait(timeout=timeout)
 82 |           elapsed = time.time() - begin
 83 |           logger.info("finished build for {} ({}) in {}s".format(bench.label(), p.returncode, elapsed))
 84 | 
 85 |           return Build(p.returncode, elapsed)
 86 |       except subprocess.TimeoutExpired:
 87 |           logger.info("build for {} reached timeout of {}s".format(bench.label(), timeout))
 88 |           os.killpg(os.getpgid(p.pid), signal.SIGTERM)
 89 |           return Build(424242, float('inf'))
 90 | 
 91 |   def build(self, bench):
 92 |       logger = Logger()
 93 |       timeout = None
 94 | 
 95 |       cache = CMakeCache()
 96 | 
 97 |       if bench.is_base():
 98 |           # Only base build can be pulled from cache
 99 |           build = cache.pull_build(bench)
100 | 
101 |           if build:
102 |               logger.info("found cached base build for {}".format(bench.label()))
103 |               if bench.is_base():
104 |                   if not os.path.exists("bin/{}".format(bench.exe_name())):
105 |                       self.do_build(bench, None)
106 | 
107 |               return build
108 |       else:
109 |           base_build = self.build(bench.get_base())
110 | 
111 |           if base_build.code != 0:
112 |               raise Exception("Base build failed")
113 | 
114 |           timeout = base_build.elapsed * 10
115 | 
116 |       build = self.do_build(bench, timeout)
117 |       cache.push_build(bench, build)
118 |       return build
119 | 
120 |   def clean():
121 |       cmd = ["cmake", "--build", ".", "--target", "clean"]
122 |       p = subprocess.Popen(cmd, stdout=subprocess.DEVNULL,
123 |                           stderr=subprocess.DEVNULL)
124 |       p.wait()
125 | 
126 |       if p.returncode != 0:
127 |           raise Exception("Unable to clean build directory")
128 | 
129 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/cub/bench/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import itertools
  4 | 
  5 | 
  6 | class Range:
  7 |     def __init__(self, definition, label, low, high, step):
  8 |         self.definition = definition
  9 |         self.label = label
 10 |         self.low = low
 11 |         self.high = high
 12 |         self.step = step
 13 | 
 14 | 
 15 | class RangePoint:
 16 |     def __init__(self, definition, label, value):
 17 |         self.definition = definition
 18 |         self.label = label
 19 |         self.value = value
 20 | 
 21 | 
 22 | class VariantPoint:
 23 |     def __init__(self, range_points):
 24 |         self.range_points = range_points
 25 |     
 26 |     def label(self):
 27 |         if self.is_base():
 28 |             return 'base'
 29 |         return '.'.join(["{}_{}".format(point.label, point.value) for point in self.range_points])
 30 |     
 31 |     def is_base(self):
 32 |         return len(self.range_points) == 0
 33 |     
 34 |     def tuning(self):
 35 |         if self.is_base():
 36 |             return ""
 37 |         
 38 |         tuning = "#pragma once\n\n"
 39 |         for point in self.range_points:
 40 |             tuning += "#define {} {}\n".format(point.definition, point.value)
 41 |         return tuning
 42 | 
 43 | 
 44 | class BasePoint(VariantPoint):
 45 |     def __init__(self):
 46 |         VariantPoint.__init__(self, [])
 47 | 
 48 | 
 49 | def parse_ranges(columns):
 50 |     ranges = []
 51 |     for column in columns:
 52 |         definition, label_range = column.split('|')
 53 |         label, range = label_range.split('=')
 54 |         start, end, step = [int(x) for x in range.split(':')]
 55 |         ranges.append(Range(definition, label, start, end + 1, step))
 56 | 
 57 |     return ranges
 58 | 
 59 | 
 60 | def parse_meta():
 61 |     if not os.path.isfile("cub_bench_meta.csv"):
 62 |         print("cub_bench_meta.csv not found", file=sys.stderr)
 63 |         print("make sure to run the script from the CUB build directory",
 64 |               file=sys.stderr)
 65 | 
 66 |     benchmarks = {}
 67 |     ctk_version = "0.0.0"
 68 |     cub_revision = "0.0-0-0000"
 69 |     with open("cub_bench_meta.csv", "r") as f:
 70 |         lines = f.readlines()
 71 |         for line in lines:
 72 |             columns = line.split(',')
 73 |             name = columns[0]
 74 | 
 75 |             if name == "ctk_version":
 76 |                 ctk_version = columns[1].rstrip()
 77 |             elif name == "cub_revision":
 78 |                 cub_revision = columns[1].rstrip()
 79 |             else:
 80 |                 benchmarks[name] = parse_ranges(columns[1:])
 81 | 
 82 |     return ctk_version, cub_revision, benchmarks
 83 | 
 84 | 
 85 | class Config:
 86 |     _instance = None
 87 | 
 88 |     def __new__(cls, *args, **kwargs):
 89 |         if cls._instance is None:
 90 |             cls._instance = super().__new__(cls, *args, **kwargs)
 91 |             cls._instance.ctk, cls._instance.cub, cls._instance.benchmarks = parse_meta()
 92 |         return cls._instance
 93 |     
 94 |     def label_to_variant_point(self, algname, label):
 95 |         if label == "base":
 96 |             return BasePoint()
 97 | 
 98 |         label_to_definition = {}
 99 |         for param_space in self.benchmarks[algname]:
100 |             label_to_definition[param_space.label] = param_space.definition
101 |         
102 |         points = []
103 |         for point in label.split('.'):
104 |             label, value = point.split('_')
105 |             points.append(RangePoint(label_to_definition[label], label, int(value)))
106 |         
107 |         return VariantPoint(points)
108 | 
109 |     def variant_space(self, algname):
110 |         variants = []
111 |         for param_space in self.benchmarks[algname]:
112 |             variants.append([])
113 |             for value in range(param_space.low, param_space.high, param_space.step):
114 |                 variants[-1].append(RangePoint(param_space.definition, param_space.label, value))
115 | 
116 |         return (VariantPoint(points) for points in itertools.product(*variants))
117 | 
118 |     def variant_space_size(self, algname):
119 |         num_variants = 1
120 |         for param_space in self.benchmarks[algname]:
121 |             num_variants = num_variants * len(range(param_space.low, param_space.high, param_space.step))
122 |         return num_variants
123 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/cub/bench/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | class Logger:
 4 |     _instance = None
 5 | 
 6 |     def __new__(cls, *args, **kwargs):
 7 |         if cls._instance is None:
 8 |             cls._instance = super().__new__(cls, *args, **kwargs)
 9 |             logger = logging.getLogger()
10 |             logger.setLevel(logging.DEBUG)
11 |             file_handler = logging.FileHandler('cub_bench_meta.log')
12 |             file_handler.setFormatter(logging.Formatter('%(asctime)s: %(message)s'))
13 |             logger.addHandler(file_handler)
14 |             cls._instance.logger = logger
15 | 
16 |         return cls._instance
17 | 
18 |     def info(self, message):
19 |         self.logger.info(message)
20 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/cub/bench/score.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy as np
 3 | 
 4 | 
 5 | def importance_function(x):
 6 |     return 1 - math.exp(-x)
 7 | 
 8 | 
 9 | def x_by_importance(y):
10 |     return -math.log(1 - y)
11 | 
12 | 
13 | def compute_weights(num_values):
14 |     least_importance = 0.6
15 |     most_importance = 0.999
16 | 
17 |     assert(least_importance < most_importance)
18 |     assert(least_importance >= 0 and least_importance < 1)
19 |     assert(most_importance > 0 and most_importance < 1)
20 | 
21 |     begin = x_by_importance(least_importance)
22 |     end = x_by_importance(most_importance)
23 | 
24 |     rng = end - begin
25 |     step = rng / num_values
26 | 
27 |     weights = np.array([begin + x * step for x in range(num_values)])
28 |     weights = weights / sum(weights)
29 | 
30 |     return weights
31 | 
32 | 
33 | def io_weights(values):
34 |     return compute_weights(len(values))
35 | 
36 | 
37 | def ei_weights(values):
38 |     return np.ones(len(values))
39 | 
40 | 
41 | def compute_axes_ids(rt_axes_values):
42 |     rt_axes_ids = {}
43 | 
44 |     axis_id = 0
45 |     for rt_axis in rt_axes_values:
46 |         rt_axes_ids[rt_axis] = axis_id
47 |         axis_id = axis_id + 1
48 |     
49 |     return rt_axes_ids
50 | 
51 | 
52 | def compute_weight_matrix(rt_axes_values, rt_axes_ids):
53 |     rt_axes_weights = {}
54 | 
55 |     first_rt_axis = True
56 |     first_rt_axis_name = None
57 |     for rt_axis in rt_axes_values:
58 |         if first_rt_axis:
59 |             first_rt_axis_name = rt_axis
60 |             first_rt_axis = False
61 |         values = rt_axes_values[rt_axis]
62 |         rt_axes_values[rt_axis] = values
63 |         if '{io}' in rt_axis:
64 |             rt_axes_weights[rt_axis] = io_weights(values)
65 |         else:
66 |             rt_axes_weights[rt_axis] = ei_weights(values)
67 | 
68 |     num_rt_axes = len(rt_axes_ids)
69 |     for rt_axis in rt_axes_weights:
70 |         shape = [1] * num_rt_axes 
71 |         shape[rt_axes_ids[rt_axis]] = -1
72 |         rt_axes_weights[rt_axis] = rt_axes_weights[rt_axis].reshape(*shape)
73 |     
74 |     weights_matrix = rt_axes_weights[first_rt_axis_name]
75 |     for rt_axis in rt_axes_weights:
76 |         if rt_axis == first_rt_axis_name:
77 |             continue
78 | 
79 |         weights_matrix = weights_matrix * rt_axes_weights[rt_axis]
80 |     
81 |     return weights_matrix / np.sum(weights_matrix)
82 | 
83 | 
84 | def get_workload_coordinates(rt_workload, rt_axes_values, rt_axes_ids):
85 |     coordinates = [0] * len(rt_axes_ids)
86 |     for point in rt_workload:
87 |         rt_axis, rt_value = point.split('=')
88 |         coordinates[rt_axes_ids[rt_axis]] = rt_axes_values[rt_axis].index(rt_value)
89 |     return coordinates
90 | 
91 | def get_workload_weight(rt_workload, rt_axes_values, rt_axes_ids, weights_matrix):
92 |     coordinates = get_workload_coordinates(rt_workload, rt_axes_values, rt_axes_ids)
93 |     return weights_matrix[tuple(coordinates)]
94 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/cub/bench/search.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import argparse
  3 | import numpy as np
  4 | 
  5 | from .bench import Bench, BaseBench
  6 | from .config import Config
  7 | from .storage import Storage
  8 | from .cmake import CMake
  9 | 
 10 | 
 11 | def list_axes(benchmarks, sub_space):
 12 |     print("### Axes")
 13 | 
 14 |     axes = {}
 15 | 
 16 |     for algname in benchmarks:
 17 |         bench = BaseBench(algname)
 18 |         for axis in bench.axes_values(sub_space, True) + bench.axes_values(sub_space, False):
 19 |             for point in axis:
 20 |                 axis, value = point.split('=')
 21 |                 if axis in axes:
 22 |                     axes[axis].add(value)
 23 |                 else:
 24 |                     axes[axis] = {value}
 25 | 
 26 |     for axis in axes:
 27 |         print("  * `{}`".format(axis))
 28 | 
 29 |         for value in axes[axis]:
 30 |             print("    * `{}`".format(value))
 31 | 
 32 | 
 33 | def list_benches():
 34 |     print("### Benchmarks")
 35 | 
 36 |     config = Config()
 37 | 
 38 |     for algname in config.benchmarks:
 39 |         space_size = config.variant_space_size(algname)
 40 |         print("  * `{}`: {} variants: ".format(algname, space_size))
 41 | 
 42 |         for param_space in config.benchmarks[algname]:
 43 |             param_name = param_space.label
 44 |             param_rng = (param_space.low, param_space.high, param_space.step)
 45 |             print("    * `{}`: {}".format(param_name, param_rng))
 46 | 
 47 | 
 48 | def parse_sub_space(args):
 49 |     sub_space = {}
 50 |     for axis in args:
 51 |         name, value = axis.split('=')
 52 | 
 53 |         if '[' in value:
 54 |             value = value.replace('[', '').replace(']', '')
 55 |             values = value.split(',')
 56 |         else:
 57 |             values = [value]
 58 |         sub_space[name] = values
 59 | 
 60 |     return sub_space
 61 | 
 62 | 
 63 | def parse_arguments():
 64 |     parser = argparse.ArgumentParser(
 65 |         description="Runs benchmarks and stores results in a database.")
 66 |     parser.add_argument('-R', type=str, default='.*',
 67 |                         help="Regex for benchmarks selection.")
 68 |     parser.add_argument('-a', '--args', action='append',
 69 |                         type=str, help="Parameter in the format `Param=Value`.")
 70 |     parser.add_argument(
 71 |         '--list-axes', action=argparse.BooleanOptionalAction, help="Show available parameters.")
 72 |     parser.add_argument(
 73 |         '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.")
 74 |     return parser.parse_args()
 75 | 
 76 | 
 77 | def run_benches(benchmarks, workload_sub_space, regex, seeker):
 78 |     pattern = re.compile(regex)
 79 | 
 80 |     for algname in benchmarks:
 81 |         if pattern.match(algname):
 82 |             bench = BaseBench(algname)
 83 |             ct_workload_space = bench.ct_workload_space(workload_sub_space)
 84 |             rt_workload_space = bench.rt_workload_space(workload_sub_space)
 85 |             seeker(algname, ct_workload_space, rt_workload_space)
 86 | 
 87 | 
 88 | def search(seeker):
 89 |     args = parse_arguments()
 90 | 
 91 |     if not Storage().exists():
 92 |         CMake().clean()
 93 | 
 94 |     config = Config()
 95 |     print("ctk: ", config.ctk)
 96 |     print("cub: ", config.cub)
 97 | 
 98 |     workload_sub_space = {}
 99 | 
100 |     if args.args:
101 |         workload_sub_space = parse_sub_space(args.args)
102 | 
103 |     if args.list_axes:
104 |         list_axes(config.benchmarks, workload_sub_space)
105 |         return
106 | 
107 |     if args.list_benches:
108 |         list_benches()
109 |         return
110 | 
111 |     run_benches(config.benchmarks, workload_sub_space, args.R, seeker)
112 | 
113 | 
114 | class MedianCenterEstimator:
115 |     def __init__(self):
116 |         pass
117 | 
118 |     def __call__(self, samples):
119 |         if len(samples) == 0:
120 |             return float("inf")
121 | 
122 |         return float(np.median(samples))
123 | 
124 | 
125 | class BruteForceSeeker:
126 |     def __init__(self, base_center_estimator, variant_center_estimator):
127 |         self.base_center_estimator = base_center_estimator
128 |         self.variant_center_estimator = variant_center_estimator
129 | 
130 |     def __call__(self, algname, ct_workload_space, rt_workload_space):
131 |         variants = Config().variant_space(algname)
132 | 
133 |         for ct_workload in ct_workload_space:
134 |             for variant in variants:
135 |                 bench = Bench(algname, variant, list(ct_workload))
136 |                 if bench.build():
137 |                     score = bench.score(ct_workload, 
138 |                                         rt_workload_space, 
139 |                                         self.base_center_estimator, 
140 |                                         self.variant_center_estimator)
141 | 
142 |                     print(bench.label(), score)
143 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/cub/bench/storage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import fpzip
 3 | import sqlite3
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | 
 8 | db_name = "cub_bench_meta.db"
 9 | 
10 | 
11 | def blob_to_samples(blob):
12 |     return np.squeeze(fpzip.decompress(blob))
13 | 
14 | 
15 | class StorageBase:
16 |     def __init__(self, db_path):
17 |         self.conn = sqlite3.connect(db_path)
18 | 
19 |     def connection(self):
20 |         return self.conn
21 | 
22 |     def exists(self):
23 |         return os.path.exists(db_name)
24 | 
25 |     def algnames(self):
26 |         with self.conn:
27 |             result = self.conn.execute("""
28 |             SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'cub.bench.%';
29 |             """).fetchall()
30 | 
31 |         algnames = [algname[0] for algname in result]
32 |         return algnames
33 | 
34 |     def alg_to_df(self, algname):
35 |         with self.conn:
36 |             df = pd.read_sql_query("SELECT * FROM \"{}\"".format(algname), self.conn)
37 |             df['samples'] = df['samples'].apply(blob_to_samples)
38 | 
39 |         return df
40 |     
41 |     def store_df(self, algname, df):
42 |         df['samples'] = df['samples'].apply(fpzip.compress)
43 |         df.to_sql(algname, self.conn, if_exists='replace', index=False)
44 | 
45 | 
46 | class Storage:
47 |     _instance = None
48 | 
49 |     def __new__(cls, *args, **kwargs):
50 |         if cls._instance is None:
51 |             cls._instance = super().__new__(cls, *args, **kwargs)
52 |             cls._instance.base = StorageBase(db_name)
53 |         return cls._instance
54 | 
55 |     def connection(self):
56 |         return self.base.connection()
57 | 
58 |     def exists(self):
59 |         return self.base.exists()
60 | 
61 |     def algnames(self):
62 |         return self.base.algnames()
63 | 
64 |     def alg_to_df(self, algname):
65 |         return self.base.alg_to_df(algname)
66 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/search.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import cub.bench as bench
 4 | 
 5 | 
 6 | # TODO:
 7 | # - driver version
 8 | # - host compiler + version
 9 | # - gpu clocks / pm
10 | # - ecc
11 | 
12 | 
13 | def main():
14 |     center_estimator = bench.MedianCenterEstimator()
15 |     bench.search(bench.BruteForceSeeker(center_estimator, center_estimator))
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     main()
20 | 


--------------------------------------------------------------------------------
/benchmarks/scripts/verify.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import argparse
 5 | import cub.bench 
 6 | 
 7 | 
 8 | def parse_arguments():
 9 |     parser = argparse.ArgumentParser(description='Verify tuning variant')
10 |     parser.add_argument('--variant', type=str, help='Variant to verify', default=None, required=True)
11 | 
12 |     variant = parser.parse_known_args()[0].variant
13 |     sys.argv.remove('--variant={}'.format(variant))
14 | 
15 |     return variant
16 | 
17 | 
18 | def workload_header(ct_workload_space, rt_workload_space):
19 |     for ct_workload in ct_workload_space:
20 |         for rt_workload in rt_workload_space:
21 |             workload_point = ct_workload + rt_workload
22 |             return ", ".join([x.split('=')[0] for x in workload_point])
23 | 
24 | 
25 | def workload_entry(ct_workload, rt_workload):
26 |     workload_point = ct_workload + rt_workload
27 |     return ", ".join([x.split('=')[1] for x in workload_point])
28 | 
29 | 
30 | class VerifySeeker:
31 |     def __init__(self, variant_label):
32 |         self.label = variant_label
33 |         self.estimator = cub.bench.MedianCenterEstimator()
34 | 
35 |     def __call__(self, algname, ct_workload_space, rt_workload_space):
36 |         variant_point = cub.bench.Config().label_to_variant_point(algname, self.label)
37 | 
38 |         print("{}, MinS, MedianS, MaxS".format(workload_header(ct_workload_space, rt_workload_space)))
39 |         for ct_workload in ct_workload_space:
40 |             bench = cub.bench.Bench(algname, variant_point, list(ct_workload))
41 |             if bench.build():
42 |                 base = bench.get_base()
43 |                 for rt_workload in rt_workload_space:
44 |                     workload_point = ct_workload + rt_workload
45 |                     base_samples, base_elapsed = base.do_run(workload_point, None)
46 |                     variant_samples, _ = bench.do_run(workload_point, base_elapsed * 10)
47 |                     min_speedup = min(base_samples) / min(variant_samples)
48 |                     median_speedup = self.estimator(base_samples) / self.estimator(variant_samples)
49 |                     max_speedup = max(base_samples) / max(variant_samples)
50 |                     point_str = workload_entry(ct_workload, rt_workload)
51 |                     print("{}, {}, {}, {}".format(point_str, min_speedup, median_speedup, max_speedup))
52 | 
53 | 
54 | def main():
55 |     cub.bench.search(VerifySeeker(parse_arguments()))
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/cmake/AppendOptionIfAvailable.cmake:
--------------------------------------------------------------------------------
 1 | include_guard(GLOBAL)
 2 | include(CheckCXXCompilerFlag)
 3 | 
 4 | macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST)
 5 | 
 6 | string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR)
 7 | check_cxx_compiler_flag(${_FLAG} ${_VAR})
 8 | 
 9 | if (${${_VAR}})
10 |   list(APPEND ${_LIST} ${_FLAG})
11 | endif ()
12 | 
13 | endmacro ()
14 | 


--------------------------------------------------------------------------------
/cmake/CubAddSubdir.cmake:
--------------------------------------------------------------------------------
1 | find_package(CUB REQUIRED CONFIG
2 |   NO_DEFAULT_PATH # Only check the explicit path in HINTS:
3 |   HINTS "${CMAKE_CURRENT_LIST_DIR}/.."
4 | )
5 | 


--------------------------------------------------------------------------------
/cmake/CubCompilerHacks.cmake:
--------------------------------------------------------------------------------
 1 | # Set up compiler paths and apply temporary hacks to support NVC++.
 2 | # This file must be included before enabling any languages.
 3 | 
 4 | # Temporary hacks to make NVC++ work; this requires you to define
 5 | # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`.
 6 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
 7 |   # If using NVC++, don't set CXX compiler
 8 |   if (NOT "${CMAKE_CXX_COMPILER}" STREQUAL "")
 9 |     unset(CMAKE_CXX_COMPILER CACHE)
10 |     message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have"
11 |       " specified a different ISO C++ compiler; NVC++ acts as both, so please"
12 |       " unset the CMAKE_CXX_COMPILER variable."
13 |     )
14 |   endif()
15 | 
16 |   # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to
17 |   # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't
18 |   # understand.
19 |   if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "")
20 |     unset(CMAKE_CUDA_HOST_COMPILER CACHE)
21 |     message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have"
22 |       " specified a different host ISO C++ compiler; NVC++ acts as both, so"
23 |       " please unset the CMAKE_CUDA_HOST_COMPILER variable."
24 |     )
25 |   endif()
26 | 
27 |   set(CMAKE_CXX_COMPILER "${CMAKE_CUDA_COMPILER}")
28 |   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -cuda")
29 |   set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CUDA_COMPILER}")
30 |   set(CMAKE_CUDA_LINK_EXECUTABLE
31 |     "<CMAKE_CUDA_HOST_LINK_LAUNCHER> <FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
32 | endif ()
33 | 
34 | # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to
35 | # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't
36 | # understand.
37 | if ((NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}"))
38 |   if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR
39 |     "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}"))
40 |     set(tmp "${CMAKE_CUDA_HOST_COMPILER}")
41 |     unset(CMAKE_CUDA_HOST_COMPILER CACHE)
42 |     message(FATAL_ERROR
43 |       "For convenience, CUB's test harness uses CMAKE_CXX_COMPILER for the "
44 |       "CUDA host compiler. Refusing to overwrite specified "
45 |       "CMAKE_CUDA_HOST_COMPILER -- please reconfigure without setting this "
46 |       "variable. Currently:\n"
47 |       "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}\n"
48 |       "CMAKE_CUDA_HOST_COMPILER=${tmp}"
49 |     )
50 |   endif ()
51 |   set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
52 | endif ()
53 | 
54 | # Temporary hacks to make NVC++ work; this requires you to define
55 | # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`.
56 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
57 |   # Need 3.17 for the properties used below.
58 |   cmake_minimum_required(VERSION 3.17)
59 | 
60 |   set(CMAKE_CUDA_STANDARD_DEFAULT 03)
61 | 
62 |   set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "-std=c++03")
63 |   set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "-std=c++03")
64 |   set(CMAKE_CUDA03_STANDARD__HAS_FULL_SUPPORT TRUE)
65 |   set_property(GLOBAL PROPERTY CMAKE_CUDA03_KNOWN_FEATURES)
66 | 
67 |   set(CMAKE_CUDA11_STANDARD_COMPILE_OPTION "-std=c++11")
68 |   set(CMAKE_CUDA11_EXTENSION_COMPILE_OPTION "-std=c++11")
69 |   set(CMAKE_CUDA11_STANDARD__HAS_FULL_SUPPORT TRUE)
70 |   set_property(GLOBAL PROPERTY CMAKE_CUDA11_KNOWN_FEATURES)
71 | 
72 |   set(CMAKE_CUDA14_STANDARD_COMPILE_OPTION "-std=c++14")
73 |   set(CMAKE_CUDA14_EXTENSION_COMPILE_OPTION "-std=c++14")
74 |   set(CMAKE_CUDA14_STANDARD__HAS_FULL_SUPPORT TRUE)
75 |   set_property(GLOBAL PROPERTY CMAKE_CUDA14_KNOWN_FEATURES)
76 | 
77 |   set(CMAKE_CUDA17_STANDARD_COMPILE_OPTION "-std=c++17")
78 |   set(CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "-std=c++17")
79 |   set(CMAKE_CUDA17_STANDARD__HAS_FULL_SUPPORT TRUE)
80 |   set_property(GLOBAL PROPERTY CMAKE_CUDA17_KNOWN_FEATURES)
81 | 
82 |   include(Internal/FeatureTesting)
83 |   include(Compiler/CMakeCommonCompilerMacros)
84 |   cmake_record_cuda_compile_features()
85 | 
86 |   set(CMAKE_CUDA_COMPILE_FEATURES
87 |     ${CMAKE_CUDA03_COMPILE_FEATURES}
88 |     ${CMAKE_CUDA11_COMPILE_FEATURES}
89 |     ${CMAKE_CUDA14_COMPILE_FEATURES}
90 |     ${CMAKE_CUDA17_COMPILE_FEATURES}
91 |     ${CMAKE_CUDA20_COMPILE_FEATURES}
92 |   )
93 | endif ()
94 | 


--------------------------------------------------------------------------------
/cmake/CubCudaConfig.cmake:
--------------------------------------------------------------------------------
 1 | enable_language(CUDA)
 2 | 
 3 | if (NOT CUB_IN_THRUST)
 4 |   message(FATAL_ERROR
 5 |     "Building CUB as a standalone project is no longer supported. "
 6 |     "Use the Thrust repo instead.")
 7 | endif()
 8 | 
 9 | set(CUB_CUDA_FLAGS_BASE "${THRUST_CUDA_FLAGS_BASE}")
10 | set(CUB_CUDA_FLAGS_RDC "${THRUST_CUDA_FLAGS_RDC}")
11 | set(CUB_CUDA_FLAGS_NO_RDC "${THRUST_CUDA_FLAGS_NO_RDC}")
12 | 
13 | # Update the enabled architectures list from thrust
14 | foreach (arch IN LISTS THRUST_KNOWN_COMPUTE_ARCHS)
15 |   if (THRUST_ENABLE_COMPUTE_${arch})
16 |     set(CUB_ENABLE_COMPUTE_${arch} True)
17 |     string(APPEND arch_message " sm_${arch}")
18 |   else()
19 |     set(CUB_ENABLE_COMPUTE_${arch} False)
20 |   endif()
21 | endforeach()
22 | 
23 | message(STATUS ${arch_message})
24 | 
25 | #
26 | # RDC options:
27 | #
28 | 
29 | # RDC is off by default in NVCC and on by default in NVC++. Turning off RDC
30 | # isn't currently supported by NVC++. So, we default to RDC off for NVCC and
31 | # RDC on for NVC++.
32 | set(option_init OFF)
33 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
34 |   set(option_init ON)
35 | endif()
36 | 
37 | option(CUB_ENABLE_TESTS_WITH_RDC
38 |   "Build all CUB tests with RDC; tests that require RDC are not affected by this option."
39 |   ${option_init}
40 | )
41 | 
42 | option(CUB_ENABLE_EXAMPLES_WITH_RDC
43 |   "Build all CUB examples with RDC; examples which require RDC are not affected by this option."
44 |   ${option_init}
45 | )
46 | 
47 | # Check for RDC/SM compatibility and error/warn if necessary
48 | set(rdc_supported True)
49 | foreach (arch IN LISTS no_rdc_archs)
50 |   if (CUB_ENABLE_COMPUTE_${arch})
51 |     set(rdc_supported False)
52 |     break()
53 |   endif()
54 | endforeach()
55 | 
56 | set(rdc_opts
57 |   CUB_ENABLE_TESTS_WITH_RDC
58 |   CUB_ENABLE_EXAMPLES_WITH_RDC
59 | )
60 | set(rdc_requested False)
61 | foreach (rdc_opt IN LISTS rdc_opts)
62 |   if (${rdc_opt})
63 |     set(rdc_requested True)
64 |     break()
65 |   endif()
66 | endforeach()
67 | 
68 | if (rdc_requested AND NOT rdc_supported)
69 |   string(JOIN ", " no_rdc ${no_rdc_archs})
70 |   string(JOIN "\n" opts ${rdc_opts})
71 |   message(FATAL_ERROR
72 |     "Architectures {${no_rdc}} do not support RDC and are incompatible with "
73 |     "these options:\n${opts}"
74 |   )
75 | endif()
76 | 
77 | 
78 | # 
79 | # Clang CUDA options 
80 | #
81 | if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
82 |   set(CUB_CUDA_FLAGS_BASE "${CUB_CUDA_FLAGS_BASE} -Wno-unknown-cuda-version -Xclang=-fcuda-allow-variadic-functions")
83 | endif()
84 | 
85 | 
86 | # By default RDC is not used:
87 | set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}")
88 | 


--------------------------------------------------------------------------------
/cmake/CubHeaderTesting.cmake:
--------------------------------------------------------------------------------
 1 | # For every public header, build a translation unit containing `#include <header>`
 2 | # to let the compiler try to figure out warnings in that header if it is not otherwise
 3 | # included in tests, and also to verify if the headers are modular enough.
 4 | # .inl files are not globbed for, because they are not supposed to be used as public
 5 | # entrypoints.
 6 | 
 7 | # Meta target for all configs' header builds:
 8 | add_custom_target(cub.all.headers)
 9 | 
10 | file(GLOB_RECURSE headers
11 |   RELATIVE "${CUB_SOURCE_DIR}/cub"
12 |   CONFIGURE_DEPENDS
13 |   cub/*.cuh
14 | )
15 | 
16 | set(headertest_srcs)
17 | foreach (header IN LISTS headers)
18 |   set(headertest_src "headers/${header}.cu")
19 |   configure_file("${CUB_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}")
20 |   list(APPEND headertest_srcs "${headertest_src}")
21 | endforeach()
22 | 
23 | function(cub_add_header_test label definitions)
24 |   foreach(cub_target IN LISTS CUB_TARGETS)
25 |     cub_get_target_property(config_prefix ${cub_target} PREFIX)
26 | 
27 |     set(headertest_target ${config_prefix}.headers.${label})
28 |     add_library(${headertest_target} OBJECT ${headertest_srcs})
29 |     target_link_libraries(${headertest_target} PUBLIC ${cub_target})
30 |     target_compile_definitions(${headertest_target} PRIVATE ${definitions})
31 |     cub_clone_target_properties(${headertest_target} ${cub_target})
32 | 
33 |     if (CUB_IN_THRUST)
34 |       thrust_fix_clang_nvcc_build_for(${headertest_target})
35 |     endif()
36 | 
37 |     add_dependencies(cub.all.headers ${headertest_target})
38 |     add_dependencies(${config_prefix}.all ${headertest_target})
39 |   endforeach()
40 | endfunction()
41 | 
42 | # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros:
43 | set(header_definitions 
44 |   "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" 
45 |   "CUB_WRAPPED_NAMESPACE=wrapped_cub")
46 | cub_add_header_test(base "${header_definitions}")
47 | 
48 | list(APPEND header_definitions "CUB_DISABLE_BF16_SUPPORT")
49 | cub_add_header_test(bf16 "${header_definitions}")
50 | 
51 | 


--------------------------------------------------------------------------------
/cmake/CubInstallRules.cmake:
--------------------------------------------------------------------------------
 1 | # Thrust manages its own copy of these rules. Update ThrustInstallRules.cmake
 2 | # if modifying this file.
 3 | if (CUB_IN_THRUST)
 4 |   return()
 5 | endif()
 6 | 
 7 | # Bring in CMAKE_INSTALL_LIBDIR
 8 | include(GNUInstallDirs)
 9 | 
10 | # CUB is a header library; no need to build anything before installing:
11 | set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE)
12 | 
13 | install(DIRECTORY "${CUB_SOURCE_DIR}/cub"
14 |   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
15 |   FILES_MATCHING
16 |     PATTERN "*.cuh"
17 | )
18 | 
19 | install(DIRECTORY "${CUB_SOURCE_DIR}/cub/cmake/"
20 |   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub"
21 |   PATTERN *.cmake.in EXCLUDE
22 | )
23 | # Need to configure a file to store the infix specified in
24 | # CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user
25 | set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/cub")
26 | configure_file("${CUB_SOURCE_DIR}/cub/cmake/cub-header-search.cmake.in"
27 |   "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake"
28 |   @ONLY)
29 | install(FILES "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake"
30 |   DESTINATION "${install_location}")
31 | 


--------------------------------------------------------------------------------
/cmake/CubUtilities.cmake:
--------------------------------------------------------------------------------
 1 | # Enable RDC for a CUDA target. Encapsulates compiler hacks:
 2 | function(cub_enable_rdc_for_cuda_target target_name)
 3 |   if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
 4 |     set_target_properties(${target_name} PROPERTIES
 5 |       COMPILE_FLAGS "-gpu=rdc"
 6 |     )
 7 |   elseif ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
 8 |   else()
 9 |     set_target_properties(${target_name} PROPERTIES
10 |       CUDA_SEPARABLE_COMPILATION ON
11 |     )
12 |   endif()
13 | endfunction()
14 | 


--------------------------------------------------------------------------------
/cmake/header_test.in:
--------------------------------------------------------------------------------
 1 | // This source file checks that:
 2 | // 1) Header <cub/${header}> compiles without error.
 3 | // 2) Common macro collisions with platform/system headers are avoided.
 4 | 
 5 | // Define CUB_MACRO_CHECK(macro, header), which emits a diagnostic indicating
 6 | // a potential macro collision and halts.
 7 | //
 8 | // Use raw platform checks instead of the CUB_HOST_COMPILER macros since we
 9 | // don't want to #include any headers other than the one being tested.
10 | //
11 | // This is only implemented for MSVC/GCC/Clang.
12 | #if defined(_MSC_VER) // MSVC
13 | 
14 | // Fake up an error for MSVC
15 | #define CUB_MACRO_CHECK_IMPL(msg)                                              \
16 |   /* Print message that looks like an error: */                                \
17 |   __pragma(message(__FILE__ ":" CUB_MACRO_CHECK_IMPL0(__LINE__)                \
18 |                    ": error: " #msg))                                          \
19 |   /* abort compilation due to static_assert or syntax error: */                \
20 |   static_assert(false, #msg);
21 | #define CUB_MACRO_CHECK_IMPL0(x) CUB_MACRO_CHECK_IMPL1(x)
22 | #define CUB_MACRO_CHECK_IMPL1(x) #x
23 | 
24 | #elif defined(__clang__) || defined(__GNUC__)
25 | 
26 | // GCC/clang are easy:
27 | #define CUB_MACRO_CHECK_IMPL(msg) CUB_MACRO_CHECK_IMPL0(GCC error #msg)
28 | #define CUB_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
29 | 
30 | #endif
31 | 
32 | // Hacky way to build a string, but it works on all tested platforms.
33 | #define CUB_MACRO_CHECK(MACRO, HEADER)                                         \
34 |   CUB_MACRO_CHECK_IMPL(Identifier MACRO should not be used from CUB            \
35 |                        headers due to conflicts with HEADER macros.)
36 | 
37 | // complex.h conflicts
38 | #define I CUB_MACRO_CHECK('I', complex.h)
39 | 
40 | // windows.h conflicts
41 | #define small CUB_MACRO_CHECK('small', windows.h)
42 | // We can't enable these checks without breaking some builds -- some standard
43 | // library implementations unconditionally `#undef` these macros, which then
44 | // causes random failures later.
45 | // Leaving these commented out as a warning: Here be dragons.
46 | //#define min(...) CUB_MACRO_CHECK('min', windows.h)
47 | //#define max(...) CUB_MACRO_CHECK('max', windows.h)
48 | 
49 | // termios.h conflicts (NVIDIA/thrust#1547)
50 | #define B0 CUB_MACRO_CHECK("B0", termios.h)
51 | 
52 | #include <cub/${header}>
53 | 
54 | #if defined(CUB_DISABLE_BF16_SUPPORT)
55 | #if defined(__CUDA_BF16_TYPES_EXIST__)
56 | #error CUB should not include cuda_bf16.h when BF16 support is disabled
57 | #endif
58 | #endif
59 | 


--------------------------------------------------------------------------------
/cub/block/specializations/block_histogram_atomic.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "../../config.cuh"
37 | 
38 | CUB_NAMESPACE_BEGIN
39 | 
40 | 
41 | /**
42 |  * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
43 |  */
44 | template <int BINS>
45 | struct BlockHistogramAtomic
46 | {
47 |     /// Shared memory storage layout type
48 |     struct TempStorage {};
49 | 
50 | 
51 |     /// Constructor
52 |     __device__ __forceinline__ BlockHistogramAtomic(
53 |         TempStorage &temp_storage)
54 |     {}
55 | 
56 | 
57 |     /// Composite data onto an existing histogram
58 |     template <
59 |         typename            T,
60 |         typename            CounterT,     
61 |         int                 ITEMS_PER_THREAD>
62 |     __device__ __forceinline__ void Composite(
63 |         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
64 |         CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
65 |     {
66 |         // Update histogram
67 |         #pragma unroll
68 |         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
69 |         {
70 |               atomicAdd(histogram + items[i], 1);
71 |         }
72 |     }
73 | 
74 | };
75 | 
76 | CUB_NAMESPACE_END
77 | 
78 | 


--------------------------------------------------------------------------------
/cub/cmake/cub-config-version.cmake:
--------------------------------------------------------------------------------
 1 | # Parse version information from version.cuh:
 2 | include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake")
 3 | 
 4 | file(READ "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh" CUB_VERSION_HEADER)
 5 | string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
 6 | set(CUB_VERSION_FLAT ${CMAKE_MATCH_1})
 7 | # Note that CUB calls this the PATCH number, CMake calls it the TWEAK number:
 8 | string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}")
 9 | set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1})
10 | 
11 | math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000")
12 | math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000")
13 | math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch"
14 | 
15 | set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}")
16 | 
17 | set(PACKAGE_VERSION ${CUB_VERSION})
18 | set(PACKAGE_VERSION_COMPATIBLE FALSE)
19 | set(PACKAGE_VERSION_EXACT FALSE)
20 | set(PACKAGE_VERSION_UNSUITABLE FALSE)
21 | 
22 | if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION)
23 |   if(CUB_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND
24 |      CUB_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR)
25 |     set(PACKAGE_VERSION_COMPATIBLE TRUE)
26 |   endif()
27 | 
28 |   if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
29 |     set(PACKAGE_VERSION_EXACT TRUE)
30 |   endif()
31 | endif()
32 | 


--------------------------------------------------------------------------------
/cub/cmake/cub-config.cmake:
--------------------------------------------------------------------------------
  1 | #
  2 | # find_package(CUB) config file.
  3 | #
  4 | # Defines a CUB::CUB target that may be linked from user projects to include
  5 | # CUB.
  6 | 
  7 | if (TARGET CUB::CUB)
  8 |   return()
  9 | endif()
 10 | 
 11 | # Minimum supported libcudacxx version:
 12 | set(cub_libcudacxx_version 1.8.0)
 13 | 
 14 | function(_cub_declare_interface_alias alias_name ugly_name)
 15 |   # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
 16 |   # 2) When an IMPORTED library is linked to another target, its include
 17 |   #    directories are treated as SYSTEM includes.
 18 |   # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
 19 |   #    system includes. This means that the Toolkit CUB will *always* be used
 20 |   #    during compilation, and the include paths of an IMPORTED CUB::CUB
 21 |   #    target will never have any effect.
 22 |   # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
 23 |   #    on EVERY target that links to CUB::CUB. This would be a burden and a
 24 |   #    footgun for our users. Forgetting this would silently pull in the wrong CUB!
 25 |   # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
 26 |   #    configure it, and then ALIAS it into the namespace (or ALIAS and then
 27 |   #    configure, that seems to work too).
 28 |   add_library(${ugly_name} INTERFACE)
 29 |   add_library(${alias_name} ALIAS ${ugly_name})
 30 | endfunction()
 31 | 
 32 | #
 33 | # Setup some internal cache variables
 34 | #
 35 | 
 36 | # Pull in the include dir detected by cub-config-version.cmake
 37 | set(_CUB_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}"
 38 |   CACHE INTERNAL "Location of CUB headers."
 39 |   FORCE
 40 | )
 41 | unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache
 42 | 
 43 | if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
 44 |   set(_CUB_QUIET ON CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE)
 45 |   set(_CUB_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE)
 46 | else()
 47 |   set(_CUB_QUIET OFF CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE)
 48 |   set(_CUB_QUIET_FLAG "" CACHE INTERNAL "" FORCE)
 49 | endif()
 50 | 
 51 | #
 52 | # Setup dependencies
 53 | #
 54 | 
 55 | if (NOT TARGET CUB::libcudacxx)
 56 |   if (TARGET Thrust::libcudacxx)
 57 |     # Prefer the same libcudacxx as Thrust, if available:
 58 |     _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx)
 59 |     target_link_libraries(_CUB_libcudacxx INTERFACE Thrust::libcudacxx)
 60 |   else()
 61 |     if (NOT TARGET libcudacxx::libcudacxx)
 62 |       # First do a non-required search for any co-packaged versions.
 63 |       # These are preferred.
 64 |       find_package(libcudacxx ${cub_libcudacxx_version} CONFIG
 65 |         ${_CUB_QUIET_FLAG}
 66 |         NO_DEFAULT_PATH # Only check the explicit HINTS below:
 67 |         HINTS
 68 |           "${_CUB_INCLUDE_DIR}/../libcudacxx"           # Source layout
 69 |           "${_CUB_CMAKE_DIR}/.."                        # Install layout
 70 |       )
 71 | 
 72 |       # A second required search allows externally packaged to be used and fails if
 73 |       # no suitable package exists.
 74 |       find_package(libcudacxx ${cub_libcudacxx_version} CONFIG
 75 |         REQUIRED
 76 |         ${_CUB_QUIET_FLAG}
 77 |       )
 78 |     endif()
 79 |     _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx)
 80 |     target_link_libraries(_CUB_libcudacxx INTERFACE libcudacxx::libcudacxx)
 81 |   endif()
 82 | endif()
 83 | 
 84 | #
 85 | # Setup targets
 86 | #
 87 | 
 88 | _cub_declare_interface_alias(CUB::CUB _CUB_CUB)
 89 | target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}")
 90 | target_link_libraries(_CUB_CUB INTERFACE CUB::libcudacxx)
 91 | 
 92 | if (CUB_IGNORE_DEPRECATED_API OR THRUST_IGNORE_DEPRECATED_API)
 93 |   target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_API")
 94 | endif()
 95 | 
 96 | if (CUB_IGNORE_DEPRECATED_CPP_DIALECT OR
 97 |     THRUST_IGNORE_DEPRECATED_CPP_DIALECT)
 98 |   target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT")
 99 | endif()
100 | 
101 | if (CUB_IGNORE_DEPRECATED_CPP_11 OR
102 |     THRUST_IGNORE_DEPRECATED_CPP_11)
103 |   target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11")
104 | endif()
105 | 
106 | if (CUB_IGNORE_DEPRECATED_COMPILER OR
107 |     THRUST_IGNORE_DEPRECATED_COMPILER)
108 |   target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER")
109 | endif()
110 | 
111 | #
112 | # Standardize version info
113 | #
114 | 
115 | set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE)
116 | set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE)
117 | set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE)
118 | set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE)
119 | set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE)
120 | set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE)
121 | 
122 | include(FindPackageHandleStandardArgs)
123 | if (NOT CUB_CONFIG)
124 |   set(CUB_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
125 | endif()
126 | find_package_handle_standard_args(CUB CONFIG_MODE)
127 | 


--------------------------------------------------------------------------------
/cub/cmake/cub-header-search.cmake:
--------------------------------------------------------------------------------
1 | # Parse version information from version.h in source tree
2 | set(_CUB_VERSION_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/../..")
3 | if(EXISTS "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh")
4 |   set(_CUB_VERSION_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" CACHE FILEPATH "" FORCE) # Clear old result
5 |   set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
6 | endif()
7 | 


--------------------------------------------------------------------------------
/cub/cmake/cub-header-search.cmake.in:
--------------------------------------------------------------------------------
 1 | # Parse version information from version.h:
 2 | unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search
 3 | 
 4 | # Find CMAKE_INSTALL_INCLUDEDIR=@CMAKE_INSTALL_INCLUDEDIR@ directory"
 5 | set(from_install_prefix "@install_location@")
 6 | 
 7 | # Transform to a list of directories, replace each directoy with "../"
 8 | # and convert back to a string
 9 | string(REGEX REPLACE "/" ";" from_install_prefix "${from_install_prefix}")
10 | list(TRANSFORM from_install_prefix REPLACE ".+" "../")
11 | list(JOIN from_install_prefix "" from_install_prefix)
12 | 
13 | find_path(_CUB_VERSION_INCLUDE_DIR cub/version.cuh
14 |   NO_CMAKE_FIND_ROOT_PATH # Don't allow CMake to re-root the search
15 |   NO_DEFAULT_PATH # Only search explicit paths below:
16 |   PATHS
17 |     "${CMAKE_CURRENT_LIST_DIR}/${from_install_prefix}/@CMAKE_INSTALL_INCLUDEDIR@"
18 | )
19 | set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
20 | 


--------------------------------------------------------------------------------
/cub/config.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Static configuration header for the CUB project.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | #include "util_arch.cuh"
36 | #include "util_compiler.cuh"
37 | #include "util_cpp_dialect.cuh"
38 | #include "util_deprecated.cuh"
39 | #include "util_macro.cuh"
40 | #include "util_namespace.cuh"
41 | 


--------------------------------------------------------------------------------
/cub/cub.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * CUB umbrella include file
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | // Static configuration
 37 | #include "config.cuh"
 38 | 
 39 | // Block
 40 | #include "block/block_adjacent_difference.cuh"
 41 | #include "block/block_discontinuity.cuh"
 42 | #include "block/block_exchange.cuh"
 43 | #include "block/block_histogram.cuh"
 44 | #include "block/block_load.cuh"
 45 | #include "block/block_merge_sort.cuh"
 46 | #include "block/block_radix_rank.cuh"
 47 | #include "block/block_radix_sort.cuh"
 48 | #include "block/block_reduce.cuh"
 49 | #include "block/block_scan.cuh"
 50 | #include "block/block_store.cuh"
 51 | //#include "block/block_shift.cuh"
 52 | 
 53 | // Device
 54 | #include "device/device_adjacent_difference.cuh"
 55 | #include "device/device_copy.cuh"
 56 | #include "device/device_histogram.cuh"
 57 | #include "device/device_memcpy.cuh"
 58 | #include "device/device_merge_sort.cuh"
 59 | #include "device/device_partition.cuh"
 60 | #include "device/device_radix_sort.cuh"
 61 | #include "device/device_reduce.cuh"
 62 | #include "device/device_run_length_encode.cuh"
 63 | #include "device/device_scan.cuh"
 64 | #include "device/device_segmented_radix_sort.cuh"
 65 | #include "device/device_segmented_reduce.cuh"
 66 | #include "device/device_segmented_sort.cuh"
 67 | #include "device/device_select.cuh"
 68 | #include "device/device_spmv.cuh"
 69 | 
 70 | // Grid
 71 | //#include "grid/grid_barrier.cuh"
 72 | #include "grid/grid_even_share.cuh"
 73 | #include "grid/grid_mapping.cuh"
 74 | #include "grid/grid_queue.cuh"
 75 | 
 76 | // Thread
 77 | #include "thread/thread_load.cuh"
 78 | #include "thread/thread_operators.cuh"
 79 | #include "thread/thread_reduce.cuh"
 80 | #include "thread/thread_scan.cuh"
 81 | #include "thread/thread_store.cuh"
 82 | 
 83 | // Warp
 84 | #include "warp/warp_exchange.cuh"
 85 | #include "warp/warp_load.cuh"
 86 | #include "warp/warp_merge_sort.cuh"
 87 | #include "warp/warp_reduce.cuh"
 88 | #include "warp/warp_scan.cuh"
 89 | #include "warp/warp_store.cuh"
 90 | 
 91 | // Iterator
 92 | #include "iterator/arg_index_input_iterator.cuh"
 93 | #include "iterator/cache_modified_input_iterator.cuh"
 94 | #include "iterator/cache_modified_output_iterator.cuh"
 95 | #include "iterator/constant_input_iterator.cuh"
 96 | #include "iterator/counting_input_iterator.cuh"
 97 | #include "iterator/discard_output_iterator.cuh"
 98 | #include "iterator/tex_obj_input_iterator.cuh"
 99 | #include "iterator/tex_ref_input_iterator.cuh"
100 | #include "iterator/transform_input_iterator.cuh"
101 | 
102 | // Util
103 | #include "util_allocator.cuh"
104 | #include "util_arch.cuh"
105 | #include "util_debug.cuh"
106 | #include "util_device.cuh"
107 | #include "util_macro.cuh"
108 | #include "util_ptx.cuh"
109 | #include "util_type.cuh"
110 | 


--------------------------------------------------------------------------------
/cub/detail/choose_offset.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #pragma once
29 | 
30 | #include <cub/config.cuh>
31 | 
32 | #include <cstdint>
33 | #include <type_traits>
34 | 
35 | CUB_NAMESPACE_BEGIN
36 | 
37 | namespace detail
38 | {
39 | 
40 | /**
41 |  * ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and
42 |  * selects the offset type based on it.
43 |  */
44 | template <typename NumItemsT>
45 | struct ChooseOffsetT
46 | {
47 |   // NumItemsT must be an integral type (but not bool).
48 |   static_assert(
49 |     std::is_integral<NumItemsT>::value &&
50 |       !std::is_same<typename std::remove_cv<NumItemsT>::type, bool>::value,
51 |     "NumItemsT must be an integral type, but not bool");
52 | 
53 |   // Unsigned integer type for global offsets.
54 |   using Type = typename std::conditional<sizeof(NumItemsT) <= 4,
55 |                                          std::uint32_t,
56 |                                          unsigned long long>::type;
57 | };
58 | 
59 | } // namespace detail
60 | 
61 | CUB_NAMESPACE_END
62 | 
63 | 


--------------------------------------------------------------------------------
/cub/detail/cpp_compatibility.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  Copyright 2022 NVIDIA Corporation
 3 | *
 4 | *  Licensed under the Apache License, Version 2.0 (the "License");
 5 | *  you may not use this file except in compliance with the License.
 6 | *  You may obtain a copy of the License at
 7 | *
 8 | *      http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | *  Unless required by applicable law or agreed to in writing, software
11 | *  distributed under the License is distributed on an "AS IS" BASIS,
12 | *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | *  See the License for the specific language governing permissions and
14 | *  limitations under the License.
15 | */
16 | 
17 | 
18 | #pragma once
19 | 
20 | #include <cub/util_cpp_dialect.cuh>
21 | 
22 | #if CUB_CPP_DIALECT >= 2017 && __cpp_if_constexpr
23 | #  define CUB_IF_CONSTEXPR if constexpr
24 | #  define CUB_ELSE_IF_CONSTEXPR else if constexpr
25 | #else
26 | #  define CUB_IF_CONSTEXPR if
27 | #  define CUB_ELSE_IF_CONSTEXPR else if
28 | #endif
29 | 


--------------------------------------------------------------------------------
/cub/detail/detect_cuda_runtime.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Utilities for CUDA dynamic parallelism.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <cub/util_namespace.cuh>
 37 | 
 38 | #include <cuda_runtime_api.h>
 39 | 
 40 | CUB_NAMESPACE_BEGIN
 41 | namespace detail
 42 | {
 43 | 
 44 | #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes:
 45 | 
 46 | /**
 47 |  * \def CUB_DISABLE_CDP
 48 |  *
 49 |  * If defined, support for device-side usage of CUB is disabled.
 50 |  */
 51 | #define CUB_DISABLE_CDP
 52 | 
 53 | /**
 54 |  * \def CUB_RDC_ENABLED
 55 |  *
 56 |  * Defined if RDC is enabled and CUB_DISABLE_CDP is not defined.
 57 |  */
 58 | #define CUB_RDC_ENABLED
 59 | 
 60 | /**
 61 |  * \def CUB_RUNTIME_FUNCTION
 62 |  *
 63 |  * Execution space for functions that can use the CUDA runtime API (`__host__`
 64 |  * when RDC is off, `__host__ __device__` when RDC is on).
 65 |  */
 66 | #define CUB_RUNTIME_FUNCTION
 67 | 
 68 | /**
 69 |  * \def CUB_RUNTIME_ENABLED
 70 |  *
 71 |  * Whether or not the active compiler pass is allowed to invoke device kernels
 72 |  * or methods from the CUDA runtime API.
 73 |  *
 74 |  * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__`
 75 |  * and is not compatible with `NV_IF_TARGET`. It is provided for legacy
 76 |  * purposes only.
 77 |  *
 78 |  * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`.
 79 |  */
 80 | #define CUB_RUNTIME_ENABLED
 81 | 
 82 | #else // Non-doxygen pass:
 83 | 
 84 | #ifndef CUB_RUNTIME_FUNCTION
 85 | 
 86 | #if defined(__CUDACC_RDC__) && !defined(CUB_DISABLE_CDP)
 87 | 
 88 | #define CUB_RDC_ENABLED
 89 | #define CUB_RUNTIME_FUNCTION __host__ __device__
 90 | 
 91 | #else // RDC disabled:
 92 | 
 93 | #define CUB_RUNTIME_FUNCTION __host__
 94 | 
 95 | #endif // RDC enabled
 96 | 
 97 | #if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
 98 | // Legacy only -- do not use in new code.
 99 | #define CUB_RUNTIME_ENABLED
100 | #endif
101 | 
102 | #endif // CUB_RUNTIME_FUNCTION predefined
103 | 
104 | #ifdef CUB_RDC_ENABLED
105 | // Detect available version of CDP:
106 | #if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED)
107 | #define CUB_DETAIL_CDPv1
108 | #else
109 | #define CUB_DETAIL_CDPv2
110 | #endif
111 | #endif
112 | 
113 | #endif // Do not document
114 | 
115 | } // namespace detail
116 | CUB_NAMESPACE_END
117 | 


--------------------------------------------------------------------------------
/cub/detail/device_double_buffer.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2021 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <cub/util_namespace.cuh>
20 | 
21 | 
22 | CUB_NAMESPACE_BEGIN
23 | 
24 | namespace detail
25 | {
26 | 
27 | 
28 | /**
29 |  * @brief It's a double-buffer storage wrapper for multi-pass stream
30 |  *        transformations that require more than one storage array for
31 |  *        streaming intermediate results back and forth.
32 |  *
33 |  * Many multi-pass computations require a pair of "ping-pong" storage buffers
34 |  * (e.g., one for reading from and the other for writing to, and then
35 |  * vice-versa for the subsequent pass). This structure wraps a set of device
36 |  * buffers.
37 |  *
38 |  * Unlike `cub::DoubleBuffer` this class doesn't provide a "selector" member
39 |  * to track which buffer is "current". The main reason for this class existence
40 |  * is the performance difference. Since `cub::DoubleBuffer` relies on the
41 |  * runtime variable to index pointers arrays, they are placed in the local
42 |  * memory instead of registers. Local memory accesses significantly affect
43 |  * performance. On the contrary, this class swaps pointer, so all operations
44 |  * can be performed in registers.
45 |  */
46 | template <typename T>
47 | class device_double_buffer
48 | {
49 |   /// Pair of device buffer pointers
50 |   T *m_current_buffer {};
51 |   T *m_alternate_buffer {};
52 | 
53 | public:
54 |   /**
55 |    * @param d_current
56 |    *   The currently valid buffer
57 |    *
58 |    * @param d_alternate
59 |    *   Alternate storage buffer of the same size as @p d_current
60 |    */
61 |   __host__ __device__ __forceinline__ device_double_buffer(T *current,
62 |                                                            T *alternate)
63 |       : m_current_buffer(current)
64 |       , m_alternate_buffer(alternate)
65 |   {}
66 | 
67 |   /// \brief Return pointer to the currently valid buffer
68 |   __host__ __device__ __forceinline__ T *current() const
69 |   {
70 |     return m_current_buffer;
71 |   }
72 | 
73 |   /// \brief Return pointer to the currently invalid buffer
74 |   __host__ __device__ __forceinline__ T *alternate() const
75 |   {
76 |     return m_alternate_buffer;
77 |   }
78 | 
79 |   __host__ __device__ void swap()
80 |   {
81 |     T *tmp             = m_current_buffer;
82 |     m_current_buffer   = m_alternate_buffer;
83 |     m_alternate_buffer = tmp;
84 |   }
85 | };
86 | 
87 | 
88 | } // namespace detail
89 | 
90 | CUB_NAMESPACE_END
91 | 


--------------------------------------------------------------------------------
/cub/detail/device_synchronize.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  *  Copyright 2021 NVIDIA Corporation
 3 |  *
 4 |  *  Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  *  Unless required by applicable law or agreed to in writing, software
11 |  *  distributed under the License is distributed on an "AS IS" BASIS,
12 |  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  *  See the License for the specific language governing permissions and
14 |  *  limitations under the License.
15 |  */
16 | 
17 | #pragma once
18 | 
19 | #include <cub/detail/detect_cuda_runtime.cuh>
20 | #include <cub/detail/exec_check_disable.cuh>
21 | #include <cub/util_arch.cuh>
22 | #include <cub/util_namespace.cuh>
23 | 
24 | #include <nv/target>
25 | 
26 | #include <cuda_runtime_api.h>
27 | 
28 | CUB_NAMESPACE_BEGIN
29 | 
30 | namespace detail
31 | {
32 | 
33 | /**
34 |  * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and
35 |  * CUDA configuration.
36 |  */
37 | CUB_EXEC_CHECK_DISABLE
38 | CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize()
39 | {
40 |   cudaError_t result = cudaErrorNotSupported;
41 | 
42 |   // Device-side sync is only available under CDPv1:
43 | #if defined(CUB_DETAIL_CDPv1)
44 | 
45 | #if ((__CUDACC_VER_MAJOR__ > 11) ||                                            \
46 |      ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6)))
47 |   // CUDA >= 11.6
48 | #define CUB_TMP_DEVICE_SYNC_IMPL                                               \
49 |   result = __cudaDeviceSynchronizeDeprecationAvoidance();
50 | #else // CUDA < 11.6:
51 | #define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize();
52 | #endif
53 | 
54 | #else // CDPv2 or no CDP:
55 | 
56 | #define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */
57 | 
58 | #endif // CDP version
59 | 
60 |   NV_IF_TARGET(NV_IS_HOST,
61 |                (result = cudaDeviceSynchronize();),
62 |                (CUB_TMP_DEVICE_SYNC_IMPL));
63 | 
64 | #undef CUB_TMP_DEVICE_SYNC_IMPL
65 | 
66 |   return result;
67 | }
68 | 
69 | } // namespace detail
70 | 
71 | CUB_NAMESPACE_END
72 | 


--------------------------------------------------------------------------------
/cub/detail/exec_check_disable.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  Copyright 2021 NVIDIA Corporation
 3 | *
 4 | *  Licensed under the Apache License, Version 2.0 (the "License");
 5 | *  you may not use this file except in compliance with the License.
 6 | *  You may obtain a copy of the License at
 7 | *
 8 | *      http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | *  Unless required by applicable law or agreed to in writing, software
11 | *  distributed under the License is distributed on an "AS IS" BASIS,
12 | *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | *  See the License for the specific language governing permissions and
14 | *  limitations under the License.
15 | */
16 | 
17 | #pragma once
18 | 
19 | #include <cub/util_compiler.cuh>
20 | 
21 | /**
22 |  * @def CUB_EXEC_CHECK_DISABLE
23 |  * Wrapper around `#pragma nv_exec_check_disable`.
24 |  */
25 | 
26 | // #pragma nv_exec_check_disable is only recognized by NVCC.
27 | #if defined(__CUDACC__) && \
28 |     !defined(_NVHPC_CUDA) && \
29 |     !(defined(__CUDA__) && defined(__clang__))
30 | 
31 | #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
32 | #define CUB_EXEC_CHECK_DISABLE __pragma("nv_exec_check_disable")
33 | #else // // !MSVC
34 | #define CUB_EXEC_CHECK_DISABLE _Pragma("nv_exec_check_disable")
35 | #endif // MSVC
36 | 
37 | #else // !NVCC
38 | 
39 | #define CUB_EXEC_CHECK_DISABLE
40 | 
41 | #endif // NVCC
42 | 


--------------------------------------------------------------------------------
/cub/detail/type_traits.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Wrappers and extensions around <type_traits> utilities.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | #include <cub/util_cpp_dialect.cuh>
36 | #include <cub/util_namespace.cuh>
37 | 
38 | #include <cuda/std/type_traits>
39 | 
40 | 
41 | CUB_NAMESPACE_BEGIN
42 | namespace detail {
43 | 
44 | template <typename Invokable, typename... Args>
45 | using invoke_result_t =
46 | #if CUB_CPP_DIALECT < 2017
47 |   typename ::cuda::std::result_of<Invokable(Args...)>::type;
48 | #else // 2017+
49 |   ::cuda::std::invoke_result_t<Invokable, Args...>;
50 | #endif
51 | 
52 | /// The type of intermediate accumulator (according to P2322R6)
53 | template <typename Invokable, typename InitT, typename InputT>
54 | using accumulator_t = 
55 |   typename ::cuda::std::decay<invoke_result_t<Invokable, InitT, InputT>>::type;
56 | 
57 | } // namespace detail
58 | CUB_NAMESPACE_END
59 | 


--------------------------------------------------------------------------------
/cub/detail/uninitialized_copy.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #pragma once
29 | 
30 | #include <cub/config.cuh>
31 | 
32 | #include <cuda/std/type_traits>
33 | 
34 | CUB_NAMESPACE_BEGIN
35 | 
36 | 
37 | namespace detail
38 | {
39 | 
40 | #if defined(_NVHPC_CUDA)
41 | template <typename T, typename U>
42 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val)
43 | {
44 |   // NVBug 3384810
45 |   new (ptr) T(::cuda::std::forward<U>(val));
46 | }
47 | #else
48 | template <typename T,
49 |           typename U,
50 |           typename ::cuda::std::enable_if<
51 |             ::cuda::std::is_trivially_copyable<T>::value, 
52 |             int
53 |           >::type = 0>
54 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val)
55 | {
56 |   *ptr = ::cuda::std::forward<U>(val);
57 | }
58 | 
59 | template <typename T, 
60 |          typename U,
61 |          typename ::cuda::std::enable_if<
62 |            !::cuda::std::is_trivially_copyable<T>::value,
63 |            int
64 |          >::type = 0>
65 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val)
66 | {
67 |   new (ptr) T(::cuda::std::forward<U>(val));
68 | }
69 | #endif
70 | 
71 | } // namespace detail
72 | 
73 | 
74 | CUB_NAMESPACE_END
75 | 
76 | 


--------------------------------------------------------------------------------
/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../config.cuh"
 37 | 
 38 | CUB_NAMESPACE_BEGIN
 39 | 
 40 | 
 41 | /**
 42 |  * \addtogroup GridModule
 43 |  * @{
 44 |  */
 45 | 
 46 | 
 47 | /******************************************************************************
 48 |  * Mapping policies
 49 |  *****************************************************************************/
 50 | 
 51 | 
 52 | /**
 53 |  * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
 54 |  */
 55 | enum GridMappingStrategy
 56 | {
 57 |     /**
 58 |      * \brief An a "raking" access pattern in which each thread block is
 59 |      * assigned a consecutive sequence of input tiles
 60 |      *
 61 |      * \par Overview
 62 |      * The input is evenly partitioned into \p p segments, where \p p is
 63 |      * constant and corresponds loosely to the number of thread blocks that may
 64 |      * actively reside on the target device. Each segment is comprised of
 65 |      * consecutive tiles, where a tile is a small, constant-sized unit of input
 66 |      * to be processed to completion before the thread block terminates or
 67 |      * obtains more work.  The kernel invokes \p p thread blocks, each
 68 |      * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
 69 |      * in tile-size increments.
 70 |      */
 71 |     GRID_MAPPING_RAKE,
 72 | 
 73 |     /**
 74 |      * \brief An a "strip mining" access pattern in which the input tiles assigned
 75 |      * to each thread block are separated by a stride equal to the the extent of
 76 |      * the grid.
 77 |      *
 78 |      * \par Overview
 79 |      * The input is evenly partitioned into \p p sets, where \p p is
 80 |      * constant and corresponds loosely to the number of thread blocks that may
 81 |      * actively reside on the target device. Each set is comprised of
 82 |      * data tiles separated by stride \p tiles, where a tile is a small,
 83 |      * constant-sized unit of input to be processed to completion before the
 84 |      * thread block terminates or obtains more work.  The kernel invokes \p p
 85 |      * thread blocks, each of which iteratively consumes a segment of
 86 |      * <em>n</em>/<em>p</em> elements in tile-size increments.
 87 |      */
 88 |     GRID_MAPPING_STRIP_MINE,
 89 | 
 90 |     /**
 91 |      * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
 92 |      *
 93 |      * \par Overview
 94 |      * The input is treated as a queue to be dynamically consumed by a grid of
 95 |      * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
 96 |      * unit of input to be processed to completion before the thread block
 97 |      * terminates or obtains more work.  The grid size \p p is constant,
 98 |      * loosely corresponding to the number of thread blocks that may actively
 99 |      * reside on the target device.
100 |      */
101 |     GRID_MAPPING_DYNAMIC,
102 | };
103 | 
104 | 
105 | /** @} */       // end group GridModule
106 | 
107 | CUB_NAMESPACE_END
108 | 
109 | 


--------------------------------------------------------------------------------
/cub/host/mutex.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * Simple portable mutex
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include <mutex>
37 | 
38 | #include <cub/config.cuh>
39 | #include <cub/util_deprecated.cuh>
40 | 
41 | 
42 | CUB_NAMESPACE_BEGIN
43 | 
44 | 
45 | /**
46 |  * Wraps std::mutex 
47 |  * @deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed 
48 |  *             in a future release. Use `std::mutex` instead.
49 |  */
50 | struct CUB_DEPRECATED Mutex
51 | {
52 |     std::mutex mtx;
53 | 
54 |     void Lock()
55 |     {
56 |         mtx.lock();
57 |     }
58 | 
59 |     void Unlock()
60 |     {
61 |         mtx.unlock();
62 |     }
63 | };
64 | 
65 | 
66 | CUB_NAMESPACE_END
67 | 


--------------------------------------------------------------------------------
/cub/iterator/tex_ref_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <cub/config.cuh>
 37 | #include <cub/iterator/tex_obj_input_iterator.cuh>
 38 | 
 39 | #include <cstddef>
 40 | 
 41 | CUB_NAMESPACE_BEGIN
 42 | 
 43 | /**
 44 |  * \addtogroup UtilIterator
 45 |  * @{
 46 |  */
 47 | 
 48 | /**
 49 |  * \brief A random-access input wrapper for dereferencing array values through texture cache.
 50 |  *
 51 |  * \deprecated [Since 1.13.0] The CUDA texture management APIs used by
 52 |  * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead.
 53 |  *
 54 |  * \par Overview
 55 |  * - TexRefInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
 56 |  *   to elements are to be loaded through texture cache.
 57 |  * - Can be used to load any data type from memory through texture cache.
 58 |  * - Can be manipulated and exchanged within and between host and device
 59 |  *   functions, can only be constructed within host functions, and can only be
 60 |  *   dereferenced within device functions.
 61 |  * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
 62 |  *   reference.  Only one TexRefInputIterator instance can be bound at any given time for a
 63 |  *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
 64 |  *   thread, and (4) compilation .o unit.
 65 |  * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be
 66 |  *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
 67 |  *   from the host).
 68 |  * - Compatible with Thrust API v1.7 or newer.
 69 |  *
 70 |  * \par Snippet
 71 |  * The code snippet below illustrates the use of \p TexRefInputIterator to
 72 |  * dereference a device array of doubles through texture cache.
 73 |  * \par
 74 |  * \code
 75 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
 76 |  *
 77 |  * // Declare, allocate, and initialize a device array
 78 |  * int num_items;   // e.g., 7
 79 |  * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
 80 |  *
 81 |  * // Create an iterator wrapper
 82 |  * cub::TexRefInputIterator<double, __LINE__> itr;
 83 |  * itr.BindTexture(d_in, sizeof(double) * num_items);
 84 |  * ...
 85 |  *
 86 |  * // Within device code:
 87 |  * printf("%f\n", itr[0]);      // 8.0
 88 |  * printf("%f\n", itr[1]);      // 6.0
 89 |  * printf("%f\n", itr[6]);      // 9.0
 90 |  *
 91 |  * ...
 92 |  * itr.UnbindTexture();
 93 |  *
 94 |  * \endcode
 95 |  *
 96 |  * \tparam T                    The value type of this iterator
 97 |  * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
 98 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
 99 |  */
100 | template <
101 |     typename    T,
102 |     int         /*UNIQUE_ID*/,
103 |     typename    OffsetT = std::ptrdiff_t>
104 | using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator<T, OffsetT>;
105 | 
106 | /** @} */       // end group UtilIterator
107 | 
108 | CUB_NAMESPACE_END
109 | 


--------------------------------------------------------------------------------
/cub/thread/thread_sort.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #pragma once
 29 | 
 30 | #include "../config.cuh"
 31 | #include "../util_ptx.cuh"
 32 | #include "../util_type.cuh"
 33 | 
 34 | CUB_NAMESPACE_BEGIN
 35 | 
 36 | 
 37 | template <typename T>
 38 | __device__ __forceinline__ void Swap(T &lhs, T &rhs)
 39 | {
 40 |   T temp = lhs;
 41 |   lhs    = rhs;
 42 |   rhs    = temp;
 43 | }
 44 | 
 45 | 
 46 | /**
 47 |  * @brief Sorts data using odd-even sort method
 48 |  *
 49 |  * The sorting method is stable. Further details can be found in:
 50 |  * A. Nico Habermann. Parallel neighbor sort (or the glory of the induction
 51 |  * principle). Technical Report AD-759 248, Carnegie Mellon University, 1972.
 52 |  *
 53 |  * @tparam KeyT
 54 |  *   Key type
 55 |  *
 56 |  * @tparam ValueT
 57 |  *   Value type. If `cub::NullType` is used as `ValueT`, only keys are sorted.
 58 |  *
 59 |  * @tparam CompareOp
 60 |  *   functor type having member `bool operator()(KeyT lhs, KeyT rhs)`
 61 |  *
 62 |  * @tparam ITEMS_PER_THREAD
 63 |  *   The number of items per thread
 64 |  *
 65 |  * @param[in,out] keys
 66 |  *   Keys to sort
 67 |  *
 68 |  * @param[in,out] items
 69 |  *   Values to sort
 70 |  *
 71 |  * @param[in] compare_op
 72 |  *   Comparison function object which returns true if the first argument is
 73 |  *   ordered before the second
 74 |  */
 75 | template <typename KeyT,
 76 |           typename ValueT,
 77 |           typename CompareOp,
 78 |           int ITEMS_PER_THREAD>
 79 | __device__ __forceinline__ void
 80 | StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
 81 |                   ValueT (&items)[ITEMS_PER_THREAD],
 82 |                   CompareOp compare_op)
 83 | {
 84 |   constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
 85 | 
 86 |   #pragma unroll
 87 |   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
 88 |   {
 89 |   #pragma unroll
 90 |     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
 91 |     {
 92 |       if (compare_op(keys[j + 1], keys[j]))
 93 |       {
 94 |         Swap(keys[j], keys[j + 1]);
 95 |         if (!KEYS_ONLY)
 96 |         {
 97 |           Swap(items[j], items[j + 1]);
 98 |         }
 99 |       }
100 |     } // inner loop
101 |   }   // outer loop
102 | }
103 | 
104 | 
105 | CUB_NAMESPACE_END
106 | 


--------------------------------------------------------------------------------
/cub/util_compiler.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 |  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Detect compiler information.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | // enumerate host compilers we know about
36 | #define CUB_HOST_COMPILER_UNKNOWN 0
37 | #define CUB_HOST_COMPILER_MSVC 1
38 | #define CUB_HOST_COMPILER_GCC 2
39 | #define CUB_HOST_COMPILER_CLANG 3
40 | 
41 | // enumerate device compilers we know about
42 | #define CUB_DEVICE_COMPILER_UNKNOWN 0
43 | #define CUB_DEVICE_COMPILER_MSVC 1
44 | #define CUB_DEVICE_COMPILER_GCC 2
45 | #define CUB_DEVICE_COMPILER_NVCC 3
46 | #define CUB_DEVICE_COMPILER_CLANG 4
47 | 
48 | // figure out which host compiler we're using
49 | #if defined(_MSC_VER)
50 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC
51 | #  define CUB_MSVC_VERSION _MSC_VER
52 | #  define CUB_MSVC_VERSION_FULL _MSC_FULL_VER
53 | #elif defined(__clang__)
54 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG
55 | #  define CUB_CLANG_VERSION                                                    \
56 |     (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
57 | #elif defined(__GNUC__)
58 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC
59 | #  define CUB_GCC_VERSION                                                      \
60 |     (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
61 | #else
62 | #  define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN
63 | #endif // CUB_HOST_COMPILER
64 | 
65 | // figure out which device compiler we're using
66 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA)
67 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
68 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
69 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC
70 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
71 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC
72 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
73 | // CUDA-capable clang should behave similar to NVCC.
74 | #  if defined(__CUDA__)
75 | #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC
76 | #  else
77 | #    define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG
78 | #  endif
79 | #else
80 | #  define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN
81 | #endif
82 | 


--------------------------------------------------------------------------------
/cub/util_deprecated.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 |  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /**
29 |  * \file
30 |  * Define CUB_DEPRECATED macro.
31 |  */
32 | 
33 | #pragma once
34 | 
35 | 
36 | #include <cub/detail/type_traits.cuh>
37 | #include <cub/util_compiler.cuh>
38 | #include <cub/util_cpp_dialect.cuh>
39 | #include <cub/util_debug.cuh>
40 | 
41 | 
42 | #if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API)
43 | #  define CUB_IGNORE_DEPRECATED_API
44 | #endif
45 | 
46 | #ifdef CUB_IGNORE_DEPRECATED_API
47 | #  define CUB_DEPRECATED
48 | #  define CUB_DEPRECATED_BECAUSE(MSG)
49 | #elif CUB_CPP_DIALECT >= 2014
50 | #  define CUB_DEPRECATED [[deprecated]]
51 | #  define CUB_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]]
52 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
53 | #  define CUB_DEPRECATED __declspec(deprecated)
54 | #  define CUB_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG))
55 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
56 | #  define CUB_DEPRECATED __attribute__((deprecated))
57 | #  define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG)))
58 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC
59 | #  define CUB_DEPRECATED __attribute__((deprecated))
60 | #  define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG)))
61 | #else
62 | #  define CUB_DEPRECATED
63 | #  define CUB_DEPRECATED_BECAUSE(MSG)
64 | #endif
65 | 
66 | #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED                         \
67 |   CUB_DEPRECATED_BECAUSE(                                                      \
68 |     "CUB no longer accepts `debug_synchronous` parameter. "                    \
69 |     "Define CUB_DEBUG_SYNC instead, or silence this message with "             \
70 |     "CUB_IGNORE_DEPRECATED_API.")
71 | 
72 | #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG                                \
73 |   if (debug_synchronous)                                                       \
74 |   {                                                                            \
75 |     _CubLog("%s\n",                                                            \
76 |             "CUB no longer accepts `debug_synchronous` parameter. "            \
77 |             "Define CUB_DEBUG_SYNC instead.");                                 \
78 |   }
79 | 
80 | 


--------------------------------------------------------------------------------
/cub/util_macro.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Common C/C++ macro utilities
 31 |  ******************************************************************************/
 32 | 
 33 | #pragma once
 34 | 
 35 | #include <cuda/std/utility>
 36 | 
 37 | #include "util_namespace.cuh"
 38 | 
 39 | CUB_NAMESPACE_BEGIN
 40 | 
 41 | 
 42 | /**
 43 |  * \addtogroup UtilModule
 44 |  * @{
 45 |  */
 46 | 
 47 | #ifndef CUB_ALIGN
 48 |     #if defined(_WIN32) || defined(_WIN64)
 49 |         /// Align struct
 50 |         #define CUB_ALIGN(bytes) __declspec(align(32))
 51 |     #else
 52 |         /// Align struct
 53 |         #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
 54 |     #endif
 55 | #endif
 56 | 
 57 | #define CUB_PREVENT_MACRO_SUBSTITUTION
 58 | 
 59 | template <typename T, typename U>
 60 | constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
 61 |                                                                       U &&u)
 62 |   -> decltype(t < u ? ::cuda::std::forward<T>(t) : ::cuda::std::forward<U>(u))
 63 | {
 64 |   return t < u ? ::cuda::std::forward<T>(t) : ::cuda::std::forward<U>(u);
 65 | }
 66 | 
 67 | template <typename T, typename U>
 68 | constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t,
 69 |                                                                       U &&u)
 70 |   -> decltype(t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t))
 71 | {
 72 |   return t < u ? ::cuda::std::forward<U>(u) : ::cuda::std::forward<T>(t);
 73 | }
 74 | 
 75 | #ifndef CUB_MAX
 76 |     /// Select maximum(a, b)
 77 |     #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
 78 | #endif
 79 | 
 80 | #ifndef CUB_MIN
 81 |     /// Select minimum(a, b)
 82 |     #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
 83 | #endif
 84 | 
 85 | #ifndef CUB_QUOTIENT_FLOOR
 86 |     /// Quotient of x/y rounded down to nearest integer
 87 |     #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
 88 | #endif
 89 | 
 90 | #ifndef CUB_QUOTIENT_CEILING
 91 |     /// Quotient of x/y rounded up to nearest integer
 92 |     #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
 93 | #endif
 94 | 
 95 | #ifndef CUB_ROUND_UP_NEAREST
 96 |     /// x rounded up to the nearest multiple of y
 97 |     #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
 98 | #endif
 99 | 
100 | #ifndef CUB_ROUND_DOWN_NEAREST
101 |     /// x rounded down to the nearest multiple of y
102 |     #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
103 | #endif
104 | 
105 | 
106 | #ifndef CUB_STATIC_ASSERT
107 |     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
108 |         #define CUB_CAT_(a, b) a ## b
109 |         #define CUB_CAT(a, b) CUB_CAT_(a, b)
110 |     #endif // DOXYGEN_SHOULD_SKIP_THIS
111 | 
112 |     /// Static assert
113 |     #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
114 | #endif
115 | 
116 | /** @} */       // end group UtilModule
117 | 
118 | CUB_NAMESPACE_END
119 | 


--------------------------------------------------------------------------------
/cub/util_math.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /**
 29 |  * \file
 30 |  * Define helper math functions.
 31 |  */
 32 | 
 33 | #pragma once
 34 | 
 35 | #include <type_traits>
 36 | 
 37 | #include "util_namespace.cuh"
 38 | #include "util_macro.cuh"
 39 | 
 40 | CUB_NAMESPACE_BEGIN
 41 | 
 42 | namespace detail
 43 | {
 44 | 
 45 | template <typename T>
 46 | using is_integral_or_enum =
 47 |   std::integral_constant<bool,
 48 |                          std::is_integral<T>::value || std::is_enum<T>::value>;
 49 | 
 50 | __host__ __device__ __forceinline__ constexpr  std::size_t
 51 | VshmemSize(std::size_t max_shmem,
 52 |            std::size_t shmem_per_block,
 53 |            std::size_t num_blocks)
 54 | {
 55 |   return shmem_per_block > max_shmem ? shmem_per_block * num_blocks : 0;
 56 | }
 57 | 
 58 | }
 59 | 
 60 | /**
 61 |  * Divide n by d, round up if any remainder, and return the result.
 62 |  *
 63 |  * Effectively performs `(n + d - 1) / d`, but is robust against the case where
 64 |  * `(n + d - 1)` would overflow.
 65 |  */
 66 | template <typename NumeratorT, typename DenominatorT>
 67 | __host__ __device__ __forceinline__ constexpr NumeratorT
 68 | DivideAndRoundUp(NumeratorT n, DenominatorT d)
 69 | {
 70 |   static_assert(cub::detail::is_integral_or_enum<NumeratorT>::value &&
 71 |                 cub::detail::is_integral_or_enum<DenominatorT>::value,
 72 |                 "DivideAndRoundUp is only intended for integral types.");
 73 | 
 74 |   // Static cast to undo integral promotion.
 75 |   return static_cast<NumeratorT>(n / d + (n % d != 0 ? 1 : 0));
 76 | }
 77 | 
 78 | constexpr __device__ __host__ int
 79 | Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes)
 80 | {
 81 |   return (cub::min)(nominal_4b_items_per_thread,
 82 |                     (cub::max)(1,
 83 |                                nominal_4b_items_per_thread * 8 /
 84 |                                combined_bytes));
 85 | }
 86 | 
 87 | template <typename T>
 88 | constexpr __device__ __host__ int
 89 | Nominal4BItemsToItems(int nominal_4b_items_per_thread)
 90 | {
 91 |   return (cub::min)(nominal_4b_items_per_thread,
 92 |                     (cub::max)(1,
 93 |                                nominal_4b_items_per_thread * 4 /
 94 |                                  static_cast<int>(sizeof(T))));
 95 | }
 96 | 
 97 | template <typename ItemT>
 98 | constexpr __device__ __host__ int
 99 | Nominal8BItemsToItems(int nominal_8b_items_per_thread)
100 | {
101 |   return sizeof(ItemT) <= 8u
102 |            ? nominal_8b_items_per_thread
103 |            : (cub::min)(nominal_8b_items_per_thread,
104 |                         (cub::max)(1,
105 |                                    ((nominal_8b_items_per_thread * 8) +
106 |                                     static_cast<int>(sizeof(ItemT)) - 1) /
107 |                                      static_cast<int>(sizeof(ItemT))));
108 | }
109 | 
110 | /**
111 |  * \brief Computes the midpoint of the integers
112 |  *
113 |  * Extra operation is performed in order to prevent overflow.
114 |  *
115 |  * \return Half the sum of \p begin and \p end
116 |  */
117 | template <typename T>
118 | constexpr __device__ __host__ T MidPoint(T begin, T end)
119 | {
120 |   return begin + (end - begin) / 2;
121 | }
122 | 
123 | CUB_NAMESPACE_END
124 | 


--------------------------------------------------------------------------------
/cub/version.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /*! \file version.cuh
29 |  *  \brief Compile-time macros encoding CUB release version
30 |  *
31 |  *         <cub/version.h> is the only CUB header that is guaranteed to
32 |  *         change with every CUB release.
33 |  *
34 |  */
35 | 
36 | #pragma once
37 | 
38 | /*! \def CUB_VERSION
39 |  *  \brief The preprocessor macro \p CUB_VERSION encodes the version
40 |  *         number of the CUB library.
41 |  *
42 |  *         <tt>CUB_VERSION % 100</tt> is the sub-minor version.
43 |  *         <tt>CUB_VERSION / 100 % 1000</tt> is the minor version.
44 |  *         <tt>CUB_VERSION / 100000</tt> is the major version.
45 |  */
46 | #define CUB_VERSION 200200
47 | 
48 | /*! \def CUB_MAJOR_VERSION
49 |  *  \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the
50 |  *         major version number of the CUB library.
51 |  */
52 | #define CUB_MAJOR_VERSION     (CUB_VERSION / 100000)
53 | 
54 | /*! \def CUB_MINOR_VERSION
55 |  *  \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the
56 |  *         minor version number of the CUB library.
57 |  */
58 | #define CUB_MINOR_VERSION     (CUB_VERSION / 100 % 1000)
59 | 
60 | /*! \def CUB_SUBMINOR_VERSION
61 |  *  \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the
62 |  *         sub-minor version number of the CUB library.
63 |  */
64 | #define CUB_SUBMINOR_VERSION  (CUB_VERSION % 100)
65 | 
66 | /*! \def CUB_PATCH_NUMBER
67 |  *  \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the
68 |  *         patch number of the CUB library.
69 |  */
70 | #define CUB_PATCH_NUMBER 0
71 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | _repo
3 | api
4 | *png
5 | 


--------------------------------------------------------------------------------
/docs/VERSION.md:
--------------------------------------------------------------------------------
1 | 104.0


--------------------------------------------------------------------------------
/docs/deps/repo-deps.packman.xml:
--------------------------------------------------------------------------------
1 | <project toolsVersion="5.0">
2 |   <dependency name="repo_man" linkPath="../_repo/deps/repo_man">
3 |     <package name="repo_man" version="1.10.1"/>
4 |   </dependency>
5 |   <dependency name="repo_docs" linkPath="../_repo/deps/repo_docs">
6 |     <package name="repo_docs" version="0.10.4"/>
7 |   </dependency>
8 | </project>
9 | 


--------------------------------------------------------------------------------
/docs/gen_docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/env bash
 2 | 
 3 | 
 4 | mkdir -p img
 5 | 
 6 | if [ ! -n "$(find img -name '*.png')" ]; then
 7 |     wget -q https://docs.nvidia.com/cuda/_static/Logo_and_CUDA.png -O img/logo.png
 8 | 
 9 |     # Parse files and collects unique names ending with .png
10 |     imgs=$(grep -R -o -h '[[:alpha:][:digit:]_]*.png' ../cub)
11 |     imgs="${imgs}\ncub_overview.png\nnested_composition.png\ntile.png\nblocked.png\nstriped.png"
12 | 
13 |     for img in $(echo -e ${imgs} | sort | uniq)
14 |     do 
15 |         echo ${img}
16 |         wget -q https://nvlabs.github.io/cub/${img} -O img/${img}
17 |     done  
18 | fi
19 | 
20 | ./repo.sh docs
21 | 


--------------------------------------------------------------------------------
/docs/repo.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | call "%~dp0tools\packman\python.bat" %~dp0tools\repoman\repoman.py %*
 4 | if %errorlevel% neq 0 ( goto Error )
 5 | 
 6 | :Success
 7 | exit /b 0
 8 | 
 9 | :Error
10 | exit /b %errorlevel%
11 | 


--------------------------------------------------------------------------------
/docs/repo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | SCRIPT_DIR=$(dirname ${BASH_SOURCE})
6 | cd "$SCRIPT_DIR"
7 | 
8 | exec "tools/packman/python.sh" tools/repoman/repoman.py $@
9 | 


--------------------------------------------------------------------------------
/docs/tools/packman/bootstrap/download_file_from_url.ps1:
--------------------------------------------------------------------------------
 1 | <#
 2 | Copyright 2019 NVIDIA CORPORATION
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | #>
16 | 
17 | param(
18 | [Parameter(Mandatory=$true)][string]$source=$null,
19 | [string]$output="out.exe"
20 | )
21 | $filename = $output
22 | 
23 | $triesLeft = 4
24 | $delay = 2
25 | do
26 | {
27 |     $triesLeft -= 1
28 | 
29 |     try
30 |     {
31 |         Write-Host "Downloading from bootstrap.packman.nvidia.com ..."
32 |         $wc = New-Object net.webclient
33 |         $wc.Downloadfile($source, $fileName)
34 |         exit 0
35 |     }
36 |     catch
37 |     {
38 |         Write-Host "Error downloading $source!"
39 |         Write-Host $_.Exception|format-list -force
40 |         if ($triesLeft)
41 |         {
42 |             Write-Host "Retrying in $delay seconds ..."
43 |             Start-Sleep -seconds $delay
44 |         }
45 |         $delay = $delay * $delay
46 |     }
47 | } while ($triesLeft -gt 0)
48 | # We only get here if the retries have been exhausted, remove any left-overs:
49 | if (Test-Path $fileName)
50 | {
51 |     Remove-Item $fileName
52 | }
53 | exit 1


--------------------------------------------------------------------------------
/docs/tools/packman/bootstrap/fetch_file_from_packman_bootstrap.cmd:
--------------------------------------------------------------------------------
 1 | :: Copyright 2019 NVIDIA CORPORATION
 2 | ::
 3 | :: Licensed under the Apache License, Version 2.0 (the "License");
 4 | :: you may not use this file except in compliance with the License.
 5 | :: You may obtain a copy of the License at
 6 | ::
 7 | ::    http://www.apache.org/licenses/LICENSE-2.0
 8 | ::
 9 | :: Unless required by applicable law or agreed to in writing, software
10 | :: distributed under the License is distributed on an "AS IS" BASIS,
11 | :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | :: See the License for the specific language governing permissions and
13 | :: limitations under the License.
14 | 
15 | :: You need to specify <package-name> <target-path> as input to this command
16 | @setlocal
17 | @set PACKAGE_NAME=%1
18 | @set TARGET_PATH=%2
19 | 
20 | @echo Fetching %PACKAGE_NAME% ...
21 | 
22 | @powershell -ExecutionPolicy ByPass -NoLogo -NoProfile -File "%~dp0download_file_from_url.ps1" ^
23 |     -source "http://bootstrap.packman.nvidia.com/%PACKAGE_NAME%" -output %TARGET_PATH%
24 | :: A bug in powershell prevents the errorlevel code from being set when using the -File execution option
25 | :: We must therefore do our own failure analysis, basically make sure the file exists:
26 | @if not exist %TARGET_PATH% goto ERROR_DOWNLOAD_FAILED
27 | 
28 | @endlocal
29 | @exit /b 0
30 | 
31 | :ERROR_DOWNLOAD_FAILED
32 | @echo Failed to download file from S3
33 | @echo Most likely because endpoint cannot be reached or file %PACKAGE_NAME% doesn't exist
34 | @endlocal
35 | @exit /b 1


--------------------------------------------------------------------------------
/docs/tools/packman/config.packman.xml:
--------------------------------------------------------------------------------
1 | <config remotes="cloudfront">
2 |     <remote2 name="cloudfront">
3 |         <transport actions="download" protocol="https" packageLocation="d4i3qtqj3r0z5.cloudfront.net/${name}@${version}" />
4 |     </remote2>
5 | </config>
6 | 


--------------------------------------------------------------------------------
/docs/tools/packman/packman.cmd:
--------------------------------------------------------------------------------
 1 | :: Reset errorlevel status (don't inherit from caller) [xxxxxxxxxxx]
 2 | @call :ECHO_AND_RESET_ERROR
 3 | :: You can remove the call below if you do your own manual configuration of the dev machines
 4 | call "%~dp0\bootstrap\configure.bat"
 5 | 
 6 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
 7 | :: Everything below is mandatory
 8 | if not defined PM_PYTHON goto :PYTHON_ENV_ERROR
 9 | if not defined PM_MODULE goto :MODULE_ENV_ERROR
10 | 
11 | :: Generate temporary path for variable file
12 | for /f "delims=" %%a in ('powershell -ExecutionPolicy ByPass -NoLogo -NoProfile ^
13 | -File "%~dp0bootstrap\generate_temp_file_name.ps1"') do set PM_VAR_PATH=%%a
14 | 
15 | if %1.==. (
16 | 	set PM_VAR_PATH_ARG=
17 | ) else (
18 | 	set PM_VAR_PATH_ARG=--var-path="%PM_VAR_PATH%"
19 | )
20 | 
21 | "%PM_PYTHON%" -S -s -u -E "%PM_MODULE%" %* %PM_VAR_PATH_ARG%
22 | if %errorlevel% neq 0 ( exit /b %errorlevel% )
23 | 
24 | :: Marshall environment variables into the current environment if they have been generated and remove temporary file
25 | if exist "%PM_VAR_PATH%" (
26 | 	for /F "usebackq tokens=*" %%A in ("%PM_VAR_PATH%") do set "%%A"
27 | )
28 | if %errorlevel% neq 0 ( goto :VAR_ERROR )
29 | 
30 | if exist "%PM_VAR_PATH%" (
31 | 	del /F "%PM_VAR_PATH%"
32 | )
33 | if %errorlevel% neq 0 ( goto :VAR_ERROR )
34 | 
35 | set PM_VAR_PATH=
36 | goto :eof
37 | 
38 | :: Subroutines below
39 | :PYTHON_ENV_ERROR
40 | @echo User environment variable PM_PYTHON is not set! Please configure machine for packman or call configure.bat.
41 | exit /b 1
42 | 
43 | :MODULE_ENV_ERROR
44 | @echo User environment variable PM_MODULE is not set! Please configure machine for packman or call configure.bat.
45 | exit /b 1
46 | 
47 | :VAR_ERROR
48 | @echo Error while processing and setting environment variables!
49 | exit /b 1
50 | 
51 | :ECHO_AND_RESET_ERROR
52 | @echo off
53 | if /I "%PM_VERBOSITY%"=="debug" (
54 | 	@echo on
55 | )
56 | exit /b 0
57 | 


--------------------------------------------------------------------------------
/docs/tools/packman/packmanconf.py:
--------------------------------------------------------------------------------
  1 | # Use this file to bootstrap packman into your Python environment (3.7.x). Simply
  2 | # add the path by doing sys.insert to where packmanconf.py is located and then execute:
  3 | #
  4 | # >>> import packmanconf
  5 | # >>> packmanconf.init()
  6 | #
  7 | # It will use the configured remote(s) and the version of packman in the same folder,
  8 | # giving you full access to the packman API via the following module
  9 | #
 10 | # >> import packmanapi
 11 | # >> dir(packmanapi)
 12 | 
 13 | import os
 14 | import platform
 15 | import sys
 16 | 
 17 | 
 18 | def init():
 19 |     """Call this function to initialize the packman configuration.
 20 | 
 21 |     Calls to the packman API will work after successfully calling this function.
 22 | 
 23 |     Note:
 24 |         This function only needs to be called once during the execution of your
 25 |         program. Calling it repeatedly is harmless but wasteful.
 26 |         Compatibility with your Python interpreter is checked and upon failure
 27 |         the function will report what is required.
 28 | 
 29 |     Example:
 30 |         >>> import packmanconf
 31 |         >>> packmanconf.init()
 32 |         >>> import packmanapi
 33 |         >>> packmanapi.set_verbosity_level(packmanapi.VERBOSITY_HIGH)
 34 |     """
 35 |     major = sys.version_info[0]
 36 |     minor = sys.version_info[1]
 37 |     if major != 3 or minor != 7:
 38 |         raise RuntimeError(
 39 |             f"This version of packman requires Python 3.7.x, but {major}.{minor} was provided"
 40 |         )
 41 |     conf_dir = os.path.dirname(os.path.abspath(__file__))
 42 |     os.environ["PM_INSTALL_PATH"] = conf_dir
 43 |     packages_root = get_packages_root(conf_dir)
 44 |     version = get_version(conf_dir)
 45 |     module_dir = get_module_dir(conf_dir, packages_root, version)
 46 |     sys.path.insert(1, module_dir)
 47 | 
 48 | 
 49 | def get_packages_root(conf_dir: str) -> str:
 50 |     root = os.getenv("PM_PACKAGES_ROOT")
 51 |     if not root:
 52 |         platform_name = platform.system()
 53 |         if platform_name == "Windows":
 54 |             drive, _ = os.path.splitdrive(conf_dir)
 55 |             root = os.path.join(drive, "packman-repo")
 56 |         elif platform_name == "Darwin":
 57 |             # macOS
 58 |             root = "/Library/Caches/packman"
 59 |         elif platform_name == "Linux":
 60 |             root = "/var/tmp/packman"
 61 |         else:
 62 |             raise RuntimeError(f"Unsupported platform '{platform_name}'")
 63 |     # make sure the path exists:
 64 |     os.makedirs(root, exist_ok=True)
 65 |     return root
 66 | 
 67 | 
 68 | def get_module_dir(conf_dir, packages_root: str, version: str) -> str:
 69 |     module_dir = os.path.join(packages_root, "packman-common", version)
 70 |     if not os.path.exists(module_dir):
 71 |         import tempfile
 72 | 
 73 |         tf = tempfile.NamedTemporaryFile(delete=False)
 74 |         target_name = tf.name
 75 |         tf.close()
 76 |         url = f"http://bootstrap.packman.nvidia.com/packman-common@{version}.zip"
 77 |         print(f"Downloading '{url}' ...")
 78 |         import urllib.request
 79 | 
 80 |         urllib.request.urlretrieve(url, target_name)
 81 |         from importlib.machinery import SourceFileLoader
 82 | 
 83 |         # import module from path provided
 84 |         script_path = os.path.join(conf_dir, "bootstrap", "install_package.py")
 85 |         ip = SourceFileLoader("install_package", script_path).load_module()
 86 |         print("Unpacking ...")
 87 |         ip.install_package(target_name, module_dir)
 88 |         os.unlink(tf.name)
 89 |     return module_dir
 90 | 
 91 | 
 92 | def get_version(conf_dir: str):
 93 |     path = os.path.join(conf_dir, "packman")
 94 |     if not os.path.exists(path):  # in dev repo fallback
 95 |         path += ".sh"
 96 |     with open(path, "rt", encoding="utf8") as launch_file:
 97 |         for line in launch_file.readlines():
 98 |             if line.startswith("PM_PACKMAN_VERSION"):
 99 |                 _, value = line.split("=")
100 |                 return value.strip()
101 |     raise RuntimeError(f"Unable to find 'PM_PACKMAN_VERSION' in '{path}'")
102 | 


--------------------------------------------------------------------------------
/docs/tools/packman/python.bat:
--------------------------------------------------------------------------------
 1 | :: Copyright 2019-2020 NVIDIA CORPORATION
 2 | ::
 3 | :: Licensed under the Apache License, Version 2.0 (the "License");
 4 | :: you may not use this file except in compliance with the License.
 5 | :: You may obtain a copy of the License at
 6 | ::
 7 | ::    http://www.apache.org/licenses/LICENSE-2.0
 8 | ::
 9 | :: Unless required by applicable law or agreed to in writing, software
10 | :: distributed under the License is distributed on an "AS IS" BASIS,
11 | :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | :: See the License for the specific language governing permissions and
13 | :: limitations under the License.
14 | 
15 | @echo off
16 | setlocal
17 | 
18 | call "%~dp0\packman" init
19 | set "PYTHONPATH=%PM_MODULE_DIR%;%PYTHONPATH%"
20 | set PYTHONNOUSERSITE=1
21 | "%PM_PYTHON%" -u %*
22 | 


--------------------------------------------------------------------------------
/docs/tools/packman/python.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019-2020 NVIDIA CORPORATION
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | set -e
18 | 
19 | PACKMAN_CMD="$(dirname "${BASH_SOURCE}")/packman"
20 | if [ ! -f "$PACKMAN_CMD" ]; then
21 |     PACKMAN_CMD="${PACKMAN_CMD}.sh"
22 | fi
23 | source "$PACKMAN_CMD" init
24 | export PYTHONPATH="${PM_MODULE_DIR}:${PYTHONPATH}"
25 | export PYTHONNOUSERSITE=1
26 | 
27 | # workaround for our python not shipping with certs
28 | if [[ -z ${SSL_CERT_DIR:-} ]]; then
29 |     export SSL_CERT_DIR=/etc/ssl/certs/
30 | fi
31 | 
32 | "${PM_PYTHON}" -u "$@"
33 | 


--------------------------------------------------------------------------------
/docs/tools/repoman/omni/repo/format/.gitignore:
--------------------------------------------------------------------------------
1 | ﻿# Dummy omni.repo.format Python module so we don't have to pull down the format package.
2 | 
3 | # Ignore everything in this directory, except this file to ensure the folder is created.
4 | *
5 | !.gitignore


--------------------------------------------------------------------------------
/docs/tools/repoman/repoman.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import io
 4 | import contextlib
 5 | import packmanapi
 6 | 
 7 | REPO_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../..")
 8 | REPO_DEPS_FILE = os.path.join(REPO_ROOT, "deps/repo-deps.packman.xml")
 9 | 
10 | 
11 | def bootstrap():
12 |     """
13 |     Bootstrap all omni.repo modules.
14 | 
15 |     Pull with packman from repo.packman.xml and add them all to python sys.path to enable importing.
16 |     """
17 |     #with contextlib.redirect_stdout(io.StringIO()):
18 |     deps = packmanapi.pull(REPO_DEPS_FILE)
19 |     for dep_path in deps.values():
20 |         if dep_path not in sys.path:
21 |             sys.path.append(dep_path)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     bootstrap()
26 |     import omni.repo.man
27 | 
28 |     omni.repo.man.main(REPO_ROOT)
29 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Create meta targets that build all examples for a single configuration:
 2 | foreach(cub_target IN LISTS CUB_TARGETS)
 3 |   cub_get_target_property(config_prefix ${cub_target} PREFIX)
 4 |   set(config_meta_target ${config_prefix}.examples)
 5 |   add_custom_target(${config_meta_target})
 6 |   add_dependencies(${config_prefix}.all ${config_meta_target})
 7 | endforeach()
 8 | 
 9 | # Update flags to reflect RDC options. See note in CubCudaConfig.cmake --
10 | # these flag variables behave unintuitively:
11 | if (CUB_ENABLE_EXAMPLES_WITH_RDC)
12 |   set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_RDC}")
13 | else()
14 |   set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}")
15 | endif()
16 | 
17 | ## cub_add_example
18 | #
19 | # Add an example executable and register it with ctest.
20 | #
21 | # target_name_var: Variable name to overwrite with the name of the example
22 | #   target. Useful for post-processing target information per-backend.
23 | # example_name: The name of the example minus "<config_prefix>.example." For
24 | #   instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu
25 | #   would be "cuda.copy".
26 | # example_src: The source file that implements the example.
27 | # cub_target: The reference cub target with configuration information.
28 | #
29 | function(cub_add_example target_name_var example_name example_src cub_target)
30 |   cub_get_target_property(config_prefix ${cub_target} PREFIX)
31 | 
32 |   # The actual name of the test's target:
33 |   set(example_target ${config_prefix}.example.${example_name})
34 |   set(${target_name_var} ${example_target} PARENT_SCOPE)
35 | 
36 |   # Related target names:
37 |   set(config_meta_target ${config_prefix}.examples)
38 |   set(example_meta_target cub.all.example.${example_name})
39 | 
40 |   add_executable(${example_target} "${example_src}")
41 |   target_link_libraries(${example_target} ${cub_target})
42 |   cub_clone_target_properties(${example_target} ${cub_target})
43 |   target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples")
44 | 
45 |   if (CUB_IN_THRUST)
46 |     thrust_fix_clang_nvcc_build_for(${example_target})
47 |   endif()
48 | 
49 |   # Add to the active configuration's meta target
50 |   add_dependencies(${config_meta_target} ${example_target})
51 | 
52 |   # Meta target that builds examples with this name for all configurations:
53 |   if (NOT TARGET ${example_meta_target})
54 |     add_custom_target(${example_meta_target})
55 |   endif()
56 |   add_dependencies(${example_meta_target} ${example_target})
57 | 
58 |   if (CUB_ENABLE_EXAMPLES_WITH_RDC)
59 |     cub_enable_rdc_for_cuda_target(${example_target})
60 |   endif()
61 | 
62 |   add_test(NAME ${example_target}
63 |     COMMAND "$<TARGET_FILE:${example_target}>"
64 |   )
65 | endfunction()
66 | 
67 | add_subdirectory(cmake)
68 | add_subdirectory(block)
69 | add_subdirectory(device)
70 | 


--------------------------------------------------------------------------------
/examples/block/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /Debug
3 | /Release
4 | /cuda55.sdf
5 | /cuda55.suo
6 | /cuda60.sdf
7 | /cuda60.suo
8 | 


--------------------------------------------------------------------------------
/examples/block/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB_RECURSE example_srcs
 2 |   RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
 3 |   CONFIGURE_DEPENDS
 4 |   example_*.cu
 5 | )
 6 | 
 7 | foreach (cub_target IN LISTS CUB_TARGETS)
 8 |   foreach (example_src IN LISTS example_srcs)
 9 |     get_filename_component(example_name "${example_src}" NAME_WE)
10 |     string(REGEX REPLACE
11 |       "^example_block_" "block."
12 |       example_name "${example_name}"
13 |     )
14 |     cub_add_example(target_name ${example_name} "${example_src}" ${cub_target})
15 |   endforeach()
16 | endforeach()
17 | 


--------------------------------------------------------------------------------
/examples/cmake/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_test(
 2 |   NAME cub.example.cmake.add_subdir
 3 |   COMMAND "${CMAKE_COMMAND}"
 4 |     --log-level=VERBOSE
 5 |     -G "${CMAKE_GENERATOR}"
 6 |     -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir"
 7 |     -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir"
 8 |     -D "CUB_ROOT=${CUB_SOURCE_DIR}"
 9 |     -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
10 |     -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
11 | )
12 | 


--------------------------------------------------------------------------------
/examples/cmake/add_subdir/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This example demonstrates / tests adding CUB via a CMake add_subdirectory
 2 | # call from a parent project.
 3 | 
 4 | cmake_minimum_required(VERSION 3.15)
 5 | 
 6 | # Silence warnings about empty CUDA_ARCHITECTURES properties on example targets:
 7 | if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
 8 |   cmake_policy(SET CMP0104 OLD)
 9 | endif()
10 | 
11 | project(CubAddSubDirExample CUDA)
12 | 
13 | # Use your project's checkout of CUB here, for most cases
14 | # `add_subdirectory(cub)` will be sufficient.
15 | add_subdirectory("${CUB_ROOT}" cub)
16 | 
17 | # Link the CUB::CUB target to your project's targets
18 | add_executable(HelloCUB dummy.cu)
19 | target_link_libraries(HelloCUB CUB::CUB)
20 | 
21 | #
22 | # Validation
23 | #
24 | 
25 | function(assert_target target_name)
26 |   if (NOT TARGET "${target_name}")
27 |     message(FATAL_ERROR "Target '${target_name}' not defined.")
28 |   endif()
29 | endfunction()
30 | 
31 | assert_target(CUB::CUB)
32 | assert_target(HelloCUB)
33 | 


--------------------------------------------------------------------------------
/examples/cmake/add_subdir/dummy.cu:
--------------------------------------------------------------------------------
1 | #include <cub/config.cuh>
2 | 
3 | #include <iostream>
4 | 
5 | int main()
6 | {
7 |   std::cout << "Hello from CUB version " << CUB_VERSION << ":\n";
8 | }
9 | 


--------------------------------------------------------------------------------
/examples/device/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /Debug
3 | /ipch
4 | /Release
5 | /cuda55.sdf
6 | /cuda55.suo
7 | /cuda60.sdf
8 | /cuda60.suo
9 | 


--------------------------------------------------------------------------------
/examples/device/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB_RECURSE example_srcs
 2 |   RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
 3 |   CONFIGURE_DEPENDS
 4 |   example_*.cu
 5 | )
 6 | 
 7 | foreach (cub_target IN LISTS CUB_TARGETS)
 8 |   foreach (example_src IN LISTS example_srcs)
 9 |     get_filename_component(example_name "${example_src}" NAME_WE)
10 |     string(REGEX REPLACE
11 |       "^example_device_" "device."
12 |       example_name "${example_name}"
13 |     )
14 |     cub_add_example(target_name ${example_name} "${example_src}" ${cub_target})
15 |   endforeach()
16 | endforeach()
17 | 


--------------------------------------------------------------------------------
/examples/device/example_device_decoupled_look_back.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011-2023, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #include <cub/device/device_scan.cuh>
 29 | 
 30 | #include <thrust/device_vector.h>
 31 | 
 32 | #include <iostream>
 33 | 
 34 | template <class ScanTileStateT>
 35 | __global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid)
 36 | {
 37 |   tile_state.InitializeStatus(blocks_in_grid);
 38 | }
 39 | 
 40 | template <class MessageT>
 41 | __global__ void decoupled_look_back_kernel(cub::ScanTileState<MessageT> tile_state)
 42 | {
 43 |   using scan_op_t         = cub::Sum;
 44 |   using scan_tile_state_t = cub::ScanTileState<MessageT>;
 45 |   using tile_prefix_op    = cub::TilePrefixCallbackOp<MessageT, scan_op_t, scan_tile_state_t>;
 46 |   using temp_storage_t    = typename tile_prefix_op::TempStorage;
 47 | 
 48 |   // Allocate temp storage in shared memory
 49 |   __shared__ temp_storage_t temp_storage;
 50 | 
 51 |   scan_op_t scan_op{};
 52 |   const unsigned int threads_in_warp = 32;
 53 |   const unsigned int tid             = threadIdx.x;
 54 | 
 55 |   // Construct prefix op
 56 |   tile_prefix_op prefix(tile_state, temp_storage, scan_op);
 57 |   const unsigned int tile_idx = prefix.GetTileIdx();
 58 | 
 59 |   // Compute block aggregate
 60 |   MessageT block_aggregate = blockIdx.x;
 61 | 
 62 |   if (tile_idx == 0)
 63 |   {
 64 |     // There are no blocks to look back to, immediately set the inclusive state
 65 |     if (tid == 0)
 66 |     {
 67 |       tile_state.SetInclusive(tile_idx, block_aggregate);
 68 |       printf("tile %d: inclusive = %d\n", tile_idx, block_aggregate);
 69 |     }
 70 |   }
 71 |   else
 72 |   {
 73 |     // Only the first warp in the block can perform the look back
 74 |     const unsigned int warp_id = tid / threads_in_warp;
 75 | 
 76 |     if (warp_id == 0)
 77 |     {
 78 |       // Perform the decoupled look-back
 79 |       // Invocation of the prefix will block until the look-back is complete.
 80 |       MessageT exclusive_prefix = prefix(block_aggregate);
 81 | 
 82 |       if (tid == 0)
 83 |       {
 84 |         MessageT inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
 85 |         printf("tile %d: exclusive = %d inclusive = %d\n",
 86 |                tile_idx,
 87 |                exclusive_prefix,
 88 |                inclusive_prefix);
 89 |       }
 90 |     }
 91 |   }
 92 | }
 93 | 
 94 | template <class MessageT>
 95 | void decoupled_look_back_example(int blocks_in_grid)
 96 | {
 97 |   using scan_tile_state_t = cub::ScanTileState<MessageT>;
 98 | 
 99 |   // Query temporary storage requirements
100 |   std::size_t temp_storage_bytes{};
101 |   scan_tile_state_t::AllocationSize(blocks_in_grid, temp_storage_bytes);
102 | 
103 |   // Allocate temporary storage
104 |   thrust::device_vector<std::uint8_t> temp_storage(temp_storage_bytes);
105 |   std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data());
106 | 
107 |   // Initialize temporary storage
108 |   scan_tile_state_t tile_status;
109 |   tile_status.Init(blocks_in_grid, d_temp_storage, temp_storage_bytes);
110 |   const unsigned int threads_in_init_block = 256;
111 |   const unsigned int blocks_in_init_grid   = cub::DivideAndRoundUp(blocks_in_grid,
112 |                                                                  threads_in_init_block);
113 |   init_kernel<<<blocks_in_init_grid, threads_in_init_block>>>(tile_status, blocks_in_grid);
114 | 
115 |   // Launch decoupled look-back
116 |   const unsigned int threads_in_block = 256;
117 |   decoupled_look_back_kernel<<<blocks_in_grid, threads_in_block>>>(tile_status);
118 | 
119 |   // Wait for kernel to finish
120 |   cudaDeviceSynchronize();
121 | }
122 | 
123 | int main() { decoupled_look_back_example<int>(14); }
124 | 


--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
1 | /bin
2 | /link_main.obj
3 | /dummy/
4 | 


--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
  1 | # Test Parametrization
  2 | 
  3 | Some of CUB's tests are very slow to build and are capable of exhausting RAM
  4 | during compilation/linking. To avoid such issues, large tests are split into
  5 | multiple executables to take advantage of parallel computation and reduce memory
  6 | usage.
  7 | 
  8 | CUB facilitates this by checking for special `%PARAM%` comments in each test's
  9 | source code, and then uses this information to generate multiple executables
 10 | with different configurations.
 11 | 
 12 | ## Using `%PARAM%`
 13 | 
 14 | The `%PARAM%` hint provides an automated method of generating multiple test
 15 | executables from a single source file. To use it, add one or more special
 16 | comments to the test source file:
 17 | 
 18 | ```cpp
 19 | // %PARAM% [definition] [label] [values]
 20 | ```
 21 | 
 22 | CMake will parse the source file and extract these comments, using them to
 23 | generate multiple test executables for the full cartesian product of values.
 24 | 
 25 | - `definition` will be used as a preprocessor definition name. By convention,
 26 |   these begin with `TEST_`.
 27 | - `label` is a short, human-readable label that will be used in the test
 28 |   executable's name to identify the test variant.
 29 | - `values` is a colon-separated list of values used during test generation. Only
 30 |   numeric values have been tested.
 31 | 
 32 | ## Special Labels
 33 | 
 34 | ### CDP / RDC Testing
 35 | 
 36 | If a `label` is `cdp`, it is assumed that the parameter is used to explicitly
 37 | test variants built with and without CDP support. The `values` for such a
 38 | parameter must be `0:1`, with `0` indicating CDP disabled (RDC off) and `1`
 39 | indicating CDP enabled (RDC on).
 40 | 
 41 | Tests that do not contain a variant labeled `cdp` will only enable RDC if
 42 | the CMake variable `CUB_ENABLE_TESTS_WITH_RDC` is true.
 43 | 
 44 | ## Example
 45 | 
 46 | For example, if `test_baz.cu` contains the following lines:
 47 | 
 48 | ```cpp
 49 | // %PARAM% TEST_FOO foo 0:1:2
 50 | // %PARAM% TEST_CDP cdp 0:1
 51 | ```
 52 | 
 53 | Six executables and CTest targets will be generated with unique definitions
 54 | (only c++17 targets shown):
 55 | 
 56 | | Executable Name                  | Preprocessor Definitions    | RDC State |
 57 | |----------------------------------|-----------------------------|-----------|
 58 | | `cub.cpp17.test.baz.foo_0.cdp_0` | `-DTEST_FOO=0 -DTEST_CDP=0` | Disabled  |
 59 | | `cub.cpp17.test.baz.foo_0.cdp_1` | `-DTEST_FOO=0 -DTEST_CDP=1` | Enabled   |
 60 | | `cub.cpp17.test.baz.foo_1.cdp_0` | `-DTEST_FOO=1 -DTEST_CDP=0` | Disabled  |
 61 | | `cub.cpp17.test.baz.foo_1.cdp_1` | `-DTEST_FOO=1 -DTEST_CDP=1` | Enabled   |
 62 | | `cub.cpp17.test.baz.foo_2.cdp_0` | `-DTEST_FOO=2 -DTEST_CDP=0` | Disabled  |
 63 | | `cub.cpp17.test.baz.foo_2.cdp_1` | `-DTEST_FOO=2 -DTEST_CDP=1` | Enabled   |
 64 | 
 65 | ## Changing `%PARAM%` Hints
 66 | 
 67 | Since CMake does not automatically reconfigure the build when source files are
 68 | modified, CMake will need to be rerun manually whenever the `%PARAM%` comments
 69 | change.
 70 | 
 71 | ## Building and Running Split Tests
 72 | 
 73 | CMake will generate individual build and test targets for each test variant, and
 74 | also provides build "metatargets" that compile all variants of a given test.
 75 | 
 76 | The variants follow the usual naming convention for CUB's tests, but include a
 77 | suffix that differentiates them (e.g. `.foo_X.bar_Y` in the example above).
 78 | 
 79 | ### Individual Test Variants
 80 | 
 81 | Continuing with the `test_baz.cu` example, the test variant that uses
 82 | `-DTEST_FOO=1 -DTEST_BAR=4` can be built and run alone:
 83 | 
 84 | ```bash
 85 | # Build a single variant:
 86 | make cub.cpp17.test.baz.foo_1.bar_4
 87 | 
 88 | # Run a single variant
 89 | bin/cub.cpp17.test.baz.foo_1.bar_4
 90 | 
 91 | # Run a single variant using CTest regex:
 92 | ctest -R cub\.cpp17\.test\.baz\.foo_1\.bar_4
 93 | ```
 94 | 
 95 | ### All Variants of a Test
 96 | 
 97 | Using a metatarget and the proper regex, all variants of a test can be built and
 98 | executed without listing all variants explicitly:
 99 | 
100 | ```bash
101 | # Build all variants using the `.all` metatarget
102 | make cub.cpp17.test.baz.all
103 | 
104 | # Run all variants:
105 | ctest -R cub\.cpp17\.test\.baz\.
106 | ```
107 | 
108 | ## Debugging
109 | 
110 | Running CMake with `--log-level=VERBOSE` will print out extra information about
111 | all detected test variants.
112 | 
113 | ## Additional Info
114 | 
115 | Ideally, only parameters that directly influence kernel template instantiations
116 | should be split out in this way. If changing a parameter doesn't change the
117 | kernel template type, the same kernel will be compiled into multiple
118 | executables. This defeats the purpose of splitting up the test since the
119 | compiler will generate redundant code across the new split executables.
120 | 
121 | The best candidate parameters for splitting are input value types, rather than
122 | integral parameters like BLOCK_THREADS, etc. Splitting by value type allows more
123 | infrastructure (data generation, validation) to be reused. Splitting other
124 | parameters can cause build times to increase since type-related infrastructure
125 | has to be rebuilt for each test variant.
126 | 


--------------------------------------------------------------------------------
/test/c2h/generators.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION.  All rights reserved.
  3 | *
  4 | * Redistribution and use in source and binary forms, with or without
  5 | * modification, are permitted provided that the following conditions are met:
  6 | *     * Redistributions of source code must retain the above copyright
  7 | *       notice, this list of conditions and the following disclaimer.
  8 | *     * Redistributions in binary form must reproduce the above copyright
  9 | *       notice, this list of conditions and the following disclaimer in the
 10 | *       documentation and/or other materials provided with the distribution.
 11 | *     * Neither the name of the NVIDIA CORPORATION nor the
 12 | *       names of its contributors may be used to endorse or promote products
 13 | *       derived from this software without specific prior written permission.
 14 | *
 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 | *
 26 | ******************************************************************************/
 27 | 
 28 | #pragma once
 29 | 
 30 | #include <limits>
 31 | 
 32 | #include <thrust/device_vector.h>
 33 | 
 34 | #include <c2h/custom_type.cuh>
 35 | 
 36 | namespace c2h
 37 | {
 38 | 
 39 | namespace detail
 40 | {
 41 | 
 42 | template <class T>
 43 | class value_wrapper_t
 44 | {
 45 |   T m_val{};
 46 | 
 47 | public:
 48 |   explicit value_wrapper_t(T val) : m_val(val) {}
 49 |   explicit value_wrapper_t(int val) : m_val(static_cast<T>(val)) {}
 50 |   T get() const { return m_val; }
 51 | };
 52 | 
 53 | }
 54 | 
 55 | class seed_t : public detail::value_wrapper_t<unsigned long long int> 
 56 | {
 57 |   using value_wrapper_t::value_wrapper_t;
 58 | };
 59 | 
 60 | class modulo_t : public detail::value_wrapper_t<std::size_t> 
 61 | {
 62 |   using value_wrapper_t::value_wrapper_t;
 63 | };
 64 | 
 65 | namespace detail
 66 | {
 67 |   
 68 | void gen(seed_t seed,
 69 |          char* data,
 70 |          c2h::custom_type_state_t min,
 71 |          c2h::custom_type_state_t max,
 72 |          std::size_t elements,
 73 |          std::size_t element_size);
 74 | 
 75 | }
 76 | 
 77 | template <template <typename> class... Ps>
 78 | void gen(
 79 |   seed_t seed,
 80 |   thrust::device_vector<c2h::custom_type_t<Ps...>> &data,
 81 |   c2h::custom_type_t<Ps...> min = std::numeric_limits<c2h::custom_type_t<Ps...>>::lowest(),
 82 |   c2h::custom_type_t<Ps...> max = std::numeric_limits<c2h::custom_type_t<Ps...>>::max())
 83 | {
 84 |   detail::gen(
 85 |       seed, 
 86 |       reinterpret_cast<char*>(thrust::raw_pointer_cast(data.data())),
 87 |       min,
 88 |       max,
 89 |       data.size(),
 90 |       sizeof(c2h::custom_type_t<Ps...>));
 91 | }
 92 | 
 93 | template <typename T>
 94 | void gen(seed_t seed,
 95 |          thrust::device_vector<T> &data,
 96 |          T min = std::numeric_limits<T>::min(),
 97 |          T max = std::numeric_limits<T>::max());
 98 | 
 99 | template <typename T>
100 | void gen(modulo_t mod, thrust::device_vector<T> &data);
101 | 
102 | } // c2h
103 | 
104 | 


--------------------------------------------------------------------------------
/test/catch2_runner.cu:
--------------------------------------------------------------------------------
1 | #define CUB_CONFIG_MAIN
2 | #include "catch2_test_helper.h"
3 | 
4 | 


--------------------------------------------------------------------------------
/test/catch2_test_printing.cu:
--------------------------------------------------------------------------------
 1 | #include "test_util.h"
 2 | 
 3 | #include "catch2_test_helper.h"
 4 | 
 5 | template <typename T>
 6 | std::string print(T val) 
 7 | {
 8 |   std::stringstream ss;
 9 |   ss << val;
10 |   return ss.str();
11 | }
12 | 
13 | #if CUB_IS_INT128_ENABLED
14 | TEST_CASE("Test utils can print __int128", "[test][utils]")
15 | {
16 |   REQUIRE( print(__int128_t{0}) == "0" );
17 |   REQUIRE( print(__int128_t{42}) == "42" );
18 |   REQUIRE( print(__int128_t{-1}) == "-1" );
19 |   REQUIRE( print(__int128_t{-42}) == "-42" );
20 |   REQUIRE( print(-1 * (__int128_t{1} << 120)) == "-1329227995784915872903807060280344576" );
21 | }
22 | 
23 | TEST_CASE("Test utils can print __uint128", "[test][utils]")
24 | {
25 |   REQUIRE( print(__uint128_t{0}) == "0" );
26 |   REQUIRE( print(__uint128_t{1}) == "1" );
27 |   REQUIRE( print(__uint128_t{42}) == "42" );
28 |   REQUIRE( print(__uint128_t{1} << 120) == "1329227995784915872903807060280344576" );
29 | }
30 | #endif
31 | 
32 | TEST_CASE("Test utils can print KeyValuePair", "[test][utils]")
33 | {
34 |   REQUIRE( print(cub::KeyValuePair<int, int>{42, -42}) == "(42,-42)" );
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/test/catch2_test_util_type.cu:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
 3 |  *
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | #include <cub/iterator/counting_input_iterator.cuh>
29 | #include <cub/iterator/discard_output_iterator.cuh>
30 | #include <cub/util_type.cuh>
31 | 
32 | #include <cuda/std/type_traits>
33 | 
34 | // Has to go after all cub headers. Otherwise, this test won't catch unused
35 | // variables in cub kernels.
36 | #include "catch2_test_helper.h"
37 | 
38 | CUB_TEST("Tests non_void_value_t", "[util][type]")
39 | {
40 |   using fallback_t        = float;
41 |   using void_fancy_it     = cub::DiscardOutputIterator<std::size_t>;
42 |   using non_void_fancy_it = cub::CountingInputIterator<int>;
43 | 
44 |   // falls back for const void*
45 |   STATIC_REQUIRE(
46 |     ::cuda::std::is_same<fallback_t, //
47 |                          cub::detail::non_void_value_t<const void *, fallback_t>>::value);
48 |   // falls back for const volatile void*
49 |   STATIC_REQUIRE(
50 |     ::cuda::std::is_same<fallback_t, //
51 |                          cub::detail::non_void_value_t<const volatile void *, fallback_t>>::value);
52 |   // falls back for volatile void*
53 |   STATIC_REQUIRE(
54 |     ::cuda::std::is_same<fallback_t, //
55 |                          cub::detail::non_void_value_t<volatile void *, fallback_t>>::value);
56 |   // falls back for void*
57 |   STATIC_REQUIRE(::cuda::std::is_same<fallback_t, //
58 |                                       cub::detail::non_void_value_t<void *, fallback_t>>::value);
59 |   // works for int*
60 |   STATIC_REQUIRE(::cuda::std::is_same<int, //
61 |                                       cub::detail::non_void_value_t<int *, void>>::value);
62 |   // falls back for fancy iterator with a void value type
63 |   STATIC_REQUIRE(
64 |     ::cuda::std::is_same<fallback_t, //
65 |                          cub::detail::non_void_value_t<void_fancy_it, fallback_t>>::value);
66 |   // works for a fancy iterator that has int as value type
67 |   STATIC_REQUIRE(
68 |     ::cuda::std::is_same<int, //
69 |                          cub::detail::non_void_value_t<non_void_fancy_it, fallback_t>>::value);
70 | }
71 | 


--------------------------------------------------------------------------------
/test/catch2_test_warp_mask.cu:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 16 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 17 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 18 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | #include <cub/util_ptx.cuh>
 29 | 
 30 | // Has to go after all cub headers. Otherwise, this test won't catch unused
 31 | // variables in cub kernels.
 32 | #include "catch2_test_helper.h"
 33 | 
 34 | template <int logical_warp_threads>
 35 | struct total_warps_t
 36 | {
 37 | private:
 38 |   static constexpr unsigned int total_warps = (cub::PowerOfTwo<logical_warp_threads>::VALUE)
 39 |                                                 ? CUB_WARP_THREADS(0) / logical_warp_threads
 40 |                                                 : 1;
 41 | 
 42 | public:
 43 |   static constexpr unsigned int value() { return total_warps; }
 44 | };
 45 | 
 46 | bool is_lane_involved(unsigned int member_mask, unsigned int lane)
 47 | {
 48 |   return member_mask & (1 << lane);
 49 | }
 50 | 
 51 | using logical_warp_threads      = c2h::iota<1, 32>;
 52 | using power_of_two_warp_threads = c2h::enum_type_list<int, 1, 2, 4, 8, 16, 32>;
 53 | 
 54 | CUB_TEST("Warp mask ignores lanes before current logical warp",
 55 |          "[mask][warp]",
 56 |          power_of_two_warp_threads)
 57 | {
 58 |   constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
 59 |   constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
 60 | 
 61 |   for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
 62 |   {
 63 |     const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
 64 |     const unsigned int warp_begin = logical_warp_thread * warp_id;
 65 | 
 66 |     for (unsigned int prev_warp_lane = 0; prev_warp_lane < warp_begin; prev_warp_lane++)
 67 |     {
 68 |       REQUIRE_FALSE(is_lane_involved(warp_mask, prev_warp_lane));
 69 |     }
 70 |   }
 71 | }
 72 | 
 73 | CUB_TEST("Warp mask involves lanes of current logical warp", "[mask][warp]", logical_warp_threads)
 74 | {
 75 |   constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
 76 |   constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
 77 | 
 78 |   for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
 79 |   {
 80 |     const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
 81 |     const unsigned int warp_begin = logical_warp_thread * warp_id;
 82 |     const unsigned int warp_end   = warp_begin + logical_warp_thread;
 83 | 
 84 |     for (unsigned int warp_lane = warp_begin; warp_lane < warp_end; warp_lane++)
 85 |     {
 86 |       REQUIRE(is_lane_involved(warp_mask, warp_lane));
 87 |     }
 88 |   }
 89 | }
 90 | 
 91 | CUB_TEST("Warp mask ignores lanes after current logical warp", "[mask][warp]", logical_warp_threads)
 92 | {
 93 |   constexpr int logical_warp_thread  = c2h::get<0, TestType>::value;
 94 |   constexpr unsigned int total_warps = total_warps_t<logical_warp_thread>::value();
 95 | 
 96 |   for (unsigned int warp_id = 0; warp_id < total_warps; warp_id++)
 97 |   {
 98 |     const unsigned int warp_mask  = cub::WarpMask<logical_warp_thread>(warp_id);
 99 |     const unsigned int warp_begin = logical_warp_thread * warp_id;
100 |     const unsigned int warp_end   = warp_begin + logical_warp_thread;
101 | 
102 |     for (unsigned int post_warp_lane = warp_end; post_warp_lane < CUB_WARP_THREADS(0);
103 |          post_warp_lane++)
104 |     {
105 |       REQUIRE_FALSE(is_lane_involved(warp_mask, post_warp_lane));
106 |     }
107 |   }
108 | }
109 | 


--------------------------------------------------------------------------------
/test/cmake/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if (NOT CUB_IN_THRUST) # Thrust has its own checks for this:
 2 |   # Test that we can use `find_package` on an installed CUB:
 3 |   add_test(
 4 |     NAME cub.test.cmake.test_install
 5 |     COMMAND "${CMAKE_COMMAND}"
 6 |       --log-level=VERBOSE
 7 |       -G "${CMAKE_GENERATOR}"
 8 |       -S "${CMAKE_CURRENT_SOURCE_DIR}/test_install"
 9 |       -B "${CMAKE_CURRENT_BINARY_DIR}/test_install"
10 |       -D "CUB_BINARY_DIR=${CUB_BINARY_DIR}"
11 |       -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
12 |       -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
13 |       -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
14 |   )
15 | endif()
16 | 
17 | # Check source code for issues that can be found by pattern matching:
18 | add_test(
19 |   NAME cub.test.cmake.check_source_files
20 |   COMMAND
21 |     "${CMAKE_COMMAND}"
22 |       -D "CUB_SOURCE_DIR=${CUB_SOURCE_DIR}"
23 |       -P "${CMAKE_CURRENT_LIST_DIR}/check_source_files.cmake"
24 | )
25 | 


--------------------------------------------------------------------------------
/test/cmake/test_install/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Test that an installation of the project can be located by find_package() call
 2 | # with appropriate prefix settings.
 3 | #
 4 | # Expects CUB_BINARY_DIR to be set to an existing cub build directory.
 5 | 
 6 | cmake_minimum_required(VERSION 3.15)
 7 | 
 8 | project(CubTestInstall CXX CUDA)
 9 | 
10 | # This will eventually get deleted recursively -- keep that in mind if modifying
11 | set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/install_prefix/")
12 | 
13 | function(do_manual_install)
14 |   # Inspired by the VTK-m install tests, we can just glob up all of the
15 |   # cmake_install.cmake, include (ie. run) them, and they'll effectively
16 |   # install the project into the current value of CMAKE_INSTALL_PREFIX.
17 | 
18 |   # Gather all of the install files from CUB's root:
19 |   file(GLOB_RECURSE install_files
20 |     LIST_DIRECTORIES False
21 |     "${CUB_BINARY_DIR}/cmake_install.cmake"
22 |   )
23 | 
24 |   message(STATUS "Locating install files...")
25 |   foreach (install_file IN LISTS install_files)
26 |     message(STATUS "  * ${install_file}")
27 |   endforeach()
28 | 
29 |   message(STATUS "Building install tree...")
30 |   foreach(install_file IN LISTS install_files)
31 |     include("${install_file}")
32 |   endforeach()
33 | endfunction()
34 | 
35 | function(do_cleanup)
36 |   message(STATUS "Removing ${CMAKE_INSTALL_PREFIX}")
37 |   file(REMOVE_RECURSE "${CMAKE_INSTALL_PREFIX}")
38 | endfunction()
39 | 
40 | function(assert_boolean var_name expect)
41 |   if (expect)
42 |     if (NOT ${var_name})
43 |       message(FATAL_ERROR "'${var_name}' is false, expected true.")
44 |     endif()
45 |   else()
46 |     if (${var_name})
47 |       message(FATAL_ERROR "'${var_name}' is true, expected false.")
48 |     endif()
49 |   endif()
50 | endfunction()
51 | 
52 | function(assert_target target_name)
53 |   if (NOT TARGET "${target_name}")
54 |     message(FATAL_ERROR "Target '${target_name}' not defined.")
55 |   endif()
56 | endfunction()
57 | 
58 | function(find_installed_project)
59 |   set(CMAKE_PREFIX_PATH "${CMAKE_INSTALL_PREFIX}")
60 |   find_package(CUB CONFIG)
61 | 
62 |   if (NOT CUB_FOUND)
63 |     message(FATAL_ERROR
64 |       "find_package(CUB) failed. "
65 |       "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}"
66 |     )
67 |   endif()
68 | 
69 |   # Test some internal config vars to check that this is the expected install:
70 |   # TODO The cmake_path (3.19) command will provide more robust ways to do this
71 | 
72 |   # Escape regex special characters in the install prefix, see
73 |   # https://gitlab.kitware.com/cmake/cmake/-/issues/18580
74 |   string(REGEX REPLACE "([][+.*()^])" "\\\\\\1"
75 |     prefix_regex
76 |     "${CMAKE_INSTALL_PREFIX}"
77 |   )
78 |   if (NOT _CUB_INCLUDE_DIR MATCHES "^${prefix_regex}")
79 |     message(FATAL_ERROR
80 |       "Found CUB in unexpected location: "
81 |       " * _CUB_INCLUDE_DIR=${_CUB_INCLUDE_DIR} "
82 |       " * ExpectedPrefix=${CMAKE_INSTALL_DIR}"
83 |     )
84 |   endif()
85 | 
86 |   assert_target(CUB::CUB)
87 | 
88 | endfunction()
89 | 
90 | do_cleanup() # Prepare for new installation
91 | do_manual_install()
92 | find_installed_project()
93 | do_cleanup() # Clean up if successful
94 | 


--------------------------------------------------------------------------------
/test/link_a.cu:
--------------------------------------------------------------------------------
 1 | #include <cub/cub.cuh>
 2 | 
 3 | void a()
 4 | {
 5 |     printf("a() called\n");
 6 | 
 7 |     cub::DoubleBuffer<unsigned int>     d_keys;
 8 |     cub::DoubleBuffer<cub::NullType>    d_values;
 9 |     size_t                              temp_storage_bytes = 0;
10 |     cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
11 | }
12 | 


--------------------------------------------------------------------------------
/test/link_b.cu:
--------------------------------------------------------------------------------
 1 | #include <cub/cub.cuh>
 2 | 
 3 | void b()
 4 | {
 5 |     printf("b() called\n");
 6 | 
 7 |     cub::DoubleBuffer<unsigned int>     d_keys;
 8 |     cub::DoubleBuffer<cub::NullType>    d_values;
 9 |     size_t                              temp_storage_bytes = 0;
10 |     cub::DeviceRadixSort::SortPairs(NULL, temp_storage_bytes, d_keys, d_values, 1024);
11 | }
12 | 


--------------------------------------------------------------------------------
/test/link_main.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | extern void a();
 4 | extern void b();
 5 | 
 6 | int main()
 7 | {
 8 |     printf("hello world\n");
 9 |     return 0;
10 | }
11 | 


--------------------------------------------------------------------------------
/test/test_cdp_variant_state.cu:
--------------------------------------------------------------------------------
 1 | /*
 2 | *  Copyright 2022 NVIDIA Corporation
 3 | *
 4 | *  Licensed under the Apache License, Version 2.0 (the "License");
 5 | *  you may not use this file except in compliance with the License.
 6 | *  You may obtain a copy of the License at
 7 | *
 8 | *      http://www.apache.org/licenses/LICENSE-2.0
 9 | *
10 | *  Unless required by applicable law or agreed to in writing, software
11 | *  distributed under the License is distributed on an "AS IS" BASIS,
12 | *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | *  See the License for the specific language governing permissions and
14 | *  limitations under the License.
15 | */
16 | 
17 | #include <cub/detail/detect_cuda_runtime.cuh>
18 | 
19 | #include <cstdlib>
20 | 
21 | int main()
22 | {
23 |   // This test just checks that RDC is enabled and detected properly when using
24 |   // the %PARAM% system to request CDP support (see the README.md file in
25 |   // this directory).
26 | 
27 |   // %PARAM% TEST_CDP cdp 0:1
28 | 
29 | #ifdef CUB_RDC_ENABLED
30 |   return (TEST_CDP == 1) ? EXIT_SUCCESS : EXIT_FAILURE;
31 | #else
32 |   return (TEST_CDP == 0) ? EXIT_SUCCESS : EXIT_FAILURE;
33 | #endif
34 | }
35 | 


--------------------------------------------------------------------------------
/test/test_namespace_wrapped.cu:
--------------------------------------------------------------------------------
 1 | // Wrap thrust and cub in different enclosing namespaces
 2 | // (In practice, you probably want these to be the same, in which case just
 3 | // set THRUST_CUB_WRAPPED_NAMESPACE to set both).
 4 | #define THRUST_WRAPPED_NAMESPACE wrap_thrust
 5 | #define CUB_WRAPPED_NAMESPACE    wrap_cub
 6 | 
 7 | // Enable error checking:
 8 | #define CUB_STDERR
 9 | 
10 | #include <thrust/device_vector.h>
11 | #include <thrust/host_vector.h>
12 | #include <thrust/sort.h>
13 | 
14 | #include <cub/device/device_radix_sort.cuh>
15 | #include <cub/util_debug.cuh>
16 | 
17 | #include "test_util.h"
18 | 
19 | #include <cstdint>
20 | #include <cstdlib>
21 | 
22 | // Test that we can use a few common utilities and algorithms from wrapped
23 | // Thrust/CUB namespaces at runtime. More extensive testing is performed by the
24 | // header tests and the check_namespace.cmake test.
25 | int main(int argc, char **argv)
26 | {
27 |   CommandLineArgs args(argc, argv);
28 |   CubDebugExit(args.DeviceInit());
29 | 
30 |   const std::size_t n = 2048;
31 | 
32 |   // Fill a vector with random data:
33 |   ::wrap_thrust::thrust::host_vector<int> h_input(n);
34 |   for (auto &val : h_input)
35 |   {
36 |     RandomBits(val);
37 |   }
38 | 
39 |   // Test the qualifier macro:
40 |   THRUST_NS_QUALIFIER::device_vector<int> d_input(h_input);
41 |   THRUST_NS_QUALIFIER::device_vector<int> d_output(n);
42 | 
43 |   std::size_t temp_storage_bytes{};
44 | 
45 |   // Sort with DeviceRadixSort:
46 |   auto error = ::wrap_cub::cub::DeviceRadixSort::SortKeys(
47 |     nullptr,
48 |     temp_storage_bytes,
49 |     ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()),
50 |     ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()),
51 |     static_cast<std::size_t>(n));
52 | 
53 |   CubDebugExit(error);
54 | 
55 |   ::wrap_thrust::thrust::device_vector<std::uint8_t> temp_storage(
56 |     temp_storage_bytes);
57 | 
58 |   // Test the CUB qualifier macro:
59 |   error = CUB_NS_QUALIFIER::DeviceRadixSort::SortKeys(
60 |     ::wrap_thrust::thrust::raw_pointer_cast(temp_storage.data()),
61 |     temp_storage_bytes,
62 |     ::wrap_thrust::thrust::raw_pointer_cast(d_input.data()),
63 |     ::wrap_thrust::thrust::raw_pointer_cast(d_output.data()),
64 |     static_cast<std::size_t>(n));
65 | 
66 |   CubDebugExit(error);
67 | 
68 |   // Verify output:
69 |   if (!::wrap_thrust::thrust::is_sorted(d_output.cbegin(), d_output.cend()))
70 |   {
71 |     std::cerr << "Output is not sorted!\n";
72 |     return EXIT_FAILURE;
73 |   }
74 | 
75 |   return EXIT_SUCCESS;
76 | }
77 | 


--------------------------------------------------------------------------------