├── .clang-format ├── .clang-tidy ├── .clangd ├── .git-blame-ignore-revs ├── .github ├── ISSUE_TEMPLATE │ └── config.yml └── workflows │ ├── mirror-main-branch-to-master-branch.yml │ └── push-to-legacy-repositories.yml ├── .gitignore ├── CHANGELOG.md ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.TXT ├── README.md ├── benchmarks ├── CMakeLists.txt ├── README.md ├── bench │ ├── adjacent_difference │ │ └── subtract_left.cu │ ├── histogram │ │ ├── even.cu │ │ ├── histogram_common.cuh │ │ ├── multi │ │ │ ├── even.cu │ │ │ └── range.cu │ │ └── range.cu │ ├── merge_sort │ │ ├── keys.cu │ │ └── pairs.cu │ ├── partition │ │ ├── flagged.cu │ │ └── if.cu │ ├── radix_sort │ │ ├── keys.cu │ │ └── pairs.cu │ ├── reduce │ │ ├── base.cuh │ │ ├── by_key.cu │ │ ├── max.cu │ │ └── sum.cu │ ├── run_length_encode │ │ ├── encode.cu │ │ └── non_trivial_runs.cu │ ├── scan │ │ └── exclusive │ │ │ ├── base.cuh │ │ │ ├── by_key.cu │ │ │ ├── max.cu │ │ │ └── sum.cu │ ├── segmented_sort │ │ ├── large │ │ │ └── keys.cu │ │ ├── power_law │ │ │ └── keys.cu │ │ └── small │ │ │ └── keys.cu │ └── select │ │ ├── flagged.cu │ │ ├── if.cu │ │ └── unique_by_key.cu ├── docker │ ├── .gitignore │ └── recipe.py ├── nvbench_helper │ ├── CMakeLists.txt │ ├── look_back_helper.cuh │ ├── nvbench_helper.cu │ └── nvbench_helper.cuh └── scripts │ ├── .gitignore │ ├── analysis.ipynb │ ├── analyze.py │ ├── cub │ ├── __init__.py │ └── bench │ │ ├── __init__.py │ │ ├── bench.py │ │ ├── build.py │ │ ├── cmake.py │ │ ├── config.py │ │ ├── logger.py │ │ ├── score.py │ │ ├── search.py │ │ └── storage.py │ ├── search.py │ └── verify.py ├── cmake ├── AppendOptionIfAvailable.cmake ├── CPM.cmake ├── CubAddSubdir.cmake ├── CubBuildCompilerTargets.cmake ├── CubBuildTargetList.cmake ├── CubCompilerHacks.cmake ├── CubCudaConfig.cmake ├── CubHeaderTesting.cmake ├── CubInstallRules.cmake ├── CubUtilities.cmake └── header_test.in ├── cub ├── agent │ ├── agent_adjacent_difference.cuh │ ├── agent_batch_memcpy.cuh │ ├── agent_histogram.cuh │ ├── agent_merge_sort.cuh │ ├── agent_radix_sort_downsweep.cuh │ ├── agent_radix_sort_histogram.cuh │ ├── agent_radix_sort_onesweep.cuh │ ├── agent_radix_sort_upsweep.cuh │ ├── agent_reduce.cuh │ ├── agent_reduce_by_key.cuh │ ├── agent_rle.cuh │ ├── agent_scan.cuh │ ├── agent_scan_by_key.cuh │ ├── agent_segment_fixup.cuh │ ├── agent_segmented_radix_sort.cuh │ ├── agent_select_if.cuh │ ├── agent_spmv_orig.cuh │ ├── agent_sub_warp_merge_sort.cuh │ ├── agent_three_way_partition.cuh │ ├── agent_unique_by_key.cuh │ └── single_pass_scan_operators.cuh ├── block │ ├── block_adjacent_difference.cuh │ ├── block_discontinuity.cuh │ ├── block_exchange.cuh │ ├── block_histogram.cuh │ ├── block_load.cuh │ ├── block_merge_sort.cuh │ ├── block_radix_rank.cuh │ ├── block_radix_sort.cuh │ ├── block_raking_layout.cuh │ ├── block_reduce.cuh │ ├── block_run_length_decode.cuh │ ├── block_scan.cuh │ ├── block_shuffle.cuh │ ├── block_store.cuh │ ├── radix_rank_sort_operations.cuh │ └── specializations │ │ ├── block_histogram_atomic.cuh │ │ ├── block_histogram_sort.cuh │ │ ├── block_reduce_raking.cuh │ │ ├── block_reduce_raking_commutative_only.cuh │ │ ├── block_reduce_warp_reductions.cuh │ │ ├── block_scan_raking.cuh │ │ └── block_scan_warp_scans.cuh ├── cmake │ ├── cub-config-version.cmake │ ├── cub-config.cmake │ ├── cub-header-search.cmake │ └── cub-header-search.cmake.in ├── config.cuh ├── cub.cuh ├── detail │ ├── choose_offset.cuh │ ├── cpp_compatibility.cuh │ ├── detect_cuda_runtime.cuh │ ├── device_double_buffer.cuh │ ├── device_synchronize.cuh │ ├── exec_check_disable.cuh │ ├── strong_load.cuh │ ├── strong_store.cuh │ ├── temporary_storage.cuh │ ├── type_traits.cuh │ └── uninitialized_copy.cuh ├── device │ ├── device_adjacent_difference.cuh │ ├── device_copy.cuh │ ├── device_histogram.cuh │ ├── device_memcpy.cuh │ ├── device_merge_sort.cuh │ ├── device_partition.cuh │ ├── device_radix_sort.cuh │ ├── device_reduce.cuh │ ├── device_run_length_encode.cuh │ ├── device_scan.cuh │ ├── device_segmented_radix_sort.cuh │ ├── device_segmented_reduce.cuh │ ├── device_segmented_sort.cuh │ ├── device_select.cuh │ ├── device_spmv.cuh │ └── dispatch │ │ ├── dispatch_adjacent_difference.cuh │ │ ├── dispatch_batch_memcpy.cuh │ │ ├── dispatch_histogram.cuh │ │ ├── dispatch_merge_sort.cuh │ │ ├── dispatch_radix_sort.cuh │ │ ├── dispatch_reduce.cuh │ │ ├── dispatch_reduce_by_key.cuh │ │ ├── dispatch_rle.cuh │ │ ├── dispatch_scan.cuh │ │ ├── dispatch_scan_by_key.cuh │ │ ├── dispatch_segmented_sort.cuh │ │ ├── dispatch_select_if.cuh │ │ ├── dispatch_spmv_orig.cuh │ │ ├── dispatch_three_way_partition.cuh │ │ ├── dispatch_unique_by_key.cuh │ │ └── tuning │ │ ├── tuning_run_length_encode.cuh │ │ ├── tuning_scan.cuh │ │ └── tuning_select_if.cuh ├── grid │ ├── grid_barrier.cuh │ ├── grid_even_share.cuh │ ├── grid_mapping.cuh │ └── grid_queue.cuh ├── host │ └── mutex.cuh ├── iterator │ ├── arg_index_input_iterator.cuh │ ├── cache_modified_input_iterator.cuh │ ├── cache_modified_output_iterator.cuh │ ├── constant_input_iterator.cuh │ ├── counting_input_iterator.cuh │ ├── discard_output_iterator.cuh │ ├── tex_obj_input_iterator.cuh │ ├── tex_ref_input_iterator.cuh │ └── transform_input_iterator.cuh ├── thread │ ├── thread_load.cuh │ ├── thread_operators.cuh │ ├── thread_reduce.cuh │ ├── thread_scan.cuh │ ├── thread_search.cuh │ ├── thread_sort.cuh │ └── thread_store.cuh ├── util_allocator.cuh ├── util_arch.cuh ├── util_compiler.cuh ├── util_cpp_dialect.cuh ├── util_debug.cuh ├── util_deprecated.cuh ├── util_device.cuh ├── util_macro.cuh ├── util_math.cuh ├── util_namespace.cuh ├── util_ptx.cuh ├── util_type.cuh ├── version.cuh └── warp │ ├── specializations │ ├── warp_reduce_shfl.cuh │ ├── warp_reduce_smem.cuh │ ├── warp_scan_shfl.cuh │ └── warp_scan_smem.cuh │ ├── warp_exchange.cuh │ ├── warp_load.cuh │ ├── warp_merge_sort.cuh │ ├── warp_reduce.cuh │ ├── warp_scan.cuh │ └── warp_store.cuh ├── docs ├── .gitignore ├── VERSION.md ├── deps │ └── repo-deps.packman.xml ├── developer_overview.rst ├── gen_docs.sh ├── index.rst ├── repo.bat ├── repo.sh ├── repo.toml ├── test_overview.rst ├── tools │ ├── packman │ │ ├── bootstrap │ │ │ ├── configure.bat │ │ │ ├── download_file_from_url.ps1 │ │ │ ├── fetch_file_from_packman_bootstrap.cmd │ │ │ ├── generate_temp_file_name.ps1 │ │ │ ├── generate_temp_folder.ps1 │ │ │ └── install_package.py │ │ ├── config.packman.xml │ │ ├── packman │ │ ├── packman.cmd │ │ ├── packmanconf.py │ │ ├── python.bat │ │ └── python.sh │ └── repoman │ │ ├── omni │ │ └── repo │ │ │ └── format │ │ │ └── .gitignore │ │ └── repoman.py └── tuning.rst ├── examples ├── CMakeLists.txt ├── block │ ├── .gitignore │ ├── CMakeLists.txt │ ├── example_block_radix_sort.cu │ ├── example_block_reduce.cu │ ├── example_block_reduce_dyn_smem.cu │ └── example_block_scan.cu ├── cmake │ ├── CMakeLists.txt │ └── add_subdir │ │ ├── CMakeLists.txt │ │ └── dummy.cu └── device │ ├── .gitignore │ ├── CMakeLists.txt │ ├── example_device_decoupled_look_back.cu │ ├── example_device_partition_flagged.cu │ ├── example_device_partition_if.cu │ ├── example_device_radix_sort.cu │ ├── example_device_radix_sort_custom.cu │ ├── example_device_reduce.cu │ ├── example_device_scan.cu │ ├── example_device_select_flagged.cu │ ├── example_device_select_if.cu │ ├── example_device_select_unique.cu │ └── example_device_sort_find_non_trivial_runs.cu └── test ├── .gitignore ├── CMakeLists.txt ├── README.md ├── bfloat16.h ├── c2h ├── custom_type.cuh ├── generators.cu └── generators.cuh ├── catch2_runner.cu ├── catch2_test_block_adjacent_difference.cu ├── catch2_test_block_histogram.cu ├── catch2_test_block_load.cu ├── catch2_test_block_merge_sort.cu ├── catch2_test_block_radix_sort.cu ├── catch2_test_block_radix_sort.cuh ├── catch2_test_block_radix_sort_custom.cu ├── catch2_test_block_reduce.cu ├── catch2_test_block_run_length_decode.cu ├── catch2_test_block_scan.cu ├── catch2_test_block_shuffle.cu ├── catch2_test_block_store.cu ├── catch2_test_cdp_helper.h ├── catch2_test_cdp_wrapper.cu ├── catch2_test_device_decoupled_look_back.cu ├── catch2_test_device_radix_sort_custom.cu ├── catch2_test_helper.h ├── catch2_test_printing.cu ├── catch2_test_radix_operations.cu ├── catch2_test_util_type.cu ├── catch2_test_warp_exchange.cu ├── catch2_test_warp_load.cu ├── catch2_test_warp_mask.cu ├── catch2_test_warp_merge_sort.cu ├── catch2_test_warp_reduce.cu ├── catch2_test_warp_scan.cu ├── catch2_test_warp_store.cu ├── cmake ├── CMakeLists.txt ├── check_source_files.cmake └── test_install │ └── CMakeLists.txt ├── fill_striped.cuh ├── half.h ├── link_a.cu ├── link_b.cu ├── link_main.cpp ├── mersenne.h ├── test_allocator.cu ├── test_block_radix_rank.cu ├── test_cdp_variant_state.cu ├── test_device_adjacent_difference.cu ├── test_device_batch_copy.cu ├── test_device_batch_memcpy.cu ├── test_device_histogram.cu ├── test_device_merge_sort.cu ├── test_device_radix_sort.cu ├── test_device_reduce.cu ├── test_device_reduce_by_key.cu ├── test_device_run_length_encode.cu ├── test_device_scan.cu ├── test_device_scan_by_key.cu ├── test_device_segmented_sort.cu ├── test_device_select_if.cu ├── test_device_select_unique.cu ├── test_device_select_unique_by_key.cu ├── test_device_spmv.cu ├── test_device_three_way_partition.cu ├── test_grid_barrier.cu ├── test_iterator.cu ├── test_iterator_deprecated.cu ├── test_namespace_wrapped.cu ├── test_temporary_storage_layout.cu ├── test_thread_operators.cu ├── test_thread_sort.cu ├── test_util.h └── test_util_vec.h /.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: LLVM 2 | AccessModifierOffset: -2 3 | AlignAfterOpenBracket: Align 4 | AlignConsecutiveAssignments: true 5 | AlignEscapedNewlines: Right 6 | AlignOperands: true 7 | AllowAllArgumentsOnNextLine: false 8 | AllowAllConstructorInitializersOnNextLine: false 9 | AllowAllParametersOfDeclarationOnNextLine: false 10 | AllowShortBlocksOnASingleLine: false 11 | AllowShortCaseLabelsOnASingleLine: false 12 | AllowShortFunctionsOnASingleLine: All 13 | AllowShortIfStatementsOnASingleLine: Never 14 | AllowShortLambdasOnASingleLine: All 15 | AllowShortLoopsOnASingleLine: false 16 | AlwaysBreakAfterReturnType: None 17 | AlwaysBreakTemplateDeclarations: Yes 18 | BinPackArguments: false 19 | BinPackParameters: false 20 | BreakBeforeBraces: Custom 21 | BraceWrapping: 22 | AfterCaseLabel: false 23 | AfterClass: true 24 | AfterControlStatement: true 25 | AfterEnum: true 26 | AfterFunction: true 27 | AfterNamespace: true 28 | AfterStruct: true 29 | AfterUnion: true 30 | BeforeCatch: true 31 | BeforeElse: true 32 | IndentBraces: false 33 | SplitEmptyFunction: false 34 | SplitEmptyRecord: false 35 | BreakBeforeBinaryOperators: None 36 | BreakBeforeTernaryOperators: true 37 | BreakConstructorInitializers: BeforeComma 38 | BreakInheritanceList: BeforeComma 39 | ColumnLimit: 100 40 | CompactNamespaces: false 41 | ContinuationIndentWidth: 2 42 | IncludeBlocks: Regroup 43 | IncludeCategories: 44 | - Regex: '^$' 51 | Priority: 4 52 | IndentCaseLabels: true 53 | IndentPPDirectives: None 54 | IndentWidth: 2 55 | KeepEmptyLinesAtTheStartOfBlocks: true 56 | MaxEmptyLinesToKeep: 1 57 | NamespaceIndentation: None 58 | PenaltyBreakAssignment: 30 59 | PenaltyBreakBeforeFirstCallParameter: 50 60 | PenaltyBreakComment: 0 61 | PenaltyBreakFirstLessLess: 0 62 | PenaltyBreakString: 70 63 | PenaltyBreakTemplateDeclaration: 0 64 | PenaltyExcessCharacter: 100 65 | PenaltyReturnTypeOnItsOwnLine: 90 66 | PointerAlignment: Right 67 | ReflowComments: true 68 | SortIncludes: CaseInsensitive 69 | SpaceAfterCStyleCast: false 70 | SpaceAfterLogicalNot: false 71 | SpaceAfterTemplateKeyword: true 72 | SpaceBeforeAssignmentOperators: true 73 | SpaceBeforeCpp11BracedList: false 74 | SpaceBeforeCtorInitializerColon: true 75 | SpaceBeforeInheritanceColon: true 76 | SpaceBeforeParens: ControlStatements 77 | SpaceBeforeRangeBasedForLoopColon: true 78 | SpaceInEmptyParentheses: false 79 | SpacesBeforeTrailingComments: 1 80 | SpacesInAngles: false 81 | SpacesInCStyleCastParentheses: false 82 | SpacesInParentheses: false 83 | SpacesInSquareBrackets: false 84 | Standard: c++11 85 | TabWidth: 2 86 | UseTab: Never 87 | -------------------------------------------------------------------------------- /.clang-tidy: -------------------------------------------------------------------------------- 1 | --- 2 | Checks: 3 | 'modernize-*, 4 | -modernize-use-equals-default, 5 | -modernize-concat-nested-namespaces, 6 | -modernize-use-trailing-return-type' 7 | 8 | # -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly) 9 | # -modernize-concat-nested-namespaces # auto-fix is broken (can delete code) 10 | # -modernize-use-trailing-return-type # just a preference 11 | 12 | WarningsAsErrors: '' 13 | HeaderFilterRegex: '' 14 | AnalyzeTemporaryDtors: false 15 | FormatStyle: none 16 | CheckOptions: 17 | - key: modernize-loop-convert.MaxCopySize 18 | value: '16' 19 | - key: modernize-loop-convert.MinConfidence 20 | value: reasonable 21 | - key: modernize-pass-by-value.IncludeStyle 22 | value: llvm 23 | - key: modernize-replace-auto-ptr.IncludeStyle 24 | value: llvm 25 | - key: modernize-use-nullptr.NullMacros 26 | value: 'NULL' 27 | ... 28 | -------------------------------------------------------------------------------- /.clangd: -------------------------------------------------------------------------------- 1 | # https://clangd.llvm.org/config 2 | 3 | # Apply a config conditionally to all C files 4 | If: 5 | PathMatch: .*\.(c|h)$ 6 | 7 | --- 8 | 9 | # Apply a config conditionally to all C++ files 10 | If: 11 | PathMatch: .*\.(c|h)pp 12 | 13 | --- 14 | 15 | # Apply a config conditionally to all CUDA files 16 | If: 17 | PathMatch: .*\.cuh? 18 | CompileFlags: 19 | Add: 20 | # Allow variadic CUDA functions 21 | - "-Xclang=-fcuda-allow-variadic-functions" 22 | 23 | --- 24 | 25 | # Tweak the clangd parse settings for all files 26 | CompileFlags: 27 | Compiler: clang++ 28 | CompilationDatabase: . 29 | Add: 30 | - -x 31 | - cuda 32 | # report all errors 33 | - "-ferror-limit=0" 34 | - "-ftemplate-backtrace-limit=0" 35 | - "-stdlib=libc++" 36 | Remove: 37 | - -stdpar 38 | # strip CUDA fatbin args 39 | - "-Xfatbin*" 40 | - "-Xcompiler*" 41 | - "-Xcudafe*" 42 | - "-rdc=*" 43 | - "-gpu=*" 44 | - "--diag_suppress*" 45 | # strip CUDA arch flags 46 | - "-gencode*" 47 | - "--generate-code*" 48 | # strip gcc's -fcoroutines 49 | - -fcoroutines 50 | # strip CUDA flags unknown to clang 51 | - "-ccbin*" 52 | - "--compiler-options*" 53 | - "--expt-extended-lambda" 54 | - "--expt-relaxed-constexpr" 55 | - "-forward-unknown-to-host-compiler" 56 | - "-Werror=cross-execution-space-call" 57 | Diagnostics: 58 | Suppress: 59 | - "variadic_device_fn" 60 | - "attributes_not_allowed" 61 | # The NVHPC version of _NVCXX_EXPAND_PACK macro triggers this clang error. 62 | # Temporarily suppressing it, but should probably fix 63 | - "template_param_shadow" 64 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Exclude these commits from git-blame and similar tools. 2 | # 3 | # To use this file, run the following command from the repo root: 4 | # 5 | # ``` 6 | # $ git config blame.ignoreRevsFile .git-blame-ignore-revs 7 | # ``` 8 | # 9 | # Include a brief comment with each commit added, for example: 10 | # 11 | # ``` 12 | # d92d9f8baac5ec48a8f8718dd69f415a45efe372 # Initial clang-format 13 | # ``` 14 | # 15 | # Only add commits that are pure formatting changes (e.g. 16 | # clang-format version changes, etc). 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Open Issue in CCCL Repository 4 | url: https://github.com/NVIDIA/cccl/issues/new/choose 5 | about: This repository has moved! Please see the new home for CUB. 6 | -------------------------------------------------------------------------------- /.github/workflows/mirror-main-branch-to-master-branch.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - "main" 5 | 6 | jobs: 7 | mirror-main-branch-to-master-branch: 8 | name: Mirror main branch to master branch 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Mirror main branch to master branch 12 | id: mirror 13 | uses: google/mirror-branch-action@v1.0 14 | with: 15 | source: "main" 16 | dest: "master" 17 | github-token: ${{ secrets.GITHUB_TOKEN }} 18 | -------------------------------------------------------------------------------- /.github/workflows/push-to-legacy-repositories.yml: -------------------------------------------------------------------------------- 1 | on: push 2 | 3 | jobs: 4 | push-to-legacy-repositories: 5 | name: Push to legacy repositories 6 | runs-on: ubuntu-latest 7 | steps: 8 | - name: Push `main` to github.com/nvlabs/cub 9 | uses: wei/git-sync@v2 10 | if: github.repository == 'nvidia/cub' 11 | with: 12 | source_repo: "nvidia/cub" 13 | source_branch: "main" 14 | destination_repo: "nvlabs/cub" 15 | destination_branch: "main" 16 | ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} 17 | - name: Push all tags to github.com/nvlabs/cub 18 | uses: wei/git-sync@v2 19 | if: github.repository == 'nvidia/cub' 20 | with: 21 | source_repo: "nvidia/cub" 22 | source_branch: "refs/tags/*" 23 | destination_repo: "nvlabs/cub" 24 | destination_branch: "refs/tags/*" 25 | ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} 26 | - name: Push `main` to github.com/thrust/cub 27 | uses: wei/git-sync@v2 28 | if: github.repository == 'nvidia/cub' 29 | with: 30 | source_repo: "nvidia/cub" 31 | source_branch: "main" 32 | destination_repo: "thrust/cub" 33 | destination_branch: "main" 34 | ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} 35 | - name: Push all tags to github.com/thrust/cub 36 | uses: wei/git-sync@v2 37 | if: github.repository == 'nvidia/cub' 38 | with: 39 | source_repo: "nvidia/cub" 40 | source_branch: "refs/tags/*" 41 | destination_repo: "thrust/cub" 42 | destination_branch: "refs/tags/*" 43 | ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .p4config 2 | *~ 3 | \#* 4 | /build 5 | .cache 6 | .vscode 7 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # 3.15 is the minimum. 2 | # 3.17 for NVC++. 3 | # 3.18.3 for C++17 + CUDA. 4 | cmake_minimum_required(VERSION 3.15) 5 | 6 | # Remove this when we use the new CUDA_ARCHITECTURES properties. 7 | if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) 8 | cmake_policy(SET CMP0104 OLD) 9 | endif() 10 | 11 | # CXX is only needed for AppendOptionIfAvailable. 12 | project(CUB NONE) 13 | 14 | # Determine whether CUB is the top-level project or included into 15 | # another project via add_subdirectory(). 16 | if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}") 17 | set(CUB_TOPLEVEL_PROJECT ON) 18 | else() 19 | set(CUB_TOPLEVEL_PROJECT OFF) 20 | endif() 21 | 22 | # This must be done before any languages are enabled: 23 | if (CUB_TOPLEVEL_PROJECT) 24 | include(cmake/CubCompilerHacks.cmake) 25 | endif() 26 | 27 | # This must appear after our Compiler Hacks or else CMake will delete the cache 28 | # and reconfigure from scratch. 29 | # This must also appear before the installation rules, as it is required by the 30 | # GNUInstallDirs CMake module. 31 | enable_language(CXX) 32 | 33 | # Thrust has its own copy of CUB install rules to handle packaging usecases 34 | # where we want to install CUB headers but aren't actually building anything. 35 | # In these cases the add_subdirectory(dependencies/cub) line in Thrust won't get 36 | # called so we can't rely on CUB providing its own rules. 37 | if (NOT CUB_IN_THRUST) 38 | option(CUB_ENABLE_INSTALL_RULES "Enable installation of CUB" ${CUB_TOPLEVEL_PROJECT}) 39 | if (CUB_ENABLE_INSTALL_RULES) 40 | include(cmake/CubInstallRules.cmake) 41 | endif() 42 | endif() 43 | 44 | # Support adding CUB to a parent project via add_subdirectory. 45 | # See examples/cmake/add_subdir/CMakeLists.txt for details. 46 | if (NOT CUB_TOPLEVEL_PROJECT AND NOT CUB_IN_THRUST) 47 | include(cmake/CubAddSubdir.cmake) 48 | return() 49 | endif() 50 | 51 | option(CUB_ENABLE_HEADER_TESTING "Test that all public headers compile." ON) 52 | option(CUB_ENABLE_TESTING "Build CUB testing suite." ON) 53 | option(CUB_ENABLE_BENCHMARKS "Build CUB benchmarking suite." OFF) 54 | option(CUB_ENABLE_TUNING "Build CUB tuning suite." OFF) 55 | option(CUB_ENABLE_EXAMPLES "Build CUB examples." ON) 56 | 57 | # This is needed for NVCXX QA, which requires a static set of executable names. 58 | # Only a single dialect may be enabled when this is off. 59 | option(CUB_ENABLE_CPP_DIALECT_IN_NAMES 60 | "Include C++ dialect information in target/object/etc names." 61 | ON 62 | ) 63 | mark_as_advanced(CUB_ENABLE_CPP_DIALECT_IN_NAMES) 64 | 65 | # This option is only used when CUB is built stand-alone; otherwise the Thrust 66 | # option has the same effect. 67 | if (NOT CUB_IN_THRUST) 68 | option(CUB_IGNORE_DEPRECATED_API 69 | "Suppress warnings about deprecated Thrust/CUB API." 70 | OFF 71 | ) 72 | endif() 73 | 74 | # Check if we're actually building anything before continuing. If not, no need 75 | # to search for deps, etc. This is a common approach for packagers that just 76 | # need the install rules. See GH issue NVIDIA/thrust#1211. 77 | if (NOT (CUB_ENABLE_HEADER_TESTING OR 78 | CUB_ENABLE_TESTING OR 79 | CUB_ENABLE_EXAMPLES)) 80 | return() 81 | endif() 82 | 83 | include(cmake/AppendOptionIfAvailable.cmake) 84 | include(cmake/CubBuildCompilerTargets.cmake) 85 | include(cmake/CubBuildTargetList.cmake) 86 | include(cmake/CubCudaConfig.cmake) 87 | include(cmake/CubUtilities.cmake) 88 | 89 | if ("" STREQUAL "${CMAKE_BUILD_TYPE}") 90 | set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE) 91 | 92 | set_property( 93 | CACHE CMAKE_BUILD_TYPE 94 | PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel 95 | ) 96 | endif () 97 | 98 | set(CMAKE_CXX_EXTENSIONS OFF) 99 | 100 | # Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up alongside 101 | # Thrust targets when building as part of Thrust. 102 | set(CUB_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib") 103 | set(CUB_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin") 104 | 105 | cub_build_target_list() 106 | 107 | if (CUB_ENABLE_HEADER_TESTING) 108 | include(cmake/CubHeaderTesting.cmake) 109 | endif() 110 | 111 | # Both testing and examples use ctest 112 | if (CUB_ENABLE_TESTING OR CUB_ENABLE_EXAMPLES) 113 | include(CTest) 114 | enable_testing() 115 | endif() 116 | 117 | if (CUB_ENABLE_TESTING) 118 | add_subdirectory(test) 119 | endif() 120 | 121 | if (CUB_ENABLE_EXAMPLES) 122 | add_subdirectory(examples) 123 | endif() 124 | 125 | if (CUB_ENABLE_BENCHMARKS OR CUB_ENABLE_TUNING) 126 | add_subdirectory(benchmarks) 127 | endif() 128 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Code of Conduct 3 | 4 | ## Overview 5 | 6 | This document defines the Code of Conduct followed and enforced for NVIDIA C++ 7 | Core Compute Libraries. 8 | 9 | ### Intended Audience 10 | 11 | * Community 12 | * Developers 13 | * Project Leads 14 | 15 | ## Our Pledge 16 | 17 | In the interest of fostering an open and welcoming environment, we as 18 | contributors and maintainers pledge to making participation in our project and 19 | our community a harassment-free experience for everyone, regardless of age, 20 | body size, disability, ethnicity, sex characteristics, gender identity and 21 | expression, level of experience, education, socio-economic status, nationality, 22 | personal appearance, race, religion, or sexual identity and orientation. 23 | 24 | ## Our Standards 25 | 26 | Examples of behavior that contributes to creating a positive environment include: 27 | 28 | - Using welcoming and inclusive language. 29 | - Being respectful of differing viewpoints and experiences. 30 | - Gracefully accepting constructive criticism. 31 | - Focusing on what is best for the community. 32 | - Showing empathy towards other community members. 33 | 34 | Examples of unacceptable behavior by participants include: 35 | 36 | - The use of sexualized language or imagery and unwelcome sexual attention or 37 | advances. 38 | - Trolling, insulting/derogatory comments, and personal or political attacks. 39 | - Public or private harassment. 40 | - Publishing others’ private information, such as a physical or electronic 41 | address, without explicit permission. 42 | - Other conduct which could reasonably be considered inappropriate. 43 | 44 | ## Our Responsibilities 45 | 46 | Project maintainers are responsible for clarifying the standards of acceptable 47 | behavior and are expected to take appropriate and fair corrective action in 48 | response to any instances of unacceptable behavior. 49 | 50 | Project maintainers have the right and responsibility to remove, edit, or 51 | reject comments, commits, code, wiki edits, issues, and other contributions 52 | that are not aligned to this Code of Conduct, or to ban temporarily or 53 | permanently any contributor for other behaviors that they deem inappropriate, 54 | threatening, offensive, or harmful. 55 | 56 | ## Scope 57 | 58 | This Code of Conduct applies both within project spaces and in public spaces 59 | when an individual is representing the project or its community. 60 | Examples of representing a project or community include using an official 61 | project email address, posting via an official social media account, or acting 62 | as an appointed representative at an online or offline event. 63 | Representation of a project may be further defined and clarified by project 64 | maintainers. 65 | 66 | ## Enforcement 67 | 68 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 69 | reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com). 70 | All complaints will be reviewed and investigated and will result in a response 71 | that is deemed necessary and appropriate to the circumstances. 72 | The project team is obligated to maintain confidentiality with regard to the 73 | reporter of an incident. 74 | Further details of specific enforcement policies may be posted separately. 75 | 76 | Project maintainers who do not follow or enforce the Code of Conduct in good 77 | faith may face temporary or permanent repercussions as determined by other 78 | members of the project’s leadership. 79 | 80 | ## Attribution 81 | 82 | This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was 83 | adapted from the [Contributor Covenant version 1.4]. 84 | 85 | Please see this [FAQ] for answers to common questions about this Code of Conduct. 86 | 87 | ## Contact 88 | 89 | Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters. 90 | 91 | 92 | [cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com 93 | 94 | [FAQ]: https://www.contributor-covenant.org/faq 95 | 96 | [NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/ 97 | [Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 98 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Table of Contents 2 | 3 | 1. [Contributing to CUB](#contributing-to-cub) 4 | 1. [CMake Options](#cmake-options) 5 | 1. [Development Model](#development-model) 6 | 7 | # Contributing to CUB 8 | 9 | CUB uses Github to manage all open-source development, including bug tracking, 10 | pull requests, and design discussions. CUB is tightly coupled to the Thrust 11 | project, and a compatible version of Thrust is required when working on the 12 | development version of CUB. 13 | 14 | To setup a CUB development branch, it is recommended to recursively clone the 15 | Thrust repository and use the CUB submodule at `dependencies/cub` to stage 16 | changes. CUB's tests and examples can be built by configuring Thrust with the 17 | CMake option `THRUST_INCLUDE_CUB_CMAKE=ON`. 18 | 19 | This process is described in more detail in Thrust's 20 | [CONTRIBUTING.md](https://nvidia.github.io/thrust/contributing.html). 21 | 22 | The CMake options in the following section may be used to customize CUB's build 23 | process. Note that some of these are controlled by Thrust for compatibility and 24 | may not have an effect when building CUB through the Thrust build system. This 25 | is pointed out in the documentation below where applicable. 26 | 27 | # CMake Options 28 | 29 | A CUB build is configured using CMake options. These may be passed to CMake 30 | using 31 | 32 | ``` 33 | cmake -D= [Thrust or CUB project source root] 34 | ``` 35 | 36 | or configured interactively with the `ccmake` or `cmake-gui` interfaces. 37 | 38 | The configuration options for CUB are: 39 | 40 | - `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}` 41 | - Standard CMake build option. Default: `RelWithDebInfo` 42 | - `CUB_ENABLE_HEADER_TESTING={ON, OFF}` 43 | - Whether to test compile public headers. Default is `ON`. 44 | - `CUB_ENABLE_TESTING={ON, OFF}` 45 | - Whether to build unit tests. Default is `ON`. 46 | - `CUB_ENABLE_EXAMPLES={ON, OFF}` 47 | - Whether to build examples. Default is `ON`. 48 | - `CUB_ENABLE_DIALECT_CPPXX={ON, OFF}` 49 | - Setting this has no effect when building CUB as a component of Thrust. 50 | See Thrust's dialect options, which CUB will inherit. 51 | - Toggle whether a specific C++ dialect will be targeted. 52 | - Multiple dialects may be targeted in a single build. 53 | - Possible values of `XX` are `{11, 14, 17}`. 54 | - By default, only C++14 is enabled. 55 | - `CUB_ENABLE_COMPUTE_XX={ON, OFF}` 56 | - Setting this has no effect when building CUB as a component of Thrust. 57 | See Thrust's architecture options, which CUB will inherit. 58 | - Controls the targeted CUDA architecture(s) 59 | - Multiple options may be selected when using NVCC as the CUDA compiler. 60 | - Valid values of `XX` are: 61 | `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}` 62 | - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`: 63 | - `CUB_ENABLE_COMPUTE_FUTURE={ON, OFF}` 64 | - Setting this has no effect when building CUB as a component of Thrust. 65 | See Thrust's architecture options, which CUB will inherit. 66 | - If enabled, CUDA objects will target the most recent virtual architecture 67 | in addition to the real architectures specified by the 68 | `CUB_ENABLE_COMPUTE_XX` options. 69 | - Default value depends on `CUB_DISABLE_ARCH_BY_DEFAULT`: 70 | - `CUB_DISABLE_ARCH_BY_DEFAULT={ON, OFF}` 71 | - Setting this has no effect when building CUB as a component of Thrust. 72 | See Thrust's architecture options, which CUB will inherit. 73 | - When `ON`, all `CUB_ENABLE_COMPUTE_*` options are initially `OFF`. 74 | - Default: `OFF` (meaning all architectures are enabled by default) 75 | - `CUB_ENABLE_TESTS_WITH_RDC={ON, OFF}` 76 | - Whether to enable Relocatable Device Code when building tests. 77 | Default is `OFF`. 78 | - `CUB_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}` 79 | - Whether to enable Relocatable Device Code when building examples. 80 | Default is `OFF`. 81 | - `CUB_ENABLE_INSTALL_RULES={ON, OFF}` 82 | - Setting this has no effect when building CUB as a component of Thrust. 83 | See Thrust's `THRUST_INSTALL_CUB_HEADERS` option, which controls this 84 | behavior. 85 | - If true, installation rules will be generated for CUB. Default is `ON` when 86 | building CUB alone, and `OFF` when CUB is a subproject added via CMake's 87 | `add_subdirectory`. 88 | 89 | # Development Model 90 | 91 | CUB follows the same development model as Thrust, described 92 | [here](https://nvidia.github.io/thrust/releases/versioning.html). 93 | -------------------------------------------------------------------------------- /LICENSE.TXT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2011, Duane Merrill. All rights reserved. 2 | Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the NVIDIA CORPORATION nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /benchmarks/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(CUDAToolkit REQUIRED) 2 | find_package(Python3 COMPONENTS Interpreter REQUIRED) 3 | 4 | # Defer dependencies collection to nvbench helper 5 | add_subdirectory(nvbench_helper) 6 | 7 | set(benches_root "${CMAKE_CURRENT_LIST_DIR}") 8 | 9 | if(NOT CMAKE_BUILD_TYPE STREQUAL "Release") 10 | message(FATAL_ERROR "CUB benchmarks must be built in release mode.") 11 | endif() 12 | 13 | if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) 14 | message(FATAL_ERROR "CMAKE_CUDA_ARCHITECTURES must be set to build CUB benchmarks.") 15 | endif() 16 | 17 | set(benches_meta_target cub.all.benches) 18 | add_custom_target(${benches_meta_target}) 19 | 20 | function(get_recursive_subdirs subdirs) 21 | set(dirs) 22 | file(GLOB_RECURSE contents 23 | CONFIGURE_DEPENDS 24 | LIST_DIRECTORIES ON 25 | "${CMAKE_CURRENT_LIST_DIR}/bench/*" 26 | ) 27 | 28 | foreach(test_dir IN LISTS contents) 29 | if(IS_DIRECTORY "${test_dir}") 30 | list(APPEND dirs "${test_dir}") 31 | endif() 32 | endforeach() 33 | 34 | set(${subdirs} "${dirs}" PARENT_SCOPE) 35 | endfunction() 36 | 37 | set(meta_path "${CMAKE_BINARY_DIR}/cub_bench_meta.csv") 38 | file(REMOVE "${meta_path}") 39 | 40 | set(ctk_version "${CUDAToolkit_VERSION}") 41 | message(STATUS "CTK version: ${ctk_version}") 42 | 43 | find_package(Git REQUIRED) 44 | if(GIT_FOUND) 45 | execute_process( 46 | COMMAND ${GIT_EXECUTABLE} describe 47 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 48 | OUTPUT_VARIABLE cub_revision 49 | OUTPUT_STRIP_TRAILING_WHITESPACE) 50 | message(STATUS "Git revision: ${cub_revision}") 51 | else() 52 | message(WARNING "Git not found. Unable to determine Git revision.") 53 | endif() 54 | 55 | file(APPEND "${meta_path}" "ctk_version,${ctk_version}\n") 56 | file(APPEND "${meta_path}" "cub_revision,${cub_revision}\n") 57 | 58 | function(get_bench_ranges src bench_name) 59 | file(READ "${src}" file_data) 60 | set(param_regex "//[ ]+%RANGE%[ ]+([^ ]+)[ ]+([^ ]+)[ ]+([^\n]*)") 61 | 62 | string(REGEX MATCHALL "${param_regex}" matches "${file_data}") 63 | 64 | set(ranges "") 65 | 66 | foreach(match IN LISTS matches) 67 | string(REGEX MATCH "${param_regex}" unused "${match}") 68 | 69 | set(def ${CMAKE_MATCH_1}) 70 | set(label ${CMAKE_MATCH_2}) 71 | set(range ${CMAKE_MATCH_3}) 72 | set(ranges "${ranges}${def}|${label}=${range},") 73 | 74 | string(REPLACE ":" ";" range "${range}") 75 | list(LENGTH range range_len) 76 | 77 | if (NOT "${range_len}" STREQUAL 3) 78 | message(FATAL_ERROR "Range should be represented as 'start:end:step'") 79 | endif() 80 | endforeach() 81 | 82 | string(LENGTH "${ranges}" ranges_length) 83 | math(EXPR last_character_index "${ranges_length} - 1") 84 | string(SUBSTRING "${ranges}" 0 ${last_character_index} ranges) 85 | file(APPEND "${meta_path}" "${bench_name},${ranges}\n") 86 | endfunction() 87 | 88 | function(add_bench target_name bench_name bench_src) 89 | set(bench_target ${bench_name}) 90 | set(${target_name} ${bench_target} PARENT_SCOPE) 91 | 92 | add_executable(${bench_target} "${bench_src}") 93 | target_link_libraries(${bench_target} PRIVATE nvbench_helper nvbench::main) 94 | set_target_properties(${bench_target} 95 | PROPERTIES 96 | ARCHIVE_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" 97 | LIBRARY_OUTPUT_DIRECTORY "${CUB_LIBRARY_OUTPUT_DIR}" 98 | RUNTIME_OUTPUT_DIRECTORY "${CUB_EXECUTABLE_OUTPUT_DIR}" 99 | ) 100 | endfunction() 101 | 102 | function(add_bench_dir bench_dir) 103 | file(GLOB bench_srcs CONFIGURE_DEPENDS "${bench_dir}/*.cu") 104 | file(RELATIVE_PATH bench_prefix "${benches_root}" "${bench_dir}") 105 | file(TO_CMAKE_PATH "${bench_prefix}" bench_prefix) 106 | string(REPLACE "/" "." bench_prefix "${bench_prefix}") 107 | 108 | foreach(bench_src IN LISTS bench_srcs) 109 | # base tuning 110 | get_filename_component(bench_name "${bench_src}" NAME_WLE) 111 | string(PREPEND bench_name "cub.${bench_prefix}.") 112 | 113 | set(base_bench_name "${bench_name}.base") 114 | add_bench(base_bench_target ${base_bench_name} "${bench_src}") 115 | add_dependencies(${benches_meta_target} ${base_bench_target}) 116 | target_compile_definitions(${base_bench_target} PRIVATE TUNE_BASE=1) 117 | 118 | # tuning 119 | if (CUB_ENABLE_TUNING) 120 | set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${bench_src}") 121 | get_bench_ranges("${bench_src}" "${bench_name}") 122 | set(tuning_name "${bench_name}.variant") 123 | set(tuning_path "${CMAKE_BINARY_DIR}/${tuning_name}.h") 124 | add_bench(bench_target ${tuning_name} "${bench_src}") 125 | file(WRITE "${tuning_path}" "#pragma once\n") 126 | target_compile_options(${bench_target} PRIVATE "-include${tuning_path}") 127 | endif() 128 | endforeach() 129 | endfunction() 130 | 131 | get_recursive_subdirs(subdirs) 132 | 133 | foreach(subdir IN LISTS subdirs) 134 | add_bench_dir("${subdir}") 135 | endforeach() 136 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | ## Benchmarks 2 | 3 | ### Ready 4 | 5 | - radix sort 6 | - keys 7 | - pairs 8 | - merge sort 9 | - keys 10 | - pairs 11 | - reduce 12 | - sum 13 | - max 14 | - by_key 15 | - scan 16 | - sum 17 | - max 18 | - by key 19 | - select 20 | - flagged 21 | - if 22 | - partition 23 | - flagged 24 | - if 25 | - histogram 26 | - even 27 | - range 28 | - multi even 29 | - multi range 30 | - rle 31 | - encode 32 | - non trivial runs 33 | - adjacent difference 34 | - left 35 | 36 | ### TODO 37 | 38 | - segmented 39 | -------------------------------------------------------------------------------- /benchmarks/bench/histogram/histogram_common.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #if !TUNE_BASE 33 | 34 | #if TUNE_LOAD == 0 35 | #define TUNE_LOAD_MODIFIER cub::LOAD_DEFAULT 36 | #elif TUNE_LOAD == 1 37 | #define TUNE_LOAD_MODIFIER cub::LOAD_LDG 38 | #else // TUNE_LOAD == 2 39 | #define TUNE_LOAD_MODIFIER cub::LOAD_CA 40 | #endif // TUNE_LOAD 41 | 42 | #if TUNE_MEM_PREFERENCE == 0 43 | constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::GMEM; 44 | #elif TUNE_MEM_PREFERENCE == 1 45 | constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::SMEM; 46 | #else // TUNE_MEM_PREFERENCE == 2 47 | constexpr cub::BlockHistogramMemoryPreference MEM_PREFERENCE = cub::BLEND; 48 | #endif // TUNE_MEM_PREFERENCE 49 | 50 | 51 | template 52 | struct policy_hub_t 53 | { 54 | template 55 | struct TScale 56 | { 57 | enum 58 | { 59 | V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int), 60 | VALUE = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1) 61 | }; 62 | }; 63 | 64 | struct policy_t : cub::ChainedPolicy<350, policy_t, policy_t> 65 | { 66 | using AgentHistogramPolicyT = cub::AgentHistogramPolicy::VALUE, 68 | cub::BLOCK_LOAD_DIRECT, 69 | TUNE_LOAD_MODIFIER, 70 | TUNE_RLE_COMPRESS, 71 | MEM_PREFERENCE, 72 | TUNE_WORK_STEALING>; 73 | }; 74 | 75 | using MaxPolicy = policy_t; 76 | }; 77 | #endif // !TUNE_BASE 78 | 79 | template 80 | SampleT get_upper_level(OffsetT bins, OffsetT elements) 81 | { 82 | if constexpr (cuda::std::is_integral_v) 83 | { 84 | if constexpr (sizeof(SampleT) < sizeof(OffsetT)) 85 | { 86 | const SampleT max_key = std::numeric_limits::max(); 87 | return static_cast(std::min(bins, static_cast(max_key))); 88 | } 89 | else 90 | { 91 | return static_cast(bins); 92 | } 93 | } 94 | 95 | return static_cast(elements); 96 | } 97 | -------------------------------------------------------------------------------- /benchmarks/bench/reduce/base.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include 29 | 30 | #ifndef TUNE_BASE 31 | #define TUNE_ITEMS_PER_VEC_LOAD (1 << TUNE_ITEMS_PER_VEC_LOAD_POW2) 32 | #endif 33 | 34 | #if !TUNE_BASE 35 | template 36 | struct policy_hub_t 37 | { 38 | struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> 39 | { 40 | static constexpr int threads_per_block = TUNE_THREADS_PER_BLOCK; 41 | static constexpr int items_per_thread = TUNE_ITEMS_PER_THREAD; 42 | static constexpr int items_per_vec_load = TUNE_ITEMS_PER_VEC_LOAD; 43 | 44 | using ReducePolicy = cub::AgentReducePolicy; 50 | 51 | // SingleTilePolicy 52 | using SingleTilePolicy = ReducePolicy; 53 | 54 | // SegmentedReducePolicy 55 | using SegmentedReducePolicy = ReducePolicy; 56 | }; 57 | 58 | using MaxPolicy = policy_t; 59 | }; 60 | #endif // !TUNE_BASE 61 | 62 | template 63 | void reduce(nvbench::state &state, nvbench::type_list) 64 | { 65 | using accum_t = T; 66 | using input_it_t = const T *; 67 | using output_it_t = T *; 68 | using offset_t = typename cub::detail::ChooseOffsetT::Type; 69 | using output_t = T; 70 | using init_t = T; 71 | #if !TUNE_BASE 72 | using policy_t = policy_hub_t; 73 | using dispatch_t = 74 | cub::DispatchReduce; 75 | #else // TUNE_BASE 76 | using dispatch_t = cub::DispatchReduce; 77 | #endif // TUNE_BASE 78 | 79 | // Retrieve axis parameters 80 | const auto elements = static_cast(state.get_int64("Elements{io}")); 81 | thrust::device_vector in(elements); 82 | thrust::device_vector out(1); 83 | 84 | gen(seed_t{}, in); 85 | 86 | input_it_t d_in = thrust::raw_pointer_cast(in.data()); 87 | output_it_t d_out = thrust::raw_pointer_cast(out.data()); 88 | 89 | // Enable throughput calculations and add "Size" column to results. 90 | state.add_element_count(elements); 91 | state.add_global_memory_reads(elements, "Size"); 92 | state.add_global_memory_writes(1); 93 | 94 | // Allocate temporary storage: 95 | std::size_t temp_size; 96 | dispatch_t::Dispatch(nullptr, 97 | temp_size, 98 | d_in, 99 | d_out, 100 | static_cast(elements), 101 | op_t{}, 102 | init_t{}, 103 | 0 /* stream */); 104 | 105 | thrust::device_vector temp(temp_size); 106 | auto *temp_storage = thrust::raw_pointer_cast(temp.data()); 107 | 108 | state.exec([&](nvbench::launch &launch) { 109 | dispatch_t::Dispatch(temp_storage, 110 | temp_size, 111 | d_in, 112 | d_out, 113 | static_cast(elements), 114 | op_t{}, 115 | init_t{}, 116 | launch.get_stream()); 117 | }); 118 | } 119 | 120 | NVBENCH_BENCH_TYPES(reduce, NVBENCH_TYPE_AXES(all_types, offset_types)) 121 | .set_name("cub::DeviceReduce::Reduce") 122 | .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) 123 | .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)); 124 | -------------------------------------------------------------------------------- /benchmarks/bench/reduce/max.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include 29 | 30 | // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 31 | // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 32 | // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 33 | 34 | using op_t = max_t; 35 | #include "base.cuh" 36 | -------------------------------------------------------------------------------- /benchmarks/bench/reduce/sum.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include 29 | 30 | // %RANGE% TUNE_ITEMS_PER_THREAD ipt 7:24:1 31 | // %RANGE% TUNE_THREADS_PER_BLOCK tpb 128:1024:32 32 | // %RANGE% TUNE_ITEMS_PER_VEC_LOAD_POW2 ipv 1:2:1 33 | 34 | using op_t = cub::Sum; 35 | #include "base.cuh" -------------------------------------------------------------------------------- /benchmarks/bench/scan/exclusive/max.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | // %RANGE% TUNE_ITEMS ipt 7:24:1 29 | // %RANGE% TUNE_THREADS tpb 128:1024:32 30 | // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 31 | // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 32 | // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 33 | // %RANGE% TUNE_TRANSPOSE trp 0:1:1 34 | // %RANGE% TUNE_LOAD ld 0:2:1 35 | 36 | #include 37 | 38 | using op_t = max_t; 39 | #include "base.cuh" 40 | -------------------------------------------------------------------------------- /benchmarks/bench/scan/exclusive/sum.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include 29 | 30 | // %RANGE% TUNE_ITEMS ipt 7:24:1 31 | // %RANGE% TUNE_THREADS tpb 128:1024:32 32 | // %RANGE% TUNE_MAGIC_NS ns 0:2048:4 33 | // %RANGE% TUNE_DELAY_CONSTRUCTOR_ID dcid 0:7:1 34 | // %RANGE% TUNE_L2_WRITE_LATENCY_NS l2w 0:1200:5 35 | // %RANGE% TUNE_TRANSPOSE trp 0:1:1 36 | // %RANGE% TUNE_LOAD ld 0:2:1 37 | 38 | using op_t = cub::Sum; 39 | #include "base.cuh" 40 | -------------------------------------------------------------------------------- /benchmarks/docker/.gitignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | -------------------------------------------------------------------------------- /benchmarks/docker/recipe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import hpccm 4 | 5 | hpccm.config.set_container_format('docker') 6 | 7 | Stage0 += hpccm.primitives.baseimage(image='nvidia/cuda:12.1.0-devel-ubuntu22.04') 8 | Stage0 += hpccm.building_blocks.apt_get(ospackages=['git', 'tmux', 'gcc', 'g++', 'vim', 'python3', 'python-is-python3', 'ninja-build']) 9 | Stage0 += hpccm.building_blocks.llvm(version='15', extra_tools=True, toolset=True) 10 | Stage0 += hpccm.building_blocks.cmake(eula=True, version='3.26.3') 11 | Stage0 += hpccm.building_blocks.nsight_compute(eula=True, version='2023.1.1') 12 | Stage0 += hpccm.building_blocks.pip(packages=['fpzip', 'numpy', 'pandas'], pip='pip3') 13 | 14 | -------------------------------------------------------------------------------- /benchmarks/nvbench_helper/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Fetch nvbench 2 | CPMAddPackage("gh:NVIDIA/nvbench#main") 3 | 4 | add_library(nvbench_helper SHARED nvbench_helper.cuh nvbench_helper.cu) 5 | target_link_libraries(nvbench_helper PUBLIC CUB::CUB 6 | Thrust::Thrust 7 | CUB::libcudacxx 8 | nvbench::nvbench 9 | PRIVATE CUDA::curand) 10 | 11 | target_include_directories(nvbench_helper PUBLIC "${CMAKE_CURRENT_LIST_DIR}") 12 | -------------------------------------------------------------------------------- /benchmarks/nvbench_helper/look_back_helper.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #if !TUNE_BASE 31 | #include 32 | #include 33 | 34 | #if !defined(TUNE_MAGIC_NS) || !defined(TUNE_L2_WRITE_LATENCY_NS) || !defined(TUNE_DELAY_CONSTRUCTOR_ID) 35 | #error "TUNE_MAGIC_NS, TUNE_L2_WRITE_LATENCY_NS, and TUNE_DELAY_CONSTRUCTOR_ID must be defined" 36 | #endif 37 | 38 | using delay_constructors = nvbench::type_list< 39 | cub::detail::no_delay_constructor_t, 40 | cub::detail::fixed_delay_constructor_t, 41 | cub::detail::exponential_backoff_constructor_t, 42 | cub::detail::exponential_backoff_jitter_constructor_t, 43 | cub::detail::exponential_backoff_jitter_window_constructor_t, 44 | cub::detail::exponential_backon_jitter_window_constructor_t, 45 | cub::detail::exponential_backon_jitter_constructor_t, 46 | cub::detail::exponential_backon_constructor_t>; 47 | 48 | using delay_constructor_t = nvbench::tl::get; 49 | #endif // !TUNE_BASE 50 | -------------------------------------------------------------------------------- /benchmarks/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.pyc 3 | *.pyo 4 | *.pyd 5 | -------------------------------------------------------------------------------- /benchmarks/scripts/cub/__init__.py: -------------------------------------------------------------------------------- 1 | from . import bench 2 | -------------------------------------------------------------------------------- /benchmarks/scripts/cub/bench/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import * 2 | from .storage import * 3 | from .bench import Bench 4 | from .cmake import CMake 5 | from .score import * 6 | from .search import * 7 | -------------------------------------------------------------------------------- /benchmarks/scripts/cub/bench/build.py: -------------------------------------------------------------------------------- 1 | 2 | class Build: 3 | def __init__(self, code, elapsed): 4 | self.code = code 5 | self.elapsed = elapsed 6 | 7 | def __repr__(self): 8 | return "Build(code = {}, elapsed = {:.4f}s)".format(self.code, self.elapsed) 9 | -------------------------------------------------------------------------------- /benchmarks/scripts/cub/bench/cmake.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import signal 4 | import subprocess 5 | 6 | from .build import Build 7 | from .config import Config 8 | from .storage import Storage 9 | from .logger import * 10 | 11 | 12 | def create_builds_table(conn): 13 | with conn: 14 | conn.execute(""" 15 | CREATE TABLE IF NOT EXISTS builds ( 16 | ctk TEXT NOT NULL, 17 | cub TEXT NOT NULL, 18 | bench TEXT NOT NULL, 19 | code TEXT NOT NULL, 20 | elapsed REAL 21 | ); 22 | """) 23 | 24 | 25 | class CMakeCache: 26 | _instance = None 27 | 28 | def __new__(cls, *args, **kwargs): 29 | if cls._instance is None: 30 | cls._instance = super().__new__(cls, *args, **kwargs) 31 | create_builds_table(Storage().connection()) 32 | return cls._instance 33 | 34 | def pull_build(self, bench): 35 | config = Config() 36 | ctk = config.ctk 37 | cub = config.cub 38 | conn = Storage().connection() 39 | 40 | with conn: 41 | query = "SELECT code, elapsed FROM builds WHERE ctk = ? AND cub = ? AND bench = ?;" 42 | result = conn.execute(query, (ctk, cub, bench.label())).fetchone() 43 | 44 | if result: 45 | code, elapsed = result 46 | return Build(int(code), float(elapsed)) 47 | 48 | return result 49 | 50 | def push_build(self, bench, build): 51 | config = Config() 52 | ctk = config.ctk 53 | cub = config.cub 54 | conn = Storage().connection() 55 | 56 | with conn: 57 | conn.execute("INSERT INTO builds (ctk, cub, bench, code, elapsed) VALUES (?, ?, ?, ?, ?);", 58 | (ctk, cub, bench.label(), build.code, build.elapsed)) 59 | 60 | 61 | class CMake: 62 | def __init__(self): 63 | pass 64 | 65 | def do_build(self, bench, timeout): 66 | logger = Logger() 67 | 68 | try: 69 | if not bench.is_base(): 70 | with open(bench.exe_name() + ".h", "w") as f: 71 | f.writelines(bench.definitions()) 72 | 73 | cmd = ["cmake", "--build", ".", "--target", bench.exe_name()] 74 | logger.info("starting build for {}: {}".format(bench.label(), " ".join(cmd))) 75 | 76 | begin = time.time() 77 | p = subprocess.Popen(cmd, 78 | start_new_session=True, 79 | stdout=subprocess.DEVNULL, 80 | stderr=subprocess.DEVNULL) 81 | p.wait(timeout=timeout) 82 | elapsed = time.time() - begin 83 | logger.info("finished build for {} ({}) in {}s".format(bench.label(), p.returncode, elapsed)) 84 | 85 | return Build(p.returncode, elapsed) 86 | except subprocess.TimeoutExpired: 87 | logger.info("build for {} reached timeout of {}s".format(bench.label(), timeout)) 88 | os.killpg(os.getpgid(p.pid), signal.SIGTERM) 89 | return Build(424242, float('inf')) 90 | 91 | def build(self, bench): 92 | logger = Logger() 93 | timeout = None 94 | 95 | cache = CMakeCache() 96 | 97 | if bench.is_base(): 98 | # Only base build can be pulled from cache 99 | build = cache.pull_build(bench) 100 | 101 | if build: 102 | logger.info("found cached base build for {}".format(bench.label())) 103 | if bench.is_base(): 104 | if not os.path.exists("bin/{}".format(bench.exe_name())): 105 | self.do_build(bench, None) 106 | 107 | return build 108 | else: 109 | base_build = self.build(bench.get_base()) 110 | 111 | if base_build.code != 0: 112 | raise Exception("Base build failed") 113 | 114 | timeout = base_build.elapsed * 10 115 | 116 | build = self.do_build(bench, timeout) 117 | cache.push_build(bench, build) 118 | return build 119 | 120 | def clean(): 121 | cmd = ["cmake", "--build", ".", "--target", "clean"] 122 | p = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, 123 | stderr=subprocess.DEVNULL) 124 | p.wait() 125 | 126 | if p.returncode != 0: 127 | raise Exception("Unable to clean build directory") 128 | 129 | -------------------------------------------------------------------------------- /benchmarks/scripts/cub/bench/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import itertools 4 | 5 | 6 | class Range: 7 | def __init__(self, definition, label, low, high, step): 8 | self.definition = definition 9 | self.label = label 10 | self.low = low 11 | self.high = high 12 | self.step = step 13 | 14 | 15 | class RangePoint: 16 | def __init__(self, definition, label, value): 17 | self.definition = definition 18 | self.label = label 19 | self.value = value 20 | 21 | 22 | class VariantPoint: 23 | def __init__(self, range_points): 24 | self.range_points = range_points 25 | 26 | def label(self): 27 | if self.is_base(): 28 | return 'base' 29 | return '.'.join(["{}_{}".format(point.label, point.value) for point in self.range_points]) 30 | 31 | def is_base(self): 32 | return len(self.range_points) == 0 33 | 34 | def tuning(self): 35 | if self.is_base(): 36 | return "" 37 | 38 | tuning = "#pragma once\n\n" 39 | for point in self.range_points: 40 | tuning += "#define {} {}\n".format(point.definition, point.value) 41 | return tuning 42 | 43 | 44 | class BasePoint(VariantPoint): 45 | def __init__(self): 46 | VariantPoint.__init__(self, []) 47 | 48 | 49 | def parse_ranges(columns): 50 | ranges = [] 51 | for column in columns: 52 | definition, label_range = column.split('|') 53 | label, range = label_range.split('=') 54 | start, end, step = [int(x) for x in range.split(':')] 55 | ranges.append(Range(definition, label, start, end + 1, step)) 56 | 57 | return ranges 58 | 59 | 60 | def parse_meta(): 61 | if not os.path.isfile("cub_bench_meta.csv"): 62 | print("cub_bench_meta.csv not found", file=sys.stderr) 63 | print("make sure to run the script from the CUB build directory", 64 | file=sys.stderr) 65 | 66 | benchmarks = {} 67 | ctk_version = "0.0.0" 68 | cub_revision = "0.0-0-0000" 69 | with open("cub_bench_meta.csv", "r") as f: 70 | lines = f.readlines() 71 | for line in lines: 72 | columns = line.split(',') 73 | name = columns[0] 74 | 75 | if name == "ctk_version": 76 | ctk_version = columns[1].rstrip() 77 | elif name == "cub_revision": 78 | cub_revision = columns[1].rstrip() 79 | else: 80 | benchmarks[name] = parse_ranges(columns[1:]) 81 | 82 | return ctk_version, cub_revision, benchmarks 83 | 84 | 85 | class Config: 86 | _instance = None 87 | 88 | def __new__(cls, *args, **kwargs): 89 | if cls._instance is None: 90 | cls._instance = super().__new__(cls, *args, **kwargs) 91 | cls._instance.ctk, cls._instance.cub, cls._instance.benchmarks = parse_meta() 92 | return cls._instance 93 | 94 | def label_to_variant_point(self, algname, label): 95 | if label == "base": 96 | return BasePoint() 97 | 98 | label_to_definition = {} 99 | for param_space in self.benchmarks[algname]: 100 | label_to_definition[param_space.label] = param_space.definition 101 | 102 | points = [] 103 | for point in label.split('.'): 104 | label, value = point.split('_') 105 | points.append(RangePoint(label_to_definition[label], label, int(value))) 106 | 107 | return VariantPoint(points) 108 | 109 | def variant_space(self, algname): 110 | variants = [] 111 | for param_space in self.benchmarks[algname]: 112 | variants.append([]) 113 | for value in range(param_space.low, param_space.high, param_space.step): 114 | variants[-1].append(RangePoint(param_space.definition, param_space.label, value)) 115 | 116 | return (VariantPoint(points) for points in itertools.product(*variants)) 117 | 118 | def variant_space_size(self, algname): 119 | num_variants = 1 120 | for param_space in self.benchmarks[algname]: 121 | num_variants = num_variants * len(range(param_space.low, param_space.high, param_space.step)) 122 | return num_variants 123 | -------------------------------------------------------------------------------- /benchmarks/scripts/cub/bench/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | class Logger: 4 | _instance = None 5 | 6 | def __new__(cls, *args, **kwargs): 7 | if cls._instance is None: 8 | cls._instance = super().__new__(cls, *args, **kwargs) 9 | logger = logging.getLogger() 10 | logger.setLevel(logging.DEBUG) 11 | file_handler = logging.FileHandler('cub_bench_meta.log') 12 | file_handler.setFormatter(logging.Formatter('%(asctime)s: %(message)s')) 13 | logger.addHandler(file_handler) 14 | cls._instance.logger = logger 15 | 16 | return cls._instance 17 | 18 | def info(self, message): 19 | self.logger.info(message) 20 | -------------------------------------------------------------------------------- /benchmarks/scripts/cub/bench/score.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | 5 | def importance_function(x): 6 | return 1 - math.exp(-x) 7 | 8 | 9 | def x_by_importance(y): 10 | return -math.log(1 - y) 11 | 12 | 13 | def compute_weights(num_values): 14 | least_importance = 0.6 15 | most_importance = 0.999 16 | 17 | assert(least_importance < most_importance) 18 | assert(least_importance >= 0 and least_importance < 1) 19 | assert(most_importance > 0 and most_importance < 1) 20 | 21 | begin = x_by_importance(least_importance) 22 | end = x_by_importance(most_importance) 23 | 24 | rng = end - begin 25 | step = rng / num_values 26 | 27 | weights = np.array([begin + x * step for x in range(num_values)]) 28 | weights = weights / sum(weights) 29 | 30 | return weights 31 | 32 | 33 | def io_weights(values): 34 | return compute_weights(len(values)) 35 | 36 | 37 | def ei_weights(values): 38 | return np.ones(len(values)) 39 | 40 | 41 | def compute_axes_ids(rt_axes_values): 42 | rt_axes_ids = {} 43 | 44 | axis_id = 0 45 | for rt_axis in rt_axes_values: 46 | rt_axes_ids[rt_axis] = axis_id 47 | axis_id = axis_id + 1 48 | 49 | return rt_axes_ids 50 | 51 | 52 | def compute_weight_matrix(rt_axes_values, rt_axes_ids): 53 | rt_axes_weights = {} 54 | 55 | first_rt_axis = True 56 | first_rt_axis_name = None 57 | for rt_axis in rt_axes_values: 58 | if first_rt_axis: 59 | first_rt_axis_name = rt_axis 60 | first_rt_axis = False 61 | values = rt_axes_values[rt_axis] 62 | rt_axes_values[rt_axis] = values 63 | if '{io}' in rt_axis: 64 | rt_axes_weights[rt_axis] = io_weights(values) 65 | else: 66 | rt_axes_weights[rt_axis] = ei_weights(values) 67 | 68 | num_rt_axes = len(rt_axes_ids) 69 | for rt_axis in rt_axes_weights: 70 | shape = [1] * num_rt_axes 71 | shape[rt_axes_ids[rt_axis]] = -1 72 | rt_axes_weights[rt_axis] = rt_axes_weights[rt_axis].reshape(*shape) 73 | 74 | weights_matrix = rt_axes_weights[first_rt_axis_name] 75 | for rt_axis in rt_axes_weights: 76 | if rt_axis == first_rt_axis_name: 77 | continue 78 | 79 | weights_matrix = weights_matrix * rt_axes_weights[rt_axis] 80 | 81 | return weights_matrix / np.sum(weights_matrix) 82 | 83 | 84 | def get_workload_coordinates(rt_workload, rt_axes_values, rt_axes_ids): 85 | coordinates = [0] * len(rt_axes_ids) 86 | for point in rt_workload: 87 | rt_axis, rt_value = point.split('=') 88 | coordinates[rt_axes_ids[rt_axis]] = rt_axes_values[rt_axis].index(rt_value) 89 | return coordinates 90 | 91 | def get_workload_weight(rt_workload, rt_axes_values, rt_axes_ids, weights_matrix): 92 | coordinates = get_workload_coordinates(rt_workload, rt_axes_values, rt_axes_ids) 93 | return weights_matrix[tuple(coordinates)] 94 | -------------------------------------------------------------------------------- /benchmarks/scripts/cub/bench/search.py: -------------------------------------------------------------------------------- 1 | import re 2 | import argparse 3 | import numpy as np 4 | 5 | from .bench import Bench, BaseBench 6 | from .config import Config 7 | from .storage import Storage 8 | from .cmake import CMake 9 | 10 | 11 | def list_axes(benchmarks, sub_space): 12 | print("### Axes") 13 | 14 | axes = {} 15 | 16 | for algname in benchmarks: 17 | bench = BaseBench(algname) 18 | for axis in bench.axes_values(sub_space, True) + bench.axes_values(sub_space, False): 19 | for point in axis: 20 | axis, value = point.split('=') 21 | if axis in axes: 22 | axes[axis].add(value) 23 | else: 24 | axes[axis] = {value} 25 | 26 | for axis in axes: 27 | print(" * `{}`".format(axis)) 28 | 29 | for value in axes[axis]: 30 | print(" * `{}`".format(value)) 31 | 32 | 33 | def list_benches(): 34 | print("### Benchmarks") 35 | 36 | config = Config() 37 | 38 | for algname in config.benchmarks: 39 | space_size = config.variant_space_size(algname) 40 | print(" * `{}`: {} variants: ".format(algname, space_size)) 41 | 42 | for param_space in config.benchmarks[algname]: 43 | param_name = param_space.label 44 | param_rng = (param_space.low, param_space.high, param_space.step) 45 | print(" * `{}`: {}".format(param_name, param_rng)) 46 | 47 | 48 | def parse_sub_space(args): 49 | sub_space = {} 50 | for axis in args: 51 | name, value = axis.split('=') 52 | 53 | if '[' in value: 54 | value = value.replace('[', '').replace(']', '') 55 | values = value.split(',') 56 | else: 57 | values = [value] 58 | sub_space[name] = values 59 | 60 | return sub_space 61 | 62 | 63 | def parse_arguments(): 64 | parser = argparse.ArgumentParser( 65 | description="Runs benchmarks and stores results in a database.") 66 | parser.add_argument('-R', type=str, default='.*', 67 | help="Regex for benchmarks selection.") 68 | parser.add_argument('-a', '--args', action='append', 69 | type=str, help="Parameter in the format `Param=Value`.") 70 | parser.add_argument( 71 | '--list-axes', action=argparse.BooleanOptionalAction, help="Show available parameters.") 72 | parser.add_argument( 73 | '--list-benches', action=argparse.BooleanOptionalAction, help="Show available benchmarks.") 74 | return parser.parse_args() 75 | 76 | 77 | def run_benches(benchmarks, workload_sub_space, regex, seeker): 78 | pattern = re.compile(regex) 79 | 80 | for algname in benchmarks: 81 | if pattern.match(algname): 82 | bench = BaseBench(algname) 83 | ct_workload_space = bench.ct_workload_space(workload_sub_space) 84 | rt_workload_space = bench.rt_workload_space(workload_sub_space) 85 | seeker(algname, ct_workload_space, rt_workload_space) 86 | 87 | 88 | def search(seeker): 89 | args = parse_arguments() 90 | 91 | if not Storage().exists(): 92 | CMake().clean() 93 | 94 | config = Config() 95 | print("ctk: ", config.ctk) 96 | print("cub: ", config.cub) 97 | 98 | workload_sub_space = {} 99 | 100 | if args.args: 101 | workload_sub_space = parse_sub_space(args.args) 102 | 103 | if args.list_axes: 104 | list_axes(config.benchmarks, workload_sub_space) 105 | return 106 | 107 | if args.list_benches: 108 | list_benches() 109 | return 110 | 111 | run_benches(config.benchmarks, workload_sub_space, args.R, seeker) 112 | 113 | 114 | class MedianCenterEstimator: 115 | def __init__(self): 116 | pass 117 | 118 | def __call__(self, samples): 119 | if len(samples) == 0: 120 | return float("inf") 121 | 122 | return float(np.median(samples)) 123 | 124 | 125 | class BruteForceSeeker: 126 | def __init__(self, base_center_estimator, variant_center_estimator): 127 | self.base_center_estimator = base_center_estimator 128 | self.variant_center_estimator = variant_center_estimator 129 | 130 | def __call__(self, algname, ct_workload_space, rt_workload_space): 131 | variants = Config().variant_space(algname) 132 | 133 | for ct_workload in ct_workload_space: 134 | for variant in variants: 135 | bench = Bench(algname, variant, list(ct_workload)) 136 | if bench.build(): 137 | score = bench.score(ct_workload, 138 | rt_workload_space, 139 | self.base_center_estimator, 140 | self.variant_center_estimator) 141 | 142 | print(bench.label(), score) 143 | -------------------------------------------------------------------------------- /benchmarks/scripts/cub/bench/storage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import fpzip 3 | import sqlite3 4 | import numpy as np 5 | import pandas as pd 6 | 7 | 8 | db_name = "cub_bench_meta.db" 9 | 10 | 11 | def blob_to_samples(blob): 12 | return np.squeeze(fpzip.decompress(blob)) 13 | 14 | 15 | class StorageBase: 16 | def __init__(self, db_path): 17 | self.conn = sqlite3.connect(db_path) 18 | 19 | def connection(self): 20 | return self.conn 21 | 22 | def exists(self): 23 | return os.path.exists(db_name) 24 | 25 | def algnames(self): 26 | with self.conn: 27 | result = self.conn.execute(""" 28 | SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'cub.bench.%'; 29 | """).fetchall() 30 | 31 | algnames = [algname[0] for algname in result] 32 | return algnames 33 | 34 | def alg_to_df(self, algname): 35 | with self.conn: 36 | df = pd.read_sql_query("SELECT * FROM \"{}\"".format(algname), self.conn) 37 | df['samples'] = df['samples'].apply(blob_to_samples) 38 | 39 | return df 40 | 41 | def store_df(self, algname, df): 42 | df['samples'] = df['samples'].apply(fpzip.compress) 43 | df.to_sql(algname, self.conn, if_exists='replace', index=False) 44 | 45 | 46 | class Storage: 47 | _instance = None 48 | 49 | def __new__(cls, *args, **kwargs): 50 | if cls._instance is None: 51 | cls._instance = super().__new__(cls, *args, **kwargs) 52 | cls._instance.base = StorageBase(db_name) 53 | return cls._instance 54 | 55 | def connection(self): 56 | return self.base.connection() 57 | 58 | def exists(self): 59 | return self.base.exists() 60 | 61 | def algnames(self): 62 | return self.base.algnames() 63 | 64 | def alg_to_df(self, algname): 65 | return self.base.alg_to_df(algname) 66 | -------------------------------------------------------------------------------- /benchmarks/scripts/search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import cub.bench as bench 4 | 5 | 6 | # TODO: 7 | # - driver version 8 | # - host compiler + version 9 | # - gpu clocks / pm 10 | # - ecc 11 | 12 | 13 | def main(): 14 | center_estimator = bench.MedianCenterEstimator() 15 | bench.search(bench.BruteForceSeeker(center_estimator, center_estimator)) 16 | 17 | 18 | if __name__ == "__main__": 19 | main() 20 | -------------------------------------------------------------------------------- /benchmarks/scripts/verify.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | import argparse 5 | import cub.bench 6 | 7 | 8 | def parse_arguments(): 9 | parser = argparse.ArgumentParser(description='Verify tuning variant') 10 | parser.add_argument('--variant', type=str, help='Variant to verify', default=None, required=True) 11 | 12 | variant = parser.parse_known_args()[0].variant 13 | sys.argv.remove('--variant={}'.format(variant)) 14 | 15 | return variant 16 | 17 | 18 | def workload_header(ct_workload_space, rt_workload_space): 19 | for ct_workload in ct_workload_space: 20 | for rt_workload in rt_workload_space: 21 | workload_point = ct_workload + rt_workload 22 | return ", ".join([x.split('=')[0] for x in workload_point]) 23 | 24 | 25 | def workload_entry(ct_workload, rt_workload): 26 | workload_point = ct_workload + rt_workload 27 | return ", ".join([x.split('=')[1] for x in workload_point]) 28 | 29 | 30 | class VerifySeeker: 31 | def __init__(self, variant_label): 32 | self.label = variant_label 33 | self.estimator = cub.bench.MedianCenterEstimator() 34 | 35 | def __call__(self, algname, ct_workload_space, rt_workload_space): 36 | variant_point = cub.bench.Config().label_to_variant_point(algname, self.label) 37 | 38 | print("{}, MinS, MedianS, MaxS".format(workload_header(ct_workload_space, rt_workload_space))) 39 | for ct_workload in ct_workload_space: 40 | bench = cub.bench.Bench(algname, variant_point, list(ct_workload)) 41 | if bench.build(): 42 | base = bench.get_base() 43 | for rt_workload in rt_workload_space: 44 | workload_point = ct_workload + rt_workload 45 | base_samples, base_elapsed = base.do_run(workload_point, None) 46 | variant_samples, _ = bench.do_run(workload_point, base_elapsed * 10) 47 | min_speedup = min(base_samples) / min(variant_samples) 48 | median_speedup = self.estimator(base_samples) / self.estimator(variant_samples) 49 | max_speedup = max(base_samples) / max(variant_samples) 50 | point_str = workload_entry(ct_workload, rt_workload) 51 | print("{}, {}, {}, {}".format(point_str, min_speedup, median_speedup, max_speedup)) 52 | 53 | 54 | def main(): 55 | cub.bench.search(VerifySeeker(parse_arguments())) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /cmake/AppendOptionIfAvailable.cmake: -------------------------------------------------------------------------------- 1 | include_guard(GLOBAL) 2 | include(CheckCXXCompilerFlag) 3 | 4 | macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST) 5 | 6 | string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR) 7 | check_cxx_compiler_flag(${_FLAG} ${_VAR}) 8 | 9 | if (${${_VAR}}) 10 | list(APPEND ${_LIST} ${_FLAG}) 11 | endif () 12 | 13 | endmacro () 14 | -------------------------------------------------------------------------------- /cmake/CubAddSubdir.cmake: -------------------------------------------------------------------------------- 1 | find_package(CUB REQUIRED CONFIG 2 | NO_DEFAULT_PATH # Only check the explicit path in HINTS: 3 | HINTS "${CMAKE_CURRENT_LIST_DIR}/.." 4 | ) 5 | -------------------------------------------------------------------------------- /cmake/CubCompilerHacks.cmake: -------------------------------------------------------------------------------- 1 | # Set up compiler paths and apply temporary hacks to support NVC++. 2 | # This file must be included before enabling any languages. 3 | 4 | # Temporary hacks to make NVC++ work; this requires you to define 5 | # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`. 6 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 7 | # If using NVC++, don't set CXX compiler 8 | if (NOT "${CMAKE_CXX_COMPILER}" STREQUAL "") 9 | unset(CMAKE_CXX_COMPILER CACHE) 10 | message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have" 11 | " specified a different ISO C++ compiler; NVC++ acts as both, so please" 12 | " unset the CMAKE_CXX_COMPILER variable." 13 | ) 14 | endif() 15 | 16 | # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to 17 | # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't 18 | # understand. 19 | if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "") 20 | unset(CMAKE_CUDA_HOST_COMPILER CACHE) 21 | message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have" 22 | " specified a different host ISO C++ compiler; NVC++ acts as both, so" 23 | " please unset the CMAKE_CUDA_HOST_COMPILER variable." 24 | ) 25 | endif() 26 | 27 | set(CMAKE_CXX_COMPILER "${CMAKE_CUDA_COMPILER}") 28 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -cuda") 29 | set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CUDA_COMPILER}") 30 | set(CMAKE_CUDA_LINK_EXECUTABLE 31 | " -o ") 32 | endif () 33 | 34 | # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to 35 | # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't 36 | # understand. 37 | if ((NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")) 38 | if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR 39 | "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}")) 40 | set(tmp "${CMAKE_CUDA_HOST_COMPILER}") 41 | unset(CMAKE_CUDA_HOST_COMPILER CACHE) 42 | message(FATAL_ERROR 43 | "For convenience, CUB's test harness uses CMAKE_CXX_COMPILER for the " 44 | "CUDA host compiler. Refusing to overwrite specified " 45 | "CMAKE_CUDA_HOST_COMPILER -- please reconfigure without setting this " 46 | "variable. Currently:\n" 47 | "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}\n" 48 | "CMAKE_CUDA_HOST_COMPILER=${tmp}" 49 | ) 50 | endif () 51 | set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") 52 | endif () 53 | 54 | # Temporary hacks to make NVC++ work; this requires you to define 55 | # `CMAKE_CUDA_COMPILER_ID=NVCXX` and `CMAKE_CUDA_COMPILER_FORCED=ON`. 56 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 57 | # Need 3.17 for the properties used below. 58 | cmake_minimum_required(VERSION 3.17) 59 | 60 | set(CMAKE_CUDA_STANDARD_DEFAULT 03) 61 | 62 | set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "-std=c++03") 63 | set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "-std=c++03") 64 | set(CMAKE_CUDA03_STANDARD__HAS_FULL_SUPPORT TRUE) 65 | set_property(GLOBAL PROPERTY CMAKE_CUDA03_KNOWN_FEATURES) 66 | 67 | set(CMAKE_CUDA11_STANDARD_COMPILE_OPTION "-std=c++11") 68 | set(CMAKE_CUDA11_EXTENSION_COMPILE_OPTION "-std=c++11") 69 | set(CMAKE_CUDA11_STANDARD__HAS_FULL_SUPPORT TRUE) 70 | set_property(GLOBAL PROPERTY CMAKE_CUDA11_KNOWN_FEATURES) 71 | 72 | set(CMAKE_CUDA14_STANDARD_COMPILE_OPTION "-std=c++14") 73 | set(CMAKE_CUDA14_EXTENSION_COMPILE_OPTION "-std=c++14") 74 | set(CMAKE_CUDA14_STANDARD__HAS_FULL_SUPPORT TRUE) 75 | set_property(GLOBAL PROPERTY CMAKE_CUDA14_KNOWN_FEATURES) 76 | 77 | set(CMAKE_CUDA17_STANDARD_COMPILE_OPTION "-std=c++17") 78 | set(CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "-std=c++17") 79 | set(CMAKE_CUDA17_STANDARD__HAS_FULL_SUPPORT TRUE) 80 | set_property(GLOBAL PROPERTY CMAKE_CUDA17_KNOWN_FEATURES) 81 | 82 | include(Internal/FeatureTesting) 83 | include(Compiler/CMakeCommonCompilerMacros) 84 | cmake_record_cuda_compile_features() 85 | 86 | set(CMAKE_CUDA_COMPILE_FEATURES 87 | ${CMAKE_CUDA03_COMPILE_FEATURES} 88 | ${CMAKE_CUDA11_COMPILE_FEATURES} 89 | ${CMAKE_CUDA14_COMPILE_FEATURES} 90 | ${CMAKE_CUDA17_COMPILE_FEATURES} 91 | ${CMAKE_CUDA20_COMPILE_FEATURES} 92 | ) 93 | endif () 94 | -------------------------------------------------------------------------------- /cmake/CubCudaConfig.cmake: -------------------------------------------------------------------------------- 1 | enable_language(CUDA) 2 | 3 | if (NOT CUB_IN_THRUST) 4 | message(FATAL_ERROR 5 | "Building CUB as a standalone project is no longer supported. " 6 | "Use the Thrust repo instead.") 7 | endif() 8 | 9 | set(CUB_CUDA_FLAGS_BASE "${THRUST_CUDA_FLAGS_BASE}") 10 | set(CUB_CUDA_FLAGS_RDC "${THRUST_CUDA_FLAGS_RDC}") 11 | set(CUB_CUDA_FLAGS_NO_RDC "${THRUST_CUDA_FLAGS_NO_RDC}") 12 | 13 | # Update the enabled architectures list from thrust 14 | foreach (arch IN LISTS THRUST_KNOWN_COMPUTE_ARCHS) 15 | if (THRUST_ENABLE_COMPUTE_${arch}) 16 | set(CUB_ENABLE_COMPUTE_${arch} True) 17 | string(APPEND arch_message " sm_${arch}") 18 | else() 19 | set(CUB_ENABLE_COMPUTE_${arch} False) 20 | endif() 21 | endforeach() 22 | 23 | message(STATUS ${arch_message}) 24 | 25 | # 26 | # RDC options: 27 | # 28 | 29 | # RDC is off by default in NVCC and on by default in NVC++. Turning off RDC 30 | # isn't currently supported by NVC++. So, we default to RDC off for NVCC and 31 | # RDC on for NVC++. 32 | set(option_init OFF) 33 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 34 | set(option_init ON) 35 | endif() 36 | 37 | option(CUB_ENABLE_TESTS_WITH_RDC 38 | "Build all CUB tests with RDC; tests that require RDC are not affected by this option." 39 | ${option_init} 40 | ) 41 | 42 | option(CUB_ENABLE_EXAMPLES_WITH_RDC 43 | "Build all CUB examples with RDC; examples which require RDC are not affected by this option." 44 | ${option_init} 45 | ) 46 | 47 | # Check for RDC/SM compatibility and error/warn if necessary 48 | set(rdc_supported True) 49 | foreach (arch IN LISTS no_rdc_archs) 50 | if (CUB_ENABLE_COMPUTE_${arch}) 51 | set(rdc_supported False) 52 | break() 53 | endif() 54 | endforeach() 55 | 56 | set(rdc_opts 57 | CUB_ENABLE_TESTS_WITH_RDC 58 | CUB_ENABLE_EXAMPLES_WITH_RDC 59 | ) 60 | set(rdc_requested False) 61 | foreach (rdc_opt IN LISTS rdc_opts) 62 | if (${rdc_opt}) 63 | set(rdc_requested True) 64 | break() 65 | endif() 66 | endforeach() 67 | 68 | if (rdc_requested AND NOT rdc_supported) 69 | string(JOIN ", " no_rdc ${no_rdc_archs}) 70 | string(JOIN "\n" opts ${rdc_opts}) 71 | message(FATAL_ERROR 72 | "Architectures {${no_rdc}} do not support RDC and are incompatible with " 73 | "these options:\n${opts}" 74 | ) 75 | endif() 76 | 77 | 78 | # 79 | # Clang CUDA options 80 | # 81 | if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 82 | set(CUB_CUDA_FLAGS_BASE "${CUB_CUDA_FLAGS_BASE} -Wno-unknown-cuda-version -Xclang=-fcuda-allow-variadic-functions") 83 | endif() 84 | 85 | 86 | # By default RDC is not used: 87 | set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}") 88 | -------------------------------------------------------------------------------- /cmake/CubHeaderTesting.cmake: -------------------------------------------------------------------------------- 1 | # For every public header, build a translation unit containing `#include
` 2 | # to let the compiler try to figure out warnings in that header if it is not otherwise 3 | # included in tests, and also to verify if the headers are modular enough. 4 | # .inl files are not globbed for, because they are not supposed to be used as public 5 | # entrypoints. 6 | 7 | # Meta target for all configs' header builds: 8 | add_custom_target(cub.all.headers) 9 | 10 | file(GLOB_RECURSE headers 11 | RELATIVE "${CUB_SOURCE_DIR}/cub" 12 | CONFIGURE_DEPENDS 13 | cub/*.cuh 14 | ) 15 | 16 | set(headertest_srcs) 17 | foreach (header IN LISTS headers) 18 | set(headertest_src "headers/${header}.cu") 19 | configure_file("${CUB_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}") 20 | list(APPEND headertest_srcs "${headertest_src}") 21 | endforeach() 22 | 23 | function(cub_add_header_test label definitions) 24 | foreach(cub_target IN LISTS CUB_TARGETS) 25 | cub_get_target_property(config_prefix ${cub_target} PREFIX) 26 | 27 | set(headertest_target ${config_prefix}.headers.${label}) 28 | add_library(${headertest_target} OBJECT ${headertest_srcs}) 29 | target_link_libraries(${headertest_target} PUBLIC ${cub_target}) 30 | target_compile_definitions(${headertest_target} PRIVATE ${definitions}) 31 | cub_clone_target_properties(${headertest_target} ${cub_target}) 32 | 33 | if (CUB_IN_THRUST) 34 | thrust_fix_clang_nvcc_build_for(${headertest_target}) 35 | endif() 36 | 37 | add_dependencies(cub.all.headers ${headertest_target}) 38 | add_dependencies(${config_prefix}.all ${headertest_target}) 39 | endforeach() 40 | endfunction() 41 | 42 | # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros: 43 | set(header_definitions 44 | "THRUST_WRAPPED_NAMESPACE=wrapped_thrust" 45 | "CUB_WRAPPED_NAMESPACE=wrapped_cub") 46 | cub_add_header_test(base "${header_definitions}") 47 | 48 | list(APPEND header_definitions "CUB_DISABLE_BF16_SUPPORT") 49 | cub_add_header_test(bf16 "${header_definitions}") 50 | 51 | -------------------------------------------------------------------------------- /cmake/CubInstallRules.cmake: -------------------------------------------------------------------------------- 1 | # Thrust manages its own copy of these rules. Update ThrustInstallRules.cmake 2 | # if modifying this file. 3 | if (CUB_IN_THRUST) 4 | return() 5 | endif() 6 | 7 | # Bring in CMAKE_INSTALL_LIBDIR 8 | include(GNUInstallDirs) 9 | 10 | # CUB is a header library; no need to build anything before installing: 11 | set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE) 12 | 13 | install(DIRECTORY "${CUB_SOURCE_DIR}/cub" 14 | DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" 15 | FILES_MATCHING 16 | PATTERN "*.cuh" 17 | ) 18 | 19 | install(DIRECTORY "${CUB_SOURCE_DIR}/cub/cmake/" 20 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub" 21 | PATTERN *.cmake.in EXCLUDE 22 | ) 23 | # Need to configure a file to store the infix specified in 24 | # CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user 25 | set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/cub") 26 | configure_file("${CUB_SOURCE_DIR}/cub/cmake/cub-header-search.cmake.in" 27 | "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake" 28 | @ONLY) 29 | install(FILES "${CUB_BINARY_DIR}/cub/cmake/cub-header-search.cmake" 30 | DESTINATION "${install_location}") 31 | -------------------------------------------------------------------------------- /cmake/CubUtilities.cmake: -------------------------------------------------------------------------------- 1 | # Enable RDC for a CUDA target. Encapsulates compiler hacks: 2 | function(cub_enable_rdc_for_cuda_target target_name) 3 | if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 4 | set_target_properties(${target_name} PROPERTIES 5 | COMPILE_FLAGS "-gpu=rdc" 6 | ) 7 | elseif ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}") 8 | else() 9 | set_target_properties(${target_name} PROPERTIES 10 | CUDA_SEPARABLE_COMPILATION ON 11 | ) 12 | endif() 13 | endfunction() 14 | -------------------------------------------------------------------------------- /cmake/header_test.in: -------------------------------------------------------------------------------- 1 | // This source file checks that: 2 | // 1) Header compiles without error. 3 | // 2) Common macro collisions with platform/system headers are avoided. 4 | 5 | // Define CUB_MACRO_CHECK(macro, header), which emits a diagnostic indicating 6 | // a potential macro collision and halts. 7 | // 8 | // Use raw platform checks instead of the CUB_HOST_COMPILER macros since we 9 | // don't want to #include any headers other than the one being tested. 10 | // 11 | // This is only implemented for MSVC/GCC/Clang. 12 | #if defined(_MSC_VER) // MSVC 13 | 14 | // Fake up an error for MSVC 15 | #define CUB_MACRO_CHECK_IMPL(msg) \ 16 | /* Print message that looks like an error: */ \ 17 | __pragma(message(__FILE__ ":" CUB_MACRO_CHECK_IMPL0(__LINE__) \ 18 | ": error: " #msg)) \ 19 | /* abort compilation due to static_assert or syntax error: */ \ 20 | static_assert(false, #msg); 21 | #define CUB_MACRO_CHECK_IMPL0(x) CUB_MACRO_CHECK_IMPL1(x) 22 | #define CUB_MACRO_CHECK_IMPL1(x) #x 23 | 24 | #elif defined(__clang__) || defined(__GNUC__) 25 | 26 | // GCC/clang are easy: 27 | #define CUB_MACRO_CHECK_IMPL(msg) CUB_MACRO_CHECK_IMPL0(GCC error #msg) 28 | #define CUB_MACRO_CHECK_IMPL0(expr) _Pragma(#expr) 29 | 30 | #endif 31 | 32 | // Hacky way to build a string, but it works on all tested platforms. 33 | #define CUB_MACRO_CHECK(MACRO, HEADER) \ 34 | CUB_MACRO_CHECK_IMPL(Identifier MACRO should not be used from CUB \ 35 | headers due to conflicts with HEADER macros.) 36 | 37 | // complex.h conflicts 38 | #define I CUB_MACRO_CHECK('I', complex.h) 39 | 40 | // windows.h conflicts 41 | #define small CUB_MACRO_CHECK('small', windows.h) 42 | // We can't enable these checks without breaking some builds -- some standard 43 | // library implementations unconditionally `#undef` these macros, which then 44 | // causes random failures later. 45 | // Leaving these commented out as a warning: Here be dragons. 46 | //#define min(...) CUB_MACRO_CHECK('min', windows.h) 47 | //#define max(...) CUB_MACRO_CHECK('max', windows.h) 48 | 49 | // termios.h conflicts (NVIDIA/thrust#1547) 50 | #define B0 CUB_MACRO_CHECK("B0", termios.h) 51 | 52 | #include 53 | 54 | #if defined(CUB_DISABLE_BF16_SUPPORT) 55 | #if defined(__CUDA_BF16_TYPES_EXIST__) 56 | #error CUB should not include cuda_bf16.h when BF16 support is disabled 57 | #endif 58 | #endif 59 | -------------------------------------------------------------------------------- /cub/block/specializations/block_histogram_atomic.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../config.cuh" 37 | 38 | CUB_NAMESPACE_BEGIN 39 | 40 | 41 | /** 42 | * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 43 | */ 44 | template 45 | struct BlockHistogramAtomic 46 | { 47 | /// Shared memory storage layout type 48 | struct TempStorage {}; 49 | 50 | 51 | /// Constructor 52 | __device__ __forceinline__ BlockHistogramAtomic( 53 | TempStorage &temp_storage) 54 | {} 55 | 56 | 57 | /// Composite data onto an existing histogram 58 | template < 59 | typename T, 60 | typename CounterT, 61 | int ITEMS_PER_THREAD> 62 | __device__ __forceinline__ void Composite( 63 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram 64 | CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram 65 | { 66 | // Update histogram 67 | #pragma unroll 68 | for (int i = 0; i < ITEMS_PER_THREAD; ++i) 69 | { 70 | atomicAdd(histogram + items[i], 1); 71 | } 72 | } 73 | 74 | }; 75 | 76 | CUB_NAMESPACE_END 77 | 78 | -------------------------------------------------------------------------------- /cub/cmake/cub-config-version.cmake: -------------------------------------------------------------------------------- 1 | # Parse version information from version.cuh: 2 | include("${CMAKE_CURRENT_LIST_DIR}/cub-header-search.cmake") 3 | 4 | file(READ "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh" CUB_VERSION_HEADER) 5 | string(REGEX MATCH "#define[ \t]+CUB_VERSION[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}") 6 | set(CUB_VERSION_FLAT ${CMAKE_MATCH_1}) 7 | # Note that CUB calls this the PATCH number, CMake calls it the TWEAK number: 8 | string(REGEX MATCH "#define[ \t]+CUB_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${CUB_VERSION_HEADER}") 9 | set(CUB_VERSION_TWEAK ${CMAKE_MATCH_1}) 10 | 11 | math(EXPR CUB_VERSION_MAJOR "${CUB_VERSION_FLAT} / 100000") 12 | math(EXPR CUB_VERSION_MINOR "(${CUB_VERSION_FLAT} / 100) % 1000") 13 | math(EXPR CUB_VERSION_PATCH "${CUB_VERSION_FLAT} % 100") # CUB: "subminor" CMake: "patch" 14 | 15 | set(CUB_VERSION "${CUB_VERSION_MAJOR}.${CUB_VERSION_MINOR}.${CUB_VERSION_PATCH}.${CUB_VERSION_TWEAK}") 16 | 17 | set(PACKAGE_VERSION ${CUB_VERSION}) 18 | set(PACKAGE_VERSION_COMPATIBLE FALSE) 19 | set(PACKAGE_VERSION_EXACT FALSE) 20 | set(PACKAGE_VERSION_UNSUITABLE FALSE) 21 | 22 | if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION) 23 | if(CUB_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND 24 | CUB_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR) 25 | set(PACKAGE_VERSION_COMPATIBLE TRUE) 26 | endif() 27 | 28 | if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION) 29 | set(PACKAGE_VERSION_EXACT TRUE) 30 | endif() 31 | endif() 32 | -------------------------------------------------------------------------------- /cub/cmake/cub-config.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # find_package(CUB) config file. 3 | # 4 | # Defines a CUB::CUB target that may be linked from user projects to include 5 | # CUB. 6 | 7 | if (TARGET CUB::CUB) 8 | return() 9 | endif() 10 | 11 | # Minimum supported libcudacxx version: 12 | set(cub_libcudacxx_version 1.8.0) 13 | 14 | function(_cub_declare_interface_alias alias_name ugly_name) 15 | # 1) Only IMPORTED and ALIAS targets can be placed in a namespace. 16 | # 2) When an IMPORTED library is linked to another target, its include 17 | # directories are treated as SYSTEM includes. 18 | # 3) nvcc will automatically check the CUDA Toolkit include path *before* the 19 | # system includes. This means that the Toolkit CUB will *always* be used 20 | # during compilation, and the include paths of an IMPORTED CUB::CUB 21 | # target will never have any effect. 22 | # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED 23 | # on EVERY target that links to CUB::CUB. This would be a burden and a 24 | # footgun for our users. Forgetting this would silently pull in the wrong CUB! 25 | # 5) A workaround is to make a non-IMPORTED library outside of the namespace, 26 | # configure it, and then ALIAS it into the namespace (or ALIAS and then 27 | # configure, that seems to work too). 28 | add_library(${ugly_name} INTERFACE) 29 | add_library(${alias_name} ALIAS ${ugly_name}) 30 | endfunction() 31 | 32 | # 33 | # Setup some internal cache variables 34 | # 35 | 36 | # Pull in the include dir detected by cub-config-version.cmake 37 | set(_CUB_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" 38 | CACHE INTERNAL "Location of CUB headers." 39 | FORCE 40 | ) 41 | unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache 42 | 43 | if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) 44 | set(_CUB_QUIET ON CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) 45 | set(_CUB_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE) 46 | else() 47 | set(_CUB_QUIET OFF CACHE INTERNAL "Quiet mode enabled for CUB find_package calls." FORCE) 48 | set(_CUB_QUIET_FLAG "" CACHE INTERNAL "" FORCE) 49 | endif() 50 | 51 | # 52 | # Setup dependencies 53 | # 54 | 55 | if (NOT TARGET CUB::libcudacxx) 56 | if (TARGET Thrust::libcudacxx) 57 | # Prefer the same libcudacxx as Thrust, if available: 58 | _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) 59 | target_link_libraries(_CUB_libcudacxx INTERFACE Thrust::libcudacxx) 60 | else() 61 | if (NOT TARGET libcudacxx::libcudacxx) 62 | # First do a non-required search for any co-packaged versions. 63 | # These are preferred. 64 | find_package(libcudacxx ${cub_libcudacxx_version} CONFIG 65 | ${_CUB_QUIET_FLAG} 66 | NO_DEFAULT_PATH # Only check the explicit HINTS below: 67 | HINTS 68 | "${_CUB_INCLUDE_DIR}/../libcudacxx" # Source layout 69 | "${_CUB_CMAKE_DIR}/.." # Install layout 70 | ) 71 | 72 | # A second required search allows externally packaged to be used and fails if 73 | # no suitable package exists. 74 | find_package(libcudacxx ${cub_libcudacxx_version} CONFIG 75 | REQUIRED 76 | ${_CUB_QUIET_FLAG} 77 | ) 78 | endif() 79 | _cub_declare_interface_alias(CUB::libcudacxx _CUB_libcudacxx) 80 | target_link_libraries(_CUB_libcudacxx INTERFACE libcudacxx::libcudacxx) 81 | endif() 82 | endif() 83 | 84 | # 85 | # Setup targets 86 | # 87 | 88 | _cub_declare_interface_alias(CUB::CUB _CUB_CUB) 89 | target_include_directories(_CUB_CUB INTERFACE "${_CUB_INCLUDE_DIR}") 90 | target_link_libraries(_CUB_CUB INTERFACE CUB::libcudacxx) 91 | 92 | if (CUB_IGNORE_DEPRECATED_API OR THRUST_IGNORE_DEPRECATED_API) 93 | target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_API") 94 | endif() 95 | 96 | if (CUB_IGNORE_DEPRECATED_CPP_DIALECT OR 97 | THRUST_IGNORE_DEPRECATED_CPP_DIALECT) 98 | target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_DIALECT") 99 | endif() 100 | 101 | if (CUB_IGNORE_DEPRECATED_CPP_11 OR 102 | THRUST_IGNORE_DEPRECATED_CPP_11) 103 | target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_CPP_11") 104 | endif() 105 | 106 | if (CUB_IGNORE_DEPRECATED_COMPILER OR 107 | THRUST_IGNORE_DEPRECATED_COMPILER) 108 | target_compile_definitions(_CUB_CUB INTERFACE "CUB_IGNORE_DEPRECATED_COMPILER") 109 | endif() 110 | 111 | # 112 | # Standardize version info 113 | # 114 | 115 | set(CUB_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE) 116 | set(CUB_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE) 117 | set(CUB_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE) 118 | set(CUB_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE) 119 | set(CUB_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE) 120 | set(CUB_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE) 121 | 122 | include(FindPackageHandleStandardArgs) 123 | if (NOT CUB_CONFIG) 124 | set(CUB_CONFIG "${CMAKE_CURRENT_LIST_FILE}") 125 | endif() 126 | find_package_handle_standard_args(CUB CONFIG_MODE) 127 | -------------------------------------------------------------------------------- /cub/cmake/cub-header-search.cmake: -------------------------------------------------------------------------------- 1 | # Parse version information from version.h in source tree 2 | set(_CUB_VERSION_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/../..") 3 | if(EXISTS "${_CUB_VERSION_INCLUDE_DIR}/cub/version.cuh") 4 | set(_CUB_VERSION_INCLUDE_DIR "${_CUB_VERSION_INCLUDE_DIR}" CACHE FILEPATH "" FORCE) # Clear old result 5 | set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) 6 | endif() 7 | -------------------------------------------------------------------------------- /cub/cmake/cub-header-search.cmake.in: -------------------------------------------------------------------------------- 1 | # Parse version information from version.h: 2 | unset(_CUB_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search 3 | 4 | # Find CMAKE_INSTALL_INCLUDEDIR=@CMAKE_INSTALL_INCLUDEDIR@ directory" 5 | set(from_install_prefix "@install_location@") 6 | 7 | # Transform to a list of directories, replace each directoy with "../" 8 | # and convert back to a string 9 | string(REGEX REPLACE "/" ";" from_install_prefix "${from_install_prefix}") 10 | list(TRANSFORM from_install_prefix REPLACE ".+" "../") 11 | list(JOIN from_install_prefix "" from_install_prefix) 12 | 13 | find_path(_CUB_VERSION_INCLUDE_DIR cub/version.cuh 14 | NO_CMAKE_FIND_ROOT_PATH # Don't allow CMake to re-root the search 15 | NO_DEFAULT_PATH # Only search explicit paths below: 16 | PATHS 17 | "${CMAKE_CURRENT_LIST_DIR}/${from_install_prefix}/@CMAKE_INSTALL_INCLUDEDIR@" 18 | ) 19 | set_property(CACHE _CUB_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL) 20 | -------------------------------------------------------------------------------- /cub/config.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Static configuration header for the CUB project. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include "util_arch.cuh" 36 | #include "util_compiler.cuh" 37 | #include "util_cpp_dialect.cuh" 38 | #include "util_deprecated.cuh" 39 | #include "util_macro.cuh" 40 | #include "util_namespace.cuh" 41 | -------------------------------------------------------------------------------- /cub/cub.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * CUB umbrella include file 32 | */ 33 | 34 | #pragma once 35 | 36 | // Static configuration 37 | #include "config.cuh" 38 | 39 | // Block 40 | #include "block/block_adjacent_difference.cuh" 41 | #include "block/block_discontinuity.cuh" 42 | #include "block/block_exchange.cuh" 43 | #include "block/block_histogram.cuh" 44 | #include "block/block_load.cuh" 45 | #include "block/block_merge_sort.cuh" 46 | #include "block/block_radix_rank.cuh" 47 | #include "block/block_radix_sort.cuh" 48 | #include "block/block_reduce.cuh" 49 | #include "block/block_scan.cuh" 50 | #include "block/block_store.cuh" 51 | //#include "block/block_shift.cuh" 52 | 53 | // Device 54 | #include "device/device_adjacent_difference.cuh" 55 | #include "device/device_copy.cuh" 56 | #include "device/device_histogram.cuh" 57 | #include "device/device_memcpy.cuh" 58 | #include "device/device_merge_sort.cuh" 59 | #include "device/device_partition.cuh" 60 | #include "device/device_radix_sort.cuh" 61 | #include "device/device_reduce.cuh" 62 | #include "device/device_run_length_encode.cuh" 63 | #include "device/device_scan.cuh" 64 | #include "device/device_segmented_radix_sort.cuh" 65 | #include "device/device_segmented_reduce.cuh" 66 | #include "device/device_segmented_sort.cuh" 67 | #include "device/device_select.cuh" 68 | #include "device/device_spmv.cuh" 69 | 70 | // Grid 71 | //#include "grid/grid_barrier.cuh" 72 | #include "grid/grid_even_share.cuh" 73 | #include "grid/grid_mapping.cuh" 74 | #include "grid/grid_queue.cuh" 75 | 76 | // Thread 77 | #include "thread/thread_load.cuh" 78 | #include "thread/thread_operators.cuh" 79 | #include "thread/thread_reduce.cuh" 80 | #include "thread/thread_scan.cuh" 81 | #include "thread/thread_store.cuh" 82 | 83 | // Warp 84 | #include "warp/warp_exchange.cuh" 85 | #include "warp/warp_load.cuh" 86 | #include "warp/warp_merge_sort.cuh" 87 | #include "warp/warp_reduce.cuh" 88 | #include "warp/warp_scan.cuh" 89 | #include "warp/warp_store.cuh" 90 | 91 | // Iterator 92 | #include "iterator/arg_index_input_iterator.cuh" 93 | #include "iterator/cache_modified_input_iterator.cuh" 94 | #include "iterator/cache_modified_output_iterator.cuh" 95 | #include "iterator/constant_input_iterator.cuh" 96 | #include "iterator/counting_input_iterator.cuh" 97 | #include "iterator/discard_output_iterator.cuh" 98 | #include "iterator/tex_obj_input_iterator.cuh" 99 | #include "iterator/tex_ref_input_iterator.cuh" 100 | #include "iterator/transform_input_iterator.cuh" 101 | 102 | // Util 103 | #include "util_allocator.cuh" 104 | #include "util_arch.cuh" 105 | #include "util_debug.cuh" 106 | #include "util_device.cuh" 107 | #include "util_macro.cuh" 108 | #include "util_ptx.cuh" 109 | #include "util_type.cuh" 110 | -------------------------------------------------------------------------------- /cub/detail/choose_offset.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | #include 34 | 35 | CUB_NAMESPACE_BEGIN 36 | 37 | namespace detail 38 | { 39 | 40 | /** 41 | * ChooseOffsetT checks NumItemsT, the type of the num_items parameter, and 42 | * selects the offset type based on it. 43 | */ 44 | template 45 | struct ChooseOffsetT 46 | { 47 | // NumItemsT must be an integral type (but not bool). 48 | static_assert( 49 | std::is_integral::value && 50 | !std::is_same::type, bool>::value, 51 | "NumItemsT must be an integral type, but not bool"); 52 | 53 | // Unsigned integer type for global offsets. 54 | using Type = typename std::conditional::type; 57 | }; 58 | 59 | } // namespace detail 60 | 61 | CUB_NAMESPACE_END 62 | 63 | -------------------------------------------------------------------------------- /cub/detail/cpp_compatibility.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | #pragma once 19 | 20 | #include 21 | 22 | #if CUB_CPP_DIALECT >= 2017 && __cpp_if_constexpr 23 | # define CUB_IF_CONSTEXPR if constexpr 24 | # define CUB_ELSE_IF_CONSTEXPR else if constexpr 25 | #else 26 | # define CUB_IF_CONSTEXPR if 27 | # define CUB_ELSE_IF_CONSTEXPR else if 28 | #endif 29 | -------------------------------------------------------------------------------- /cub/detail/detect_cuda_runtime.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Utilities for CUDA dynamic parallelism. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | 38 | #include 39 | 40 | CUB_NAMESPACE_BEGIN 41 | namespace detail 42 | { 43 | 44 | #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: 45 | 46 | /** 47 | * \def CUB_DISABLE_CDP 48 | * 49 | * If defined, support for device-side usage of CUB is disabled. 50 | */ 51 | #define CUB_DISABLE_CDP 52 | 53 | /** 54 | * \def CUB_RDC_ENABLED 55 | * 56 | * Defined if RDC is enabled and CUB_DISABLE_CDP is not defined. 57 | */ 58 | #define CUB_RDC_ENABLED 59 | 60 | /** 61 | * \def CUB_RUNTIME_FUNCTION 62 | * 63 | * Execution space for functions that can use the CUDA runtime API (`__host__` 64 | * when RDC is off, `__host__ __device__` when RDC is on). 65 | */ 66 | #define CUB_RUNTIME_FUNCTION 67 | 68 | /** 69 | * \def CUB_RUNTIME_ENABLED 70 | * 71 | * Whether or not the active compiler pass is allowed to invoke device kernels 72 | * or methods from the CUDA runtime API. 73 | * 74 | * This macro should not be used in CUB, as it depends on `__CUDA_ARCH__` 75 | * and is not compatible with `NV_IF_TARGET`. It is provided for legacy 76 | * purposes only. 77 | * 78 | * Replace any usages with `CUB_RDC_ENABLED` and `NV_IF_TARGET`. 79 | */ 80 | #define CUB_RUNTIME_ENABLED 81 | 82 | #else // Non-doxygen pass: 83 | 84 | #ifndef CUB_RUNTIME_FUNCTION 85 | 86 | #if defined(__CUDACC_RDC__) && !defined(CUB_DISABLE_CDP) 87 | 88 | #define CUB_RDC_ENABLED 89 | #define CUB_RUNTIME_FUNCTION __host__ __device__ 90 | 91 | #else // RDC disabled: 92 | 93 | #define CUB_RUNTIME_FUNCTION __host__ 94 | 95 | #endif // RDC enabled 96 | 97 | #if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__) 98 | // Legacy only -- do not use in new code. 99 | #define CUB_RUNTIME_ENABLED 100 | #endif 101 | 102 | #endif // CUB_RUNTIME_FUNCTION predefined 103 | 104 | #ifdef CUB_RDC_ENABLED 105 | // Detect available version of CDP: 106 | #if __CUDACC_VER_MAJOR__ < 12 || defined(CUDA_FORCE_CDP1_IF_SUPPORTED) 107 | #define CUB_DETAIL_CDPv1 108 | #else 109 | #define CUB_DETAIL_CDPv2 110 | #endif 111 | #endif 112 | 113 | #endif // Do not document 114 | 115 | } // namespace detail 116 | CUB_NAMESPACE_END 117 | -------------------------------------------------------------------------------- /cub/detail/device_double_buffer.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | 22 | CUB_NAMESPACE_BEGIN 23 | 24 | namespace detail 25 | { 26 | 27 | 28 | /** 29 | * @brief It's a double-buffer storage wrapper for multi-pass stream 30 | * transformations that require more than one storage array for 31 | * streaming intermediate results back and forth. 32 | * 33 | * Many multi-pass computations require a pair of "ping-pong" storage buffers 34 | * (e.g., one for reading from and the other for writing to, and then 35 | * vice-versa for the subsequent pass). This structure wraps a set of device 36 | * buffers. 37 | * 38 | * Unlike `cub::DoubleBuffer` this class doesn't provide a "selector" member 39 | * to track which buffer is "current". The main reason for this class existence 40 | * is the performance difference. Since `cub::DoubleBuffer` relies on the 41 | * runtime variable to index pointers arrays, they are placed in the local 42 | * memory instead of registers. Local memory accesses significantly affect 43 | * performance. On the contrary, this class swaps pointer, so all operations 44 | * can be performed in registers. 45 | */ 46 | template 47 | class device_double_buffer 48 | { 49 | /// Pair of device buffer pointers 50 | T *m_current_buffer {}; 51 | T *m_alternate_buffer {}; 52 | 53 | public: 54 | /** 55 | * @param d_current 56 | * The currently valid buffer 57 | * 58 | * @param d_alternate 59 | * Alternate storage buffer of the same size as @p d_current 60 | */ 61 | __host__ __device__ __forceinline__ device_double_buffer(T *current, 62 | T *alternate) 63 | : m_current_buffer(current) 64 | , m_alternate_buffer(alternate) 65 | {} 66 | 67 | /// \brief Return pointer to the currently valid buffer 68 | __host__ __device__ __forceinline__ T *current() const 69 | { 70 | return m_current_buffer; 71 | } 72 | 73 | /// \brief Return pointer to the currently invalid buffer 74 | __host__ __device__ __forceinline__ T *alternate() const 75 | { 76 | return m_alternate_buffer; 77 | } 78 | 79 | __host__ __device__ void swap() 80 | { 81 | T *tmp = m_current_buffer; 82 | m_current_buffer = m_alternate_buffer; 83 | m_alternate_buffer = tmp; 84 | } 85 | }; 86 | 87 | 88 | } // namespace detail 89 | 90 | CUB_NAMESPACE_END 91 | -------------------------------------------------------------------------------- /cub/detail/device_synchronize.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | #include 27 | 28 | CUB_NAMESPACE_BEGIN 29 | 30 | namespace detail 31 | { 32 | 33 | /** 34 | * Call `cudaDeviceSynchronize()` using the proper API for the current CUB and 35 | * CUDA configuration. 36 | */ 37 | CUB_EXEC_CHECK_DISABLE 38 | CUB_RUNTIME_FUNCTION inline cudaError_t device_synchronize() 39 | { 40 | cudaError_t result = cudaErrorNotSupported; 41 | 42 | // Device-side sync is only available under CDPv1: 43 | #if defined(CUB_DETAIL_CDPv1) 44 | 45 | #if ((__CUDACC_VER_MAJOR__ > 11) || \ 46 | ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 6))) 47 | // CUDA >= 11.6 48 | #define CUB_TMP_DEVICE_SYNC_IMPL \ 49 | result = __cudaDeviceSynchronizeDeprecationAvoidance(); 50 | #else // CUDA < 11.6: 51 | #define CUB_TMP_DEVICE_SYNC_IMPL result = cudaDeviceSynchronize(); 52 | #endif 53 | 54 | #else // CDPv2 or no CDP: 55 | 56 | #define CUB_TMP_DEVICE_SYNC_IMPL /* unavailable */ 57 | 58 | #endif // CDP version 59 | 60 | NV_IF_TARGET(NV_IS_HOST, 61 | (result = cudaDeviceSynchronize();), 62 | (CUB_TMP_DEVICE_SYNC_IMPL)); 63 | 64 | #undef CUB_TMP_DEVICE_SYNC_IMPL 65 | 66 | return result; 67 | } 68 | 69 | } // namespace detail 70 | 71 | CUB_NAMESPACE_END 72 | -------------------------------------------------------------------------------- /cub/detail/exec_check_disable.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 NVIDIA Corporation 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | #pragma once 18 | 19 | #include 20 | 21 | /** 22 | * @def CUB_EXEC_CHECK_DISABLE 23 | * Wrapper around `#pragma nv_exec_check_disable`. 24 | */ 25 | 26 | // #pragma nv_exec_check_disable is only recognized by NVCC. 27 | #if defined(__CUDACC__) && \ 28 | !defined(_NVHPC_CUDA) && \ 29 | !(defined(__CUDA__) && defined(__clang__)) 30 | 31 | #if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC 32 | #define CUB_EXEC_CHECK_DISABLE __pragma("nv_exec_check_disable") 33 | #else // // !MSVC 34 | #define CUB_EXEC_CHECK_DISABLE _Pragma("nv_exec_check_disable") 35 | #endif // MSVC 36 | 37 | #else // !NVCC 38 | 39 | #define CUB_EXEC_CHECK_DISABLE 40 | 41 | #endif // NVCC 42 | -------------------------------------------------------------------------------- /cub/detail/type_traits.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Wrappers and extensions around utilities. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include 36 | #include 37 | 38 | #include 39 | 40 | 41 | CUB_NAMESPACE_BEGIN 42 | namespace detail { 43 | 44 | template 45 | using invoke_result_t = 46 | #if CUB_CPP_DIALECT < 2017 47 | typename ::cuda::std::result_of::type; 48 | #else // 2017+ 49 | ::cuda::std::invoke_result_t; 50 | #endif 51 | 52 | /// The type of intermediate accumulator (according to P2322R6) 53 | template 54 | using accumulator_t = 55 | typename ::cuda::std::decay>::type; 56 | 57 | } // namespace detail 58 | CUB_NAMESPACE_END 59 | -------------------------------------------------------------------------------- /cub/detail/uninitialized_copy.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | 34 | CUB_NAMESPACE_BEGIN 35 | 36 | 37 | namespace detail 38 | { 39 | 40 | #if defined(_NVHPC_CUDA) 41 | template 42 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val) 43 | { 44 | // NVBug 3384810 45 | new (ptr) T(::cuda::std::forward(val)); 46 | } 47 | #else 48 | template ::value, 52 | int 53 | >::type = 0> 54 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val) 55 | { 56 | *ptr = ::cuda::std::forward(val); 57 | } 58 | 59 | template ::value, 63 | int 64 | >::type = 0> 65 | __host__ __device__ void uninitialized_copy(T *ptr, U &&val) 66 | { 67 | new (ptr) T(::cuda::std::forward(val)); 68 | } 69 | #endif 70 | 71 | } // namespace detail 72 | 73 | 74 | CUB_NAMESPACE_END 75 | 76 | -------------------------------------------------------------------------------- /cub/grid/grid_mapping.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../config.cuh" 37 | 38 | CUB_NAMESPACE_BEGIN 39 | 40 | 41 | /** 42 | * \addtogroup GridModule 43 | * @{ 44 | */ 45 | 46 | 47 | /****************************************************************************** 48 | * Mapping policies 49 | *****************************************************************************/ 50 | 51 | 52 | /** 53 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 54 | */ 55 | enum GridMappingStrategy 56 | { 57 | /** 58 | * \brief An a "raking" access pattern in which each thread block is 59 | * assigned a consecutive sequence of input tiles 60 | * 61 | * \par Overview 62 | * The input is evenly partitioned into \p p segments, where \p p is 63 | * constant and corresponds loosely to the number of thread blocks that may 64 | * actively reside on the target device. Each segment is comprised of 65 | * consecutive tiles, where a tile is a small, constant-sized unit of input 66 | * to be processed to completion before the thread block terminates or 67 | * obtains more work. The kernel invokes \p p thread blocks, each 68 | * of which iteratively consumes a segment of n/p elements 69 | * in tile-size increments. 70 | */ 71 | GRID_MAPPING_RAKE, 72 | 73 | /** 74 | * \brief An a "strip mining" access pattern in which the input tiles assigned 75 | * to each thread block are separated by a stride equal to the the extent of 76 | * the grid. 77 | * 78 | * \par Overview 79 | * The input is evenly partitioned into \p p sets, where \p p is 80 | * constant and corresponds loosely to the number of thread blocks that may 81 | * actively reside on the target device. Each set is comprised of 82 | * data tiles separated by stride \p tiles, where a tile is a small, 83 | * constant-sized unit of input to be processed to completion before the 84 | * thread block terminates or obtains more work. The kernel invokes \p p 85 | * thread blocks, each of which iteratively consumes a segment of 86 | * n/p elements in tile-size increments. 87 | */ 88 | GRID_MAPPING_STRIP_MINE, 89 | 90 | /** 91 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. 92 | * 93 | * \par Overview 94 | * The input is treated as a queue to be dynamically consumed by a grid of 95 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a 96 | * unit of input to be processed to completion before the thread block 97 | * terminates or obtains more work. The grid size \p p is constant, 98 | * loosely corresponding to the number of thread blocks that may actively 99 | * reside on the target device. 100 | */ 101 | GRID_MAPPING_DYNAMIC, 102 | }; 103 | 104 | 105 | /** @} */ // end group GridModule 106 | 107 | CUB_NAMESPACE_END 108 | 109 | -------------------------------------------------------------------------------- /cub/host/mutex.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple portable mutex 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | 38 | #include 39 | #include 40 | 41 | 42 | CUB_NAMESPACE_BEGIN 43 | 44 | 45 | /** 46 | * Wraps std::mutex 47 | * @deprecated [Since CUB 2.1.0] The `cub::Mutex` is deprecated and will be removed 48 | * in a future release. Use `std::mutex` instead. 49 | */ 50 | struct CUB_DEPRECATED Mutex 51 | { 52 | std::mutex mtx; 53 | 54 | void Lock() 55 | { 56 | mtx.lock(); 57 | } 58 | 59 | void Unlock() 60 | { 61 | mtx.unlock(); 62 | } 63 | }; 64 | 65 | 66 | CUB_NAMESPACE_END 67 | -------------------------------------------------------------------------------- /cub/iterator/tex_ref_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include 40 | 41 | CUB_NAMESPACE_BEGIN 42 | 43 | /** 44 | * \addtogroup UtilIterator 45 | * @{ 46 | */ 47 | 48 | /** 49 | * \brief A random-access input wrapper for dereferencing array values through texture cache. 50 | * 51 | * \deprecated [Since 1.13.0] The CUDA texture management APIs used by 52 | * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead. 53 | * 54 | * \par Overview 55 | * - TexRefInputIterator wraps a native device pointer of type ValueType*. References 56 | * to elements are to be loaded through texture cache. 57 | * - Can be used to load any data type from memory through texture cache. 58 | * - Can be manipulated and exchanged within and between host and device 59 | * functions, can only be constructed within host functions, and can only be 60 | * dereferenced within device functions. 61 | * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture 62 | * reference. Only one TexRefInputIterator instance can be bound at any given time for a 63 | * specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host 64 | * thread, and (4) compilation .o unit. 65 | * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be 66 | * created by the host thread and used by a top-level kernel (i.e. the one which is launched 67 | * from the host). 68 | * - Compatible with Thrust API v1.7 or newer. 69 | * 70 | * \par Snippet 71 | * The code snippet below illustrates the use of \p TexRefInputIterator to 72 | * dereference a device array of doubles through texture cache. 73 | * \par 74 | * \code 75 | * #include // or equivalently 76 | * 77 | * // Declare, allocate, and initialize a device array 78 | * int num_items; // e.g., 7 79 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] 80 | * 81 | * // Create an iterator wrapper 82 | * cub::TexRefInputIterator itr; 83 | * itr.BindTexture(d_in, sizeof(double) * num_items); 84 | * ... 85 | * 86 | * // Within device code: 87 | * printf("%f\n", itr[0]); // 8.0 88 | * printf("%f\n", itr[1]); // 6.0 89 | * printf("%f\n", itr[6]); // 9.0 90 | * 91 | * ... 92 | * itr.UnbindTexture(); 93 | * 94 | * \endcode 95 | * 96 | * \tparam T The value type of this iterator 97 | * \tparam UNIQUE_ID A globally-unique identifier (within the compilation unit) to name the underlying texture reference 98 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 99 | */ 100 | template < 101 | typename T, 102 | int /*UNIQUE_ID*/, 103 | typename OffsetT = std::ptrdiff_t> 104 | using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator; 105 | 106 | /** @} */ // end group UtilIterator 107 | 108 | CUB_NAMESPACE_END 109 | -------------------------------------------------------------------------------- /cub/thread/thread_sort.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include "../config.cuh" 31 | #include "../util_ptx.cuh" 32 | #include "../util_type.cuh" 33 | 34 | CUB_NAMESPACE_BEGIN 35 | 36 | 37 | template 38 | __device__ __forceinline__ void Swap(T &lhs, T &rhs) 39 | { 40 | T temp = lhs; 41 | lhs = rhs; 42 | rhs = temp; 43 | } 44 | 45 | 46 | /** 47 | * @brief Sorts data using odd-even sort method 48 | * 49 | * The sorting method is stable. Further details can be found in: 50 | * A. Nico Habermann. Parallel neighbor sort (or the glory of the induction 51 | * principle). Technical Report AD-759 248, Carnegie Mellon University, 1972. 52 | * 53 | * @tparam KeyT 54 | * Key type 55 | * 56 | * @tparam ValueT 57 | * Value type. If `cub::NullType` is used as `ValueT`, only keys are sorted. 58 | * 59 | * @tparam CompareOp 60 | * functor type having member `bool operator()(KeyT lhs, KeyT rhs)` 61 | * 62 | * @tparam ITEMS_PER_THREAD 63 | * The number of items per thread 64 | * 65 | * @param[in,out] keys 66 | * Keys to sort 67 | * 68 | * @param[in,out] items 69 | * Values to sort 70 | * 71 | * @param[in] compare_op 72 | * Comparison function object which returns true if the first argument is 73 | * ordered before the second 74 | */ 75 | template 79 | __device__ __forceinline__ void 80 | StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], 81 | ValueT (&items)[ITEMS_PER_THREAD], 82 | CompareOp compare_op) 83 | { 84 | constexpr bool KEYS_ONLY = std::is_same::value; 85 | 86 | #pragma unroll 87 | for (int i = 0; i < ITEMS_PER_THREAD; ++i) 88 | { 89 | #pragma unroll 90 | for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) 91 | { 92 | if (compare_op(keys[j + 1], keys[j])) 93 | { 94 | Swap(keys[j], keys[j + 1]); 95 | if (!KEYS_ONLY) 96 | { 97 | Swap(items[j], items[j + 1]); 98 | } 99 | } 100 | } // inner loop 101 | } // outer loop 102 | } 103 | 104 | 105 | CUB_NAMESPACE_END 106 | -------------------------------------------------------------------------------- /cub/util_compiler.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Detect compiler information. 31 | */ 32 | 33 | #pragma once 34 | 35 | // enumerate host compilers we know about 36 | #define CUB_HOST_COMPILER_UNKNOWN 0 37 | #define CUB_HOST_COMPILER_MSVC 1 38 | #define CUB_HOST_COMPILER_GCC 2 39 | #define CUB_HOST_COMPILER_CLANG 3 40 | 41 | // enumerate device compilers we know about 42 | #define CUB_DEVICE_COMPILER_UNKNOWN 0 43 | #define CUB_DEVICE_COMPILER_MSVC 1 44 | #define CUB_DEVICE_COMPILER_GCC 2 45 | #define CUB_DEVICE_COMPILER_NVCC 3 46 | #define CUB_DEVICE_COMPILER_CLANG 4 47 | 48 | // figure out which host compiler we're using 49 | #if defined(_MSC_VER) 50 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_MSVC 51 | # define CUB_MSVC_VERSION _MSC_VER 52 | # define CUB_MSVC_VERSION_FULL _MSC_FULL_VER 53 | #elif defined(__clang__) 54 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_CLANG 55 | # define CUB_CLANG_VERSION \ 56 | (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__) 57 | #elif defined(__GNUC__) 58 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_GCC 59 | # define CUB_GCC_VERSION \ 60 | (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 61 | #else 62 | # define CUB_HOST_COMPILER CUB_HOST_COMPILER_UNKNOWN 63 | #endif // CUB_HOST_COMPILER 64 | 65 | // figure out which device compiler we're using 66 | #if defined(__CUDACC__) || defined(_NVHPC_CUDA) 67 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC 68 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC 69 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_MSVC 70 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC 71 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_GCC 72 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG 73 | // CUDA-capable clang should behave similar to NVCC. 74 | # if defined(__CUDA__) 75 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_NVCC 76 | # else 77 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_CLANG 78 | # endif 79 | #else 80 | # define CUB_DEVICE_COMPILER CUB_DEVICE_COMPILER_UNKNOWN 81 | #endif 82 | -------------------------------------------------------------------------------- /cub/util_deprecated.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Define CUB_DEPRECATED macro. 31 | */ 32 | 33 | #pragma once 34 | 35 | 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | 42 | #if defined(THRUST_IGNORE_DEPRECATED_API) && !defined(CUB_IGNORE_DEPRECATED_API) 43 | # define CUB_IGNORE_DEPRECATED_API 44 | #endif 45 | 46 | #ifdef CUB_IGNORE_DEPRECATED_API 47 | # define CUB_DEPRECATED 48 | # define CUB_DEPRECATED_BECAUSE(MSG) 49 | #elif CUB_CPP_DIALECT >= 2014 50 | # define CUB_DEPRECATED [[deprecated]] 51 | # define CUB_DEPRECATED_BECAUSE(MSG) [[deprecated(MSG)]] 52 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC 53 | # define CUB_DEPRECATED __declspec(deprecated) 54 | # define CUB_DEPRECATED_BECAUSE(MSG) __declspec(deprecated(MSG)) 55 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG 56 | # define CUB_DEPRECATED __attribute__((deprecated)) 57 | # define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG))) 58 | #elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC 59 | # define CUB_DEPRECATED __attribute__((deprecated)) 60 | # define CUB_DEPRECATED_BECAUSE(MSG) __attribute__((deprecated(MSG))) 61 | #else 62 | # define CUB_DEPRECATED 63 | # define CUB_DEPRECATED_BECAUSE(MSG) 64 | #endif 65 | 66 | #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_IS_NOT_SUPPORTED \ 67 | CUB_DEPRECATED_BECAUSE( \ 68 | "CUB no longer accepts `debug_synchronous` parameter. " \ 69 | "Define CUB_DEBUG_SYNC instead, or silence this message with " \ 70 | "CUB_IGNORE_DEPRECATED_API.") 71 | 72 | #define CUB_DETAIL_RUNTIME_DEBUG_SYNC_USAGE_LOG \ 73 | if (debug_synchronous) \ 74 | { \ 75 | _CubLog("%s\n", \ 76 | "CUB no longer accepts `debug_synchronous` parameter. " \ 77 | "Define CUB_DEBUG_SYNC instead."); \ 78 | } 79 | 80 | -------------------------------------------------------------------------------- /cub/util_macro.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Common C/C++ macro utilities 31 | ******************************************************************************/ 32 | 33 | #pragma once 34 | 35 | #include 36 | 37 | #include "util_namespace.cuh" 38 | 39 | CUB_NAMESPACE_BEGIN 40 | 41 | 42 | /** 43 | * \addtogroup UtilModule 44 | * @{ 45 | */ 46 | 47 | #ifndef CUB_ALIGN 48 | #if defined(_WIN32) || defined(_WIN64) 49 | /// Align struct 50 | #define CUB_ALIGN(bytes) __declspec(align(32)) 51 | #else 52 | /// Align struct 53 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) 54 | #endif 55 | #endif 56 | 57 | #define CUB_PREVENT_MACRO_SUBSTITUTION 58 | 59 | template 60 | constexpr __host__ __device__ auto min CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, 61 | U &&u) 62 | -> decltype(t < u ? ::cuda::std::forward(t) : ::cuda::std::forward(u)) 63 | { 64 | return t < u ? ::cuda::std::forward(t) : ::cuda::std::forward(u); 65 | } 66 | 67 | template 68 | constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, 69 | U &&u) 70 | -> decltype(t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t)) 71 | { 72 | return t < u ? ::cuda::std::forward(u) : ::cuda::std::forward(t); 73 | } 74 | 75 | #ifndef CUB_MAX 76 | /// Select maximum(a, b) 77 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) 78 | #endif 79 | 80 | #ifndef CUB_MIN 81 | /// Select minimum(a, b) 82 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) 83 | #endif 84 | 85 | #ifndef CUB_QUOTIENT_FLOOR 86 | /// Quotient of x/y rounded down to nearest integer 87 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) 88 | #endif 89 | 90 | #ifndef CUB_QUOTIENT_CEILING 91 | /// Quotient of x/y rounded up to nearest integer 92 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) 93 | #endif 94 | 95 | #ifndef CUB_ROUND_UP_NEAREST 96 | /// x rounded up to the nearest multiple of y 97 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) 98 | #endif 99 | 100 | #ifndef CUB_ROUND_DOWN_NEAREST 101 | /// x rounded down to the nearest multiple of y 102 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) 103 | #endif 104 | 105 | 106 | #ifndef CUB_STATIC_ASSERT 107 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 108 | #define CUB_CAT_(a, b) a ## b 109 | #define CUB_CAT(a, b) CUB_CAT_(a, b) 110 | #endif // DOXYGEN_SHOULD_SKIP_THIS 111 | 112 | /// Static assert 113 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] 114 | #endif 115 | 116 | /** @} */ // end group UtilModule 117 | 118 | CUB_NAMESPACE_END 119 | -------------------------------------------------------------------------------- /cub/util_math.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /** 29 | * \file 30 | * Define helper math functions. 31 | */ 32 | 33 | #pragma once 34 | 35 | #include 36 | 37 | #include "util_namespace.cuh" 38 | #include "util_macro.cuh" 39 | 40 | CUB_NAMESPACE_BEGIN 41 | 42 | namespace detail 43 | { 44 | 45 | template 46 | using is_integral_or_enum = 47 | std::integral_constant::value || std::is_enum::value>; 49 | 50 | __host__ __device__ __forceinline__ constexpr std::size_t 51 | VshmemSize(std::size_t max_shmem, 52 | std::size_t shmem_per_block, 53 | std::size_t num_blocks) 54 | { 55 | return shmem_per_block > max_shmem ? shmem_per_block * num_blocks : 0; 56 | } 57 | 58 | } 59 | 60 | /** 61 | * Divide n by d, round up if any remainder, and return the result. 62 | * 63 | * Effectively performs `(n + d - 1) / d`, but is robust against the case where 64 | * `(n + d - 1)` would overflow. 65 | */ 66 | template 67 | __host__ __device__ __forceinline__ constexpr NumeratorT 68 | DivideAndRoundUp(NumeratorT n, DenominatorT d) 69 | { 70 | static_assert(cub::detail::is_integral_or_enum::value && 71 | cub::detail::is_integral_or_enum::value, 72 | "DivideAndRoundUp is only intended for integral types."); 73 | 74 | // Static cast to undo integral promotion. 75 | return static_cast(n / d + (n % d != 0 ? 1 : 0)); 76 | } 77 | 78 | constexpr __device__ __host__ int 79 | Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes) 80 | { 81 | return (cub::min)(nominal_4b_items_per_thread, 82 | (cub::max)(1, 83 | nominal_4b_items_per_thread * 8 / 84 | combined_bytes)); 85 | } 86 | 87 | template 88 | constexpr __device__ __host__ int 89 | Nominal4BItemsToItems(int nominal_4b_items_per_thread) 90 | { 91 | return (cub::min)(nominal_4b_items_per_thread, 92 | (cub::max)(1, 93 | nominal_4b_items_per_thread * 4 / 94 | static_cast(sizeof(T)))); 95 | } 96 | 97 | template 98 | constexpr __device__ __host__ int 99 | Nominal8BItemsToItems(int nominal_8b_items_per_thread) 100 | { 101 | return sizeof(ItemT) <= 8u 102 | ? nominal_8b_items_per_thread 103 | : (cub::min)(nominal_8b_items_per_thread, 104 | (cub::max)(1, 105 | ((nominal_8b_items_per_thread * 8) + 106 | static_cast(sizeof(ItemT)) - 1) / 107 | static_cast(sizeof(ItemT)))); 108 | } 109 | 110 | /** 111 | * \brief Computes the midpoint of the integers 112 | * 113 | * Extra operation is performed in order to prevent overflow. 114 | * 115 | * \return Half the sum of \p begin and \p end 116 | */ 117 | template 118 | constexpr __device__ __host__ T MidPoint(T begin, T end) 119 | { 120 | return begin + (end - begin) / 2; 121 | } 122 | 123 | CUB_NAMESPACE_END 124 | -------------------------------------------------------------------------------- /cub/version.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /*! \file version.cuh 29 | * \brief Compile-time macros encoding CUB release version 30 | * 31 | * is the only CUB header that is guaranteed to 32 | * change with every CUB release. 33 | * 34 | */ 35 | 36 | #pragma once 37 | 38 | /*! \def CUB_VERSION 39 | * \brief The preprocessor macro \p CUB_VERSION encodes the version 40 | * number of the CUB library. 41 | * 42 | * CUB_VERSION % 100 is the sub-minor version. 43 | * CUB_VERSION / 100 % 1000 is the minor version. 44 | * CUB_VERSION / 100000 is the major version. 45 | */ 46 | #define CUB_VERSION 200200 47 | 48 | /*! \def CUB_MAJOR_VERSION 49 | * \brief The preprocessor macro \p CUB_MAJOR_VERSION encodes the 50 | * major version number of the CUB library. 51 | */ 52 | #define CUB_MAJOR_VERSION (CUB_VERSION / 100000) 53 | 54 | /*! \def CUB_MINOR_VERSION 55 | * \brief The preprocessor macro \p CUB_MINOR_VERSION encodes the 56 | * minor version number of the CUB library. 57 | */ 58 | #define CUB_MINOR_VERSION (CUB_VERSION / 100 % 1000) 59 | 60 | /*! \def CUB_SUBMINOR_VERSION 61 | * \brief The preprocessor macro \p CUB_SUBMINOR_VERSION encodes the 62 | * sub-minor version number of the CUB library. 63 | */ 64 | #define CUB_SUBMINOR_VERSION (CUB_VERSION % 100) 65 | 66 | /*! \def CUB_PATCH_NUMBER 67 | * \brief The preprocessor macro \p CUB_PATCH_NUMBER encodes the 68 | * patch number of the CUB library. 69 | */ 70 | #define CUB_PATCH_NUMBER 0 71 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | _repo 3 | api 4 | *png 5 | -------------------------------------------------------------------------------- /docs/VERSION.md: -------------------------------------------------------------------------------- 1 | 104.0 -------------------------------------------------------------------------------- /docs/deps/repo-deps.packman.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /docs/gen_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/env bash 2 | 3 | 4 | mkdir -p img 5 | 6 | if [ ! -n "$(find img -name '*.png')" ]; then 7 | wget -q https://docs.nvidia.com/cuda/_static/Logo_and_CUDA.png -O img/logo.png 8 | 9 | # Parse files and collects unique names ending with .png 10 | imgs=$(grep -R -o -h '[[:alpha:][:digit:]_]*.png' ../cub) 11 | imgs="${imgs}\ncub_overview.png\nnested_composition.png\ntile.png\nblocked.png\nstriped.png" 12 | 13 | for img in $(echo -e ${imgs} | sort | uniq) 14 | do 15 | echo ${img} 16 | wget -q https://nvlabs.github.io/cub/${img} -O img/${img} 17 | done 18 | fi 19 | 20 | ./repo.sh docs 21 | -------------------------------------------------------------------------------- /docs/repo.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | call "%~dp0tools\packman\python.bat" %~dp0tools\repoman\repoman.py %* 4 | if %errorlevel% neq 0 ( goto Error ) 5 | 6 | :Success 7 | exit /b 0 8 | 9 | :Error 10 | exit /b %errorlevel% 11 | -------------------------------------------------------------------------------- /docs/repo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SCRIPT_DIR=$(dirname ${BASH_SOURCE}) 6 | cd "$SCRIPT_DIR" 7 | 8 | exec "tools/packman/python.sh" tools/repoman/repoman.py $@ 9 | -------------------------------------------------------------------------------- /docs/tools/packman/bootstrap/download_file_from_url.ps1: -------------------------------------------------------------------------------- 1 | <# 2 | Copyright 2019 NVIDIA CORPORATION 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | #> 16 | 17 | param( 18 | [Parameter(Mandatory=$true)][string]$source=$null, 19 | [string]$output="out.exe" 20 | ) 21 | $filename = $output 22 | 23 | $triesLeft = 4 24 | $delay = 2 25 | do 26 | { 27 | $triesLeft -= 1 28 | 29 | try 30 | { 31 | Write-Host "Downloading from bootstrap.packman.nvidia.com ..." 32 | $wc = New-Object net.webclient 33 | $wc.Downloadfile($source, $fileName) 34 | exit 0 35 | } 36 | catch 37 | { 38 | Write-Host "Error downloading $source!" 39 | Write-Host $_.Exception|format-list -force 40 | if ($triesLeft) 41 | { 42 | Write-Host "Retrying in $delay seconds ..." 43 | Start-Sleep -seconds $delay 44 | } 45 | $delay = $delay * $delay 46 | } 47 | } while ($triesLeft -gt 0) 48 | # We only get here if the retries have been exhausted, remove any left-overs: 49 | if (Test-Path $fileName) 50 | { 51 | Remove-Item $fileName 52 | } 53 | exit 1 -------------------------------------------------------------------------------- /docs/tools/packman/bootstrap/fetch_file_from_packman_bootstrap.cmd: -------------------------------------------------------------------------------- 1 | :: Copyright 2019 NVIDIA CORPORATION 2 | :: 3 | :: Licensed under the Apache License, Version 2.0 (the "License"); 4 | :: you may not use this file except in compliance with the License. 5 | :: You may obtain a copy of the License at 6 | :: 7 | :: http://www.apache.org/licenses/LICENSE-2.0 8 | :: 9 | :: Unless required by applicable law or agreed to in writing, software 10 | :: distributed under the License is distributed on an "AS IS" BASIS, 11 | :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | :: See the License for the specific language governing permissions and 13 | :: limitations under the License. 14 | 15 | :: You need to specify as input to this command 16 | @setlocal 17 | @set PACKAGE_NAME=%1 18 | @set TARGET_PATH=%2 19 | 20 | @echo Fetching %PACKAGE_NAME% ... 21 | 22 | @powershell -ExecutionPolicy ByPass -NoLogo -NoProfile -File "%~dp0download_file_from_url.ps1" ^ 23 | -source "http://bootstrap.packman.nvidia.com/%PACKAGE_NAME%" -output %TARGET_PATH% 24 | :: A bug in powershell prevents the errorlevel code from being set when using the -File execution option 25 | :: We must therefore do our own failure analysis, basically make sure the file exists: 26 | @if not exist %TARGET_PATH% goto ERROR_DOWNLOAD_FAILED 27 | 28 | @endlocal 29 | @exit /b 0 30 | 31 | :ERROR_DOWNLOAD_FAILED 32 | @echo Failed to download file from S3 33 | @echo Most likely because endpoint cannot be reached or file %PACKAGE_NAME% doesn't exist 34 | @endlocal 35 | @exit /b 1 -------------------------------------------------------------------------------- /docs/tools/packman/config.packman.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /docs/tools/packman/packman.cmd: -------------------------------------------------------------------------------- 1 | :: Reset errorlevel status (don't inherit from caller) [xxxxxxxxxxx] 2 | @call :ECHO_AND_RESET_ERROR 3 | :: You can remove the call below if you do your own manual configuration of the dev machines 4 | call "%~dp0\bootstrap\configure.bat" 5 | 6 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 7 | :: Everything below is mandatory 8 | if not defined PM_PYTHON goto :PYTHON_ENV_ERROR 9 | if not defined PM_MODULE goto :MODULE_ENV_ERROR 10 | 11 | :: Generate temporary path for variable file 12 | for /f "delims=" %%a in ('powershell -ExecutionPolicy ByPass -NoLogo -NoProfile ^ 13 | -File "%~dp0bootstrap\generate_temp_file_name.ps1"') do set PM_VAR_PATH=%%a 14 | 15 | if %1.==. ( 16 | set PM_VAR_PATH_ARG= 17 | ) else ( 18 | set PM_VAR_PATH_ARG=--var-path="%PM_VAR_PATH%" 19 | ) 20 | 21 | "%PM_PYTHON%" -S -s -u -E "%PM_MODULE%" %* %PM_VAR_PATH_ARG% 22 | if %errorlevel% neq 0 ( exit /b %errorlevel% ) 23 | 24 | :: Marshall environment variables into the current environment if they have been generated and remove temporary file 25 | if exist "%PM_VAR_PATH%" ( 26 | for /F "usebackq tokens=*" %%A in ("%PM_VAR_PATH%") do set "%%A" 27 | ) 28 | if %errorlevel% neq 0 ( goto :VAR_ERROR ) 29 | 30 | if exist "%PM_VAR_PATH%" ( 31 | del /F "%PM_VAR_PATH%" 32 | ) 33 | if %errorlevel% neq 0 ( goto :VAR_ERROR ) 34 | 35 | set PM_VAR_PATH= 36 | goto :eof 37 | 38 | :: Subroutines below 39 | :PYTHON_ENV_ERROR 40 | @echo User environment variable PM_PYTHON is not set! Please configure machine for packman or call configure.bat. 41 | exit /b 1 42 | 43 | :MODULE_ENV_ERROR 44 | @echo User environment variable PM_MODULE is not set! Please configure machine for packman or call configure.bat. 45 | exit /b 1 46 | 47 | :VAR_ERROR 48 | @echo Error while processing and setting environment variables! 49 | exit /b 1 50 | 51 | :ECHO_AND_RESET_ERROR 52 | @echo off 53 | if /I "%PM_VERBOSITY%"=="debug" ( 54 | @echo on 55 | ) 56 | exit /b 0 57 | -------------------------------------------------------------------------------- /docs/tools/packman/packmanconf.py: -------------------------------------------------------------------------------- 1 | # Use this file to bootstrap packman into your Python environment (3.7.x). Simply 2 | # add the path by doing sys.insert to where packmanconf.py is located and then execute: 3 | # 4 | # >>> import packmanconf 5 | # >>> packmanconf.init() 6 | # 7 | # It will use the configured remote(s) and the version of packman in the same folder, 8 | # giving you full access to the packman API via the following module 9 | # 10 | # >> import packmanapi 11 | # >> dir(packmanapi) 12 | 13 | import os 14 | import platform 15 | import sys 16 | 17 | 18 | def init(): 19 | """Call this function to initialize the packman configuration. 20 | 21 | Calls to the packman API will work after successfully calling this function. 22 | 23 | Note: 24 | This function only needs to be called once during the execution of your 25 | program. Calling it repeatedly is harmless but wasteful. 26 | Compatibility with your Python interpreter is checked and upon failure 27 | the function will report what is required. 28 | 29 | Example: 30 | >>> import packmanconf 31 | >>> packmanconf.init() 32 | >>> import packmanapi 33 | >>> packmanapi.set_verbosity_level(packmanapi.VERBOSITY_HIGH) 34 | """ 35 | major = sys.version_info[0] 36 | minor = sys.version_info[1] 37 | if major != 3 or minor != 7: 38 | raise RuntimeError( 39 | f"This version of packman requires Python 3.7.x, but {major}.{minor} was provided" 40 | ) 41 | conf_dir = os.path.dirname(os.path.abspath(__file__)) 42 | os.environ["PM_INSTALL_PATH"] = conf_dir 43 | packages_root = get_packages_root(conf_dir) 44 | version = get_version(conf_dir) 45 | module_dir = get_module_dir(conf_dir, packages_root, version) 46 | sys.path.insert(1, module_dir) 47 | 48 | 49 | def get_packages_root(conf_dir: str) -> str: 50 | root = os.getenv("PM_PACKAGES_ROOT") 51 | if not root: 52 | platform_name = platform.system() 53 | if platform_name == "Windows": 54 | drive, _ = os.path.splitdrive(conf_dir) 55 | root = os.path.join(drive, "packman-repo") 56 | elif platform_name == "Darwin": 57 | # macOS 58 | root = "/Library/Caches/packman" 59 | elif platform_name == "Linux": 60 | root = "/var/tmp/packman" 61 | else: 62 | raise RuntimeError(f"Unsupported platform '{platform_name}'") 63 | # make sure the path exists: 64 | os.makedirs(root, exist_ok=True) 65 | return root 66 | 67 | 68 | def get_module_dir(conf_dir, packages_root: str, version: str) -> str: 69 | module_dir = os.path.join(packages_root, "packman-common", version) 70 | if not os.path.exists(module_dir): 71 | import tempfile 72 | 73 | tf = tempfile.NamedTemporaryFile(delete=False) 74 | target_name = tf.name 75 | tf.close() 76 | url = f"http://bootstrap.packman.nvidia.com/packman-common@{version}.zip" 77 | print(f"Downloading '{url}' ...") 78 | import urllib.request 79 | 80 | urllib.request.urlretrieve(url, target_name) 81 | from importlib.machinery import SourceFileLoader 82 | 83 | # import module from path provided 84 | script_path = os.path.join(conf_dir, "bootstrap", "install_package.py") 85 | ip = SourceFileLoader("install_package", script_path).load_module() 86 | print("Unpacking ...") 87 | ip.install_package(target_name, module_dir) 88 | os.unlink(tf.name) 89 | return module_dir 90 | 91 | 92 | def get_version(conf_dir: str): 93 | path = os.path.join(conf_dir, "packman") 94 | if not os.path.exists(path): # in dev repo fallback 95 | path += ".sh" 96 | with open(path, "rt", encoding="utf8") as launch_file: 97 | for line in launch_file.readlines(): 98 | if line.startswith("PM_PACKMAN_VERSION"): 99 | _, value = line.split("=") 100 | return value.strip() 101 | raise RuntimeError(f"Unable to find 'PM_PACKMAN_VERSION' in '{path}'") 102 | -------------------------------------------------------------------------------- /docs/tools/packman/python.bat: -------------------------------------------------------------------------------- 1 | :: Copyright 2019-2020 NVIDIA CORPORATION 2 | :: 3 | :: Licensed under the Apache License, Version 2.0 (the "License"); 4 | :: you may not use this file except in compliance with the License. 5 | :: You may obtain a copy of the License at 6 | :: 7 | :: http://www.apache.org/licenses/LICENSE-2.0 8 | :: 9 | :: Unless required by applicable law or agreed to in writing, software 10 | :: distributed under the License is distributed on an "AS IS" BASIS, 11 | :: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | :: See the License for the specific language governing permissions and 13 | :: limitations under the License. 14 | 15 | @echo off 16 | setlocal 17 | 18 | call "%~dp0\packman" init 19 | set "PYTHONPATH=%PM_MODULE_DIR%;%PYTHONPATH%" 20 | set PYTHONNOUSERSITE=1 21 | "%PM_PYTHON%" -u %* 22 | -------------------------------------------------------------------------------- /docs/tools/packman/python.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright 2019-2020 NVIDIA CORPORATION 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -e 18 | 19 | PACKMAN_CMD="$(dirname "${BASH_SOURCE}")/packman" 20 | if [ ! -f "$PACKMAN_CMD" ]; then 21 | PACKMAN_CMD="${PACKMAN_CMD}.sh" 22 | fi 23 | source "$PACKMAN_CMD" init 24 | export PYTHONPATH="${PM_MODULE_DIR}:${PYTHONPATH}" 25 | export PYTHONNOUSERSITE=1 26 | 27 | # workaround for our python not shipping with certs 28 | if [[ -z ${SSL_CERT_DIR:-} ]]; then 29 | export SSL_CERT_DIR=/etc/ssl/certs/ 30 | fi 31 | 32 | "${PM_PYTHON}" -u "$@" 33 | -------------------------------------------------------------------------------- /docs/tools/repoman/omni/repo/format/.gitignore: -------------------------------------------------------------------------------- 1 | # Dummy omni.repo.format Python module so we don't have to pull down the format package. 2 | 3 | # Ignore everything in this directory, except this file to ensure the folder is created. 4 | * 5 | !.gitignore -------------------------------------------------------------------------------- /docs/tools/repoman/repoman.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import io 4 | import contextlib 5 | import packmanapi 6 | 7 | REPO_ROOT = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../..") 8 | REPO_DEPS_FILE = os.path.join(REPO_ROOT, "deps/repo-deps.packman.xml") 9 | 10 | 11 | def bootstrap(): 12 | """ 13 | Bootstrap all omni.repo modules. 14 | 15 | Pull with packman from repo.packman.xml and add them all to python sys.path to enable importing. 16 | """ 17 | #with contextlib.redirect_stdout(io.StringIO()): 18 | deps = packmanapi.pull(REPO_DEPS_FILE) 19 | for dep_path in deps.values(): 20 | if dep_path not in sys.path: 21 | sys.path.append(dep_path) 22 | 23 | 24 | if __name__ == "__main__": 25 | bootstrap() 26 | import omni.repo.man 27 | 28 | omni.repo.man.main(REPO_ROOT) 29 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Create meta targets that build all examples for a single configuration: 2 | foreach(cub_target IN LISTS CUB_TARGETS) 3 | cub_get_target_property(config_prefix ${cub_target} PREFIX) 4 | set(config_meta_target ${config_prefix}.examples) 5 | add_custom_target(${config_meta_target}) 6 | add_dependencies(${config_prefix}.all ${config_meta_target}) 7 | endforeach() 8 | 9 | # Update flags to reflect RDC options. See note in CubCudaConfig.cmake -- 10 | # these flag variables behave unintuitively: 11 | if (CUB_ENABLE_EXAMPLES_WITH_RDC) 12 | set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_RDC}") 13 | else() 14 | set(CMAKE_CUDA_FLAGS "${CUB_CUDA_FLAGS_BASE} ${CUB_CUDA_FLAGS_NO_RDC}") 15 | endif() 16 | 17 | ## cub_add_example 18 | # 19 | # Add an example executable and register it with ctest. 20 | # 21 | # target_name_var: Variable name to overwrite with the name of the example 22 | # target. Useful for post-processing target information per-backend. 23 | # example_name: The name of the example minus ".example." For 24 | # instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu 25 | # would be "cuda.copy". 26 | # example_src: The source file that implements the example. 27 | # cub_target: The reference cub target with configuration information. 28 | # 29 | function(cub_add_example target_name_var example_name example_src cub_target) 30 | cub_get_target_property(config_prefix ${cub_target} PREFIX) 31 | 32 | # The actual name of the test's target: 33 | set(example_target ${config_prefix}.example.${example_name}) 34 | set(${target_name_var} ${example_target} PARENT_SCOPE) 35 | 36 | # Related target names: 37 | set(config_meta_target ${config_prefix}.examples) 38 | set(example_meta_target cub.all.example.${example_name}) 39 | 40 | add_executable(${example_target} "${example_src}") 41 | target_link_libraries(${example_target} ${cub_target}) 42 | cub_clone_target_properties(${example_target} ${cub_target}) 43 | target_include_directories(${example_target} PRIVATE "${CUB_SOURCE_DIR}/examples") 44 | 45 | if (CUB_IN_THRUST) 46 | thrust_fix_clang_nvcc_build_for(${example_target}) 47 | endif() 48 | 49 | # Add to the active configuration's meta target 50 | add_dependencies(${config_meta_target} ${example_target}) 51 | 52 | # Meta target that builds examples with this name for all configurations: 53 | if (NOT TARGET ${example_meta_target}) 54 | add_custom_target(${example_meta_target}) 55 | endif() 56 | add_dependencies(${example_meta_target} ${example_target}) 57 | 58 | if (CUB_ENABLE_EXAMPLES_WITH_RDC) 59 | cub_enable_rdc_for_cuda_target(${example_target}) 60 | endif() 61 | 62 | add_test(NAME ${example_target} 63 | COMMAND "$" 64 | ) 65 | endfunction() 66 | 67 | add_subdirectory(cmake) 68 | add_subdirectory(block) 69 | add_subdirectory(device) 70 | -------------------------------------------------------------------------------- /examples/block/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /Debug 3 | /Release 4 | /cuda55.sdf 5 | /cuda55.suo 6 | /cuda60.sdf 7 | /cuda60.suo 8 | -------------------------------------------------------------------------------- /examples/block/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB_RECURSE example_srcs 2 | RELATIVE "${CMAKE_CURRENT_LIST_DIR}" 3 | CONFIGURE_DEPENDS 4 | example_*.cu 5 | ) 6 | 7 | foreach (cub_target IN LISTS CUB_TARGETS) 8 | foreach (example_src IN LISTS example_srcs) 9 | get_filename_component(example_name "${example_src}" NAME_WE) 10 | string(REGEX REPLACE 11 | "^example_block_" "block." 12 | example_name "${example_name}" 13 | ) 14 | cub_add_example(target_name ${example_name} "${example_src}" ${cub_target}) 15 | endforeach() 16 | endforeach() 17 | -------------------------------------------------------------------------------- /examples/cmake/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_test( 2 | NAME cub.example.cmake.add_subdir 3 | COMMAND "${CMAKE_COMMAND}" 4 | --log-level=VERBOSE 5 | -G "${CMAKE_GENERATOR}" 6 | -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir" 7 | -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir" 8 | -D "CUB_ROOT=${CUB_SOURCE_DIR}" 9 | -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}" 10 | -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" 11 | ) 12 | -------------------------------------------------------------------------------- /examples/cmake/add_subdir/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This example demonstrates / tests adding CUB via a CMake add_subdirectory 2 | # call from a parent project. 3 | 4 | cmake_minimum_required(VERSION 3.15) 5 | 6 | # Silence warnings about empty CUDA_ARCHITECTURES properties on example targets: 7 | if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) 8 | cmake_policy(SET CMP0104 OLD) 9 | endif() 10 | 11 | project(CubAddSubDirExample CUDA) 12 | 13 | # Use your project's checkout of CUB here, for most cases 14 | # `add_subdirectory(cub)` will be sufficient. 15 | add_subdirectory("${CUB_ROOT}" cub) 16 | 17 | # Link the CUB::CUB target to your project's targets 18 | add_executable(HelloCUB dummy.cu) 19 | target_link_libraries(HelloCUB CUB::CUB) 20 | 21 | # 22 | # Validation 23 | # 24 | 25 | function(assert_target target_name) 26 | if (NOT TARGET "${target_name}") 27 | message(FATAL_ERROR "Target '${target_name}' not defined.") 28 | endif() 29 | endfunction() 30 | 31 | assert_target(CUB::CUB) 32 | assert_target(HelloCUB) 33 | -------------------------------------------------------------------------------- /examples/cmake/add_subdir/dummy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | int main() 6 | { 7 | std::cout << "Hello from CUB version " << CUB_VERSION << ":\n"; 8 | } 9 | -------------------------------------------------------------------------------- /examples/device/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /Debug 3 | /ipch 4 | /Release 5 | /cuda55.sdf 6 | /cuda55.suo 7 | /cuda60.sdf 8 | /cuda60.suo 9 | -------------------------------------------------------------------------------- /examples/device/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB_RECURSE example_srcs 2 | RELATIVE "${CMAKE_CURRENT_LIST_DIR}" 3 | CONFIGURE_DEPENDS 4 | example_*.cu 5 | ) 6 | 7 | foreach (cub_target IN LISTS CUB_TARGETS) 8 | foreach (example_src IN LISTS example_srcs) 9 | get_filename_component(example_name "${example_src}" NAME_WE) 10 | string(REGEX REPLACE 11 | "^example_device_" "device." 12 | example_name "${example_name}" 13 | ) 14 | cub_add_example(target_name ${example_name} "${example_src}" ${cub_target}) 15 | endforeach() 16 | endforeach() 17 | -------------------------------------------------------------------------------- /examples/device/example_device_decoupled_look_back.cu: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2023, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #include 29 | 30 | #include 31 | 32 | #include 33 | 34 | template 35 | __global__ void init_kernel(ScanTileStateT tile_state, int blocks_in_grid) 36 | { 37 | tile_state.InitializeStatus(blocks_in_grid); 38 | } 39 | 40 | template 41 | __global__ void decoupled_look_back_kernel(cub::ScanTileState tile_state) 42 | { 43 | using scan_op_t = cub::Sum; 44 | using scan_tile_state_t = cub::ScanTileState; 45 | using tile_prefix_op = cub::TilePrefixCallbackOp; 46 | using temp_storage_t = typename tile_prefix_op::TempStorage; 47 | 48 | // Allocate temp storage in shared memory 49 | __shared__ temp_storage_t temp_storage; 50 | 51 | scan_op_t scan_op{}; 52 | const unsigned int threads_in_warp = 32; 53 | const unsigned int tid = threadIdx.x; 54 | 55 | // Construct prefix op 56 | tile_prefix_op prefix(tile_state, temp_storage, scan_op); 57 | const unsigned int tile_idx = prefix.GetTileIdx(); 58 | 59 | // Compute block aggregate 60 | MessageT block_aggregate = blockIdx.x; 61 | 62 | if (tile_idx == 0) 63 | { 64 | // There are no blocks to look back to, immediately set the inclusive state 65 | if (tid == 0) 66 | { 67 | tile_state.SetInclusive(tile_idx, block_aggregate); 68 | printf("tile %d: inclusive = %d\n", tile_idx, block_aggregate); 69 | } 70 | } 71 | else 72 | { 73 | // Only the first warp in the block can perform the look back 74 | const unsigned int warp_id = tid / threads_in_warp; 75 | 76 | if (warp_id == 0) 77 | { 78 | // Perform the decoupled look-back 79 | // Invocation of the prefix will block until the look-back is complete. 80 | MessageT exclusive_prefix = prefix(block_aggregate); 81 | 82 | if (tid == 0) 83 | { 84 | MessageT inclusive_prefix = scan_op(exclusive_prefix, block_aggregate); 85 | printf("tile %d: exclusive = %d inclusive = %d\n", 86 | tile_idx, 87 | exclusive_prefix, 88 | inclusive_prefix); 89 | } 90 | } 91 | } 92 | } 93 | 94 | template 95 | void decoupled_look_back_example(int blocks_in_grid) 96 | { 97 | using scan_tile_state_t = cub::ScanTileState; 98 | 99 | // Query temporary storage requirements 100 | std::size_t temp_storage_bytes{}; 101 | scan_tile_state_t::AllocationSize(blocks_in_grid, temp_storage_bytes); 102 | 103 | // Allocate temporary storage 104 | thrust::device_vector temp_storage(temp_storage_bytes); 105 | std::uint8_t *d_temp_storage = thrust::raw_pointer_cast(temp_storage.data()); 106 | 107 | // Initialize temporary storage 108 | scan_tile_state_t tile_status; 109 | tile_status.Init(blocks_in_grid, d_temp_storage, temp_storage_bytes); 110 | const unsigned int threads_in_init_block = 256; 111 | const unsigned int blocks_in_init_grid = cub::DivideAndRoundUp(blocks_in_grid, 112 | threads_in_init_block); 113 | init_kernel<<>>(tile_status, blocks_in_grid); 114 | 115 | // Launch decoupled look-back 116 | const unsigned int threads_in_block = 256; 117 | decoupled_look_back_kernel<<>>(tile_status); 118 | 119 | // Wait for kernel to finish 120 | cudaDeviceSynchronize(); 121 | } 122 | 123 | int main() { decoupled_look_back_example(14); } 124 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | /link_main.obj 3 | /dummy/ 4 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | # Test Parametrization 2 | 3 | Some of CUB's tests are very slow to build and are capable of exhausting RAM 4 | during compilation/linking. To avoid such issues, large tests are split into 5 | multiple executables to take advantage of parallel computation and reduce memory 6 | usage. 7 | 8 | CUB facilitates this by checking for special `%PARAM%` comments in each test's 9 | source code, and then uses this information to generate multiple executables 10 | with different configurations. 11 | 12 | ## Using `%PARAM%` 13 | 14 | The `%PARAM%` hint provides an automated method of generating multiple test 15 | executables from a single source file. To use it, add one or more special 16 | comments to the test source file: 17 | 18 | ```cpp 19 | // %PARAM% [definition] [label] [values] 20 | ``` 21 | 22 | CMake will parse the source file and extract these comments, using them to 23 | generate multiple test executables for the full cartesian product of values. 24 | 25 | - `definition` will be used as a preprocessor definition name. By convention, 26 | these begin with `TEST_`. 27 | - `label` is a short, human-readable label that will be used in the test 28 | executable's name to identify the test variant. 29 | - `values` is a colon-separated list of values used during test generation. Only 30 | numeric values have been tested. 31 | 32 | ## Special Labels 33 | 34 | ### CDP / RDC Testing 35 | 36 | If a `label` is `cdp`, it is assumed that the parameter is used to explicitly 37 | test variants built with and without CDP support. The `values` for such a 38 | parameter must be `0:1`, with `0` indicating CDP disabled (RDC off) and `1` 39 | indicating CDP enabled (RDC on). 40 | 41 | Tests that do not contain a variant labeled `cdp` will only enable RDC if 42 | the CMake variable `CUB_ENABLE_TESTS_WITH_RDC` is true. 43 | 44 | ## Example 45 | 46 | For example, if `test_baz.cu` contains the following lines: 47 | 48 | ```cpp 49 | // %PARAM% TEST_FOO foo 0:1:2 50 | // %PARAM% TEST_CDP cdp 0:1 51 | ``` 52 | 53 | Six executables and CTest targets will be generated with unique definitions 54 | (only c++17 targets shown): 55 | 56 | | Executable Name | Preprocessor Definitions | RDC State | 57 | |----------------------------------|-----------------------------|-----------| 58 | | `cub.cpp17.test.baz.foo_0.cdp_0` | `-DTEST_FOO=0 -DTEST_CDP=0` | Disabled | 59 | | `cub.cpp17.test.baz.foo_0.cdp_1` | `-DTEST_FOO=0 -DTEST_CDP=1` | Enabled | 60 | | `cub.cpp17.test.baz.foo_1.cdp_0` | `-DTEST_FOO=1 -DTEST_CDP=0` | Disabled | 61 | | `cub.cpp17.test.baz.foo_1.cdp_1` | `-DTEST_FOO=1 -DTEST_CDP=1` | Enabled | 62 | | `cub.cpp17.test.baz.foo_2.cdp_0` | `-DTEST_FOO=2 -DTEST_CDP=0` | Disabled | 63 | | `cub.cpp17.test.baz.foo_2.cdp_1` | `-DTEST_FOO=2 -DTEST_CDP=1` | Enabled | 64 | 65 | ## Changing `%PARAM%` Hints 66 | 67 | Since CMake does not automatically reconfigure the build when source files are 68 | modified, CMake will need to be rerun manually whenever the `%PARAM%` comments 69 | change. 70 | 71 | ## Building and Running Split Tests 72 | 73 | CMake will generate individual build and test targets for each test variant, and 74 | also provides build "metatargets" that compile all variants of a given test. 75 | 76 | The variants follow the usual naming convention for CUB's tests, but include a 77 | suffix that differentiates them (e.g. `.foo_X.bar_Y` in the example above). 78 | 79 | ### Individual Test Variants 80 | 81 | Continuing with the `test_baz.cu` example, the test variant that uses 82 | `-DTEST_FOO=1 -DTEST_BAR=4` can be built and run alone: 83 | 84 | ```bash 85 | # Build a single variant: 86 | make cub.cpp17.test.baz.foo_1.bar_4 87 | 88 | # Run a single variant 89 | bin/cub.cpp17.test.baz.foo_1.bar_4 90 | 91 | # Run a single variant using CTest regex: 92 | ctest -R cub\.cpp17\.test\.baz\.foo_1\.bar_4 93 | ``` 94 | 95 | ### All Variants of a Test 96 | 97 | Using a metatarget and the proper regex, all variants of a test can be built and 98 | executed without listing all variants explicitly: 99 | 100 | ```bash 101 | # Build all variants using the `.all` metatarget 102 | make cub.cpp17.test.baz.all 103 | 104 | # Run all variants: 105 | ctest -R cub\.cpp17\.test\.baz\. 106 | ``` 107 | 108 | ## Debugging 109 | 110 | Running CMake with `--log-level=VERBOSE` will print out extra information about 111 | all detected test variants. 112 | 113 | ## Additional Info 114 | 115 | Ideally, only parameters that directly influence kernel template instantiations 116 | should be split out in this way. If changing a parameter doesn't change the 117 | kernel template type, the same kernel will be compiled into multiple 118 | executables. This defeats the purpose of splitting up the test since the 119 | compiler will generate redundant code across the new split executables. 120 | 121 | The best candidate parameters for splitting are input value types, rather than 122 | integral parameters like BLOCK_THREADS, etc. Splitting by value type allows more 123 | infrastructure (data generation, validation) to be reused. Splitting other 124 | parameters can cause build times to increase since type-related infrastructure 125 | has to be rebuilt for each test variant. 126 | -------------------------------------------------------------------------------- /test/c2h/generators.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011-2022, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | #pragma once 29 | 30 | #include 31 | 32 | #include 33 | 34 | #include 35 | 36 | namespace c2h 37 | { 38 | 39 | namespace detail 40 | { 41 | 42 | template 43 | class value_wrapper_t 44 | { 45 | T m_val{}; 46 | 47 | public: 48 | explicit value_wrapper_t(T val) : m_val(val) {} 49 | explicit value_wrapper_t(int val) : m_val(static_cast(val)) {} 50 | T get() const { return m_val; } 51 | }; 52 | 53 | } 54 | 55 | class seed_t : public detail::value_wrapper_t 56 | { 57 | using value_wrapper_t::value_wrapper_t; 58 | }; 59 | 60 | class modulo_t : public detail::value_wrapper_t 61 | { 62 | using value_wrapper_t::value_wrapper_t; 63 | }; 64 | 65 | namespace detail 66 | { 67 | 68 | void gen(seed_t seed, 69 | char* data, 70 | c2h::custom_type_state_t min, 71 | c2h::custom_type_state_t max, 72 | std::size_t elements, 73 | std::size_t element_size); 74 | 75 | } 76 | 77 | template