├── .cproject ├── .gitignore ├── .project ├── .settings ├── .gitignore └── language.settings.xml ├── LICENSE.TXT ├── Makefile ├── README.md ├── cpu_spmv ├── cpu_spmv.cpp ├── cub ├── agent │ ├── agent_histogram.cuh │ ├── agent_radix_sort_downsweep.cuh │ ├── agent_radix_sort_upsweep.cuh │ ├── agent_reduce.cuh │ ├── agent_reduce_by_key.cuh │ ├── agent_rle.cuh │ ├── agent_scan.cuh │ ├── agent_segment_fixup.cuh │ ├── agent_select_if.cuh │ ├── agent_spmv_csrt.cuh │ ├── agent_spmv_orig.cuh │ ├── agent_spmv_row_based.cuh │ └── single_pass_scan_operators.cuh ├── block │ ├── block_discontinuity.cuh │ ├── block_exchange.cuh │ ├── block_histogram.cuh │ ├── block_load.cuh │ ├── block_radix_rank.cuh │ ├── block_radix_sort.cuh │ ├── block_raking_layout.cuh │ ├── block_reduce.cuh │ ├── block_reduce_by_key.cuh │ ├── block_scan.cuh │ ├── block_shuffle.cuh │ ├── block_store.cuh │ └── specializations │ │ ├── block_histogram_atomic.cuh │ │ ├── block_histogram_sort.cuh │ │ ├── block_reduce_raking.cuh │ │ ├── block_reduce_raking_commutative_only.cuh │ │ ├── block_reduce_warp_reductions.cuh │ │ ├── block_scan_raking.cuh │ │ └── block_scan_warp_scans.cuh ├── cub.cuh ├── device │ ├── device_histogram.cuh │ ├── device_partition.cuh │ ├── device_radix_sort.cuh │ ├── device_reduce.cuh │ ├── device_run_length_encode.cuh │ ├── device_scan.cuh │ ├── device_segmented_radix_sort.cuh │ ├── device_segmented_reduce.cuh │ ├── device_select.cuh │ ├── device_spmv.cuh │ └── dispatch │ │ ├── dispatch_histogram.cuh │ │ ├── dispatch_radix_sort.cuh │ │ ├── dispatch_reduce.cuh │ │ ├── dispatch_reduce_by_key.cuh │ │ ├── dispatch_rle.cuh │ │ ├── dispatch_scan.cuh │ │ ├── dispatch_select_if.cuh │ │ ├── dispatch_spmv_csrt.cuh │ │ ├── dispatch_spmv_orig.cuh │ │ └── dispatch_spmv_row_based.cuh ├── grid │ ├── grid_barrier.cuh │ ├── grid_even_share.cuh │ ├── grid_mapping.cuh │ └── grid_queue.cuh ├── host │ └── mutex.cuh ├── iterator │ ├── arg_index_input_iterator.cuh │ ├── cache_modified_input_iterator.cuh │ ├── cache_modified_output_iterator.cuh │ ├── constant_input_iterator.cuh │ ├── counting_input_iterator.cuh │ ├── tex_obj_input_iterator.cuh │ ├── tex_ref_input_iterator.cuh │ └── transform_input_iterator.cuh ├── thread │ ├── thread_load.cuh │ ├── thread_operators.cuh │ ├── thread_reduce.cuh │ ├── thread_scan.cuh │ ├── thread_search.cuh │ └── thread_store.cuh ├── util_allocator.cuh ├── util_arch.cuh ├── util_debug.cuh ├── util_device.cuh ├── util_macro.cuh ├── util_namespace.cuh ├── util_ptx.cuh ├── util_type.cuh └── warp │ ├── specializations │ ├── warp_reduce_shfl.cuh │ ├── warp_reduce_smem.cuh │ ├── warp_scan_shfl.cuh │ └── warp_scan_smem.cuh │ ├── warp_reduce.cuh │ └── warp_scan.cuh ├── eval_csrmv.sh ├── get_uf_datasets.sh ├── gpu_spmv ├── gpu_spmv.cu ├── merge-based-spmv-sc16-preprint.pdf ├── merge_decomposition.png ├── merge_spmv.png ├── sparse_matrix.h ├── ufl_matrices.txt ├── ufl_urls.txt └── utils.h /.cproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _cpu_spmv_driver 2 | _gpu_spmv_driver 3 | mtx 4 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | merge-spmv 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.cdt.autotools.core.genmakebuilderV2 10 | 11 | 12 | 13 | 14 | org.eclipse.cdt.managedbuilder.core.genmakebuilder 15 | clean,full,incremental, 16 | 17 | 18 | 19 | 20 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder 21 | full,incremental, 22 | 23 | 24 | 25 | 26 | 27 | org.eclipse.cdt.core.cnature 28 | org.eclipse.cdt.core.ccnature 29 | org.eclipse.cdt.managedbuilder.core.managedBuildNature 30 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature 31 | org.eclipse.cdt.autotools.core.autotoolsNatureV2 32 | 33 | 34 | -------------------------------------------------------------------------------- /.settings/.gitignore: -------------------------------------------------------------------------------- 1 | /language.settings.xml 2 | -------------------------------------------------------------------------------- /.settings/language.settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /LICENSE.TXT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | * Redistributions of source code must retain the above copyright 6 | notice, this list of conditions and the following disclaimer. 7 | * Redistributions in binary form must reproduce the above copyright 8 | notice, this list of conditions and the following disclaimer in the 9 | documentation and/or other materials provided with the distribution. 10 | * Neither the name of the NVIDIA CORPORATION nor the 11 | names of its contributors may be used to endorse or promote products 12 | derived from this software without specific prior written permission. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 18 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #/****************************************************************************** 2 | # * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | # * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | # * 5 | # * Redistribution and use in source and binary forms, with or without 6 | # * modification, are permitted provided that the following conditions are met: 7 | # * * Redistributions of source code must retain the above copyright 8 | # * notice, this list of conditions and the following disclaimer. 9 | # * * Redistributions in binary form must reproduce the above copyright 10 | # * notice, this list of conditions and the following disclaimer in the 11 | # * documentation and/or other materials provided with the distribution. 12 | # * * Neither the name of the NVIDIA CORPORATION nor the 13 | # * names of its contributors may be used to endorse or promote products 14 | # * derived from this software without specific prior written permission. 15 | # * 16 | # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | # * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | # * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | # * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | # * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | # * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | # * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | # * 27 | #******************************************************************************/ 28 | 29 | #------------------------------------------------------------------------------- 30 | # 31 | # Makefile usage 32 | # 33 | # CPU: 34 | # make cpu_spmv 35 | # 36 | # GPU: 37 | # make gpu_spmv [sm=] [verbose=<0|1>] 38 | # 39 | #------------------------------------------------------------------------------- 40 | 41 | #------------------------------------------------------------------------------- 42 | # Commandline Options 43 | #------------------------------------------------------------------------------- 44 | 45 | 46 | # [sm=] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default). 47 | 48 | COMMA = , 49 | ifdef sm 50 | SM_ARCH = $(subst $(COMMA),-,$(sm)) 51 | else 52 | SM_ARCH = 350 53 | endif 54 | 55 | ifeq (520, $(findstring 520, $(SM_ARCH))) 56 | SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\" 57 | endif 58 | ifeq (370, $(findstring 370, $(SM_ARCH))) 59 | SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\" 60 | endif 61 | ifeq (350, $(findstring 350, $(SM_ARCH))) 62 | SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\" 63 | endif 64 | ifeq (300, $(findstring 300, $(SM_ARCH))) 65 | SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\" 66 | endif 67 | 68 | 69 | # [verbose=<0|1>] Verbose toolchain output from nvcc option 70 | 71 | ifeq ($(verbose), 1) 72 | NVCCFLAGS += -v 73 | endif 74 | 75 | 76 | 77 | #------------------------------------------------------------------------------- 78 | # Compiler and compilation platform 79 | #------------------------------------------------------------------------------- 80 | 81 | CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST))) 82 | 83 | NVCC = "$(shell which nvcc)" 84 | ifdef nvccver 85 | NVCC_VERSION = $(nvccver) 86 | else 87 | NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//')) 88 | endif 89 | 90 | # detect OS 91 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 92 | 93 | # Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases 94 | NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\# 95 | 96 | ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER))) 97 | # For MSVC 98 | # Disable excess x86 floating point precision that can lead to results being labeled incorrectly 99 | NVCCFLAGS += -Xcompiler /fp:strict 100 | # Help the compiler/linker work with huge numbers of kernels on Windows 101 | NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500 102 | CC = cl 103 | NPPI = -lnppi 104 | 105 | # Multithreaded runtime 106 | NVCCFLAGS += -Xcompiler /MT 107 | 108 | ifneq ($(force32), 1) 109 | CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib" 110 | else 111 | CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib" 112 | endif 113 | CUDART = "$(shell cygpath -w $(CUDART_CYG))" 114 | else 115 | # For g++ 116 | # Disable excess x86 floating point precision that can lead to results being labeled incorrectly 117 | NVCCFLAGS += -Xcompiler -ffloat-store 118 | CC = g++ 119 | ifneq ($(force32), 1) 120 | CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a" 121 | else 122 | CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a" 123 | endif 124 | endif 125 | 126 | 127 | #------------------------------------------------------------------------------- 128 | # Compiler and compilation platform 129 | #------------------------------------------------------------------------------- 130 | 131 | # OMP compiler 132 | OMPCC=icpc 133 | OMPCC_FLAGS=-openmp -O3 -lrt -fno-alias -xHost -lnuma -O3 -mkl 134 | 135 | # Includes 136 | INC += -I$(CUB_DIR) -I$(CUB_DIR)test 137 | 138 | # detect OS 139 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:]) 140 | 141 | #------------------------------------------------------------------------------- 142 | # Dependency Lists 143 | #------------------------------------------------------------------------------- 144 | 145 | rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d)) 146 | 147 | DEPS = $(call rwildcard, $(CUB_DIR),*.cuh) \ 148 | $(call rwildcard, $(CUB_DIR),*.h) \ 149 | Makefile 150 | 151 | #------------------------------------------------------------------------------- 152 | # make clean 153 | #------------------------------------------------------------------------------- 154 | 155 | clean : 156 | rm -f _gpu_spmv_driver _cpu_spmv_driver 157 | 158 | 159 | #------------------------------------------------------------------------------- 160 | # make gpu_spmv 161 | #------------------------------------------------------------------------------- 162 | 163 | gpu_spmv : gpu_spmv.cu $(DEPS) 164 | $(NVCC) $(DEFINES) $(SM_TARGETS) -o _gpu_spmv_driver gpu_spmv.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse -O3 165 | 166 | 167 | #------------------------------------------------------------------------------- 168 | # make cpu_spmv 169 | #------------------------------------------------------------------------------- 170 | 171 | cpu_spmv : cpu_spmv.cpp $(DEPS) 172 | $(OMPCC) $(DEFINES) -DCUB_MKL -o _cpu_spmv_driver cpu_spmv.cpp $(OMPCC_FLAGS) 173 | 174 | -------------------------------------------------------------------------------- /cpu_spmv: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | KMP_AFFINITY=granularity=core,scatter 4 | ./_cpu_spmv_driver $@ 5 | -------------------------------------------------------------------------------- /cub/block/block_raking_layout.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data. 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #include "../util_macro.cuh" 38 | #include "../util_arch.cuh" 39 | #include "../util_type.cuh" 40 | #include "../util_namespace.cuh" 41 | 42 | /// Optional outer namespace(s) 43 | CUB_NS_PREFIX 44 | 45 | /// CUB namespace 46 | namespace cub { 47 | 48 | /** 49 | * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. ![](raking.png) 50 | * \ingroup BlockModule 51 | * 52 | * \par Overview 53 | * This type facilitates a shared memory usage pattern where a block of CUDA 54 | * threads places elements into shared memory and then reduces the active 55 | * parallelism to one "raking" warp of threads for serially aggregating consecutive 56 | * sequences of shared items. Padding is inserted to eliminate bank conflicts 57 | * (for most data types). 58 | * 59 | * \tparam T The data type to be exchanged. 60 | * \tparam BLOCK_THREADS The thread block size in threads. 61 | * \tparam PTX_ARCH [optional] \ptxversion 62 | */ 63 | template < 64 | typename T, 65 | int BLOCK_THREADS, 66 | int PTX_ARCH = CUB_PTX_ARCH> 67 | struct BlockRakingLayout 68 | { 69 | //--------------------------------------------------------------------- 70 | // Constants and type definitions 71 | //--------------------------------------------------------------------- 72 | 73 | enum 74 | { 75 | /// The total number of elements that need to be cooperatively reduced 76 | SHARED_ELEMENTS = BLOCK_THREADS, 77 | 78 | /// Maximum number of warp-synchronous raking threads 79 | MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)), 80 | 81 | /// Number of raking elements per warp-synchronous raking thread (rounded up) 82 | SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS, 83 | 84 | /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads) 85 | RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH, 86 | 87 | /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1) 88 | HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0), 89 | 90 | /// Degree of bank conflicts (e.g., 4-way) 91 | CONFLICT_DEGREE = (HAS_CONFLICTS) ? 92 | (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) : 93 | 1, 94 | 95 | /// Pad each segment length with one element if degree of bank conflicts is greater than 4-way (heuristic) 96 | SEGMENT_PADDING = (CONFLICT_DEGREE > CUB_PREFER_CONFLICT_OVER_PADDING(PTX_ARCH)) ? 1 : 0, 97 | // SEGMENT_PADDING = (HAS_CONFLICTS) ? 1 : 0, 98 | 99 | /// Total number of elements in the raking grid 100 | GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING), 101 | 102 | /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads) 103 | UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0), 104 | }; 105 | 106 | 107 | /** 108 | * \brief Shared memory storage type 109 | */ 110 | typedef T _TempStorage[BlockRakingLayout::GRID_ELEMENTS]; 111 | 112 | /// Alias wrapper allowing storage to be unioned 113 | struct TempStorage : Uninitialized<_TempStorage> {}; 114 | 115 | 116 | /** 117 | * \brief Returns the location for the calling thread to place data into the grid 118 | */ 119 | static __device__ __forceinline__ T* PlacementPtr( 120 | TempStorage &temp_storage, 121 | int linear_tid) 122 | { 123 | // Offset for partial 124 | unsigned int offset = linear_tid; 125 | 126 | // Add in one padding element for every segment 127 | if (SEGMENT_PADDING > 0) 128 | { 129 | offset += offset / SEGMENT_LENGTH; 130 | } 131 | 132 | // Incorporating a block of padding partials every shared memory segment 133 | return temp_storage.Alias() + offset; 134 | } 135 | 136 | 137 | /** 138 | * \brief Returns the location for the calling thread to begin sequential raking 139 | */ 140 | static __device__ __forceinline__ T* RakingPtr( 141 | TempStorage &temp_storage, 142 | int linear_tid) 143 | { 144 | return temp_storage.Alias() + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING)); 145 | } 146 | }; 147 | 148 | } // CUB namespace 149 | CUB_NS_POSTFIX // Optional outer namespace(s) 150 | 151 | -------------------------------------------------------------------------------- /cub/block/specializations/block_histogram_atomic.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 47 | */ 48 | template 49 | struct BlockHistogramAtomic 50 | { 51 | /// Shared memory storage layout type 52 | struct TempStorage {}; 53 | 54 | 55 | /// Constructor 56 | __device__ __forceinline__ BlockHistogramAtomic( 57 | TempStorage &temp_storage) 58 | {} 59 | 60 | 61 | /// Composite data onto an existing histogram 62 | template < 63 | typename T, 64 | typename CounterT, 65 | int ITEMS_PER_THREAD> 66 | __device__ __forceinline__ void Composite( 67 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram 68 | CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram 69 | { 70 | // Update histogram 71 | #pragma unroll 72 | for (int i = 0; i < ITEMS_PER_THREAD; ++i) 73 | { 74 | atomicAdd(histogram + items[i], 1); 75 | } 76 | } 77 | 78 | }; 79 | 80 | } // CUB namespace 81 | CUB_NS_POSTFIX // Optional outer namespace(s) 82 | 83 | -------------------------------------------------------------------------------- /cub/block/specializations/block_histogram_sort.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../block/block_radix_sort.cuh" 37 | #include "../../block/block_discontinuity.cuh" 38 | #include "../../util_ptx.cuh" 39 | #include "../../util_namespace.cuh" 40 | 41 | /// Optional outer namespace(s) 42 | CUB_NS_PREFIX 43 | 44 | /// CUB namespace 45 | namespace cub { 46 | 47 | 48 | 49 | /** 50 | * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. 51 | */ 52 | template < 53 | typename T, ///< Sample type 54 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension 55 | int ITEMS_PER_THREAD, ///< The number of samples per thread 56 | int BINS, ///< The number of bins into which histogram samples may fall 57 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension 58 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension 59 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective 60 | struct BlockHistogramSort 61 | { 62 | /// Constants 63 | enum 64 | { 65 | /// The thread block size in threads 66 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, 67 | }; 68 | 69 | // Parameterize BlockRadixSort type for our thread block 70 | typedef BlockRadixSort< 71 | T, 72 | BLOCK_DIM_X, 73 | ITEMS_PER_THREAD, 74 | NullType, 75 | 4, 76 | (PTX_ARCH >= 350) ? true : false, 77 | BLOCK_SCAN_WARP_SCANS, 78 | cudaSharedMemBankSizeFourByte, 79 | BLOCK_DIM_Y, 80 | BLOCK_DIM_Z, 81 | PTX_ARCH> 82 | BlockRadixSortT; 83 | 84 | // Parameterize BlockDiscontinuity type for our thread block 85 | typedef BlockDiscontinuity< 86 | T, 87 | BLOCK_DIM_X, 88 | BLOCK_DIM_Y, 89 | BLOCK_DIM_Z, 90 | PTX_ARCH> 91 | BlockDiscontinuityT; 92 | 93 | /// Shared memory 94 | union _TempStorage 95 | { 96 | // Storage for sorting bin values 97 | typename BlockRadixSortT::TempStorage sort; 98 | 99 | struct 100 | { 101 | // Storage for detecting discontinuities in the tile of sorted bin values 102 | typename BlockDiscontinuityT::TempStorage flag; 103 | 104 | // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values 105 | unsigned int run_begin[BINS]; 106 | unsigned int run_end[BINS]; 107 | }; 108 | }; 109 | 110 | 111 | /// Alias wrapper allowing storage to be unioned 112 | struct TempStorage : Uninitialized<_TempStorage> {}; 113 | 114 | 115 | // Thread fields 116 | _TempStorage &temp_storage; 117 | int linear_tid; 118 | 119 | 120 | /// Constructor 121 | __device__ __forceinline__ BlockHistogramSort( 122 | TempStorage &temp_storage) 123 | : 124 | temp_storage(temp_storage.Alias()), 125 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) 126 | {} 127 | 128 | 129 | // Discontinuity functor 130 | struct DiscontinuityOp 131 | { 132 | // Reference to temp_storage 133 | _TempStorage &temp_storage; 134 | 135 | // Constructor 136 | __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) : 137 | temp_storage(temp_storage) 138 | {} 139 | 140 | // Discontinuity predicate 141 | __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index) 142 | { 143 | if (a != b) 144 | { 145 | // Note the begin/end offsets in shared storage 146 | temp_storage.run_begin[b] = b_index; 147 | temp_storage.run_end[a] = b_index; 148 | 149 | return true; 150 | } 151 | else 152 | { 153 | return false; 154 | } 155 | } 156 | }; 157 | 158 | 159 | // Composite data onto an existing histogram 160 | template < 161 | typename CounterT > 162 | __device__ __forceinline__ void Composite( 163 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram 164 | CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram 165 | { 166 | enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD }; 167 | 168 | // Sort bytes in blocked arrangement 169 | BlockRadixSortT(temp_storage.sort).Sort(items); 170 | 171 | __syncthreads(); 172 | 173 | // Initialize the shared memory's run_begin and run_end for each bin 174 | int histo_offset = 0; 175 | 176 | #pragma unroll 177 | for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) 178 | { 179 | temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; 180 | temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; 181 | } 182 | // Finish up with guarded initialization if necessary 183 | if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) 184 | { 185 | temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE; 186 | temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE; 187 | } 188 | 189 | __syncthreads(); 190 | 191 | int flags[ITEMS_PER_THREAD]; // unused 192 | 193 | // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile 194 | DiscontinuityOp flag_op(temp_storage); 195 | BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op); 196 | 197 | // Update begin for first item 198 | if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0; 199 | 200 | __syncthreads(); 201 | 202 | // Composite into histogram 203 | histo_offset = 0; 204 | 205 | #pragma unroll 206 | for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS) 207 | { 208 | int thread_offset = histo_offset + linear_tid; 209 | CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; 210 | histogram[thread_offset] += count; 211 | } 212 | 213 | // Finish up with guarded composition if necessary 214 | if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS)) 215 | { 216 | int thread_offset = histo_offset + linear_tid; 217 | CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset]; 218 | histogram[thread_offset] += count; 219 | } 220 | } 221 | 222 | }; 223 | 224 | } // CUB namespace 225 | CUB_NS_POSTFIX // Optional outer namespace(s) 226 | 227 | -------------------------------------------------------------------------------- /cub/block/specializations/block_reduce_raking.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../block/block_raking_layout.cuh" 37 | #include "../../warp/warp_reduce.cuh" 38 | #include "../../thread/thread_reduce.cuh" 39 | #include "../../util_ptx.cuh" 40 | #include "../../util_namespace.cuh" 41 | 42 | /// Optional outer namespace(s) 43 | CUB_NS_PREFIX 44 | 45 | /// CUB namespace 46 | namespace cub { 47 | 48 | 49 | /** 50 | * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators. 51 | * 52 | * Supports non-commutative binary reduction operators. Unlike commutative 53 | * reduction operators (e.g., addition), the application of a non-commutative 54 | * reduction operator (e.g, string concatenation) across a sequence of inputs must 55 | * honor the relative ordering of items and partial reductions when applying the 56 | * reduction operator. 57 | * 58 | * Compared to the implementation of BlockReduceRaking (which does not support 59 | * non-commutative operators), this implementation requires a few extra 60 | * rounds of inter-thread communication. 61 | */ 62 | template < 63 | typename T, ///< Data type being reduced 64 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension 65 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension 66 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension 67 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective 68 | struct BlockReduceRaking 69 | { 70 | /// Constants 71 | enum 72 | { 73 | /// The thread block size in threads 74 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, 75 | }; 76 | 77 | /// Layout type for padded thread block raking grid 78 | typedef BlockRakingLayout BlockRakingLayout; 79 | 80 | /// WarpReduce utility type 81 | typedef typename WarpReduce::InternalWarpReduce WarpReduce; 82 | 83 | /// Constants 84 | enum 85 | { 86 | /// Number of raking threads 87 | RAKING_THREADS = BlockRakingLayout::RAKING_THREADS, 88 | 89 | /// Number of raking elements per warp synchronous raking thread 90 | SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH, 91 | 92 | /// Cooperative work can be entirely warp synchronous 93 | WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS), 94 | 95 | /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two 96 | WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE, 97 | 98 | /// Whether or not accesses into smem are unguarded 99 | RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED, 100 | 101 | }; 102 | 103 | 104 | /// Shared memory storage layout type 105 | union _TempStorage 106 | { 107 | typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction 108 | typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid 109 | }; 110 | 111 | 112 | /// Alias wrapper allowing storage to be unioned 113 | struct TempStorage : Uninitialized<_TempStorage> {}; 114 | 115 | 116 | // Thread fields 117 | _TempStorage &temp_storage; 118 | int linear_tid; 119 | 120 | 121 | /// Constructor 122 | __device__ __forceinline__ BlockReduceRaking( 123 | TempStorage &temp_storage) 124 | : 125 | temp_storage(temp_storage.Alias()), 126 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) 127 | {} 128 | 129 | 130 | template 131 | __device__ __forceinline__ T RakingReduction( 132 | ReductionOp reduction_op, ///< [in] Binary scan operator 133 | T *raking_segment, 134 | T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items 135 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 136 | Int2Type iteration) 137 | { 138 | // Update partial if addend is in range 139 | if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid)) 140 | { 141 | T addend = raking_segment[ITERATION]; 142 | partial = reduction_op(partial, addend); 143 | } 144 | return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type()); 145 | } 146 | 147 | template 148 | __device__ __forceinline__ T RakingReduction( 149 | ReductionOp reduction_op, ///< [in] Binary scan operator 150 | T *raking_segment, 151 | T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items 152 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 153 | Int2Type iteration) 154 | { 155 | return partial; 156 | } 157 | 158 | 159 | 160 | /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. 161 | template < 162 | bool IS_FULL_TILE, 163 | typename ReductionOp> 164 | __device__ __forceinline__ T Reduce( 165 | T partial, ///< [in] Calling thread's input partial reductions 166 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 167 | ReductionOp reduction_op) ///< [in] Binary reduction operator 168 | { 169 | if (WARP_SYNCHRONOUS) 170 | { 171 | // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two) 172 | partial = WarpReduce(temp_storage.warp_storage).template Reduce( 173 | partial, 174 | num_valid, 175 | reduction_op); 176 | } 177 | else 178 | { 179 | // Place partial into shared memory grid. 180 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial; 181 | 182 | __syncthreads(); 183 | 184 | // Reduce parallelism to one warp 185 | if (linear_tid < RAKING_THREADS) 186 | { 187 | // Raking reduction in grid 188 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); 189 | partial = raking_segment[0]; 190 | 191 | partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>()); 192 | 193 | partial = WarpReduce(temp_storage.warp_storage).template Reduce( 194 | partial, 195 | num_valid, 196 | reduction_op); 197 | 198 | } 199 | } 200 | 201 | return partial; 202 | } 203 | 204 | 205 | /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. 206 | template 207 | __device__ __forceinline__ T Sum( 208 | T partial, ///< [in] Calling thread's input partial reductions 209 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 210 | { 211 | cub::Sum reduction_op; 212 | 213 | return Reduce(partial, num_valid, reduction_op); 214 | } 215 | 216 | 217 | 218 | }; 219 | 220 | } // CUB namespace 221 | CUB_NS_POSTFIX // Optional outer namespace(s) 222 | 223 | -------------------------------------------------------------------------------- /cub/block/specializations/block_reduce_raking_commutative_only.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "block_reduce_raking.cuh" 37 | #include "../../warp/warp_reduce.cuh" 38 | #include "../../thread/thread_reduce.cuh" 39 | #include "../../util_ptx.cuh" 40 | #include "../../util_namespace.cuh" 41 | 42 | /// Optional outer namespace(s) 43 | CUB_NS_PREFIX 44 | 45 | /// CUB namespace 46 | namespace cub { 47 | 48 | 49 | /** 50 | * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size. 51 | */ 52 | template < 53 | typename T, ///< Data type being reduced 54 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension 55 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension 56 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension 57 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective 58 | struct BlockReduceRakingCommutativeOnly 59 | { 60 | /// Constants 61 | enum 62 | { 63 | /// The thread block size in threads 64 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, 65 | }; 66 | 67 | // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values 68 | typedef BlockReduceRaking FallBack; 69 | 70 | /// Constants 71 | enum 72 | { 73 | /// Number of warp threads 74 | WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), 75 | 76 | /// Whether or not to use fall-back 77 | USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)), 78 | 79 | /// Number of raking threads 80 | RAKING_THREADS = WARP_THREADS, 81 | 82 | /// Number of threads actually sharing items with the raking threads 83 | SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS), 84 | 85 | /// Number of raking elements per warp synchronous raking thread 86 | SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS, 87 | }; 88 | 89 | /// WarpReduce utility type 90 | typedef WarpReduce WarpReduce; 91 | 92 | /// Layout type for padded thread block raking grid 93 | typedef BlockRakingLayout BlockRakingLayout; 94 | 95 | /// Shared memory storage layout type 96 | struct _TempStorage 97 | { 98 | union 99 | { 100 | struct 101 | { 102 | typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction 103 | typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid 104 | }; 105 | typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan 106 | }; 107 | }; 108 | 109 | 110 | /// Alias wrapper allowing storage to be unioned 111 | struct TempStorage : Uninitialized<_TempStorage> {}; 112 | 113 | 114 | // Thread fields 115 | _TempStorage &temp_storage; 116 | int linear_tid; 117 | 118 | 119 | /// Constructor 120 | __device__ __forceinline__ BlockReduceRakingCommutativeOnly( 121 | TempStorage &temp_storage) 122 | : 123 | temp_storage(temp_storage.Alias()), 124 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) 125 | {} 126 | 127 | 128 | /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. 129 | template 130 | __device__ __forceinline__ T Sum( 131 | T partial, ///< [in] Calling thread's input partial reductions 132 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 133 | { 134 | if (USE_FALLBACK || !FULL_TILE) 135 | { 136 | return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid); 137 | } 138 | else 139 | { 140 | // Place partial into shared memory grid 141 | if (linear_tid >= RAKING_THREADS) 142 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; 143 | 144 | __syncthreads(); 145 | 146 | // Reduce parallelism to one warp 147 | if (linear_tid < RAKING_THREADS) 148 | { 149 | // Raking reduction in grid 150 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); 151 | partial = ThreadReduce(raking_segment, cub::Sum(), partial); 152 | 153 | // Warpscan 154 | partial = WarpReduce(temp_storage.warp_storage).Sum(partial); 155 | } 156 | } 157 | 158 | return partial; 159 | } 160 | 161 | 162 | /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. 163 | template < 164 | bool FULL_TILE, 165 | typename ReductionOp> 166 | __device__ __forceinline__ T Reduce( 167 | T partial, ///< [in] Calling thread's input partial reductions 168 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 169 | ReductionOp reduction_op) ///< [in] Binary reduction operator 170 | { 171 | if (USE_FALLBACK || !FULL_TILE) 172 | { 173 | return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op); 174 | } 175 | else 176 | { 177 | // Place partial into shared memory grid 178 | if (linear_tid >= RAKING_THREADS) 179 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial; 180 | 181 | __syncthreads(); 182 | 183 | // Reduce parallelism to one warp 184 | if (linear_tid < RAKING_THREADS) 185 | { 186 | // Raking reduction in grid 187 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid); 188 | partial = ThreadReduce(raking_segment, reduction_op, partial); 189 | 190 | // Warpscan 191 | partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op); 192 | } 193 | } 194 | 195 | return partial; 196 | } 197 | 198 | }; 199 | 200 | } // CUB namespace 201 | CUB_NS_POSTFIX // Optional outer namespace(s) 202 | 203 | -------------------------------------------------------------------------------- /cub/block/specializations/block_reduce_warp_reductions.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../../warp/warp_reduce.cuh" 37 | #include "../../util_ptx.cuh" 38 | #include "../../util_arch.cuh" 39 | #include "../../util_namespace.cuh" 40 | 41 | /// Optional outer namespace(s) 42 | CUB_NS_PREFIX 43 | 44 | /// CUB namespace 45 | namespace cub { 46 | 47 | 48 | /** 49 | * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators. 50 | */ 51 | template < 52 | typename T, ///< Data type being reduced 53 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension 54 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension 55 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension 56 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective 57 | struct BlockReduceWarpReductions 58 | { 59 | /// Constants 60 | enum 61 | { 62 | /// The thread block size in threads 63 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, 64 | 65 | /// Number of warp threads 66 | WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH), 67 | 68 | /// Number of active warps 69 | WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS, 70 | 71 | /// The logical warp size for warp reductions 72 | LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS), 73 | 74 | /// Whether or not the logical warp size evenly divides the threadblock size 75 | EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0) 76 | }; 77 | 78 | 79 | /// WarpReduce utility type 80 | typedef typename WarpReduce::InternalWarpReduce WarpReduce; 81 | 82 | 83 | /// Shared memory storage layout type 84 | struct _TempStorage 85 | { 86 | typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan 87 | T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan 88 | T block_prefix; ///< Shared prefix for the entire threadblock 89 | }; 90 | 91 | /// Alias wrapper allowing storage to be unioned 92 | struct TempStorage : Uninitialized<_TempStorage> {}; 93 | 94 | 95 | // Thread fields 96 | _TempStorage &temp_storage; 97 | int linear_tid; 98 | int warp_id; 99 | int lane_id; 100 | 101 | 102 | /// Constructor 103 | __device__ __forceinline__ BlockReduceWarpReductions( 104 | TempStorage &temp_storage) 105 | : 106 | temp_storage(temp_storage.Alias()), 107 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)), 108 | warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS), 109 | lane_id(LaneId()) 110 | {} 111 | 112 | 113 | template 114 | __device__ __forceinline__ T ApplyWarpAggregates( 115 | ReductionOp reduction_op, ///< [in] Binary scan operator 116 | T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items 117 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 118 | Int2Type successor_warp) 119 | { 120 | if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid)) 121 | { 122 | T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP]; 123 | warp_aggregate = reduction_op(warp_aggregate, addend); 124 | } 125 | return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type()); 126 | } 127 | 128 | template 129 | __device__ __forceinline__ T ApplyWarpAggregates( 130 | ReductionOp reduction_op, ///< [in] Binary scan operator 131 | T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items 132 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 133 | Int2Type successor_warp) 134 | { 135 | return warp_aggregate; 136 | } 137 | 138 | 139 | /// Returns block-wide aggregate in thread0. 140 | template < 141 | bool FULL_TILE, 142 | typename ReductionOp> 143 | __device__ __forceinline__ T ApplyWarpAggregates( 144 | ReductionOp reduction_op, ///< [in] Binary scan operator 145 | T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items 146 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 147 | { 148 | // Share lane aggregates 149 | if (lane_id == 0) 150 | { 151 | temp_storage.warp_aggregates[warp_id] = warp_aggregate; 152 | } 153 | 154 | __syncthreads(); 155 | 156 | // Update total aggregate in warp 0, lane 0 157 | if (linear_tid == 0) 158 | { 159 | warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>()); 160 | } 161 | 162 | return warp_aggregate; 163 | } 164 | 165 | 166 | /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. 167 | template 168 | __device__ __forceinline__ T Sum( 169 | T input, ///< [in] Calling thread's input partial reductions 170 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 171 | { 172 | cub::Sum reduction_op; 173 | unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; 174 | unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? 175 | LOGICAL_WARP_SIZE : 176 | (warp_offset < num_valid) ? 177 | num_valid - warp_offset : 178 | 0; 179 | 180 | // Warp reduction in every warp 181 | T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( 182 | input, 183 | warp_num_valid, 184 | cub::Sum()); 185 | 186 | // Update outputs and block_aggregate with warp-wide aggregates from lane-0s 187 | return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); 188 | } 189 | 190 | 191 | /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0. 192 | template < 193 | bool FULL_TILE, 194 | typename ReductionOp> 195 | __device__ __forceinline__ T Reduce( 196 | T input, ///< [in] Calling thread's input partial reductions 197 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS) 198 | ReductionOp reduction_op) ///< [in] Binary reduction operator 199 | { 200 | unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE; 201 | unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ? 202 | LOGICAL_WARP_SIZE : 203 | (warp_offset < num_valid) ? 204 | num_valid - warp_offset : 205 | 0; 206 | 207 | // Warp reduction in every warp 208 | T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>( 209 | input, 210 | warp_num_valid, 211 | reduction_op); 212 | 213 | // Update outputs and block_aggregate with warp-wide aggregates from lane-0s 214 | return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid); 215 | } 216 | 217 | }; 218 | 219 | 220 | } // CUB namespace 221 | CUB_NS_POSTFIX // Optional outer namespace(s) 222 | 223 | -------------------------------------------------------------------------------- /cub/cub.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * CUB umbrella include file 32 | */ 33 | 34 | #pragma once 35 | 36 | 37 | // Block 38 | #include "block/block_histogram.cuh" 39 | #include "block/block_discontinuity.cuh" 40 | #include "block/block_exchange.cuh" 41 | #include "block/block_load.cuh" 42 | #include "block/block_radix_rank.cuh" 43 | #include "block/block_radix_sort.cuh" 44 | #include "block/block_reduce.cuh" 45 | #include "block/block_scan.cuh" 46 | #include "block/block_store.cuh" 47 | //#include "block/block_shift.cuh" 48 | 49 | // Device 50 | #include "device/device_histogram.cuh" 51 | #include "device/device_partition.cuh" 52 | #include "device/device_radix_sort.cuh" 53 | #include "device/device_reduce.cuh" 54 | #include "device/device_run_length_encode.cuh" 55 | #include "device/device_scan.cuh" 56 | #include "device/device_segmented_radix_sort.cuh" 57 | #include "device/device_segmented_reduce.cuh" 58 | #include "device/device_select.cuh" 59 | #include "device/device_spmv.cuh" 60 | 61 | // Grid 62 | //#include "grid/grid_barrier.cuh" 63 | #include "grid/grid_even_share.cuh" 64 | #include "grid/grid_mapping.cuh" 65 | #include "grid/grid_queue.cuh" 66 | 67 | // Thread 68 | #include "thread/thread_load.cuh" 69 | #include "thread/thread_operators.cuh" 70 | #include "thread/thread_reduce.cuh" 71 | #include "thread/thread_scan.cuh" 72 | #include "thread/thread_store.cuh" 73 | 74 | // Warp 75 | #include "warp/warp_reduce.cuh" 76 | #include "warp/warp_scan.cuh" 77 | 78 | // Iterator 79 | #include "iterator/arg_index_input_iterator.cuh" 80 | #include "iterator/cache_modified_input_iterator.cuh" 81 | #include "iterator/cache_modified_output_iterator.cuh" 82 | #include "iterator/constant_input_iterator.cuh" 83 | #include "iterator/counting_input_iterator.cuh" 84 | #include "iterator/tex_obj_input_iterator.cuh" 85 | #include "iterator/tex_ref_input_iterator.cuh" 86 | #include "iterator/transform_input_iterator.cuh" 87 | 88 | // Util 89 | #include "util_allocator.cuh" 90 | #include "util_arch.cuh" 91 | #include "util_debug.cuh" 92 | #include "util_device.cuh" 93 | #include "util_macro.cuh" 94 | #include "util_ptx.cuh" 95 | #include "util_type.cuh" 96 | 97 | -------------------------------------------------------------------------------- /cub/device/device_spmv.cuh: -------------------------------------------------------------------------------- 1 | 2 | /****************************************************************************** 3 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 4 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * * Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * * Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * * Neither the name of the NVIDIA CORPORATION nor the 14 | * names of its contributors may be used to endorse or promote products 15 | * derived from this software without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 21 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | * 28 | ******************************************************************************/ 29 | 30 | /** 31 | * \file 32 | * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV). 33 | */ 34 | 35 | #pragma once 36 | 37 | #include 38 | #include 39 | #include 40 | 41 | #include "dispatch/dispatch_spmv_orig.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | /// Optional outer namespace(s) 45 | CUB_NS_PREFIX 46 | 47 | /// CUB namespace 48 | namespace cub { 49 | 50 | 51 | /** 52 | * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV). 53 | * \ingroup SingleModule 54 | * 55 | * \par Overview 56 | * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication) 57 | * performs the matrix-vector operation 58 | * y = alpha*A*x + beta*y, 59 | * where: 60 | * - A is an mxn sparse matrix whose non-zero structure is specified in 61 | * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29) 62 | * (i.e., three arrays: values, row_offsets, and column_indices) 63 | * - x and y are dense vectors 64 | * - alpha and beta are scalar multiplicands 65 | * 66 | * \par Usage Considerations 67 | * \cdp_class{DeviceSpmv} 68 | * 69 | */ 70 | struct DeviceSpmv 71 | { 72 | /******************************************************************//** 73 | * \name CSR matrix operations 74 | *********************************************************************/ 75 | //@{ 76 | 77 | /** 78 | * \brief This function performs the matrix-vector operation y = A*x. 79 | * 80 | * \par Snippet 81 | * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A 82 | * representing a 3x3 lattice (24 non-zeros). 83 | * 84 | * \par 85 | * \code 86 | * #include // or equivalently 87 | * 88 | * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x, 89 | * // and output vector y 90 | * int num_rows = 9; 91 | * int num_cols = 9; 92 | * int num_nonzeros = 24; 93 | * 94 | * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 95 | * // 1, 1, 1, 1, 1, 1, 1, 1, 96 | * // 1, 1, 1, 1, 1, 1, 1, 1] 97 | * 98 | * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0, 99 | * // 4, 6, 1, 3, 5, 7, 2, 4, 100 | * // 8, 3, 7, 4, 6, 8, 5, 7] 101 | * 102 | * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24] 103 | * 104 | * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1] 105 | * float* d_vector_y; // e.g., [ , , , , , , , , ] 106 | * ... 107 | * 108 | * // Determine temporary device storage requirements 109 | * void* d_temp_storage = NULL; 110 | * size_t temp_storage_bytes = 0; 111 | * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, 112 | * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, 113 | * num_rows, num_cols, num_nonzeros, alpha, beta); 114 | * 115 | * // Allocate temporary storage 116 | * cudaMalloc(&d_temp_storage, temp_storage_bytes); 117 | * 118 | * // Run SpMV 119 | * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values, 120 | * d_row_offsets, d_column_indices, d_vector_x, d_vector_y, 121 | * num_rows, num_cols, num_nonzeros, alpha, beta); 122 | * 123 | * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2] 124 | * 125 | * \endcode 126 | * 127 | * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.) 128 | */ 129 | template < 130 | typename ValueT> 131 | CUB_RUNTIME_FUNCTION 132 | static cudaError_t CsrMV( 133 | void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 134 | size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation 135 | ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A. 136 | int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros) 137 | int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.) 138 | ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x 139 | ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y 140 | int num_rows, ///< [in] number of rows of matrix A. 141 | int num_cols, ///< [in] number of columns of matrix A. 142 | int num_nonzeros, ///< [in] number of nonzero elements of matrix A. 143 | cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. 144 | bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. 145 | { 146 | SpmvParams spmv_params; 147 | spmv_params.d_values = d_values; 148 | spmv_params.d_row_end_offsets = d_row_offsets + 1; 149 | spmv_params.d_column_indices = d_column_indices; 150 | spmv_params.d_vector_x = d_vector_x; 151 | spmv_params.d_vector_y = d_vector_y; 152 | spmv_params.num_rows = num_rows; 153 | spmv_params.num_cols = num_cols; 154 | spmv_params.num_nonzeros = num_nonzeros; 155 | spmv_params.alpha = 1.0; 156 | spmv_params.beta = 0.0; 157 | 158 | return DispatchSpmv::Dispatch( 159 | d_temp_storage, 160 | temp_storage_bytes, 161 | spmv_params, 162 | stream, 163 | debug_synchronous); 164 | } 165 | 166 | //@} end member group 167 | }; 168 | 169 | 170 | 171 | } // CUB namespace 172 | CUB_NS_POSTFIX // Optional outer namespace(s) 173 | 174 | 175 | -------------------------------------------------------------------------------- /cub/grid/grid_barrier.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_debug.cuh" 37 | #include "../util_namespace.cuh" 38 | #include "../thread/thread_load.cuh" 39 | 40 | /// Optional outer namespace(s) 41 | CUB_NS_PREFIX 42 | 43 | /// CUB namespace 44 | namespace cub { 45 | 46 | 47 | /** 48 | * \addtogroup GridModule 49 | * @{ 50 | */ 51 | 52 | 53 | /** 54 | * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid 55 | */ 56 | class GridBarrier 57 | { 58 | protected : 59 | 60 | typedef unsigned int SyncFlag; 61 | 62 | // Counters in global device memory 63 | SyncFlag* d_sync; 64 | 65 | public: 66 | 67 | /** 68 | * Constructor 69 | */ 70 | GridBarrier() : d_sync(NULL) {} 71 | 72 | 73 | /** 74 | * Synchronize 75 | */ 76 | __device__ __forceinline__ void Sync() const 77 | { 78 | volatile SyncFlag *d_vol_sync = d_sync; 79 | 80 | // Threadfence and syncthreads to make sure global writes are visible before 81 | // thread-0 reports in with its sync counter 82 | __threadfence(); 83 | __syncthreads(); 84 | 85 | if (blockIdx.x == 0) 86 | { 87 | // Report in ourselves 88 | if (threadIdx.x == 0) 89 | { 90 | d_vol_sync[blockIdx.x] = 1; 91 | } 92 | 93 | __syncthreads(); 94 | 95 | // Wait for everyone else to report in 96 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 97 | { 98 | while (ThreadLoad(d_sync + peer_block) == 0) 99 | { 100 | __threadfence_block(); 101 | } 102 | } 103 | 104 | __syncthreads(); 105 | 106 | // Let everyone know it's safe to proceed 107 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x) 108 | { 109 | d_vol_sync[peer_block] = 0; 110 | } 111 | } 112 | else 113 | { 114 | if (threadIdx.x == 0) 115 | { 116 | // Report in 117 | d_vol_sync[blockIdx.x] = 1; 118 | 119 | // Wait for acknowledgment 120 | while (ThreadLoad(d_sync + blockIdx.x) == 1) 121 | { 122 | __threadfence_block(); 123 | } 124 | } 125 | 126 | __syncthreads(); 127 | } 128 | } 129 | }; 130 | 131 | 132 | /** 133 | * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation. 134 | * 135 | * Uses RAII for lifetime, i.e., device resources are reclaimed when 136 | * the destructor is called. 137 | */ 138 | class GridBarrierLifetime : public GridBarrier 139 | { 140 | protected: 141 | 142 | // Number of bytes backed by d_sync 143 | size_t sync_bytes; 144 | 145 | public: 146 | 147 | /** 148 | * Constructor 149 | */ 150 | GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {} 151 | 152 | 153 | /** 154 | * DeviceFrees and resets the progress counters 155 | */ 156 | cudaError_t HostReset() 157 | { 158 | cudaError_t retval = cudaSuccess; 159 | if (d_sync) 160 | { 161 | CubDebug(retval = cudaFree(d_sync)); 162 | d_sync = NULL; 163 | } 164 | sync_bytes = 0; 165 | return retval; 166 | } 167 | 168 | 169 | /** 170 | * Destructor 171 | */ 172 | virtual ~GridBarrierLifetime() 173 | { 174 | HostReset(); 175 | } 176 | 177 | 178 | /** 179 | * Sets up the progress counters for the next kernel launch (lazily 180 | * allocating and initializing them if necessary) 181 | */ 182 | cudaError_t Setup(int sweep_grid_size) 183 | { 184 | cudaError_t retval = cudaSuccess; 185 | do { 186 | size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag); 187 | if (new_sync_bytes > sync_bytes) 188 | { 189 | if (d_sync) 190 | { 191 | if (CubDebug(retval = cudaFree(d_sync))) break; 192 | } 193 | 194 | sync_bytes = new_sync_bytes; 195 | 196 | // Allocate and initialize to zero 197 | if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break; 198 | if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break; 199 | } 200 | } while (0); 201 | 202 | return retval; 203 | } 204 | }; 205 | 206 | 207 | /** @} */ // end group GridModule 208 | 209 | } // CUB namespace 210 | CUB_NS_POSTFIX // Optional outer namespace(s) 211 | 212 | -------------------------------------------------------------------------------- /cub/grid/grid_even_share.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #include "../util_namespace.cuh" 38 | #include "../util_macro.cuh" 39 | 40 | /// Optional outer namespace(s) 41 | CUB_NS_PREFIX 42 | 43 | /// CUB namespace 44 | namespace cub { 45 | 46 | 47 | /** 48 | * \addtogroup GridModule 49 | * @{ 50 | */ 51 | 52 | 53 | /** 54 | * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains). 55 | * 56 | * \par Overview 57 | * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks. 58 | * Threadblocks may receive one of three different amounts of work: "big", "normal", 59 | * and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit 60 | * for the last threadblock may be partially-full if the input is not an even multiple of 61 | * the scheduling grain size. 62 | * 63 | * \par 64 | * Before invoking a child grid, a parent thread will typically construct an instance of 65 | * GridEvenShare. The instance can be passed to child threadblocks which can 66 | * initialize their per-threadblock offsets using \p BlockInit(). 67 | * 68 | * \tparam OffsetT Signed integer type for global offsets 69 | */ 70 | template 71 | struct GridEvenShare 72 | { 73 | OffsetT total_grains; 74 | int big_blocks; 75 | OffsetT big_share; 76 | OffsetT normal_share; 77 | OffsetT normal_base_offset; 78 | 79 | /// Total number of input items 80 | OffsetT num_items; 81 | 82 | /// Grid size in threadblocks 83 | int grid_size; 84 | 85 | /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles 86 | OffsetT block_offset; 87 | 88 | /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles 89 | OffsetT block_end; 90 | 91 | /** 92 | * \brief Default constructor. Zero-initializes block-specific fields. 93 | */ 94 | __host__ __device__ __forceinline__ GridEvenShare() : 95 | num_items(0), 96 | grid_size(0), 97 | block_offset(0), 98 | block_end(0) {} 99 | 100 | /** 101 | * \brief Constructor. Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch) 102 | */ 103 | __host__ __device__ __forceinline__ GridEvenShare( 104 | OffsetT num_items, ///< Total number of input items 105 | int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items) 106 | int schedule_granularity) ///< Granularity by which the input can be parcelled into and distributed among threablocks. Usually the thread block's native tile size (or a multiple thereof. 107 | { 108 | this->num_items = num_items; 109 | this->block_offset = num_items; 110 | this->block_end = num_items; 111 | this->total_grains = (num_items + schedule_granularity - 1) / schedule_granularity; 112 | this->grid_size = CUB_MIN(total_grains, max_grid_size); 113 | OffsetT grains_per_block = total_grains / grid_size; 114 | this->big_blocks = total_grains - (grains_per_block * grid_size); // leftover grains go to big blocks 115 | this->normal_share = grains_per_block * schedule_granularity; 116 | this->normal_base_offset = big_blocks * schedule_granularity; 117 | this->big_share = normal_share + schedule_granularity; 118 | } 119 | 120 | 121 | 122 | /** 123 | * \brief Initializes ranges for the specified partition index 124 | */ 125 | __device__ __forceinline__ void Init(int partition_id) 126 | { 127 | if (partition_id < big_blocks) 128 | { 129 | // This threadblock gets a big share of grains (grains_per_block + 1) 130 | block_offset = (partition_id * big_share); 131 | block_end = block_offset + big_share; 132 | } 133 | else if (partition_id < total_grains) 134 | { 135 | // This threadblock gets a normal share of grains (grains_per_block) 136 | block_offset = normal_base_offset + (partition_id * normal_share); 137 | block_end = CUB_MIN(num_items, block_offset + normal_share); 138 | } 139 | } 140 | 141 | 142 | /** 143 | * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup) 144 | */ 145 | __device__ __forceinline__ void BlockInit() 146 | { 147 | Init(blockIdx.x); 148 | } 149 | 150 | 151 | /** 152 | * Print to stdout 153 | */ 154 | __host__ __device__ __forceinline__ void Print() 155 | { 156 | printf( 157 | #if (CUB_PTX_ARCH > 0) 158 | "\tthreadblock(%d) " 159 | "block_offset(%lu) " 160 | "block_end(%lu) " 161 | #endif 162 | "num_items(%lu) " 163 | "total_grains(%lu) " 164 | "big_blocks(%lu) " 165 | "big_share(%lu) " 166 | "normal_share(%lu)\n", 167 | #if (CUB_PTX_ARCH > 0) 168 | blockIdx.x, 169 | (unsigned long) block_offset, 170 | (unsigned long) block_end, 171 | #endif 172 | (unsigned long) num_items, 173 | (unsigned long) total_grains, 174 | (unsigned long) big_blocks, 175 | (unsigned long) big_share, 176 | (unsigned long) normal_share); 177 | } 178 | }; 179 | 180 | 181 | 182 | /** @} */ // end group GridModule 183 | 184 | } // CUB namespace 185 | CUB_NS_POSTFIX // Optional outer namespace(s) 186 | -------------------------------------------------------------------------------- /cub/grid/grid_mapping.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * \addtogroup GridModule 47 | * @{ 48 | */ 49 | 50 | 51 | /****************************************************************************** 52 | * Mapping policies 53 | *****************************************************************************/ 54 | 55 | 56 | /** 57 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks. 58 | */ 59 | enum GridMappingStrategy 60 | { 61 | /** 62 | * \brief An "even-share" strategy for assigning input tiles to thread blocks. 63 | * 64 | * \par Overview 65 | * The input is evenly partitioned into \p p segments, where \p p is 66 | * constant and corresponds loosely to the number of thread blocks that may 67 | * actively reside on the target device. Each segment is comprised of 68 | * consecutive tiles, where a tile is a small, constant-sized unit of input 69 | * to be processed to completion before the thread block terminates or 70 | * obtains more work. The kernel invokes \p p thread blocks, each 71 | * of which iteratively consumes a segment of n/p elements 72 | * in tile-size increments. 73 | */ 74 | GRID_MAPPING_EVEN_SHARE, 75 | 76 | /** 77 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks. 78 | * 79 | * \par Overview 80 | * The input is treated as a queue to be dynamically consumed by a grid of 81 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a 82 | * unit of input to be processed to completion before the thread block 83 | * terminates or obtains more work. The grid size \p p is constant, 84 | * loosely corresponding to the number of thread blocks that may actively 85 | * reside on the target device. 86 | */ 87 | GRID_MAPPING_DYNAMIC, 88 | }; 89 | 90 | 91 | /** @} */ // end group GridModule 92 | 93 | } // CUB namespace 94 | CUB_NS_POSTFIX // Optional outer namespace(s) 95 | 96 | -------------------------------------------------------------------------------- /cub/grid/grid_queue.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * cub::GridQueue is a descriptor utility for dynamic queue management. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | #include "../util_debug.cuh" 38 | 39 | /// Optional outer namespace(s) 40 | CUB_NS_PREFIX 41 | 42 | /// CUB namespace 43 | namespace cub { 44 | 45 | 46 | /** 47 | * \addtogroup GridModule 48 | * @{ 49 | */ 50 | 51 | 52 | /** 53 | * \brief GridQueue is a descriptor utility for dynamic queue management. 54 | * 55 | * \par Overview 56 | * GridQueue descriptors provides abstractions for "filling" or 57 | * "draining" globally-shared vectors. 58 | * 59 | * \par 60 | * A "filling" GridQueue works by atomically-adding to a zero-initialized counter, 61 | * returning a unique offset for the calling thread to write its items. 62 | * The GridQueue maintains the total "fill-size". The fill counter must be reset 63 | * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that 64 | * will be filling. 65 | * 66 | * \par 67 | * Similarly, a "draining" GridQueue works by works by atomically-incrementing a 68 | * zero-initialized counter, returning a unique offset for the calling thread to 69 | * read its items. Threads can safely drain until the array's logical fill-size is 70 | * exceeded. The drain counter must be reset using GridQueue::ResetDrain or 71 | * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that 72 | * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size 73 | * is simply the number of elements in the array.) 74 | * 75 | * \par 76 | * Iterative work management can be implemented simply with a pair of flip-flopping 77 | * work buffers, each with an associated set of fill and drain GridQueue descriptors. 78 | * 79 | * \tparam OffsetT Signed integer type for global offsets 80 | */ 81 | template 82 | class GridQueue 83 | { 84 | private: 85 | 86 | /// Counter indices 87 | enum 88 | { 89 | FILL = 0, 90 | DRAIN = 1, 91 | }; 92 | 93 | /// Pair of counters 94 | OffsetT *d_counters; 95 | 96 | public: 97 | 98 | /// Returns the device allocation size in bytes needed to construct a GridQueue instance 99 | __host__ __device__ __forceinline__ 100 | static size_t AllocationSize() 101 | { 102 | return sizeof(OffsetT) * 2; 103 | } 104 | 105 | 106 | /// Constructs an invalid GridQueue descriptor 107 | __host__ __device__ __forceinline__ GridQueue() 108 | : 109 | d_counters(NULL) 110 | {} 111 | 112 | 113 | /// Constructs a GridQueue descriptor around the device storage allocation 114 | __host__ __device__ __forceinline__ GridQueue( 115 | void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize(). 116 | : 117 | d_counters((OffsetT*) d_storage) 118 | {} 119 | 120 | 121 | /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining. 122 | __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain( 123 | OffsetT fill_size, 124 | cudaStream_t stream = 0) 125 | { 126 | #if (CUB_PTX_ARCH > 0) 127 | d_counters[FILL] = fill_size; 128 | d_counters[DRAIN] = 0; 129 | return cudaSuccess; 130 | #else 131 | OffsetT counters[2]; 132 | counters[FILL] = fill_size; 133 | counters[DRAIN] = 0; 134 | return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream)); 135 | #endif 136 | } 137 | 138 | 139 | /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining. 140 | __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0) 141 | { 142 | #if (CUB_PTX_ARCH > 0) 143 | d_counters[DRAIN] = 0; 144 | return cudaSuccess; 145 | #else 146 | return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream)); 147 | #endif 148 | } 149 | 150 | 151 | /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling. 152 | __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0) 153 | { 154 | #if (CUB_PTX_ARCH > 0) 155 | d_counters[FILL] = 0; 156 | return cudaSuccess; 157 | #else 158 | return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream)); 159 | #endif 160 | } 161 | 162 | 163 | /// Returns the fill-size established by the parent or by the previous kernel. 164 | __host__ __device__ __forceinline__ cudaError_t FillSize( 165 | OffsetT &fill_size, 166 | cudaStream_t stream = 0) 167 | { 168 | #if (CUB_PTX_ARCH > 0) 169 | fill_size = d_counters[FILL]; 170 | return cudaSuccess; 171 | #else 172 | return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream)); 173 | #endif 174 | } 175 | 176 | 177 | /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel. 178 | __device__ __forceinline__ OffsetT Drain(OffsetT num_items) 179 | { 180 | return atomicAdd(d_counters + DRAIN, num_items); 181 | } 182 | 183 | 184 | /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel. 185 | __device__ __forceinline__ OffsetT Fill(OffsetT num_items) 186 | { 187 | return atomicAdd(d_counters + FILL, num_items); 188 | } 189 | }; 190 | 191 | 192 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 193 | 194 | 195 | /** 196 | * Reset grid queue (call with 1 block of 1 thread) 197 | */ 198 | template 199 | __global__ void FillAndResetDrainKernel( 200 | GridQueue grid_queue, 201 | OffsetT num_items) 202 | { 203 | grid_queue.FillAndResetDrain(num_items); 204 | } 205 | 206 | 207 | 208 | #endif // DOXYGEN_SHOULD_SKIP_THIS 209 | 210 | 211 | /** @} */ // end group GridModule 212 | 213 | } // CUB namespace 214 | CUB_NS_POSTFIX // Optional outer namespace(s) 215 | 216 | 217 | -------------------------------------------------------------------------------- /cub/host/mutex.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple portable mutex 32 | */ 33 | 34 | 35 | #pragma once 36 | 37 | #if __cplusplus > 199711L 38 | #include 39 | #else 40 | #if defined(_WIN32) || defined(_WIN64) 41 | #include 42 | #include 43 | #undef small // Windows is terrible for polluting macro namespace 44 | 45 | /** 46 | * Compiler read/write barrier 47 | */ 48 | #pragma intrinsic(_ReadWriteBarrier) 49 | 50 | #endif 51 | #endif 52 | 53 | #include "../util_namespace.cuh" 54 | 55 | 56 | /// Optional outer namespace(s) 57 | CUB_NS_PREFIX 58 | 59 | /// CUB namespace 60 | namespace cub { 61 | 62 | 63 | /** 64 | * Simple portable mutex 65 | * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms) 66 | * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++) 67 | */ 68 | struct Mutex 69 | { 70 | #if __cplusplus > 199711L 71 | 72 | std::mutex mtx; 73 | 74 | void Lock() 75 | { 76 | mtx.lock(); 77 | } 78 | 79 | void Unlock() 80 | { 81 | mtx.unlock(); 82 | } 83 | 84 | void TryLock() 85 | { 86 | mtx.try_lock(); 87 | } 88 | 89 | #else //__cplusplus > 199711L 90 | 91 | #if defined(_MSC_VER) 92 | 93 | // Microsoft VC++ 94 | typedef long Spinlock; 95 | 96 | #else 97 | 98 | // GNU g++ 99 | typedef int Spinlock; 100 | 101 | /** 102 | * Compiler read/write barrier 103 | */ 104 | __forceinline__ void _ReadWriteBarrier() 105 | { 106 | __sync_synchronize(); 107 | } 108 | 109 | /** 110 | * Atomic exchange 111 | */ 112 | __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value) 113 | { 114 | // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier 115 | _ReadWriteBarrier(); 116 | return __sync_lock_test_and_set(Target, Value); 117 | } 118 | 119 | /** 120 | * Pause instruction to prevent excess processor bus usage 121 | */ 122 | __forceinline__ void YieldProcessor() 123 | { 124 | #ifndef __arm__ 125 | asm volatile("pause\n": : :"memory"); 126 | #endif // __arm__ 127 | } 128 | 129 | #endif // defined(_MSC_VER) 130 | 131 | /// Lock member 132 | volatile Spinlock lock; 133 | 134 | /** 135 | * Constructor 136 | */ 137 | Mutex() : lock(0) {} 138 | 139 | /** 140 | * Return when the specified spinlock has been acquired 141 | */ 142 | __forceinline__ void Lock() 143 | { 144 | while (1) 145 | { 146 | if (!_InterlockedExchange(&lock, 1)) return; 147 | while (lock) YieldProcessor(); 148 | } 149 | } 150 | 151 | 152 | /** 153 | * Release the specified spinlock 154 | */ 155 | __forceinline__ void Unlock() 156 | { 157 | _ReadWriteBarrier(); 158 | lock = 0; 159 | } 160 | 161 | #endif // __cplusplus > 199711L 162 | 163 | }; 164 | 165 | 166 | 167 | 168 | } // CUB namespace 169 | CUB_NS_POSTFIX // Optional outer namespace(s) 170 | 171 | -------------------------------------------------------------------------------- /cub/iterator/arg_index_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_device.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | #include 45 | 46 | #if (THRUST_VERSION >= 100700) 47 | // This iterator is compatible with Thrust API 1.7 and newer 48 | #include 49 | #include 50 | #endif // THRUST_VERSION 51 | 52 | /// Optional outer namespace(s) 53 | CUB_NS_PREFIX 54 | 55 | /// CUB namespace 56 | namespace cub { 57 | 58 | /** 59 | * \addtogroup UtilIterator 60 | * @{ 61 | */ 62 | 63 | 64 | /** 65 | * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples). 66 | * 67 | * \par Overview 68 | * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT. 69 | * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose 70 | * \p key field is \p i and whose \p value field is itr[i]. 71 | * - Can be used with any data type. 72 | * - Can be constructed, manipulated, and exchanged within and between host and device 73 | * functions. Wrapped host memory can only be dereferenced on the host, and wrapped 74 | * device memory can only be dereferenced on the device. 75 | * - Compatible with Thrust API v1.7 or newer. 76 | * 77 | * \par Snippet 78 | * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto 79 | * dereference an array of doubles 80 | * \par 81 | * \code 82 | * #include // or equivalently 83 | * 84 | * // Declare, allocate, and initialize a device array 85 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] 86 | * 87 | * // Create an iterator wrapper 88 | * cub::ArgIndexInputIterator itr(d_in); 89 | * 90 | * // Within device code: 91 | * typedef typename cub::ArgIndexInputIterator::value_type Tuple; 92 | * Tuple item_offset_pair.key = *itr; 93 | * printf("%f @ %d\n", 94 | * item_offset_pair.value, 95 | * item_offset_pair.key); // 8.0 @ 0 96 | * 97 | * itr = itr + 6; 98 | * item_offset_pair.key = *itr; 99 | * printf("%f @ %d\n", 100 | * item_offset_pair.value, 101 | * item_offset_pair.key); // 9.0 @ 6 102 | * 103 | * \endcode 104 | * 105 | * \tparam InputIteratorT The type of the wrapped input iterator 106 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 107 | */ 108 | template < 109 | typename InputIteratorT, 110 | typename OffsetT = ptrdiff_t> 111 | class ArgIndexInputIterator 112 | { 113 | private: 114 | 115 | // Data type of input iterator 116 | typedef typename std::iterator_traits::value_type T; 117 | 118 | public: 119 | 120 | 121 | // Required iterator traits 122 | typedef ArgIndexInputIterator self_type; ///< My own type 123 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 124 | typedef KeyValuePair value_type; ///< The type of the element the iterator can point to 125 | typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to 126 | typedef value_type reference; ///< The type of a reference to an element the iterator can point to 127 | 128 | #if (THRUST_VERSION >= 100700) 129 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 130 | typedef typename thrust::detail::iterator_facade_category< 131 | thrust::any_system_tag, 132 | thrust::random_access_traversal_tag, 133 | value_type, 134 | reference 135 | >::type iterator_category; ///< The iterator category 136 | #else 137 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 138 | #endif // THRUST_VERSION 139 | 140 | private: 141 | 142 | InputIteratorT itr; 143 | difference_type offset; 144 | 145 | public: 146 | 147 | /// Constructor 148 | __host__ __device__ __forceinline__ ArgIndexInputIterator( 149 | InputIteratorT itr, ///< Input iterator to wrap 150 | difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator 151 | : 152 | itr(itr), 153 | offset(offset) 154 | {} 155 | 156 | /// Postfix increment 157 | __host__ __device__ __forceinline__ self_type operator++(int) 158 | { 159 | self_type retval = *this; 160 | offset++; 161 | return retval; 162 | } 163 | 164 | /// Prefix increment 165 | __host__ __device__ __forceinline__ self_type operator++() 166 | { 167 | offset++; 168 | return *this; 169 | } 170 | 171 | /// Indirection 172 | __host__ __device__ __forceinline__ reference operator*() const 173 | { 174 | value_type retval; 175 | retval.value = itr[offset]; 176 | retval.key = offset; 177 | return retval; 178 | } 179 | 180 | /// Addition 181 | template 182 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 183 | { 184 | self_type retval(itr, offset + n); 185 | return retval; 186 | } 187 | 188 | /// Addition assignment 189 | template 190 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 191 | { 192 | offset += n; 193 | return *this; 194 | } 195 | 196 | /// Subtraction 197 | template 198 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 199 | { 200 | self_type retval(itr, offset - n); 201 | return retval; 202 | } 203 | 204 | /// Subtraction assignment 205 | template 206 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 207 | { 208 | offset -= n; 209 | return *this; 210 | } 211 | 212 | /// Distance 213 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 214 | { 215 | return offset - other.offset; 216 | } 217 | 218 | /// Array subscript 219 | template 220 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 221 | { 222 | self_type offset = (*this) + n; 223 | return *offset; 224 | } 225 | 226 | /// Structure dereference 227 | __host__ __device__ __forceinline__ pointer operator->() 228 | { 229 | return &(*(*this)); 230 | } 231 | 232 | /// Equal to 233 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 234 | { 235 | return ((itr == rhs.itr) && (offset == rhs.offset)); 236 | } 237 | 238 | /// Not equal to 239 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 240 | { 241 | return ((itr != rhs.itr) || (offset != rhs.offset)); 242 | } 243 | 244 | /// Normalize 245 | __host__ __device__ __forceinline__ void normalize() 246 | { 247 | itr += offset; 248 | offset = 0; 249 | } 250 | 251 | /// ostream operator 252 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 253 | { 254 | return os; 255 | } 256 | }; 257 | 258 | 259 | 260 | /** @} */ // end group UtilIterator 261 | 262 | } // CUB namespace 263 | CUB_NS_POSTFIX // Optional outer namespace(s) 264 | -------------------------------------------------------------------------------- /cub/iterator/cache_modified_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_device.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | #if (THRUST_VERSION >= 100700) 45 | // This iterator is compatible with Thrust API 1.7 and newer 46 | #include 47 | #include 48 | #endif // THRUST_VERSION 49 | 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | 58 | 59 | /** 60 | * \addtogroup UtilIterator 61 | * @{ 62 | */ 63 | 64 | 65 | /** 66 | * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier. 67 | * 68 | * \par Overview 69 | * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native 70 | * device pointer of type ValueType*. \p ValueType references are 71 | * made by reading \p ValueType values through loads modified by \p MODIFIER. 72 | * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG", 73 | * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.). 74 | * - Can be constructed, manipulated, and exchanged within and between host and device 75 | * functions, but can only be dereferenced within device functions. 76 | * - Compatible with Thrust API v1.7 or newer. 77 | * 78 | * \par Snippet 79 | * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto 80 | * dereference a device array of double using the "ldg" PTX load modifier 81 | * (i.e., load values through texture cache). 82 | * \par 83 | * \code 84 | * #include // or equivalently 85 | * 86 | * // Declare, allocate, and initialize a device array 87 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] 88 | * 89 | * // Create an iterator wrapper 90 | * cub::CacheModifiedInputIterator itr(d_in); 91 | * 92 | * // Within device code: 93 | * printf("%f\n", itr[0]); // 8.0 94 | * printf("%f\n", itr[1]); // 6.0 95 | * printf("%f\n", itr[6]); // 9.0 96 | * 97 | * \endcode 98 | * 99 | * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data 100 | * \tparam ValueType The value type of this iterator 101 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 102 | */ 103 | template < 104 | CacheLoadModifier MODIFIER, 105 | typename ValueType, 106 | typename OffsetT = ptrdiff_t> 107 | class CacheModifiedInputIterator 108 | { 109 | public: 110 | 111 | // Required iterator traits 112 | typedef CacheModifiedInputIterator self_type; ///< My own type 113 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 114 | typedef ValueType value_type; ///< The type of the element the iterator can point to 115 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 116 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to 117 | 118 | #if (THRUST_VERSION >= 100700) 119 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 120 | typedef typename thrust::detail::iterator_facade_category< 121 | thrust::device_system_tag, 122 | thrust::random_access_traversal_tag, 123 | value_type, 124 | reference 125 | >::type iterator_category; ///< The iterator category 126 | #else 127 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 128 | #endif // THRUST_VERSION 129 | 130 | 131 | public: 132 | 133 | /// Wrapped native pointer 134 | ValueType* ptr; 135 | 136 | /// Constructor 137 | template 138 | __host__ __device__ __forceinline__ CacheModifiedInputIterator( 139 | QualifiedValueType* ptr) ///< Native pointer to wrap 140 | : 141 | ptr(const_cast::Type *>(ptr)) 142 | {} 143 | 144 | /// Postfix increment 145 | __host__ __device__ __forceinline__ self_type operator++(int) 146 | { 147 | self_type retval = *this; 148 | ptr++; 149 | return retval; 150 | } 151 | 152 | /// Prefix increment 153 | __host__ __device__ __forceinline__ self_type operator++() 154 | { 155 | ptr++; 156 | return *this; 157 | } 158 | 159 | /// Indirection 160 | __device__ __forceinline__ reference operator*() const 161 | { 162 | return ThreadLoad(ptr); 163 | } 164 | 165 | /// Addition 166 | template 167 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 168 | { 169 | self_type retval(ptr + n); 170 | return retval; 171 | } 172 | 173 | /// Addition assignment 174 | template 175 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 176 | { 177 | ptr += n; 178 | return *this; 179 | } 180 | 181 | /// Subtraction 182 | template 183 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 184 | { 185 | self_type retval(ptr - n); 186 | return retval; 187 | } 188 | 189 | /// Subtraction assignment 190 | template 191 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 192 | { 193 | ptr -= n; 194 | return *this; 195 | } 196 | 197 | /// Distance 198 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 199 | { 200 | return ptr - other.ptr; 201 | } 202 | 203 | /// Array subscript 204 | template 205 | __device__ __forceinline__ reference operator[](Distance n) const 206 | { 207 | return ThreadLoad(ptr + n); 208 | } 209 | 210 | /// Structure dereference 211 | __device__ __forceinline__ pointer operator->() 212 | { 213 | return &ThreadLoad(ptr); 214 | } 215 | 216 | /// Equal to 217 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 218 | { 219 | return (ptr == rhs.ptr); 220 | } 221 | 222 | /// Not equal to 223 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 224 | { 225 | return (ptr != rhs.ptr); 226 | } 227 | 228 | /// ostream operator 229 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 230 | { 231 | return os; 232 | } 233 | }; 234 | 235 | 236 | 237 | /** @} */ // end group UtilIterator 238 | 239 | } // CUB namespace 240 | CUB_NS_POSTFIX // Optional outer namespace(s) 241 | -------------------------------------------------------------------------------- /cub/iterator/cache_modified_output_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_device.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | #if (THRUST_VERSION >= 100700) 45 | // This iterator is compatible with Thrust API 1.7 and newer 46 | #include 47 | #include 48 | #endif // THRUST_VERSION 49 | 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | 58 | /** 59 | * \addtogroup UtilIterator 60 | * @{ 61 | */ 62 | 63 | 64 | /** 65 | * \brief A random-access output wrapper for storing array values using a PTX cache-modifier. 66 | * 67 | * \par Overview 68 | * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native 69 | * device pointer of type ValueType*. \p ValueType references are 70 | * made by writing \p ValueType values through stores modified by \p MODIFIER. 71 | * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB", 72 | * "STORE_CG", "STORE_CS", "STORE_WT", etc.). 73 | * - Can be constructed, manipulated, and exchanged within and between host and device 74 | * functions, but can only be dereferenced within device functions. 75 | * - Compatible with Thrust API v1.7 or newer. 76 | * 77 | * \par Snippet 78 | * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to 79 | * dereference a device array of doubles using the "wt" PTX load modifier 80 | * (i.e., write-through to system memory). 81 | * \par 82 | * \code 83 | * #include // or equivalently 84 | * 85 | * // Declare, allocate, and initialize a device array 86 | * double *d_out; // e.g., [, , , , , , ] 87 | * 88 | * // Create an iterator wrapper 89 | * cub::CacheModifiedOutputIterator itr(d_out); 90 | * 91 | * // Within device code: 92 | * itr[0] = 8.0; 93 | * itr[1] = 66.0; 94 | * itr[55] = 24.0; 95 | * 96 | * \endcode 97 | * 98 | * \par Usage Considerations 99 | * - Can only be dereferenced within device code 100 | * 101 | * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data 102 | * \tparam ValueType The value type of this iterator 103 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 104 | */ 105 | template < 106 | CacheStoreModifier MODIFIER, 107 | typename ValueType, 108 | typename OffsetT = ptrdiff_t> 109 | class CacheModifiedOutputIterator 110 | { 111 | private: 112 | 113 | // Proxy object 114 | struct Reference 115 | { 116 | ValueType* ptr; 117 | 118 | /// Constructor 119 | __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {} 120 | 121 | /// Assignment 122 | __device__ __forceinline__ ValueType operator =(ValueType val) 123 | { 124 | ThreadStore(ptr, val); 125 | return val; 126 | } 127 | }; 128 | 129 | public: 130 | 131 | // Required iterator traits 132 | typedef CacheModifiedOutputIterator self_type; ///< My own type 133 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 134 | typedef ValueType value_type; ///< The type of the element the iterator can point to 135 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 136 | typedef Reference reference; ///< The type of a reference to an element the iterator can point to 137 | 138 | #if (THRUST_VERSION >= 100700) 139 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 140 | typedef typename thrust::detail::iterator_facade_category< 141 | thrust::device_system_tag, 142 | thrust::random_access_traversal_tag, 143 | value_type, 144 | reference 145 | >::type iterator_category; ///< The iterator category 146 | #else 147 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 148 | #endif // THRUST_VERSION 149 | 150 | private: 151 | 152 | ValueType* ptr; 153 | 154 | public: 155 | 156 | /// Constructor 157 | template 158 | __host__ __device__ __forceinline__ CacheModifiedOutputIterator( 159 | QualifiedValueType* ptr) ///< Native pointer to wrap 160 | : 161 | ptr(const_cast::Type *>(ptr)) 162 | {} 163 | 164 | /// Postfix increment 165 | __host__ __device__ __forceinline__ self_type operator++(int) 166 | { 167 | self_type retval = *this; 168 | ptr++; 169 | return retval; 170 | } 171 | 172 | 173 | /// Prefix increment 174 | __host__ __device__ __forceinline__ self_type operator++() 175 | { 176 | ptr++; 177 | return *this; 178 | } 179 | 180 | /// Indirection 181 | __host__ __device__ __forceinline__ reference operator*() const 182 | { 183 | return Reference(ptr); 184 | } 185 | 186 | /// Addition 187 | template 188 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 189 | { 190 | self_type retval(ptr + n); 191 | return retval; 192 | } 193 | 194 | /// Addition assignment 195 | template 196 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 197 | { 198 | ptr += n; 199 | return *this; 200 | } 201 | 202 | /// Subtraction 203 | template 204 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 205 | { 206 | self_type retval(ptr - n); 207 | return retval; 208 | } 209 | 210 | /// Subtraction assignment 211 | template 212 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 213 | { 214 | ptr -= n; 215 | return *this; 216 | } 217 | 218 | /// Distance 219 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 220 | { 221 | return ptr - other.ptr; 222 | } 223 | 224 | /// Array subscript 225 | template 226 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 227 | { 228 | return Reference(ptr + n); 229 | } 230 | 231 | /// Equal to 232 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 233 | { 234 | return (ptr == rhs.ptr); 235 | } 236 | 237 | /// Not equal to 238 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 239 | { 240 | return (ptr != rhs.ptr); 241 | } 242 | 243 | /// ostream operator 244 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 245 | { 246 | return os; 247 | } 248 | }; 249 | 250 | 251 | /** @} */ // end group UtilIterator 252 | 253 | } // CUB namespace 254 | CUB_NS_POSTFIX // Optional outer namespace(s) 255 | -------------------------------------------------------------------------------- /cub/iterator/constant_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_namespace.cuh" 42 | 43 | #if (THRUST_VERSION >= 100700) 44 | // This iterator is compatible with Thrust API 1.7 and newer 45 | #include 46 | #include 47 | #endif // THRUST_VERSION 48 | 49 | 50 | /// Optional outer namespace(s) 51 | CUB_NS_PREFIX 52 | 53 | /// CUB namespace 54 | namespace cub { 55 | 56 | 57 | /** 58 | * \addtogroup UtilIterator 59 | * @{ 60 | */ 61 | 62 | 63 | /** 64 | * \brief A random-access input generator for dereferencing a sequence of homogeneous values 65 | * 66 | * \par Overview 67 | * - Read references to a ConstantInputIteratorTiterator always return the supplied constant 68 | * of type \p ValueType. 69 | * - Can be used with any data type. 70 | * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device 71 | * functions. 72 | * - Compatible with Thrust API v1.7 or newer. 73 | * 74 | * \par Snippet 75 | * The code snippet below illustrates the use of \p ConstantInputIteratorTto 76 | * dereference a sequence of homogeneous doubles. 77 | * \par 78 | * \code 79 | * #include // or equivalently 80 | * 81 | * cub::ConstantInputIterator itr(5.0); 82 | * 83 | * printf("%f\n", itr[0]); // 5.0 84 | * printf("%f\n", itr[1]); // 5.0 85 | * printf("%f\n", itr[2]); // 5.0 86 | * printf("%f\n", itr[50]); // 5.0 87 | * 88 | * \endcode 89 | * 90 | * \tparam ValueType The value type of this iterator 91 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 92 | */ 93 | template < 94 | typename ValueType, 95 | typename OffsetT = ptrdiff_t> 96 | class ConstantInputIterator 97 | { 98 | public: 99 | 100 | // Required iterator traits 101 | typedef ConstantInputIterator self_type; ///< My own type 102 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 103 | typedef ValueType value_type; ///< The type of the element the iterator can point to 104 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 105 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to 106 | 107 | #if (THRUST_VERSION >= 100700) 108 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 109 | typedef typename thrust::detail::iterator_facade_category< 110 | thrust::any_system_tag, 111 | thrust::random_access_traversal_tag, 112 | value_type, 113 | reference 114 | >::type iterator_category; ///< The iterator category 115 | #else 116 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 117 | #endif // THRUST_VERSION 118 | 119 | private: 120 | 121 | ValueType val; 122 | OffsetT offset; 123 | #ifdef _WIN32 124 | OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce) 125 | #endif 126 | 127 | public: 128 | 129 | /// Constructor 130 | __host__ __device__ __forceinline__ ConstantInputIterator( 131 | ValueType val, ///< Starting value for the iterator instance to report 132 | OffsetT offset = 0) ///< Base offset 133 | : 134 | val(val), 135 | offset(offset) 136 | {} 137 | 138 | /// Postfix increment 139 | __host__ __device__ __forceinline__ self_type operator++(int) 140 | { 141 | self_type retval = *this; 142 | offset++; 143 | return retval; 144 | } 145 | 146 | /// Prefix increment 147 | __host__ __device__ __forceinline__ self_type operator++() 148 | { 149 | offset++; 150 | return *this; 151 | } 152 | 153 | /// Indirection 154 | __host__ __device__ __forceinline__ reference operator*() const 155 | { 156 | return val; 157 | } 158 | 159 | /// Addition 160 | template 161 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 162 | { 163 | self_type retval(val, offset + n); 164 | return retval; 165 | } 166 | 167 | /// Addition assignment 168 | template 169 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 170 | { 171 | offset += n; 172 | return *this; 173 | } 174 | 175 | /// Subtraction 176 | template 177 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 178 | { 179 | self_type retval(val, offset - n); 180 | return retval; 181 | } 182 | 183 | /// Subtraction assignment 184 | template 185 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 186 | { 187 | offset -= n; 188 | return *this; 189 | } 190 | 191 | /// Distance 192 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 193 | { 194 | return offset - other.offset; 195 | } 196 | 197 | /// Array subscript 198 | template 199 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 200 | { 201 | return val; 202 | } 203 | 204 | /// Structure dereference 205 | __host__ __device__ __forceinline__ pointer operator->() 206 | { 207 | return &val; 208 | } 209 | 210 | /// Equal to 211 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 212 | { 213 | return (offset == rhs.offset) && ((val == rhs.val)); 214 | } 215 | 216 | /// Not equal to 217 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 218 | { 219 | return (offset != rhs.offset) || (val!= rhs.val); 220 | } 221 | 222 | /// ostream operator 223 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 224 | { 225 | os << "[" << itr.val << "," << itr.offset << "]"; 226 | return os; 227 | } 228 | 229 | }; 230 | 231 | 232 | /** @} */ // end group UtilIterator 233 | 234 | } // CUB namespace 235 | CUB_NS_POSTFIX // Optional outer namespace(s) 236 | -------------------------------------------------------------------------------- /cub/iterator/counting_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_device.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | #if (THRUST_VERSION >= 100700) 45 | // This iterator is compatible with Thrust API 1.7 and newer 46 | #include 47 | #include 48 | #endif // THRUST_VERSION 49 | 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | /** 58 | * \addtogroup UtilIterator 59 | * @{ 60 | */ 61 | 62 | /** 63 | * \brief A random-access input generator for dereferencing a sequence of incrementing integer values. 64 | * 65 | * \par Overview 66 | * - After initializing a CountingInputIteratorTto a certain integer \p base, read references 67 | * at \p offset will return the value \p base + \p offset. 68 | * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device 69 | * functions. 70 | * - Compatible with Thrust API v1.7 or newer. 71 | * 72 | * \par Snippet 73 | * The code snippet below illustrates the use of \p CountingInputIteratorTto 74 | * dereference a sequence of incrementing integers. 75 | * \par 76 | * \code 77 | * #include // or equivalently 78 | * 79 | * cub::CountingInputIterator itr(5); 80 | * 81 | * printf("%d\n", itr[0]); // 5 82 | * printf("%d\n", itr[1]); // 6 83 | * printf("%d\n", itr[2]); // 7 84 | * printf("%d\n", itr[50]); // 55 85 | * 86 | * \endcode 87 | * 88 | * \tparam ValueType The value type of this iterator 89 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 90 | */ 91 | template < 92 | typename ValueType, 93 | typename OffsetT = ptrdiff_t> 94 | class CountingInputIterator 95 | { 96 | public: 97 | 98 | // Required iterator traits 99 | typedef CountingInputIterator self_type; ///< My own type 100 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 101 | typedef ValueType value_type; ///< The type of the element the iterator can point to 102 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 103 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to 104 | 105 | #if (THRUST_VERSION >= 100700) 106 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 107 | typedef typename thrust::detail::iterator_facade_category< 108 | thrust::any_system_tag, 109 | thrust::random_access_traversal_tag, 110 | value_type, 111 | reference 112 | >::type iterator_category; ///< The iterator category 113 | #else 114 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 115 | #endif // THRUST_VERSION 116 | 117 | private: 118 | 119 | ValueType val; 120 | 121 | public: 122 | 123 | /// Constructor 124 | __host__ __device__ __forceinline__ CountingInputIterator( 125 | const ValueType &val) ///< Starting value for the iterator instance to report 126 | : 127 | val(val) 128 | {} 129 | 130 | /// Postfix increment 131 | __host__ __device__ __forceinline__ self_type operator++(int) 132 | { 133 | self_type retval = *this; 134 | val++; 135 | return retval; 136 | } 137 | 138 | /// Prefix increment 139 | __host__ __device__ __forceinline__ self_type operator++() 140 | { 141 | val++; 142 | return *this; 143 | } 144 | 145 | /// Indirection 146 | __host__ __device__ __forceinline__ reference operator*() const 147 | { 148 | return val; 149 | } 150 | 151 | /// Addition 152 | template 153 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 154 | { 155 | self_type retval(val + n); 156 | return retval; 157 | } 158 | 159 | /// Addition assignment 160 | template 161 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 162 | { 163 | val += n; 164 | return *this; 165 | } 166 | 167 | /// Subtraction 168 | template 169 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 170 | { 171 | self_type retval(val - n); 172 | return retval; 173 | } 174 | 175 | /// Subtraction assignment 176 | template 177 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 178 | { 179 | val -= n; 180 | return *this; 181 | } 182 | 183 | /// Distance 184 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 185 | { 186 | return val - other.val; 187 | } 188 | 189 | /// Array subscript 190 | template 191 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 192 | { 193 | return val + n; 194 | } 195 | 196 | /// Structure dereference 197 | __host__ __device__ __forceinline__ pointer operator->() 198 | { 199 | return &val; 200 | } 201 | 202 | /// Equal to 203 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 204 | { 205 | return (val == rhs.val); 206 | } 207 | 208 | /// Not equal to 209 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 210 | { 211 | return (val != rhs.val); 212 | } 213 | 214 | /// ostream operator 215 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 216 | { 217 | os << "[" << itr.val << "]"; 218 | return os; 219 | } 220 | 221 | }; 222 | 223 | 224 | 225 | /** @} */ // end group UtilIterator 226 | 227 | } // CUB namespace 228 | CUB_NS_POSTFIX // Optional outer namespace(s) 229 | -------------------------------------------------------------------------------- /cub/iterator/tex_obj_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_device.cuh" 42 | #include "../util_debug.cuh" 43 | #include "../util_namespace.cuh" 44 | 45 | #if (THRUST_VERSION >= 100700) 46 | // This iterator is compatible with Thrust API 1.7 and newer 47 | #include 48 | #include 49 | #endif // THRUST_VERSION 50 | 51 | 52 | /// Optional outer namespace(s) 53 | CUB_NS_PREFIX 54 | 55 | /// CUB namespace 56 | namespace cub { 57 | 58 | /** 59 | * \addtogroup UtilIterator 60 | * @{ 61 | */ 62 | 63 | 64 | 65 | /** 66 | * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects. 67 | * 68 | * \par Overview 69 | * - TexObjInputIteratorTwraps a native device pointer of type ValueType*. References 70 | * to elements are to be loaded through texture cache. 71 | * - Can be used to load any data type from memory through texture cache. 72 | * - Can be manipulated and exchanged within and between host and device 73 | * functions, can only be constructed within host functions, and can only be 74 | * dereferenced within device functions. 75 | * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be 76 | * created by the host thread, but can be used by any descendant kernel. 77 | * - Compatible with Thrust API v1.7 or newer. 78 | * 79 | * \par Snippet 80 | * The code snippet below illustrates the use of \p TexRefInputIteratorTto 81 | * dereference a device array of doubles through texture cache. 82 | * \par 83 | * \code 84 | * #include // or equivalently 85 | * 86 | * // Declare, allocate, and initialize a device array 87 | * int num_items; // e.g., 7 88 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0] 89 | * 90 | * // Create an iterator wrapper 91 | * cub::TexObjInputIterator itr; 92 | * itr.BindTexture(d_in, sizeof(double) * num_items); 93 | * ... 94 | * 95 | * // Within device code: 96 | * printf("%f\n", itr[0]); // 8.0 97 | * printf("%f\n", itr[1]); // 6.0 98 | * printf("%f\n", itr[6]); // 9.0 99 | * 100 | * ... 101 | * itr.UnbindTexture(); 102 | * 103 | * \endcode 104 | * 105 | * \tparam T The value type of this iterator 106 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 107 | */ 108 | template < 109 | typename T, 110 | typename OffsetT = ptrdiff_t> 111 | class TexObjInputIterator 112 | { 113 | public: 114 | 115 | // Required iterator traits 116 | typedef TexObjInputIterator self_type; ///< My own type 117 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 118 | typedef T value_type; ///< The type of the element the iterator can point to 119 | typedef T* pointer; ///< The type of a pointer to an element the iterator can point to 120 | typedef T reference; ///< The type of a reference to an element the iterator can point to 121 | 122 | #if (THRUST_VERSION >= 100700) 123 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 124 | typedef typename thrust::detail::iterator_facade_category< 125 | thrust::device_system_tag, 126 | thrust::random_access_traversal_tag, 127 | value_type, 128 | reference 129 | >::type iterator_category; ///< The iterator category 130 | #else 131 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 132 | #endif // THRUST_VERSION 133 | 134 | private: 135 | 136 | // Largest texture word we can use in device 137 | typedef typename UnitWord::TextureWord TextureWord; 138 | 139 | // Number of texture words per T 140 | enum { 141 | TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord) 142 | }; 143 | 144 | private: 145 | 146 | T* ptr; 147 | difference_type tex_offset; 148 | cudaTextureObject_t tex_obj; 149 | 150 | public: 151 | 152 | /// Constructor 153 | __host__ __device__ __forceinline__ TexObjInputIterator() 154 | : 155 | ptr(NULL), 156 | tex_offset(0), 157 | tex_obj(0) 158 | {} 159 | 160 | /// Use this iterator to bind \p ptr with a texture reference 161 | template 162 | cudaError_t BindTexture( 163 | QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment 164 | size_t bytes = size_t(-1), ///< Number of bytes in the range 165 | size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator 166 | { 167 | this->ptr = const_cast::Type *>(ptr); 168 | this->tex_offset = tex_offset; 169 | 170 | cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc(); 171 | cudaResourceDesc res_desc; 172 | cudaTextureDesc tex_desc; 173 | memset(&res_desc, 0, sizeof(cudaResourceDesc)); 174 | memset(&tex_desc, 0, sizeof(cudaTextureDesc)); 175 | res_desc.resType = cudaResourceTypeLinear; 176 | res_desc.res.linear.devPtr = this->ptr; 177 | res_desc.res.linear.desc = channel_desc; 178 | res_desc.res.linear.sizeInBytes = bytes; 179 | tex_desc.readMode = cudaReadModeElementType; 180 | return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL); 181 | } 182 | 183 | /// Unbind this iterator from its texture reference 184 | cudaError_t UnbindTexture() 185 | { 186 | return cudaDestroyTextureObject(tex_obj); 187 | } 188 | 189 | /// Postfix increment 190 | __host__ __device__ __forceinline__ self_type operator++(int) 191 | { 192 | self_type retval = *this; 193 | tex_offset++; 194 | return retval; 195 | } 196 | 197 | /// Prefix increment 198 | __host__ __device__ __forceinline__ self_type operator++() 199 | { 200 | tex_offset++; 201 | return *this; 202 | } 203 | 204 | /// Indirection 205 | __host__ __device__ __forceinline__ reference operator*() const 206 | { 207 | #if (CUB_PTX_ARCH == 0) 208 | // Simply dereference the pointer on the host 209 | return ptr[tex_offset]; 210 | #else 211 | // Move array of uninitialized words, then alias and assign to return value 212 | TextureWord words[TEXTURE_MULTIPLE]; 213 | 214 | #pragma unroll 215 | for (int i = 0; i < TEXTURE_MULTIPLE; ++i) 216 | { 217 | words[i] = tex1Dfetch( 218 | tex_obj, 219 | (tex_offset * TEXTURE_MULTIPLE) + i); 220 | } 221 | 222 | // Load from words 223 | return *reinterpret_cast(words); 224 | #endif 225 | } 226 | 227 | /// Addition 228 | template 229 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 230 | { 231 | self_type retval; 232 | retval.ptr = ptr; 233 | retval.tex_obj = tex_obj; 234 | retval.tex_offset = tex_offset + n; 235 | return retval; 236 | } 237 | 238 | /// Addition assignment 239 | template 240 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 241 | { 242 | tex_offset += n; 243 | return *this; 244 | } 245 | 246 | /// Subtraction 247 | template 248 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 249 | { 250 | self_type retval; 251 | retval.ptr = ptr; 252 | retval.tex_obj = tex_obj; 253 | retval.tex_offset = tex_offset - n; 254 | return retval; 255 | } 256 | 257 | /// Subtraction assignment 258 | template 259 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 260 | { 261 | tex_offset -= n; 262 | return *this; 263 | } 264 | 265 | /// Distance 266 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 267 | { 268 | return tex_offset - other.tex_offset; 269 | } 270 | 271 | /// Array subscript 272 | template 273 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 274 | { 275 | self_type offset = (*this) + n; 276 | return *offset; 277 | } 278 | 279 | /// Structure dereference 280 | __host__ __device__ __forceinline__ pointer operator->() 281 | { 282 | return &(*(*this)); 283 | } 284 | 285 | /// Equal to 286 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 287 | { 288 | return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj)); 289 | } 290 | 291 | /// Not equal to 292 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 293 | { 294 | return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj)); 295 | } 296 | 297 | /// ostream operator 298 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 299 | { 300 | return os; 301 | } 302 | 303 | }; 304 | 305 | 306 | 307 | /** @} */ // end group UtilIterator 308 | 309 | } // CUB namespace 310 | CUB_NS_POSTFIX // Optional outer namespace(s) 311 | -------------------------------------------------------------------------------- /cub/iterator/transform_input_iterator.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Random-access iterator types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include 37 | #include 38 | 39 | #include "../thread/thread_load.cuh" 40 | #include "../thread/thread_store.cuh" 41 | #include "../util_device.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | #if (THRUST_VERSION >= 100700) 45 | // This iterator is compatible with Thrust API 1.7 and newer 46 | #include 47 | #include 48 | #endif // THRUST_VERSION 49 | 50 | 51 | /// Optional outer namespace(s) 52 | CUB_NS_PREFIX 53 | 54 | /// CUB namespace 55 | namespace cub { 56 | 57 | /** 58 | * \addtogroup UtilIterator 59 | * @{ 60 | */ 61 | 62 | 63 | /** 64 | * \brief A random-access input wrapper for transforming dereferenced values. 65 | * 66 | * \par Overview 67 | * - TransformInputIteratorTwraps a unary conversion functor of type \p 68 | * ConversionOp and a random-access input iterator of type InputIteratorT, 69 | * using the former to produce references of type \p ValueType from the latter. 70 | * - Can be used with any data type. 71 | * - Can be constructed, manipulated, and exchanged within and between host and device 72 | * functions. Wrapped host memory can only be dereferenced on the host, and wrapped 73 | * device memory can only be dereferenced on the device. 74 | * - Compatible with Thrust API v1.7 or newer. 75 | * 76 | * \par Snippet 77 | * The code snippet below illustrates the use of \p TransformInputIteratorTto 78 | * dereference an array of integers, tripling the values and converting them to doubles. 79 | * \par 80 | * \code 81 | * #include // or equivalently 82 | * 83 | * // Functor for tripling integer values and converting to doubles 84 | * struct TripleDoubler 85 | * { 86 | * __host__ __device__ __forceinline__ 87 | * double operator()(const int &a) const { 88 | * return double(a * 2); 89 | * } 90 | * }; 91 | * 92 | * // Declare, allocate, and initialize a device array 93 | * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9] 94 | * TripleDoubler conversion_op; 95 | * 96 | * // Create an iterator wrapper 97 | * cub::TransformInputIterator itr(d_in, conversion_op); 98 | * 99 | * // Within device code: 100 | * printf("%f\n", itr[0]); // 24.0 101 | * printf("%f\n", itr[1]); // 18.0 102 | * printf("%f\n", itr[6]); // 27.0 103 | * 104 | * \endcode 105 | * 106 | * \tparam ValueType The value type of this iterator 107 | * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum). 108 | * \tparam InputIteratorT The type of the wrapped input iterator 109 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t) 110 | * 111 | */ 112 | template < 113 | typename ValueType, 114 | typename ConversionOp, 115 | typename InputIteratorT, 116 | typename OffsetT = ptrdiff_t> 117 | class TransformInputIterator 118 | { 119 | public: 120 | 121 | // Required iterator traits 122 | typedef TransformInputIterator self_type; ///< My own type 123 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another 124 | typedef ValueType value_type; ///< The type of the element the iterator can point to 125 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to 126 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to 127 | 128 | #if (THRUST_VERSION >= 100700) 129 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods 130 | typedef typename thrust::detail::iterator_facade_category< 131 | thrust::any_system_tag, 132 | thrust::random_access_traversal_tag, 133 | value_type, 134 | reference 135 | >::type iterator_category; ///< The iterator category 136 | #else 137 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category 138 | #endif // THRUST_VERSION 139 | 140 | private: 141 | 142 | ConversionOp conversion_op; 143 | InputIteratorT input_itr; 144 | 145 | public: 146 | 147 | /// Constructor 148 | __host__ __device__ __forceinline__ TransformInputIterator( 149 | InputIteratorT input_itr, ///< Input iterator to wrap 150 | ConversionOp conversion_op) ///< Conversion functor to wrap 151 | : 152 | conversion_op(conversion_op), 153 | input_itr(input_itr) 154 | {} 155 | 156 | /// Postfix increment 157 | __host__ __device__ __forceinline__ self_type operator++(int) 158 | { 159 | self_type retval = *this; 160 | input_itr++; 161 | return retval; 162 | } 163 | 164 | /// Prefix increment 165 | __host__ __device__ __forceinline__ self_type operator++() 166 | { 167 | input_itr++; 168 | return *this; 169 | } 170 | 171 | /// Indirection 172 | __host__ __device__ __forceinline__ reference operator*() const 173 | { 174 | return conversion_op(*input_itr); 175 | } 176 | 177 | /// Addition 178 | template 179 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const 180 | { 181 | self_type retval(input_itr + n, conversion_op); 182 | return retval; 183 | } 184 | 185 | /// Addition assignment 186 | template 187 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n) 188 | { 189 | input_itr += n; 190 | return *this; 191 | } 192 | 193 | /// Subtraction 194 | template 195 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const 196 | { 197 | self_type retval(input_itr - n, conversion_op); 198 | return retval; 199 | } 200 | 201 | /// Subtraction assignment 202 | template 203 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n) 204 | { 205 | input_itr -= n; 206 | return *this; 207 | } 208 | 209 | /// Distance 210 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const 211 | { 212 | return input_itr - other.input_itr; 213 | } 214 | 215 | /// Array subscript 216 | template 217 | __host__ __device__ __forceinline__ reference operator[](Distance n) const 218 | { 219 | return conversion_op(input_itr[n]); 220 | } 221 | 222 | /// Structure dereference 223 | __host__ __device__ __forceinline__ pointer operator->() 224 | { 225 | return &conversion_op(*input_itr); 226 | } 227 | 228 | /// Equal to 229 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) 230 | { 231 | return (input_itr == rhs.input_itr); 232 | } 233 | 234 | /// Not equal to 235 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) 236 | { 237 | return (input_itr != rhs.input_itr); 238 | } 239 | 240 | /// ostream operator 241 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr) 242 | { 243 | return os; 244 | } 245 | }; 246 | 247 | 248 | 249 | /** @} */ // end group UtilIterator 250 | 251 | } // CUB namespace 252 | CUB_NS_POSTFIX // Optional outer namespace(s) 253 | -------------------------------------------------------------------------------- /cub/thread/thread_operators.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Simple binary operator functor types 32 | */ 33 | 34 | /****************************************************************************** 35 | * Simple functor operators 36 | ******************************************************************************/ 37 | 38 | #pragma once 39 | 40 | #include "../util_macro.cuh" 41 | #include "../util_type.cuh" 42 | #include "../util_namespace.cuh" 43 | 44 | /// Optional outer namespace(s) 45 | CUB_NS_PREFIX 46 | 47 | /// CUB namespace 48 | namespace cub { 49 | 50 | 51 | /** 52 | * \addtogroup UtilModule 53 | * @{ 54 | */ 55 | 56 | /** 57 | * \brief Default equality functor 58 | */ 59 | struct Equality 60 | { 61 | /// Boolean equality operator, returns (a == b) 62 | template 63 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const 64 | { 65 | return a == b; 66 | } 67 | }; 68 | 69 | 70 | /** 71 | * \brief Default inequality functor 72 | */ 73 | struct Inequality 74 | { 75 | /// Boolean inequality operator, returns (a != b) 76 | template 77 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const 78 | { 79 | return a != b; 80 | } 81 | }; 82 | 83 | 84 | /** 85 | * \brief Inequality functor (wraps equality functor) 86 | */ 87 | template 88 | struct InequalityWrapper 89 | { 90 | /// Wrapped equality operator 91 | EqualityOp op; 92 | 93 | /// Constructor 94 | __host__ __device__ __forceinline__ 95 | InequalityWrapper(EqualityOp op) : op(op) {} 96 | 97 | /// Boolean inequality operator, returns (a != b) 98 | template 99 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const 100 | { 101 | return !op(a, b); 102 | } 103 | }; 104 | 105 | 106 | /** 107 | * \brief Default sum functor 108 | */ 109 | struct Sum 110 | { 111 | /// Boolean sum operator, returns a + b 112 | template 113 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const 114 | { 115 | return a + b; 116 | } 117 | }; 118 | 119 | 120 | /** 121 | * \brief Default max functor 122 | */ 123 | struct Max 124 | { 125 | /// Boolean max operator, returns (a > b) ? a : b 126 | template 127 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const 128 | { 129 | return CUB_MAX(a, b); 130 | } 131 | }; 132 | 133 | 134 | /** 135 | * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item) 136 | */ 137 | struct ArgMax 138 | { 139 | /// Boolean max operator, preferring the item having the smaller offset in case of ties 140 | template 141 | __host__ __device__ __forceinline__ KeyValuePair operator()( 142 | const KeyValuePair &a, 143 | const KeyValuePair &b) const 144 | { 145 | // Mooch BUG (device reduce argmax gk110 3.2 million random fp32) 146 | // return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; 147 | 148 | if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) 149 | return b; 150 | return a; 151 | } 152 | }; 153 | 154 | 155 | /** 156 | * \brief Default min functor 157 | */ 158 | struct Min 159 | { 160 | /// Boolean min operator, returns (a < b) ? a : b 161 | template 162 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const 163 | { 164 | return CUB_MIN(a, b); 165 | } 166 | }; 167 | 168 | 169 | /** 170 | * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item) 171 | */ 172 | struct ArgMin 173 | { 174 | /// Boolean min operator, preferring the item having the smaller offset in case of ties 175 | template 176 | __host__ __device__ __forceinline__ KeyValuePair operator()( 177 | const KeyValuePair &a, 178 | const KeyValuePair &b) const 179 | { 180 | // Mooch BUG (device reduce argmax gk110 3.2 million random fp32) 181 | // return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a; 182 | 183 | if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) 184 | return b; 185 | return a; 186 | } 187 | }; 188 | 189 | 190 | /** 191 | * \brief Default cast functor 192 | */ 193 | template 194 | struct Cast 195 | { 196 | /// Cast operator, returns (B) a 197 | template 198 | __host__ __device__ __forceinline__ B operator()(const A &a) const 199 | { 200 | return (B) a; 201 | } 202 | }; 203 | 204 | 205 | /** 206 | * \brief Binary operator wrapper for switching non-commutative scan arguments 207 | */ 208 | template 209 | class SwizzleScanOp 210 | { 211 | private: 212 | 213 | /// Wrapped scan operator 214 | ScanOp scan_op; 215 | 216 | public: 217 | 218 | /// Constructor 219 | __host__ __device__ __forceinline__ 220 | SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {} 221 | 222 | /// Switch the scan arguments 223 | template 224 | __host__ __device__ __forceinline__ 225 | T operator()(const T &a, const T &b) 226 | { 227 | return scan_op(b, a); 228 | } 229 | }; 230 | 231 | 232 | /** 233 | * \brief Reduce-by-segment functor. 234 | * 235 | * Given two cub::KeyValuePair inputs \p a and \p b and a 236 | * binary associative combining operator \p f(const T &x, const T &y), 237 | * an instance of this functor returns a cub::KeyValuePair whose \p key 238 | * field is a.key + a.key, and whose \p value field 239 | * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise. 240 | * 241 | * ReduceBySegmentOp is an associative, non-commutative binary combining operator 242 | * for input sequences of cub::KeyValuePair pairings. Such 243 | * sequences are typically used to represent a segmented set of values to be reduced 244 | * and a corresponding set of {0,1}-valued integer "head flags" demarcating the 245 | * first value of each segment. 246 | * 247 | */ 248 | template ///< Binary reduction operator to apply to values 249 | struct ReduceBySegmentOp 250 | { 251 | /// Wrapped reduction operator 252 | ReductionOpT op; 253 | 254 | /// Constructor 255 | __host__ __device__ __forceinline__ ReduceBySegmentOp() {} 256 | 257 | /// Constructor 258 | __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {} 259 | 260 | /// Scan operator 261 | template ///< KeyValuePair pairing of T (value) and OffsetT (head flag) 262 | __host__ __device__ __forceinline__ KeyValuePairT operator()( 263 | const KeyValuePairT &first, ///< First partial reduction 264 | const KeyValuePairT &second) ///< Second partial reduction 265 | { 266 | KeyValuePairT retval; 267 | retval.key = first.key + second.key; 268 | retval.value = (second.key) ? 269 | second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate 270 | op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate 271 | return retval; 272 | } 273 | }; 274 | 275 | 276 | 277 | template ///< Binary reduction operator to apply to values 278 | struct ReduceByKeyOp 279 | { 280 | /// Wrapped reduction operator 281 | ReductionOpT op; 282 | 283 | /// Constructor 284 | __host__ __device__ __forceinline__ ReduceByKeyOp() {} 285 | 286 | /// Constructor 287 | __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {} 288 | 289 | /// Scan operator 290 | template 291 | __host__ __device__ __forceinline__ KeyValuePairT operator()( 292 | const KeyValuePairT &first, ///< First partial reduction 293 | const KeyValuePairT &second) ///< Second partial reduction 294 | { 295 | KeyValuePairT retval = second; 296 | 297 | if (first.key == second.key) 298 | retval.value = op(first.value, retval.value); 299 | 300 | return retval; 301 | } 302 | }; 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | /** @} */ // end group UtilModule 311 | 312 | 313 | } // CUB namespace 314 | CUB_NS_POSTFIX // Optional outer namespace(s) 315 | -------------------------------------------------------------------------------- /cub/thread/thread_reduce.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Thread utilities for sequential reduction over statically-sized array types 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../thread/thread_operators.cuh" 37 | #include "../util_namespace.cuh" 38 | 39 | /// Optional outer namespace(s) 40 | CUB_NS_PREFIX 41 | 42 | /// CUB namespace 43 | namespace cub { 44 | 45 | /** 46 | * \addtogroup UtilModule 47 | * @{ 48 | */ 49 | 50 | /** 51 | * \name Sequential reduction over statically-sized array types 52 | * @{ 53 | */ 54 | 55 | 56 | template < 57 | int LENGTH, 58 | typename T, 59 | typename ReductionOp> 60 | __device__ __forceinline__ T ThreadReduce( 61 | T* input, ///< [in] Input array 62 | ReductionOp reduction_op, ///< [in] Binary reduction operator 63 | T prefix, ///< [in] Prefix to seed reduction with 64 | Int2Type length) 65 | { 66 | T addend = *input; 67 | prefix = reduction_op(prefix, addend); 68 | 69 | return ThreadReduce(input + 1, reduction_op, prefix, Int2Type()); 70 | } 71 | 72 | template < 73 | typename T, 74 | typename ReductionOp> 75 | __device__ __forceinline__ T ThreadReduce( 76 | T* input, ///< [in] Input array 77 | ReductionOp reduction_op, ///< [in] Binary reduction operator 78 | T prefix, ///< [in] Prefix to seed reduction with 79 | Int2Type<0> length) 80 | { 81 | return prefix; 82 | } 83 | 84 | 85 | /** 86 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned. 87 | * 88 | * \tparam LENGTH LengthT of input array 89 | * \tparam T [inferred] The data type to be reduced. 90 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 91 | */ 92 | template < 93 | int LENGTH, 94 | typename T, 95 | typename ReductionOp> 96 | __device__ __forceinline__ T ThreadReduce( 97 | T* input, ///< [in] Input array 98 | ReductionOp reduction_op, ///< [in] Binary reduction operator 99 | T prefix) ///< [in] Prefix to seed reduction with 100 | { 101 | return ThreadReduce(input, reduction_op, prefix, Int2Type()); 102 | } 103 | 104 | 105 | /** 106 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned. 107 | * 108 | * \tparam LENGTH LengthT of input array 109 | * \tparam T [inferred] The data type to be reduced. 110 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 111 | */ 112 | template < 113 | int LENGTH, 114 | typename T, 115 | typename ReductionOp> 116 | __device__ __forceinline__ T ThreadReduce( 117 | T* input, ///< [in] Input array 118 | ReductionOp reduction_op) ///< [in] Binary reduction operator 119 | { 120 | T prefix = input[0]; 121 | return ThreadReduce(input + 1, reduction_op, prefix); 122 | } 123 | 124 | 125 | /** 126 | * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned. 127 | * 128 | * \tparam LENGTH [inferred] LengthT of \p input array 129 | * \tparam T [inferred] The data type to be reduced. 130 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 131 | */ 132 | template < 133 | int LENGTH, 134 | typename T, 135 | typename ReductionOp> 136 | __device__ __forceinline__ T ThreadReduce( 137 | T (&input)[LENGTH], ///< [in] Input array 138 | ReductionOp reduction_op, ///< [in] Binary reduction operator 139 | T prefix) ///< [in] Prefix to seed reduction with 140 | { 141 | return ThreadReduce(input, reduction_op, prefix, Int2Type()); 142 | } 143 | 144 | 145 | /** 146 | * \brief Serial reduction with the specified operator 147 | * 148 | * \tparam LENGTH [inferred] LengthT of \p input array 149 | * \tparam T [inferred] The data type to be reduced. 150 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b) 151 | */ 152 | template < 153 | int LENGTH, 154 | typename T, 155 | typename ReductionOp> 156 | __device__ __forceinline__ T ThreadReduce( 157 | T (&input)[LENGTH], ///< [in] Input array 158 | ReductionOp reduction_op) ///< [in] Binary reduction operator 159 | { 160 | return ThreadReduce((T*) input, reduction_op); 161 | } 162 | 163 | 164 | //@} end member group 165 | 166 | /** @} */ // end group UtilModule 167 | 168 | } // CUB namespace 169 | CUB_NS_POSTFIX // Optional outer namespace(s) 170 | -------------------------------------------------------------------------------- /cub/thread/thread_search.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Thread utilities for sequential search 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "../util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | 45 | /** 46 | * Computes the begin offsets into A and B for the specific diagonal 47 | */ 48 | template < 49 | typename AIteratorT, 50 | typename BIteratorT, 51 | typename OffsetT, 52 | typename CoordinateT> 53 | __host__ __device__ __forceinline__ void MergePathSearch( 54 | OffsetT diagonal, 55 | AIteratorT a, 56 | BIteratorT b, 57 | OffsetT a_len, 58 | OffsetT b_len, 59 | CoordinateT& path_coordinate) 60 | { 61 | /// The value type of the input iterator 62 | typedef typename std::iterator_traits::value_type T; 63 | 64 | OffsetT split_min = CUB_MAX(diagonal - b_len, 0); 65 | OffsetT split_max = CUB_MIN(diagonal, a_len); 66 | 67 | while (split_min < split_max) 68 | { 69 | OffsetT split_pivot = (split_min + split_max) >> 1; 70 | if (a[split_pivot] <= b[diagonal - split_pivot - 1]) 71 | { 72 | // Move candidate split range up A, down B 73 | split_min = split_pivot + 1; 74 | } 75 | else 76 | { 77 | // Move candidate split range up B, down A 78 | split_max = split_pivot; 79 | } 80 | } 81 | 82 | path_coordinate.x = CUB_MIN(split_min, a_len); 83 | path_coordinate.y = diagonal - split_min; 84 | } 85 | 86 | 87 | 88 | /** 89 | * \brief Returns the offset of the first value within \p input which does not compare less than \p val 90 | */ 91 | template < 92 | typename InputIteratorT, 93 | typename OffsetT, 94 | typename T> 95 | __device__ __forceinline__ OffsetT LowerBound( 96 | InputIteratorT input, ///< [in] Input sequence 97 | OffsetT num_items, ///< [in] Input sequence length 98 | T val) ///< [in] Search key 99 | { 100 | OffsetT retval = 0; 101 | while (num_items > 0) 102 | { 103 | OffsetT half = num_items >> 1; 104 | if (input[retval + half] < val) 105 | { 106 | retval = retval + (half + 1); 107 | num_items = num_items - (half + 1); 108 | } 109 | else 110 | { 111 | num_items = half; 112 | } 113 | } 114 | 115 | return retval; 116 | } 117 | 118 | 119 | /** 120 | * \brief Returns the offset of the first value within \p input which compares greater than \p val 121 | */ 122 | template < 123 | typename InputIteratorT, 124 | typename OffsetT, 125 | typename T> 126 | __device__ __forceinline__ OffsetT UpperBound( 127 | InputIteratorT input, ///< [in] Input sequence 128 | OffsetT num_items, ///< [in] Input sequence length 129 | T val) ///< [in] Search key 130 | { 131 | OffsetT retval = 0; 132 | while (num_items > 0) 133 | { 134 | OffsetT half = num_items >> 1; 135 | if (val < input[retval + half]) 136 | { 137 | num_items = half; 138 | } 139 | else 140 | { 141 | retval = retval + (half + 1); 142 | num_items = num_items - (half + 1); 143 | } 144 | } 145 | 146 | return retval; 147 | } 148 | 149 | 150 | 151 | 152 | 153 | } // CUB namespace 154 | CUB_NS_POSTFIX // Optional outer namespace(s) 155 | -------------------------------------------------------------------------------- /cub/util_arch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Static architectural properties by SM version. 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "util_namespace.cuh" 37 | 38 | /// Optional outer namespace(s) 39 | CUB_NS_PREFIX 40 | 41 | /// CUB namespace 42 | namespace cub { 43 | 44 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 45 | 46 | 47 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass). 48 | #ifndef CUB_PTX_ARCH 49 | #ifndef __CUDA_ARCH__ 50 | #define CUB_PTX_ARCH 0 51 | #else 52 | #define CUB_PTX_ARCH __CUDA_ARCH__ 53 | #endif 54 | #endif 55 | 56 | 57 | /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. 58 | #ifndef CUB_RUNTIME_FUNCTION 59 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) 60 | #define CUB_RUNTIME_ENABLED 61 | #define CUB_RUNTIME_FUNCTION __host__ __device__ 62 | #else 63 | #define CUB_RUNTIME_FUNCTION __host__ 64 | #endif 65 | #endif 66 | 67 | 68 | /// Number of threads per warp 69 | #ifndef CUB_LOG_WARP_THREADS 70 | #define CUB_LOG_WARP_THREADS(arch) \ 71 | (5) 72 | #define CUB_WARP_THREADS(arch) \ 73 | (1 << CUB_LOG_WARP_THREADS(arch)) 74 | 75 | #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) 76 | #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) 77 | #endif 78 | 79 | 80 | /// Number of smem banks 81 | #ifndef CUB_LOG_SMEM_BANKS 82 | #define CUB_LOG_SMEM_BANKS(arch) \ 83 | ((arch >= 200) ? \ 84 | (5) : \ 85 | (4)) 86 | #define CUB_SMEM_BANKS(arch) \ 87 | (1 << CUB_LOG_SMEM_BANKS(arch)) 88 | 89 | #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) 90 | #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) 91 | #endif 92 | 93 | 94 | /// Oversubscription factor 95 | #ifndef CUB_SUBSCRIPTION_FACTOR 96 | #define CUB_SUBSCRIPTION_FACTOR(arch) \ 97 | ((arch >= 300) ? \ 98 | (5) : \ 99 | ((arch >= 200) ? \ 100 | (3) : \ 101 | (10))) 102 | #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) 103 | #endif 104 | 105 | 106 | /// Prefer padding overhead vs X-way conflicts greater than this threshold 107 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING 108 | #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ 109 | ((arch >= 300) ? \ 110 | (1) : \ 111 | (4)) 112 | #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) 113 | #endif 114 | 115 | 116 | /// Scale the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data. Minimum of two warps. 117 | #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ 118 | (CUB_MIN(NOMINAL_4B_BLOCK_THREADS, CUB_MAX(3, ((NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4) / sizeof(T)) * CUB_WARP_THREADS(PTX_ARCH))) 119 | 120 | /// If necessary, scale down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data. Minimum 1 item per thread 121 | #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \ 122 | (CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)))) 123 | 124 | 125 | 126 | #endif // Do not document 127 | 128 | } // CUB namespace 129 | CUB_NS_POSTFIX // Optional outer namespace(s) 130 | -------------------------------------------------------------------------------- /cub/util_debug.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Error and event logging routines. 32 | * 33 | * The following macros definitions are supported: 34 | * - \p CUB_LOG. Simple event messages are printed to \p stdout. 35 | */ 36 | 37 | #pragma once 38 | 39 | #include 40 | #include "util_namespace.cuh" 41 | #include "util_arch.cuh" 42 | 43 | /// Optional outer namespace(s) 44 | CUB_NS_PREFIX 45 | 46 | /// CUB namespace 47 | namespace cub { 48 | 49 | 50 | /** 51 | * \addtogroup UtilMgmt 52 | * @{ 53 | */ 54 | 55 | 56 | /// CUB error reporting macro (prints error messages to stderr) 57 | #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR) 58 | #define CUB_STDERR 59 | #endif 60 | 61 | 62 | 63 | /** 64 | * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context. 65 | * 66 | * \return The CUDA error. 67 | */ 68 | __host__ __device__ __forceinline__ cudaError_t Debug( 69 | cudaError_t error, 70 | const char* filename, 71 | int line) 72 | { 73 | #ifdef CUB_STDERR 74 | if (error) 75 | { 76 | #if (CUB_PTX_ARCH == 0) 77 | fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); 78 | fflush(stderr); 79 | #elif (CUB_PTX_ARCH >= 200) 80 | printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line); 81 | #endif 82 | } 83 | #endif 84 | return error; 85 | } 86 | 87 | 88 | /** 89 | * \brief Debug macro 90 | */ 91 | #ifndef CubDebug 92 | #define CubDebug(e) cub::Debug((e), __FILE__, __LINE__) 93 | #endif 94 | 95 | 96 | /** 97 | * \brief Debug macro with exit 98 | */ 99 | #ifndef CubDebugExit 100 | #define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); } 101 | #endif 102 | 103 | 104 | /** 105 | * \brief Log macro for printf statements. 106 | */ 107 | #if !defined(_CubLog) 108 | #if (CUB_PTX_ARCH == 0) 109 | #define _CubLog(format, ...) printf(format,__VA_ARGS__); 110 | #elif (CUB_PTX_ARCH >= 200) 111 | #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__); 112 | #endif 113 | #endif 114 | 115 | 116 | 117 | 118 | /** @} */ // end group UtilMgmt 119 | 120 | } // CUB namespace 121 | CUB_NS_POSTFIX // Optional outer namespace(s) 122 | -------------------------------------------------------------------------------- /cub/util_device.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Properties of a given CUDA device and the corresponding PTX bundle 32 | */ 33 | 34 | #pragma once 35 | 36 | #include "util_type.cuh" 37 | #include "util_arch.cuh" 38 | #include "util_debug.cuh" 39 | #include "util_namespace.cuh" 40 | #include "util_macro.cuh" 41 | 42 | /// Optional outer namespace(s) 43 | CUB_NS_PREFIX 44 | 45 | /// CUB namespace 46 | namespace cub { 47 | 48 | 49 | /** 50 | * \addtogroup UtilMgmt 51 | * @{ 52 | */ 53 | 54 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 55 | 56 | 57 | /** 58 | * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed). 59 | */ 60 | template 61 | CUB_RUNTIME_FUNCTION __forceinline__ 62 | cudaError_t AliasTemporaries( 63 | void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. 64 | size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation 65 | void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed 66 | size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed 67 | { 68 | const int ALIGN_BYTES = 256; 69 | const int ALIGN_MASK = ~(ALIGN_BYTES - 1); 70 | 71 | // Compute exclusive prefix sum over allocation requests 72 | size_t allocation_offsets[ALLOCATIONS]; 73 | size_t bytes_needed = 0; 74 | for (int i = 0; i < ALLOCATIONS; ++i) 75 | { 76 | size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; 77 | allocation_offsets[i] = bytes_needed; 78 | bytes_needed += allocation_bytes; 79 | } 80 | bytes_needed += ALIGN_BYTES - 1; 81 | 82 | // Check if the caller is simply requesting the size of the storage allocation 83 | if (!d_temp_storage) 84 | { 85 | temp_storage_bytes = bytes_needed; 86 | return cudaSuccess; 87 | } 88 | 89 | // Check if enough storage provided 90 | if (temp_storage_bytes < bytes_needed) 91 | { 92 | return CubDebug(cudaErrorInvalidValue); 93 | } 94 | 95 | // Alias 96 | d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); 97 | for (int i = 0; i < ALLOCATIONS; ++i) 98 | { 99 | allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; 100 | } 101 | 102 | return cudaSuccess; 103 | } 104 | 105 | 106 | /** 107 | * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device 108 | */ 109 | template 110 | __global__ void EmptyKernel(void) { } 111 | 112 | 113 | #endif // DOXYGEN_SHOULD_SKIP_THIS 114 | 115 | /** 116 | * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10) 117 | */ 118 | CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version) 119 | { 120 | struct Dummy 121 | { 122 | /// Type definition of the EmptyKernel kernel entry point 123 | typedef void (*EmptyKernelPtr)(); 124 | 125 | /// Force EmptyKernel to be generated if this class is used 126 | CUB_RUNTIME_FUNCTION __forceinline__ 127 | EmptyKernelPtr Empty() 128 | { 129 | return EmptyKernel; 130 | } 131 | }; 132 | 133 | 134 | #ifndef CUB_RUNTIME_ENABLED 135 | 136 | // CUDA API calls not supported from this device 137 | return cudaErrorInvalidConfiguration; 138 | 139 | #elif (CUB_PTX_ARCH > 0) 140 | 141 | ptx_version = CUB_PTX_ARCH; 142 | return cudaSuccess; 143 | 144 | #else 145 | 146 | cudaError_t error = cudaSuccess; 147 | do 148 | { 149 | cudaFuncAttributes empty_kernel_attrs; 150 | if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel))) break; 151 | ptx_version = empty_kernel_attrs.ptxVersion * 10; 152 | } 153 | while (0); 154 | 155 | return error; 156 | 157 | #endif 158 | } 159 | 160 | 161 | /** 162 | * \brief Retrieves the SM version (major * 100 + minor * 10) 163 | */ 164 | CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal) 165 | { 166 | #ifndef CUB_RUNTIME_ENABLED 167 | 168 | // CUDA API calls not supported from this device 169 | return cudaErrorInvalidConfiguration; 170 | 171 | #else 172 | 173 | cudaError_t error = cudaSuccess; 174 | do 175 | { 176 | // Fill in SM version 177 | int major, minor; 178 | if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break; 179 | if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break; 180 | sm_version = major * 100 + minor * 10; 181 | } 182 | while (0); 183 | 184 | return error; 185 | 186 | #endif 187 | } 188 | 189 | 190 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 191 | 192 | /** 193 | * Synchronize the stream if specified 194 | */ 195 | CUB_RUNTIME_FUNCTION __forceinline__ 196 | static cudaError_t SyncStream(cudaStream_t stream) 197 | { 198 | #if (CUB_PTX_ARCH == 0) 199 | return cudaStreamSynchronize(stream); 200 | #else 201 | // Device can't yet sync on a specific stream 202 | return cudaDeviceSynchronize(); 203 | #endif 204 | } 205 | 206 | 207 | /** 208 | * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block. 209 | * 210 | * \par Snippet 211 | * The code snippet below illustrates the use of the MaxSmOccupancy function. 212 | * \par 213 | * \code 214 | * #include // or equivalently 215 | * 216 | * template 217 | * __global__ void ExampleKernel() 218 | * { 219 | * // Allocate shared memory for BlockScan 220 | * __shared__ volatile T buffer[4096]; 221 | * 222 | * ... 223 | * } 224 | * 225 | * ... 226 | * 227 | * // Determine SM occupancy for ExampleKernel specialized for unsigned char 228 | * int max_sm_occupancy; 229 | * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64); 230 | * 231 | * // max_sm_occupancy <-- 4 on SM10 232 | * // max_sm_occupancy <-- 8 on SM20 233 | * // max_sm_occupancy <-- 12 on SM35 234 | * 235 | * \endcode 236 | * 237 | */ 238 | template 239 | CUB_RUNTIME_FUNCTION __forceinline__ 240 | cudaError_t MaxSmOccupancy( 241 | int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM 242 | KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy 243 | int block_threads, ///< [in] Number of threads per thread block 244 | int dynamic_smem_bytes = 0) 245 | { 246 | #ifndef CUB_RUNTIME_ENABLED 247 | 248 | // CUDA API calls not supported from this device 249 | return CubDebug(cudaErrorInvalidConfiguration); 250 | 251 | #else 252 | 253 | return cudaOccupancyMaxActiveBlocksPerMultiprocessor ( 254 | &max_sm_occupancy, 255 | kernel_ptr, 256 | block_threads, 257 | dynamic_smem_bytes); 258 | 259 | #endif // CUB_RUNTIME_ENABLED 260 | } 261 | 262 | 263 | /****************************************************************************** 264 | * Policy management 265 | ******************************************************************************/ 266 | 267 | /** 268 | * Kernel dispatch configuration 269 | */ 270 | struct KernelConfig 271 | { 272 | int block_threads; 273 | int items_per_thread; 274 | int tile_size; 275 | int sm_occupancy; 276 | 277 | CUB_RUNTIME_FUNCTION __forceinline__ 278 | KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {} 279 | 280 | template 281 | CUB_RUNTIME_FUNCTION __forceinline__ 282 | cudaError_t Init(KernelPtrT kernel_ptr) 283 | { 284 | block_threads = AgentPolicyT::BLOCK_THREADS; 285 | items_per_thread = AgentPolicyT::ITEMS_PER_THREAD; 286 | tile_size = block_threads * items_per_thread; 287 | cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads); 288 | return retval; 289 | } 290 | }; 291 | 292 | 293 | 294 | /// Helper for dispatching into a policy chain 295 | template 296 | struct ChainedPolicy 297 | { 298 | /// The policy for the active compiler pass 299 | typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy; 300 | 301 | /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version 302 | template 303 | CUB_RUNTIME_FUNCTION __forceinline__ 304 | static cudaError_t Invoke(int ptx_version, FunctorT &op) 305 | { 306 | if (ptx_version < PTX_VERSION) { 307 | return PrevPolicyT::Invoke(ptx_version, op); 308 | } 309 | return op.template Invoke(); 310 | } 311 | }; 312 | 313 | /// Helper for dispatching into a policy chain (end-of-chain specialization) 314 | template 315 | struct ChainedPolicy 316 | { 317 | /// The policy for the active compiler pass 318 | typedef PolicyT ActivePolicy; 319 | 320 | /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version 321 | template 322 | CUB_RUNTIME_FUNCTION __forceinline__ 323 | static cudaError_t Invoke(int ptx_version, FunctorT &op) { 324 | return op.template Invoke(); 325 | } 326 | }; 327 | 328 | 329 | 330 | 331 | #endif // Do not document 332 | 333 | 334 | 335 | 336 | /** @} */ // end group UtilMgmt 337 | 338 | } // CUB namespace 339 | CUB_NS_POSTFIX // Optional outer namespace(s) 340 | -------------------------------------------------------------------------------- /cub/util_macro.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /****************************************************************************** 30 | * Common C/C++ macro utilities 31 | ******************************************************************************/ 32 | 33 | #pragma once 34 | 35 | #include "util_namespace.cuh" 36 | 37 | /// Optional outer namespace(s) 38 | CUB_NS_PREFIX 39 | 40 | /// CUB namespace 41 | namespace cub { 42 | 43 | 44 | /** 45 | * \addtogroup UtilModule 46 | * @{ 47 | */ 48 | 49 | #ifndef CUB_ALIGN 50 | #if defined(_WIN32) || defined(_WIN64) 51 | /// Align struct 52 | #define CUB_ALIGN(bytes) __declspec(align(32)) 53 | #else 54 | /// Align struct 55 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes))) 56 | #endif 57 | #endif 58 | 59 | #ifndef CUB_MAX 60 | /// Select maximum(a, b) 61 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a)) 62 | #endif 63 | 64 | #ifndef CUB_MIN 65 | /// Select minimum(a, b) 66 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a)) 67 | #endif 68 | 69 | #ifndef CUB_QUOTIENT_FLOOR 70 | /// Quotient of x/y rounded down to nearest integer 71 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y)) 72 | #endif 73 | 74 | #ifndef CUB_QUOTIENT_CEILING 75 | /// Quotient of x/y rounded up to nearest integer 76 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y)) 77 | #endif 78 | 79 | #ifndef CUB_ROUND_UP_NEAREST 80 | /// x rounded up to the nearest multiple of y 81 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y) 82 | #endif 83 | 84 | #ifndef CUB_ROUND_DOWN_NEAREST 85 | /// x rounded down to the nearest multiple of y 86 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y) 87 | #endif 88 | 89 | 90 | #ifndef CUB_STATIC_ASSERT 91 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 92 | #define CUB_CAT_(a, b) a ## b 93 | #define CUB_CAT(a, b) CUB_CAT_(a, b) 94 | #endif // DOXYGEN_SHOULD_SKIP_THIS 95 | 96 | /// Static assert 97 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] 98 | #endif 99 | 100 | /** @} */ // end group UtilModule 101 | 102 | } // CUB namespace 103 | CUB_NS_POSTFIX // Optional outer namespace(s) 104 | -------------------------------------------------------------------------------- /cub/util_namespace.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2011, Duane Merrill. All rights reserved. 3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of the NVIDIA CORPORATION nor the 13 | * names of its contributors may be used to endorse or promote products 14 | * derived from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | * 27 | ******************************************************************************/ 28 | 29 | /** 30 | * \file 31 | * Place-holder for prefixing the cub namespace 32 | */ 33 | 34 | #pragma once 35 | 36 | // For example: 37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail { 38 | //#define CUB_NS_POSTFIX } } 39 | 40 | #define CUB_NS_PREFIX 41 | #define CUB_NS_POSTFIX 42 | -------------------------------------------------------------------------------- /eval_csrmv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if (( $# != 2 )); then 4 | echo "$0 " 5 | exit 0 6 | fi 7 | 8 | echo "file, num_rows, num_cols, num_nonzeros, row_length_mean, row_length_std_dev, row_length_variation, row_length_skewness, method_name, setup_ms, avg_spmv_ms, gflops, effective_GBs" 9 | 10 | MTX_DIR=$1 11 | 12 | shift 13 | 14 | for i in `find $MTX_DIR -name *.mtx` 15 | do 16 | ./$@ --quiet --mtx=$i 17 | done 18 | -------------------------------------------------------------------------------- /get_uf_datasets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$#" -eq 0 ]; then 4 | MTX_DIR = mtx 5 | else 6 | MTX_DIR = $1 7 | fi 8 | 9 | # Make temporary directory for download/unpack 10 | mkdir -p tgz 11 | cd tgz 12 | 13 | # Download 14 | for i in `cat ../ufl_urls.txt`; do echo $i; wget $i; done 15 | 16 | # Unpack 17 | for i in `cat ../ufl_matrices.txt`; do gunzip $i.tar.gz; tar -xvf $i.tar; rm $i.tar; done 18 | 19 | # Relocate 20 | mkdir -p ../$MTX_DIR 21 | for i in `find . -name *.mtx`; do echo $i; mv $i ../$MTX_DIR/; done 22 | 23 | # Cleanup 24 | cd .. 25 | rm -rf tgz 26 | -------------------------------------------------------------------------------- /gpu_spmv: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ./_gpu_spmv_driver $@ 4 | -------------------------------------------------------------------------------- /merge-based-spmv-sc16-preprint.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dumerrill/merge-spmv/18895571ce9af960ee207dae541b0ffc701ea4bb/merge-based-spmv-sc16-preprint.pdf -------------------------------------------------------------------------------- /merge_decomposition.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dumerrill/merge-spmv/18895571ce9af960ee207dae541b0ffc701ea4bb/merge_decomposition.png -------------------------------------------------------------------------------- /merge_spmv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dumerrill/merge-spmv/18895571ce9af960ee207dae541b0ffc701ea4bb/merge_spmv.png --------------------------------------------------------------------------------