├── .cproject
├── .gitignore
├── .project
├── .settings
├── .gitignore
└── language.settings.xml
├── LICENSE.TXT
├── Makefile
├── README.md
├── cpu_spmv
├── cpu_spmv.cpp
├── cub
├── agent
│ ├── agent_histogram.cuh
│ ├── agent_radix_sort_downsweep.cuh
│ ├── agent_radix_sort_upsweep.cuh
│ ├── agent_reduce.cuh
│ ├── agent_reduce_by_key.cuh
│ ├── agent_rle.cuh
│ ├── agent_scan.cuh
│ ├── agent_segment_fixup.cuh
│ ├── agent_select_if.cuh
│ ├── agent_spmv_csrt.cuh
│ ├── agent_spmv_orig.cuh
│ ├── agent_spmv_row_based.cuh
│ └── single_pass_scan_operators.cuh
├── block
│ ├── block_discontinuity.cuh
│ ├── block_exchange.cuh
│ ├── block_histogram.cuh
│ ├── block_load.cuh
│ ├── block_radix_rank.cuh
│ ├── block_radix_sort.cuh
│ ├── block_raking_layout.cuh
│ ├── block_reduce.cuh
│ ├── block_reduce_by_key.cuh
│ ├── block_scan.cuh
│ ├── block_shuffle.cuh
│ ├── block_store.cuh
│ └── specializations
│ │ ├── block_histogram_atomic.cuh
│ │ ├── block_histogram_sort.cuh
│ │ ├── block_reduce_raking.cuh
│ │ ├── block_reduce_raking_commutative_only.cuh
│ │ ├── block_reduce_warp_reductions.cuh
│ │ ├── block_scan_raking.cuh
│ │ └── block_scan_warp_scans.cuh
├── cub.cuh
├── device
│ ├── device_histogram.cuh
│ ├── device_partition.cuh
│ ├── device_radix_sort.cuh
│ ├── device_reduce.cuh
│ ├── device_run_length_encode.cuh
│ ├── device_scan.cuh
│ ├── device_segmented_radix_sort.cuh
│ ├── device_segmented_reduce.cuh
│ ├── device_select.cuh
│ ├── device_spmv.cuh
│ └── dispatch
│ │ ├── dispatch_histogram.cuh
│ │ ├── dispatch_radix_sort.cuh
│ │ ├── dispatch_reduce.cuh
│ │ ├── dispatch_reduce_by_key.cuh
│ │ ├── dispatch_rle.cuh
│ │ ├── dispatch_scan.cuh
│ │ ├── dispatch_select_if.cuh
│ │ ├── dispatch_spmv_csrt.cuh
│ │ ├── dispatch_spmv_orig.cuh
│ │ └── dispatch_spmv_row_based.cuh
├── grid
│ ├── grid_barrier.cuh
│ ├── grid_even_share.cuh
│ ├── grid_mapping.cuh
│ └── grid_queue.cuh
├── host
│ └── mutex.cuh
├── iterator
│ ├── arg_index_input_iterator.cuh
│ ├── cache_modified_input_iterator.cuh
│ ├── cache_modified_output_iterator.cuh
│ ├── constant_input_iterator.cuh
│ ├── counting_input_iterator.cuh
│ ├── tex_obj_input_iterator.cuh
│ ├── tex_ref_input_iterator.cuh
│ └── transform_input_iterator.cuh
├── thread
│ ├── thread_load.cuh
│ ├── thread_operators.cuh
│ ├── thread_reduce.cuh
│ ├── thread_scan.cuh
│ ├── thread_search.cuh
│ └── thread_store.cuh
├── util_allocator.cuh
├── util_arch.cuh
├── util_debug.cuh
├── util_device.cuh
├── util_macro.cuh
├── util_namespace.cuh
├── util_ptx.cuh
├── util_type.cuh
└── warp
│ ├── specializations
│ ├── warp_reduce_shfl.cuh
│ ├── warp_reduce_smem.cuh
│ ├── warp_scan_shfl.cuh
│ └── warp_scan_smem.cuh
│ ├── warp_reduce.cuh
│ └── warp_scan.cuh
├── eval_csrmv.sh
├── get_uf_datasets.sh
├── gpu_spmv
├── gpu_spmv.cu
├── merge-based-spmv-sc16-preprint.pdf
├── merge_decomposition.png
├── merge_spmv.png
├── sparse_matrix.h
├── ufl_matrices.txt
├── ufl_urls.txt
└── utils.h
/.cproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _cpu_spmv_driver
2 | _gpu_spmv_driver
3 | mtx
4 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | merge-spmv
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.cdt.autotools.core.genmakebuilderV2
10 |
11 |
12 |
13 |
14 | org.eclipse.cdt.managedbuilder.core.genmakebuilder
15 | clean,full,incremental,
16 |
17 |
18 |
19 |
20 | org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder
21 | full,incremental,
22 |
23 |
24 |
25 |
26 |
27 | org.eclipse.cdt.core.cnature
28 | org.eclipse.cdt.core.ccnature
29 | org.eclipse.cdt.managedbuilder.core.managedBuildNature
30 | org.eclipse.cdt.managedbuilder.core.ScannerConfigNature
31 | org.eclipse.cdt.autotools.core.autotoolsNatureV2
32 |
33 |
34 |
--------------------------------------------------------------------------------
/.settings/.gitignore:
--------------------------------------------------------------------------------
1 | /language.settings.xml
2 |
--------------------------------------------------------------------------------
/.settings/language.settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/LICENSE.TXT:
--------------------------------------------------------------------------------
1 | Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are met:
5 | * Redistributions of source code must retain the above copyright
6 | notice, this list of conditions and the following disclaimer.
7 | * Redistributions in binary form must reproduce the above copyright
8 | notice, this list of conditions and the following disclaimer in the
9 | documentation and/or other materials provided with the distribution.
10 | * Neither the name of the NVIDIA CORPORATION nor the
11 | names of its contributors may be used to endorse or promote products
12 | derived from this software without specific prior written permission.
13 |
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
18 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | #/******************************************************************************
2 | # * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | # * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | # *
5 | # * Redistribution and use in source and binary forms, with or without
6 | # * modification, are permitted provided that the following conditions are met:
7 | # * * Redistributions of source code must retain the above copyright
8 | # * notice, this list of conditions and the following disclaimer.
9 | # * * Redistributions in binary form must reproduce the above copyright
10 | # * notice, this list of conditions and the following disclaimer in the
11 | # * documentation and/or other materials provided with the distribution.
12 | # * * Neither the name of the NVIDIA CORPORATION nor the
13 | # * names of its contributors may be used to endorse or promote products
14 | # * derived from this software without specific prior written permission.
15 | # *
16 | # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | # * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | # * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | # * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | # * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | # * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | # * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | # *
27 | #******************************************************************************/
28 |
29 | #-------------------------------------------------------------------------------
30 | #
31 | # Makefile usage
32 | #
33 | # CPU:
34 | # make cpu_spmv
35 | #
36 | # GPU:
37 | # make gpu_spmv [sm=] [verbose=<0|1>]
38 | #
39 | #-------------------------------------------------------------------------------
40 |
41 | #-------------------------------------------------------------------------------
42 | # Commandline Options
43 | #-------------------------------------------------------------------------------
44 |
45 |
46 | # [sm=] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
47 |
48 | COMMA = ,
49 | ifdef sm
50 | SM_ARCH = $(subst $(COMMA),-,$(sm))
51 | else
52 | SM_ARCH = 350
53 | endif
54 |
55 | ifeq (520, $(findstring 520, $(SM_ARCH)))
56 | SM_TARGETS += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
57 | endif
58 | ifeq (370, $(findstring 370, $(SM_ARCH)))
59 | SM_TARGETS += -gencode=arch=compute_37,code=\"sm_37,compute_37\"
60 | endif
61 | ifeq (350, $(findstring 350, $(SM_ARCH)))
62 | SM_TARGETS += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
63 | endif
64 | ifeq (300, $(findstring 300, $(SM_ARCH)))
65 | SM_TARGETS += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
66 | endif
67 |
68 |
69 | # [verbose=<0|1>] Verbose toolchain output from nvcc option
70 |
71 | ifeq ($(verbose), 1)
72 | NVCCFLAGS += -v
73 | endif
74 |
75 |
76 |
77 | #-------------------------------------------------------------------------------
78 | # Compiler and compilation platform
79 | #-------------------------------------------------------------------------------
80 |
81 | CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
82 |
83 | NVCC = "$(shell which nvcc)"
84 | ifdef nvccver
85 | NVCC_VERSION = $(nvccver)
86 | else
87 | NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' | sed 's/,.*//'))
88 | endif
89 |
90 | # detect OS
91 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
92 |
93 | # Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases
94 | NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\#
95 |
96 | ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
97 | # For MSVC
98 | # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
99 | NVCCFLAGS += -Xcompiler /fp:strict
100 | # Help the compiler/linker work with huge numbers of kernels on Windows
101 | NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
102 | CC = cl
103 | NPPI = -lnppi
104 |
105 | # Multithreaded runtime
106 | NVCCFLAGS += -Xcompiler /MT
107 |
108 | ifneq ($(force32), 1)
109 | CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
110 | else
111 | CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib"
112 | endif
113 | CUDART = "$(shell cygpath -w $(CUDART_CYG))"
114 | else
115 | # For g++
116 | # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
117 | NVCCFLAGS += -Xcompiler -ffloat-store
118 | CC = g++
119 | ifneq ($(force32), 1)
120 | CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a"
121 | else
122 | CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a"
123 | endif
124 | endif
125 |
126 |
127 | #-------------------------------------------------------------------------------
128 | # Compiler and compilation platform
129 | #-------------------------------------------------------------------------------
130 |
131 | # OMP compiler
132 | OMPCC=icpc
133 | OMPCC_FLAGS=-openmp -O3 -lrt -fno-alias -xHost -lnuma -O3 -mkl
134 |
135 | # Includes
136 | INC += -I$(CUB_DIR) -I$(CUB_DIR)test
137 |
138 | # detect OS
139 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
140 |
141 | #-------------------------------------------------------------------------------
142 | # Dependency Lists
143 | #-------------------------------------------------------------------------------
144 |
145 | rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
146 |
147 | DEPS = $(call rwildcard, $(CUB_DIR),*.cuh) \
148 | $(call rwildcard, $(CUB_DIR),*.h) \
149 | Makefile
150 |
151 | #-------------------------------------------------------------------------------
152 | # make clean
153 | #-------------------------------------------------------------------------------
154 |
155 | clean :
156 | rm -f _gpu_spmv_driver _cpu_spmv_driver
157 |
158 |
159 | #-------------------------------------------------------------------------------
160 | # make gpu_spmv
161 | #-------------------------------------------------------------------------------
162 |
163 | gpu_spmv : gpu_spmv.cu $(DEPS)
164 | $(NVCC) $(DEFINES) $(SM_TARGETS) -o _gpu_spmv_driver gpu_spmv.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse -O3
165 |
166 |
167 | #-------------------------------------------------------------------------------
168 | # make cpu_spmv
169 | #-------------------------------------------------------------------------------
170 |
171 | cpu_spmv : cpu_spmv.cpp $(DEPS)
172 | $(OMPCC) $(DEFINES) -DCUB_MKL -o _cpu_spmv_driver cpu_spmv.cpp $(OMPCC_FLAGS)
173 |
174 |
--------------------------------------------------------------------------------
/cpu_spmv:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | KMP_AFFINITY=granularity=core,scatter
4 | ./_cpu_spmv_driver $@
5 |
--------------------------------------------------------------------------------
/cub/block/block_raking_layout.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
32 | */
33 |
34 |
35 | #pragma once
36 |
37 | #include "../util_macro.cuh"
38 | #include "../util_arch.cuh"
39 | #include "../util_type.cuh"
40 | #include "../util_namespace.cuh"
41 |
42 | /// Optional outer namespace(s)
43 | CUB_NS_PREFIX
44 |
45 | /// CUB namespace
46 | namespace cub {
47 |
48 | /**
49 | * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data. 
50 | * \ingroup BlockModule
51 | *
52 | * \par Overview
53 | * This type facilitates a shared memory usage pattern where a block of CUDA
54 | * threads places elements into shared memory and then reduces the active
55 | * parallelism to one "raking" warp of threads for serially aggregating consecutive
56 | * sequences of shared items. Padding is inserted to eliminate bank conflicts
57 | * (for most data types).
58 | *
59 | * \tparam T The data type to be exchanged.
60 | * \tparam BLOCK_THREADS The thread block size in threads.
61 | * \tparam PTX_ARCH [optional] \ptxversion
62 | */
63 | template <
64 | typename T,
65 | int BLOCK_THREADS,
66 | int PTX_ARCH = CUB_PTX_ARCH>
67 | struct BlockRakingLayout
68 | {
69 | //---------------------------------------------------------------------
70 | // Constants and type definitions
71 | //---------------------------------------------------------------------
72 |
73 | enum
74 | {
75 | /// The total number of elements that need to be cooperatively reduced
76 | SHARED_ELEMENTS = BLOCK_THREADS,
77 |
78 | /// Maximum number of warp-synchronous raking threads
79 | MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
80 |
81 | /// Number of raking elements per warp-synchronous raking thread (rounded up)
82 | SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
83 |
84 | /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
85 | RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
86 |
87 | /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
88 | HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
89 |
90 | /// Degree of bank conflicts (e.g., 4-way)
91 | CONFLICT_DEGREE = (HAS_CONFLICTS) ?
92 | (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
93 | 1,
94 |
95 | /// Pad each segment length with one element if degree of bank conflicts is greater than 4-way (heuristic)
96 | SEGMENT_PADDING = (CONFLICT_DEGREE > CUB_PREFER_CONFLICT_OVER_PADDING(PTX_ARCH)) ? 1 : 0,
97 | // SEGMENT_PADDING = (HAS_CONFLICTS) ? 1 : 0,
98 |
99 | /// Total number of elements in the raking grid
100 | GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
101 |
102 | /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
103 | UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
104 | };
105 |
106 |
107 | /**
108 | * \brief Shared memory storage type
109 | */
110 | typedef T _TempStorage[BlockRakingLayout::GRID_ELEMENTS];
111 |
112 | /// Alias wrapper allowing storage to be unioned
113 | struct TempStorage : Uninitialized<_TempStorage> {};
114 |
115 |
116 | /**
117 | * \brief Returns the location for the calling thread to place data into the grid
118 | */
119 | static __device__ __forceinline__ T* PlacementPtr(
120 | TempStorage &temp_storage,
121 | int linear_tid)
122 | {
123 | // Offset for partial
124 | unsigned int offset = linear_tid;
125 |
126 | // Add in one padding element for every segment
127 | if (SEGMENT_PADDING > 0)
128 | {
129 | offset += offset / SEGMENT_LENGTH;
130 | }
131 |
132 | // Incorporating a block of padding partials every shared memory segment
133 | return temp_storage.Alias() + offset;
134 | }
135 |
136 |
137 | /**
138 | * \brief Returns the location for the calling thread to begin sequential raking
139 | */
140 | static __device__ __forceinline__ T* RakingPtr(
141 | TempStorage &temp_storage,
142 | int linear_tid)
143 | {
144 | return temp_storage.Alias() + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
145 | }
146 | };
147 |
148 | } // CUB namespace
149 | CUB_NS_POSTFIX // Optional outer namespace(s)
150 |
151 |
--------------------------------------------------------------------------------
/cub/block/specializations/block_histogram_atomic.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../../util_namespace.cuh"
37 |
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 |
41 | /// CUB namespace
42 | namespace cub {
43 |
44 |
45 | /**
46 | * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
47 | */
48 | template
49 | struct BlockHistogramAtomic
50 | {
51 | /// Shared memory storage layout type
52 | struct TempStorage {};
53 |
54 |
55 | /// Constructor
56 | __device__ __forceinline__ BlockHistogramAtomic(
57 | TempStorage &temp_storage)
58 | {}
59 |
60 |
61 | /// Composite data onto an existing histogram
62 | template <
63 | typename T,
64 | typename CounterT,
65 | int ITEMS_PER_THREAD>
66 | __device__ __forceinline__ void Composite(
67 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
68 | CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram
69 | {
70 | // Update histogram
71 | #pragma unroll
72 | for (int i = 0; i < ITEMS_PER_THREAD; ++i)
73 | {
74 | atomicAdd(histogram + items[i], 1);
75 | }
76 | }
77 |
78 | };
79 |
80 | } // CUB namespace
81 | CUB_NS_POSTFIX // Optional outer namespace(s)
82 |
83 |
--------------------------------------------------------------------------------
/cub/block/specializations/block_histogram_sort.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../../block/block_radix_sort.cuh"
37 | #include "../../block/block_discontinuity.cuh"
38 | #include "../../util_ptx.cuh"
39 | #include "../../util_namespace.cuh"
40 |
41 | /// Optional outer namespace(s)
42 | CUB_NS_PREFIX
43 |
44 | /// CUB namespace
45 | namespace cub {
46 |
47 |
48 |
49 | /**
50 | * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
51 | */
52 | template <
53 | typename T, ///< Sample type
54 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
55 | int ITEMS_PER_THREAD, ///< The number of samples per thread
56 | int BINS, ///< The number of bins into which histogram samples may fall
57 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
58 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
59 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
60 | struct BlockHistogramSort
61 | {
62 | /// Constants
63 | enum
64 | {
65 | /// The thread block size in threads
66 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
67 | };
68 |
69 | // Parameterize BlockRadixSort type for our thread block
70 | typedef BlockRadixSort<
71 | T,
72 | BLOCK_DIM_X,
73 | ITEMS_PER_THREAD,
74 | NullType,
75 | 4,
76 | (PTX_ARCH >= 350) ? true : false,
77 | BLOCK_SCAN_WARP_SCANS,
78 | cudaSharedMemBankSizeFourByte,
79 | BLOCK_DIM_Y,
80 | BLOCK_DIM_Z,
81 | PTX_ARCH>
82 | BlockRadixSortT;
83 |
84 | // Parameterize BlockDiscontinuity type for our thread block
85 | typedef BlockDiscontinuity<
86 | T,
87 | BLOCK_DIM_X,
88 | BLOCK_DIM_Y,
89 | BLOCK_DIM_Z,
90 | PTX_ARCH>
91 | BlockDiscontinuityT;
92 |
93 | /// Shared memory
94 | union _TempStorage
95 | {
96 | // Storage for sorting bin values
97 | typename BlockRadixSortT::TempStorage sort;
98 |
99 | struct
100 | {
101 | // Storage for detecting discontinuities in the tile of sorted bin values
102 | typename BlockDiscontinuityT::TempStorage flag;
103 |
104 | // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
105 | unsigned int run_begin[BINS];
106 | unsigned int run_end[BINS];
107 | };
108 | };
109 |
110 |
111 | /// Alias wrapper allowing storage to be unioned
112 | struct TempStorage : Uninitialized<_TempStorage> {};
113 |
114 |
115 | // Thread fields
116 | _TempStorage &temp_storage;
117 | int linear_tid;
118 |
119 |
120 | /// Constructor
121 | __device__ __forceinline__ BlockHistogramSort(
122 | TempStorage &temp_storage)
123 | :
124 | temp_storage(temp_storage.Alias()),
125 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
126 | {}
127 |
128 |
129 | // Discontinuity functor
130 | struct DiscontinuityOp
131 | {
132 | // Reference to temp_storage
133 | _TempStorage &temp_storage;
134 |
135 | // Constructor
136 | __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
137 | temp_storage(temp_storage)
138 | {}
139 |
140 | // Discontinuity predicate
141 | __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
142 | {
143 | if (a != b)
144 | {
145 | // Note the begin/end offsets in shared storage
146 | temp_storage.run_begin[b] = b_index;
147 | temp_storage.run_end[a] = b_index;
148 |
149 | return true;
150 | }
151 | else
152 | {
153 | return false;
154 | }
155 | }
156 | };
157 |
158 |
159 | // Composite data onto an existing histogram
160 | template <
161 | typename CounterT >
162 | __device__ __forceinline__ void Composite(
163 | T (&items)[ITEMS_PER_THREAD], ///< [in] Calling thread's input values to histogram
164 | CounterT histogram[BINS]) ///< [out] Reference to shared/device-accessible memory histogram
165 | {
166 | enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
167 |
168 | // Sort bytes in blocked arrangement
169 | BlockRadixSortT(temp_storage.sort).Sort(items);
170 |
171 | __syncthreads();
172 |
173 | // Initialize the shared memory's run_begin and run_end for each bin
174 | int histo_offset = 0;
175 |
176 | #pragma unroll
177 | for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
178 | {
179 | temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
180 | temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
181 | }
182 | // Finish up with guarded initialization if necessary
183 | if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
184 | {
185 | temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
186 | temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
187 | }
188 |
189 | __syncthreads();
190 |
191 | int flags[ITEMS_PER_THREAD]; // unused
192 |
193 | // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
194 | DiscontinuityOp flag_op(temp_storage);
195 | BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
196 |
197 | // Update begin for first item
198 | if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
199 |
200 | __syncthreads();
201 |
202 | // Composite into histogram
203 | histo_offset = 0;
204 |
205 | #pragma unroll
206 | for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
207 | {
208 | int thread_offset = histo_offset + linear_tid;
209 | CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
210 | histogram[thread_offset] += count;
211 | }
212 |
213 | // Finish up with guarded composition if necessary
214 | if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
215 | {
216 | int thread_offset = histo_offset + linear_tid;
217 | CounterT count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
218 | histogram[thread_offset] += count;
219 | }
220 | }
221 |
222 | };
223 |
224 | } // CUB namespace
225 | CUB_NS_POSTFIX // Optional outer namespace(s)
226 |
227 |
--------------------------------------------------------------------------------
/cub/block/specializations/block_reduce_raking.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../../block/block_raking_layout.cuh"
37 | #include "../../warp/warp_reduce.cuh"
38 | #include "../../thread/thread_reduce.cuh"
39 | #include "../../util_ptx.cuh"
40 | #include "../../util_namespace.cuh"
41 |
42 | /// Optional outer namespace(s)
43 | CUB_NS_PREFIX
44 |
45 | /// CUB namespace
46 | namespace cub {
47 |
48 |
49 | /**
50 | * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block. Supports non-commutative reduction operators.
51 | *
52 | * Supports non-commutative binary reduction operators. Unlike commutative
53 | * reduction operators (e.g., addition), the application of a non-commutative
54 | * reduction operator (e.g, string concatenation) across a sequence of inputs must
55 | * honor the relative ordering of items and partial reductions when applying the
56 | * reduction operator.
57 | *
58 | * Compared to the implementation of BlockReduceRaking (which does not support
59 | * non-commutative operators), this implementation requires a few extra
60 | * rounds of inter-thread communication.
61 | */
62 | template <
63 | typename T, ///< Data type being reduced
64 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
65 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
66 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
67 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
68 | struct BlockReduceRaking
69 | {
70 | /// Constants
71 | enum
72 | {
73 | /// The thread block size in threads
74 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
75 | };
76 |
77 | /// Layout type for padded thread block raking grid
78 | typedef BlockRakingLayout BlockRakingLayout;
79 |
80 | /// WarpReduce utility type
81 | typedef typename WarpReduce::InternalWarpReduce WarpReduce;
82 |
83 | /// Constants
84 | enum
85 | {
86 | /// Number of raking threads
87 | RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
88 |
89 | /// Number of raking elements per warp synchronous raking thread
90 | SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
91 |
92 | /// Cooperative work can be entirely warp synchronous
93 | WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
94 |
95 | /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
96 | WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo::VALUE,
97 |
98 | /// Whether or not accesses into smem are unguarded
99 | RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
100 |
101 | };
102 |
103 |
104 | /// Shared memory storage layout type
105 | union _TempStorage
106 | {
107 | typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction
108 | typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
109 | };
110 |
111 |
112 | /// Alias wrapper allowing storage to be unioned
113 | struct TempStorage : Uninitialized<_TempStorage> {};
114 |
115 |
116 | // Thread fields
117 | _TempStorage &temp_storage;
118 | int linear_tid;
119 |
120 |
121 | /// Constructor
122 | __device__ __forceinline__ BlockReduceRaking(
123 | TempStorage &temp_storage)
124 | :
125 | temp_storage(temp_storage.Alias()),
126 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
127 | {}
128 |
129 |
130 | template
131 | __device__ __forceinline__ T RakingReduction(
132 | ReductionOp reduction_op, ///< [in] Binary scan operator
133 | T *raking_segment,
134 | T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items
135 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
136 | Int2Type iteration)
137 | {
138 | // Update partial if addend is in range
139 | if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
140 | {
141 | T addend = raking_segment[ITERATION];
142 | partial = reduction_op(partial, addend);
143 | }
144 | return RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type());
145 | }
146 |
147 | template
148 | __device__ __forceinline__ T RakingReduction(
149 | ReductionOp reduction_op, ///< [in] Binary scan operator
150 | T *raking_segment,
151 | T partial, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items
152 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
153 | Int2Type iteration)
154 | {
155 | return partial;
156 | }
157 |
158 |
159 |
160 | /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
161 | template <
162 | bool IS_FULL_TILE,
163 | typename ReductionOp>
164 | __device__ __forceinline__ T Reduce(
165 | T partial, ///< [in] Calling thread's input partial reductions
166 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
167 | ReductionOp reduction_op) ///< [in] Binary reduction operator
168 | {
169 | if (WARP_SYNCHRONOUS)
170 | {
171 | // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
172 | partial = WarpReduce(temp_storage.warp_storage).template Reduce(
173 | partial,
174 | num_valid,
175 | reduction_op);
176 | }
177 | else
178 | {
179 | // Place partial into shared memory grid.
180 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
181 |
182 | __syncthreads();
183 |
184 | // Reduce parallelism to one warp
185 | if (linear_tid < RAKING_THREADS)
186 | {
187 | // Raking reduction in grid
188 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
189 | partial = raking_segment[0];
190 |
191 | partial = RakingReduction(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
192 |
193 | partial = WarpReduce(temp_storage.warp_storage).template Reduce(
194 | partial,
195 | num_valid,
196 | reduction_op);
197 |
198 | }
199 | }
200 |
201 | return partial;
202 | }
203 |
204 |
205 | /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
206 | template
207 | __device__ __forceinline__ T Sum(
208 | T partial, ///< [in] Calling thread's input partial reductions
209 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
210 | {
211 | cub::Sum reduction_op;
212 |
213 | return Reduce(partial, num_valid, reduction_op);
214 | }
215 |
216 |
217 |
218 | };
219 |
220 | } // CUB namespace
221 | CUB_NS_POSTFIX // Optional outer namespace(s)
222 |
223 |
--------------------------------------------------------------------------------
/cub/block/specializations/block_reduce_raking_commutative_only.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "block_reduce_raking.cuh"
37 | #include "../../warp/warp_reduce.cuh"
38 | #include "../../thread/thread_reduce.cuh"
39 | #include "../../util_ptx.cuh"
40 | #include "../../util_namespace.cuh"
41 |
42 | /// Optional outer namespace(s)
43 | CUB_NS_PREFIX
44 |
45 | /// CUB namespace
46 | namespace cub {
47 |
48 |
49 | /**
50 | * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block. Does not support non-commutative reduction operators. Does not support block sizes that are not a multiple of the warp size.
51 | */
52 | template <
53 | typename T, ///< Data type being reduced
54 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
55 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
56 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
57 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
58 | struct BlockReduceRakingCommutativeOnly
59 | {
60 | /// Constants
61 | enum
62 | {
63 | /// The thread block size in threads
64 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
65 | };
66 |
67 | // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
68 | typedef BlockReduceRaking FallBack;
69 |
70 | /// Constants
71 | enum
72 | {
73 | /// Number of warp threads
74 | WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
75 |
76 | /// Whether or not to use fall-back
77 | USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
78 |
79 | /// Number of raking threads
80 | RAKING_THREADS = WARP_THREADS,
81 |
82 | /// Number of threads actually sharing items with the raking threads
83 | SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
84 |
85 | /// Number of raking elements per warp synchronous raking thread
86 | SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
87 | };
88 |
89 | /// WarpReduce utility type
90 | typedef WarpReduce WarpReduce;
91 |
92 | /// Layout type for padded thread block raking grid
93 | typedef BlockRakingLayout BlockRakingLayout;
94 |
95 | /// Shared memory storage layout type
96 | struct _TempStorage
97 | {
98 | union
99 | {
100 | struct
101 | {
102 | typename WarpReduce::TempStorage warp_storage; ///< Storage for warp-synchronous reduction
103 | typename BlockRakingLayout::TempStorage raking_grid; ///< Padded threadblock raking grid
104 | };
105 | typename FallBack::TempStorage fallback_storage; ///< Fall-back storage for non-commutative block scan
106 | };
107 | };
108 |
109 |
110 | /// Alias wrapper allowing storage to be unioned
111 | struct TempStorage : Uninitialized<_TempStorage> {};
112 |
113 |
114 | // Thread fields
115 | _TempStorage &temp_storage;
116 | int linear_tid;
117 |
118 |
119 | /// Constructor
120 | __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
121 | TempStorage &temp_storage)
122 | :
123 | temp_storage(temp_storage.Alias()),
124 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
125 | {}
126 |
127 |
128 | /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
129 | template
130 | __device__ __forceinline__ T Sum(
131 | T partial, ///< [in] Calling thread's input partial reductions
132 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
133 | {
134 | if (USE_FALLBACK || !FULL_TILE)
135 | {
136 | return FallBack(temp_storage.fallback_storage).template Sum(partial, num_valid);
137 | }
138 | else
139 | {
140 | // Place partial into shared memory grid
141 | if (linear_tid >= RAKING_THREADS)
142 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
143 |
144 | __syncthreads();
145 |
146 | // Reduce parallelism to one warp
147 | if (linear_tid < RAKING_THREADS)
148 | {
149 | // Raking reduction in grid
150 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
151 | partial = ThreadReduce(raking_segment, cub::Sum(), partial);
152 |
153 | // Warpscan
154 | partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
155 | }
156 | }
157 |
158 | return partial;
159 | }
160 |
161 |
162 | /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
163 | template <
164 | bool FULL_TILE,
165 | typename ReductionOp>
166 | __device__ __forceinline__ T Reduce(
167 | T partial, ///< [in] Calling thread's input partial reductions
168 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
169 | ReductionOp reduction_op) ///< [in] Binary reduction operator
170 | {
171 | if (USE_FALLBACK || !FULL_TILE)
172 | {
173 | return FallBack(temp_storage.fallback_storage).template Reduce(partial, num_valid, reduction_op);
174 | }
175 | else
176 | {
177 | // Place partial into shared memory grid
178 | if (linear_tid >= RAKING_THREADS)
179 | *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
180 |
181 | __syncthreads();
182 |
183 | // Reduce parallelism to one warp
184 | if (linear_tid < RAKING_THREADS)
185 | {
186 | // Raking reduction in grid
187 | T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
188 | partial = ThreadReduce(raking_segment, reduction_op, partial);
189 |
190 | // Warpscan
191 | partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
192 | }
193 | }
194 |
195 | return partial;
196 | }
197 |
198 | };
199 |
200 | } // CUB namespace
201 | CUB_NS_POSTFIX // Optional outer namespace(s)
202 |
203 |
--------------------------------------------------------------------------------
/cub/block/specializations/block_reduce_warp_reductions.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../../warp/warp_reduce.cuh"
37 | #include "../../util_ptx.cuh"
38 | #include "../../util_arch.cuh"
39 | #include "../../util_namespace.cuh"
40 |
41 | /// Optional outer namespace(s)
42 | CUB_NS_PREFIX
43 |
44 | /// CUB namespace
45 | namespace cub {
46 |
47 |
48 | /**
49 | * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock. Supports non-commutative reduction operators.
50 | */
51 | template <
52 | typename T, ///< Data type being reduced
53 | int BLOCK_DIM_X, ///< The thread block length in threads along the X dimension
54 | int BLOCK_DIM_Y, ///< The thread block length in threads along the Y dimension
55 | int BLOCK_DIM_Z, ///< The thread block length in threads along the Z dimension
56 | int PTX_ARCH> ///< The PTX compute capability for which to to specialize this collective
57 | struct BlockReduceWarpReductions
58 | {
59 | /// Constants
60 | enum
61 | {
62 | /// The thread block size in threads
63 | BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
64 |
65 | /// Number of warp threads
66 | WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
67 |
68 | /// Number of active warps
69 | WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
70 |
71 | /// The logical warp size for warp reductions
72 | LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
73 |
74 | /// Whether or not the logical warp size evenly divides the threadblock size
75 | EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
76 | };
77 |
78 |
79 | /// WarpReduce utility type
80 | typedef typename WarpReduce::InternalWarpReduce WarpReduce;
81 |
82 |
83 | /// Shared memory storage layout type
84 | struct _TempStorage
85 | {
86 | typename WarpReduce::TempStorage warp_reduce[WARPS]; ///< Buffer for warp-synchronous scan
87 | T warp_aggregates[WARPS]; ///< Shared totals from each warp-synchronous scan
88 | T block_prefix; ///< Shared prefix for the entire threadblock
89 | };
90 |
91 | /// Alias wrapper allowing storage to be unioned
92 | struct TempStorage : Uninitialized<_TempStorage> {};
93 |
94 |
95 | // Thread fields
96 | _TempStorage &temp_storage;
97 | int linear_tid;
98 | int warp_id;
99 | int lane_id;
100 |
101 |
102 | /// Constructor
103 | __device__ __forceinline__ BlockReduceWarpReductions(
104 | TempStorage &temp_storage)
105 | :
106 | temp_storage(temp_storage.Alias()),
107 | linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
108 | warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
109 | lane_id(LaneId())
110 | {}
111 |
112 |
113 | template
114 | __device__ __forceinline__ T ApplyWarpAggregates(
115 | ReductionOp reduction_op, ///< [in] Binary scan operator
116 | T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items
117 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
118 | Int2Type successor_warp)
119 | {
120 | if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
121 | {
122 | T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
123 | warp_aggregate = reduction_op(warp_aggregate, addend);
124 | }
125 | return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type());
126 | }
127 |
128 | template
129 | __device__ __forceinline__ T ApplyWarpAggregates(
130 | ReductionOp reduction_op, ///< [in] Binary scan operator
131 | T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items
132 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
133 | Int2Type successor_warp)
134 | {
135 | return warp_aggregate;
136 | }
137 |
138 |
139 | /// Returns block-wide aggregate in thread0.
140 | template <
141 | bool FULL_TILE,
142 | typename ReductionOp>
143 | __device__ __forceinline__ T ApplyWarpAggregates(
144 | ReductionOp reduction_op, ///< [in] Binary scan operator
145 | T warp_aggregate, ///< [in] [lane0 only] Warp-wide aggregate reduction of input items
146 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
147 | {
148 | // Share lane aggregates
149 | if (lane_id == 0)
150 | {
151 | temp_storage.warp_aggregates[warp_id] = warp_aggregate;
152 | }
153 |
154 | __syncthreads();
155 |
156 | // Update total aggregate in warp 0, lane 0
157 | if (linear_tid == 0)
158 | {
159 | warp_aggregate = ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
160 | }
161 |
162 | return warp_aggregate;
163 | }
164 |
165 |
166 | /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
167 | template
168 | __device__ __forceinline__ T Sum(
169 | T input, ///< [in] Calling thread's input partial reductions
170 | int num_valid) ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
171 | {
172 | cub::Sum reduction_op;
173 | unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE;
174 | unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
175 | LOGICAL_WARP_SIZE :
176 | (warp_offset < num_valid) ?
177 | num_valid - warp_offset :
178 | 0;
179 |
180 | // Warp reduction in every warp
181 | T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
182 | input,
183 | warp_num_valid,
184 | cub::Sum());
185 |
186 | // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
187 | return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid);
188 | }
189 |
190 |
191 | /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial. The return value is only valid for thread0.
192 | template <
193 | bool FULL_TILE,
194 | typename ReductionOp>
195 | __device__ __forceinline__ T Reduce(
196 | T input, ///< [in] Calling thread's input partial reductions
197 | int num_valid, ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
198 | ReductionOp reduction_op) ///< [in] Binary reduction operator
199 | {
200 | unsigned int warp_offset = warp_id * LOGICAL_WARP_SIZE;
201 | unsigned int warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
202 | LOGICAL_WARP_SIZE :
203 | (warp_offset < num_valid) ?
204 | num_valid - warp_offset :
205 | 0;
206 |
207 | // Warp reduction in every warp
208 | T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
209 | input,
210 | warp_num_valid,
211 | reduction_op);
212 |
213 | // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
214 | return ApplyWarpAggregates(reduction_op, warp_aggregate, num_valid);
215 | }
216 |
217 | };
218 |
219 |
220 | } // CUB namespace
221 | CUB_NS_POSTFIX // Optional outer namespace(s)
222 |
223 |
--------------------------------------------------------------------------------
/cub/cub.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * CUB umbrella include file
32 | */
33 |
34 | #pragma once
35 |
36 |
37 | // Block
38 | #include "block/block_histogram.cuh"
39 | #include "block/block_discontinuity.cuh"
40 | #include "block/block_exchange.cuh"
41 | #include "block/block_load.cuh"
42 | #include "block/block_radix_rank.cuh"
43 | #include "block/block_radix_sort.cuh"
44 | #include "block/block_reduce.cuh"
45 | #include "block/block_scan.cuh"
46 | #include "block/block_store.cuh"
47 | //#include "block/block_shift.cuh"
48 |
49 | // Device
50 | #include "device/device_histogram.cuh"
51 | #include "device/device_partition.cuh"
52 | #include "device/device_radix_sort.cuh"
53 | #include "device/device_reduce.cuh"
54 | #include "device/device_run_length_encode.cuh"
55 | #include "device/device_scan.cuh"
56 | #include "device/device_segmented_radix_sort.cuh"
57 | #include "device/device_segmented_reduce.cuh"
58 | #include "device/device_select.cuh"
59 | #include "device/device_spmv.cuh"
60 |
61 | // Grid
62 | //#include "grid/grid_barrier.cuh"
63 | #include "grid/grid_even_share.cuh"
64 | #include "grid/grid_mapping.cuh"
65 | #include "grid/grid_queue.cuh"
66 |
67 | // Thread
68 | #include "thread/thread_load.cuh"
69 | #include "thread/thread_operators.cuh"
70 | #include "thread/thread_reduce.cuh"
71 | #include "thread/thread_scan.cuh"
72 | #include "thread/thread_store.cuh"
73 |
74 | // Warp
75 | #include "warp/warp_reduce.cuh"
76 | #include "warp/warp_scan.cuh"
77 |
78 | // Iterator
79 | #include "iterator/arg_index_input_iterator.cuh"
80 | #include "iterator/cache_modified_input_iterator.cuh"
81 | #include "iterator/cache_modified_output_iterator.cuh"
82 | #include "iterator/constant_input_iterator.cuh"
83 | #include "iterator/counting_input_iterator.cuh"
84 | #include "iterator/tex_obj_input_iterator.cuh"
85 | #include "iterator/tex_ref_input_iterator.cuh"
86 | #include "iterator/transform_input_iterator.cuh"
87 |
88 | // Util
89 | #include "util_allocator.cuh"
90 | #include "util_arch.cuh"
91 | #include "util_debug.cuh"
92 | #include "util_device.cuh"
93 | #include "util_macro.cuh"
94 | #include "util_ptx.cuh"
95 | #include "util_type.cuh"
96 |
97 |
--------------------------------------------------------------------------------
/cub/device/device_spmv.cuh:
--------------------------------------------------------------------------------
1 |
2 | /******************************************************************************
3 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
4 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
5 | *
6 | * Redistribution and use in source and binary forms, with or without
7 | * modification, are permitted provided that the following conditions are met:
8 | * * Redistributions of source code must retain the above copyright
9 | * notice, this list of conditions and the following disclaimer.
10 | * * Redistributions in binary form must reproduce the above copyright
11 | * notice, this list of conditions and the following disclaimer in the
12 | * documentation and/or other materials provided with the distribution.
13 | * * Neither the name of the NVIDIA CORPORATION nor the
14 | * names of its contributors may be used to endorse or promote products
15 | * derived from this software without specific prior written permission.
16 | *
17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | *
28 | ******************************************************************************/
29 |
30 | /**
31 | * \file
32 | * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
33 | */
34 |
35 | #pragma once
36 |
37 | #include
38 | #include
39 | #include
40 |
41 | #include "dispatch/dispatch_spmv_orig.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | /// Optional outer namespace(s)
45 | CUB_NS_PREFIX
46 |
47 | /// CUB namespace
48 | namespace cub {
49 |
50 |
51 | /**
52 | * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
53 | * \ingroup SingleModule
54 | *
55 | * \par Overview
56 | * The [SpMV computation](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
57 | * performs the matrix-vector operation
58 | * y = alpha*A*x + beta*y,
59 | * where:
60 | * - A is an mxn sparse matrix whose non-zero structure is specified in
61 | * [compressed-storage-row (CSR) format](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
62 | * (i.e., three arrays: values, row_offsets, and column_indices)
63 | * - x and y are dense vectors
64 | * - alpha and beta are scalar multiplicands
65 | *
66 | * \par Usage Considerations
67 | * \cdp_class{DeviceSpmv}
68 | *
69 | */
70 | struct DeviceSpmv
71 | {
72 | /******************************************************************//**
73 | * \name CSR matrix operations
74 | *********************************************************************/
75 | //@{
76 |
77 | /**
78 | * \brief This function performs the matrix-vector operation y = A*x.
79 | *
80 | * \par Snippet
81 | * The code snippet below illustrates SpMV upon a 9x9 CSR matrix A
82 | * representing a 3x3 lattice (24 non-zeros).
83 | *
84 | * \par
85 | * \code
86 | * #include // or equivalently
87 | *
88 | * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
89 | * // and output vector y
90 | * int num_rows = 9;
91 | * int num_cols = 9;
92 | * int num_nonzeros = 24;
93 | *
94 | * float* d_values; // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
95 | * // 1, 1, 1, 1, 1, 1, 1, 1,
96 | * // 1, 1, 1, 1, 1, 1, 1, 1]
97 | *
98 | * int* d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
99 | * // 4, 6, 1, 3, 5, 7, 2, 4,
100 | * // 8, 3, 7, 4, 6, 8, 5, 7]
101 | *
102 | * int* d_row_offsets; // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
103 | *
104 | * float* d_vector_x; // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
105 | * float* d_vector_y; // e.g., [ , , , , , , , , ]
106 | * ...
107 | *
108 | * // Determine temporary device storage requirements
109 | * void* d_temp_storage = NULL;
110 | * size_t temp_storage_bytes = 0;
111 | * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
112 | * d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
113 | * num_rows, num_cols, num_nonzeros, alpha, beta);
114 | *
115 | * // Allocate temporary storage
116 | * cudaMalloc(&d_temp_storage, temp_storage_bytes);
117 | *
118 | * // Run SpMV
119 | * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
120 | * d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
121 | * num_rows, num_cols, num_nonzeros, alpha, beta);
122 | *
123 | * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
124 | *
125 | * \endcode
126 | *
127 | * \tparam ValueT [inferred] Matrix and vector value type (e.g., /p float, /p double, etc.)
128 | */
129 | template <
130 | typename ValueT>
131 | CUB_RUNTIME_FUNCTION
132 | static cudaError_t CsrMV(
133 | void* d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
134 | size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
135 | ValueT* d_values, ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix A.
136 | int* d_row_offsets, ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
137 | int* d_column_indices, ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix A. (Indices are zero-valued.)
138 | ValueT* d_vector_x, ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector x
139 | ValueT* d_vector_y, ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector y
140 | int num_rows, ///< [in] number of rows of matrix A.
141 | int num_cols, ///< [in] number of columns of matrix A.
142 | int num_nonzeros, ///< [in] number of nonzero elements of matrix A.
143 | cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0.
144 | bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
145 | {
146 | SpmvParams spmv_params;
147 | spmv_params.d_values = d_values;
148 | spmv_params.d_row_end_offsets = d_row_offsets + 1;
149 | spmv_params.d_column_indices = d_column_indices;
150 | spmv_params.d_vector_x = d_vector_x;
151 | spmv_params.d_vector_y = d_vector_y;
152 | spmv_params.num_rows = num_rows;
153 | spmv_params.num_cols = num_cols;
154 | spmv_params.num_nonzeros = num_nonzeros;
155 | spmv_params.alpha = 1.0;
156 | spmv_params.beta = 0.0;
157 |
158 | return DispatchSpmv::Dispatch(
159 | d_temp_storage,
160 | temp_storage_bytes,
161 | spmv_params,
162 | stream,
163 | debug_synchronous);
164 | }
165 |
166 | //@} end member group
167 | };
168 |
169 |
170 |
171 | } // CUB namespace
172 | CUB_NS_POSTFIX // Optional outer namespace(s)
173 |
174 |
175 |
--------------------------------------------------------------------------------
/cub/grid/grid_barrier.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../util_debug.cuh"
37 | #include "../util_namespace.cuh"
38 | #include "../thread/thread_load.cuh"
39 |
40 | /// Optional outer namespace(s)
41 | CUB_NS_PREFIX
42 |
43 | /// CUB namespace
44 | namespace cub {
45 |
46 |
47 | /**
48 | * \addtogroup GridModule
49 | * @{
50 | */
51 |
52 |
53 | /**
54 | * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
55 | */
56 | class GridBarrier
57 | {
58 | protected :
59 |
60 | typedef unsigned int SyncFlag;
61 |
62 | // Counters in global device memory
63 | SyncFlag* d_sync;
64 |
65 | public:
66 |
67 | /**
68 | * Constructor
69 | */
70 | GridBarrier() : d_sync(NULL) {}
71 |
72 |
73 | /**
74 | * Synchronize
75 | */
76 | __device__ __forceinline__ void Sync() const
77 | {
78 | volatile SyncFlag *d_vol_sync = d_sync;
79 |
80 | // Threadfence and syncthreads to make sure global writes are visible before
81 | // thread-0 reports in with its sync counter
82 | __threadfence();
83 | __syncthreads();
84 |
85 | if (blockIdx.x == 0)
86 | {
87 | // Report in ourselves
88 | if (threadIdx.x == 0)
89 | {
90 | d_vol_sync[blockIdx.x] = 1;
91 | }
92 |
93 | __syncthreads();
94 |
95 | // Wait for everyone else to report in
96 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
97 | {
98 | while (ThreadLoad(d_sync + peer_block) == 0)
99 | {
100 | __threadfence_block();
101 | }
102 | }
103 |
104 | __syncthreads();
105 |
106 | // Let everyone know it's safe to proceed
107 | for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
108 | {
109 | d_vol_sync[peer_block] = 0;
110 | }
111 | }
112 | else
113 | {
114 | if (threadIdx.x == 0)
115 | {
116 | // Report in
117 | d_vol_sync[blockIdx.x] = 1;
118 |
119 | // Wait for acknowledgment
120 | while (ThreadLoad(d_sync + blockIdx.x) == 1)
121 | {
122 | __threadfence_block();
123 | }
124 | }
125 |
126 | __syncthreads();
127 | }
128 | }
129 | };
130 |
131 |
132 | /**
133 | * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
134 | *
135 | * Uses RAII for lifetime, i.e., device resources are reclaimed when
136 | * the destructor is called.
137 | */
138 | class GridBarrierLifetime : public GridBarrier
139 | {
140 | protected:
141 |
142 | // Number of bytes backed by d_sync
143 | size_t sync_bytes;
144 |
145 | public:
146 |
147 | /**
148 | * Constructor
149 | */
150 | GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
151 |
152 |
153 | /**
154 | * DeviceFrees and resets the progress counters
155 | */
156 | cudaError_t HostReset()
157 | {
158 | cudaError_t retval = cudaSuccess;
159 | if (d_sync)
160 | {
161 | CubDebug(retval = cudaFree(d_sync));
162 | d_sync = NULL;
163 | }
164 | sync_bytes = 0;
165 | return retval;
166 | }
167 |
168 |
169 | /**
170 | * Destructor
171 | */
172 | virtual ~GridBarrierLifetime()
173 | {
174 | HostReset();
175 | }
176 |
177 |
178 | /**
179 | * Sets up the progress counters for the next kernel launch (lazily
180 | * allocating and initializing them if necessary)
181 | */
182 | cudaError_t Setup(int sweep_grid_size)
183 | {
184 | cudaError_t retval = cudaSuccess;
185 | do {
186 | size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
187 | if (new_sync_bytes > sync_bytes)
188 | {
189 | if (d_sync)
190 | {
191 | if (CubDebug(retval = cudaFree(d_sync))) break;
192 | }
193 |
194 | sync_bytes = new_sync_bytes;
195 |
196 | // Allocate and initialize to zero
197 | if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
198 | if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
199 | }
200 | } while (0);
201 |
202 | return retval;
203 | }
204 | };
205 |
206 |
207 | /** @} */ // end group GridModule
208 |
209 | } // CUB namespace
210 | CUB_NS_POSTFIX // Optional outer namespace(s)
211 |
212 |
--------------------------------------------------------------------------------
/cub/grid/grid_even_share.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
32 | */
33 |
34 |
35 | #pragma once
36 |
37 | #include "../util_namespace.cuh"
38 | #include "../util_macro.cuh"
39 |
40 | /// Optional outer namespace(s)
41 | CUB_NS_PREFIX
42 |
43 | /// CUB namespace
44 | namespace cub {
45 |
46 |
47 | /**
48 | * \addtogroup GridModule
49 | * @{
50 | */
51 |
52 |
53 | /**
54 | * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion. Each threadblock gets roughly the same number of fixed-size work units (grains).
55 | *
56 | * \par Overview
57 | * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
58 | * Threadblocks may receive one of three different amounts of work: "big", "normal",
59 | * and "last". The "big" workloads are one scheduling grain larger than "normal". The "last" work unit
60 | * for the last threadblock may be partially-full if the input is not an even multiple of
61 | * the scheduling grain size.
62 | *
63 | * \par
64 | * Before invoking a child grid, a parent thread will typically construct an instance of
65 | * GridEvenShare. The instance can be passed to child threadblocks which can
66 | * initialize their per-threadblock offsets using \p BlockInit().
67 | *
68 | * \tparam OffsetT Signed integer type for global offsets
69 | */
70 | template
71 | struct GridEvenShare
72 | {
73 | OffsetT total_grains;
74 | int big_blocks;
75 | OffsetT big_share;
76 | OffsetT normal_share;
77 | OffsetT normal_base_offset;
78 |
79 | /// Total number of input items
80 | OffsetT num_items;
81 |
82 | /// Grid size in threadblocks
83 | int grid_size;
84 |
85 | /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
86 | OffsetT block_offset;
87 |
88 | /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
89 | OffsetT block_end;
90 |
91 | /**
92 | * \brief Default constructor. Zero-initializes block-specific fields.
93 | */
94 | __host__ __device__ __forceinline__ GridEvenShare() :
95 | num_items(0),
96 | grid_size(0),
97 | block_offset(0),
98 | block_end(0) {}
99 |
100 | /**
101 | * \brief Constructor. Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
102 | */
103 | __host__ __device__ __forceinline__ GridEvenShare(
104 | OffsetT num_items, ///< Total number of input items
105 | int max_grid_size, ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
106 | int schedule_granularity) ///< Granularity by which the input can be parcelled into and distributed among threablocks. Usually the thread block's native tile size (or a multiple thereof.
107 | {
108 | this->num_items = num_items;
109 | this->block_offset = num_items;
110 | this->block_end = num_items;
111 | this->total_grains = (num_items + schedule_granularity - 1) / schedule_granularity;
112 | this->grid_size = CUB_MIN(total_grains, max_grid_size);
113 | OffsetT grains_per_block = total_grains / grid_size;
114 | this->big_blocks = total_grains - (grains_per_block * grid_size); // leftover grains go to big blocks
115 | this->normal_share = grains_per_block * schedule_granularity;
116 | this->normal_base_offset = big_blocks * schedule_granularity;
117 | this->big_share = normal_share + schedule_granularity;
118 | }
119 |
120 |
121 |
122 | /**
123 | * \brief Initializes ranges for the specified partition index
124 | */
125 | __device__ __forceinline__ void Init(int partition_id)
126 | {
127 | if (partition_id < big_blocks)
128 | {
129 | // This threadblock gets a big share of grains (grains_per_block + 1)
130 | block_offset = (partition_id * big_share);
131 | block_end = block_offset + big_share;
132 | }
133 | else if (partition_id < total_grains)
134 | {
135 | // This threadblock gets a normal share of grains (grains_per_block)
136 | block_offset = normal_base_offset + (partition_id * normal_share);
137 | block_end = CUB_MIN(num_items, block_offset + normal_share);
138 | }
139 | }
140 |
141 |
142 | /**
143 | * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup)
144 | */
145 | __device__ __forceinline__ void BlockInit()
146 | {
147 | Init(blockIdx.x);
148 | }
149 |
150 |
151 | /**
152 | * Print to stdout
153 | */
154 | __host__ __device__ __forceinline__ void Print()
155 | {
156 | printf(
157 | #if (CUB_PTX_ARCH > 0)
158 | "\tthreadblock(%d) "
159 | "block_offset(%lu) "
160 | "block_end(%lu) "
161 | #endif
162 | "num_items(%lu) "
163 | "total_grains(%lu) "
164 | "big_blocks(%lu) "
165 | "big_share(%lu) "
166 | "normal_share(%lu)\n",
167 | #if (CUB_PTX_ARCH > 0)
168 | blockIdx.x,
169 | (unsigned long) block_offset,
170 | (unsigned long) block_end,
171 | #endif
172 | (unsigned long) num_items,
173 | (unsigned long) total_grains,
174 | (unsigned long) big_blocks,
175 | (unsigned long) big_share,
176 | (unsigned long) normal_share);
177 | }
178 | };
179 |
180 |
181 |
182 | /** @} */ // end group GridModule
183 |
184 | } // CUB namespace
185 | CUB_NS_POSTFIX // Optional outer namespace(s)
186 |
--------------------------------------------------------------------------------
/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../util_namespace.cuh"
37 |
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 |
41 | /// CUB namespace
42 | namespace cub {
43 |
44 |
45 | /**
46 | * \addtogroup GridModule
47 | * @{
48 | */
49 |
50 |
51 | /******************************************************************************
52 | * Mapping policies
53 | *****************************************************************************/
54 |
55 |
56 | /**
57 | * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
58 | */
59 | enum GridMappingStrategy
60 | {
61 | /**
62 | * \brief An "even-share" strategy for assigning input tiles to thread blocks.
63 | *
64 | * \par Overview
65 | * The input is evenly partitioned into \p p segments, where \p p is
66 | * constant and corresponds loosely to the number of thread blocks that may
67 | * actively reside on the target device. Each segment is comprised of
68 | * consecutive tiles, where a tile is a small, constant-sized unit of input
69 | * to be processed to completion before the thread block terminates or
70 | * obtains more work. The kernel invokes \p p thread blocks, each
71 | * of which iteratively consumes a segment of n/p elements
72 | * in tile-size increments.
73 | */
74 | GRID_MAPPING_EVEN_SHARE,
75 |
76 | /**
77 | * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
78 | *
79 | * \par Overview
80 | * The input is treated as a queue to be dynamically consumed by a grid of
81 | * thread blocks. Work is atomically dequeued in tiles, where a tile is a
82 | * unit of input to be processed to completion before the thread block
83 | * terminates or obtains more work. The grid size \p p is constant,
84 | * loosely corresponding to the number of thread blocks that may actively
85 | * reside on the target device.
86 | */
87 | GRID_MAPPING_DYNAMIC,
88 | };
89 |
90 |
91 | /** @} */ // end group GridModule
92 |
93 | } // CUB namespace
94 | CUB_NS_POSTFIX // Optional outer namespace(s)
95 |
96 |
--------------------------------------------------------------------------------
/cub/grid/grid_queue.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * cub::GridQueue is a descriptor utility for dynamic queue management.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../util_namespace.cuh"
37 | #include "../util_debug.cuh"
38 |
39 | /// Optional outer namespace(s)
40 | CUB_NS_PREFIX
41 |
42 | /// CUB namespace
43 | namespace cub {
44 |
45 |
46 | /**
47 | * \addtogroup GridModule
48 | * @{
49 | */
50 |
51 |
52 | /**
53 | * \brief GridQueue is a descriptor utility for dynamic queue management.
54 | *
55 | * \par Overview
56 | * GridQueue descriptors provides abstractions for "filling" or
57 | * "draining" globally-shared vectors.
58 | *
59 | * \par
60 | * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
61 | * returning a unique offset for the calling thread to write its items.
62 | * The GridQueue maintains the total "fill-size". The fill counter must be reset
63 | * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
64 | * will be filling.
65 | *
66 | * \par
67 | * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
68 | * zero-initialized counter, returning a unique offset for the calling thread to
69 | * read its items. Threads can safely drain until the array's logical fill-size is
70 | * exceeded. The drain counter must be reset using GridQueue::ResetDrain or
71 | * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
72 | * will be filling. (For dynamic work distribution of existing data, the corresponding fill-size
73 | * is simply the number of elements in the array.)
74 | *
75 | * \par
76 | * Iterative work management can be implemented simply with a pair of flip-flopping
77 | * work buffers, each with an associated set of fill and drain GridQueue descriptors.
78 | *
79 | * \tparam OffsetT Signed integer type for global offsets
80 | */
81 | template
82 | class GridQueue
83 | {
84 | private:
85 |
86 | /// Counter indices
87 | enum
88 | {
89 | FILL = 0,
90 | DRAIN = 1,
91 | };
92 |
93 | /// Pair of counters
94 | OffsetT *d_counters;
95 |
96 | public:
97 |
98 | /// Returns the device allocation size in bytes needed to construct a GridQueue instance
99 | __host__ __device__ __forceinline__
100 | static size_t AllocationSize()
101 | {
102 | return sizeof(OffsetT) * 2;
103 | }
104 |
105 |
106 | /// Constructs an invalid GridQueue descriptor
107 | __host__ __device__ __forceinline__ GridQueue()
108 | :
109 | d_counters(NULL)
110 | {}
111 |
112 |
113 | /// Constructs a GridQueue descriptor around the device storage allocation
114 | __host__ __device__ __forceinline__ GridQueue(
115 | void *d_storage) ///< Device allocation to back the GridQueue. Must be at least as big as AllocationSize().
116 | :
117 | d_counters((OffsetT*) d_storage)
118 | {}
119 |
120 |
121 | /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance. To be called by the host or by a kernel prior to that which will be draining.
122 | __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
123 | OffsetT fill_size,
124 | cudaStream_t stream = 0)
125 | {
126 | #if (CUB_PTX_ARCH > 0)
127 | d_counters[FILL] = fill_size;
128 | d_counters[DRAIN] = 0;
129 | return cudaSuccess;
130 | #else
131 | OffsetT counters[2];
132 | counters[FILL] = fill_size;
133 | counters[DRAIN] = 0;
134 | return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
135 | #endif
136 | }
137 |
138 |
139 | /// This operation resets the drain so that it may advance to meet the existing fill-size. To be called by the host or by a kernel prior to that which will be draining.
140 | __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
141 | {
142 | #if (CUB_PTX_ARCH > 0)
143 | d_counters[DRAIN] = 0;
144 | return cudaSuccess;
145 | #else
146 | return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
147 | #endif
148 | }
149 |
150 |
151 | /// This operation resets the fill counter. To be called by the host or by a kernel prior to that which will be filling.
152 | __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
153 | {
154 | #if (CUB_PTX_ARCH > 0)
155 | d_counters[FILL] = 0;
156 | return cudaSuccess;
157 | #else
158 | return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
159 | #endif
160 | }
161 |
162 |
163 | /// Returns the fill-size established by the parent or by the previous kernel.
164 | __host__ __device__ __forceinline__ cudaError_t FillSize(
165 | OffsetT &fill_size,
166 | cudaStream_t stream = 0)
167 | {
168 | #if (CUB_PTX_ARCH > 0)
169 | fill_size = d_counters[FILL];
170 | return cudaSuccess;
171 | #else
172 | return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
173 | #endif
174 | }
175 |
176 |
177 | /// Drain \p num_items from the queue. Returns offset from which to read items. To be called from CUDA kernel.
178 | __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
179 | {
180 | return atomicAdd(d_counters + DRAIN, num_items);
181 | }
182 |
183 |
184 | /// Fill \p num_items into the queue. Returns offset from which to write items. To be called from CUDA kernel.
185 | __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
186 | {
187 | return atomicAdd(d_counters + FILL, num_items);
188 | }
189 | };
190 |
191 |
192 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
193 |
194 |
195 | /**
196 | * Reset grid queue (call with 1 block of 1 thread)
197 | */
198 | template
199 | __global__ void FillAndResetDrainKernel(
200 | GridQueue grid_queue,
201 | OffsetT num_items)
202 | {
203 | grid_queue.FillAndResetDrain(num_items);
204 | }
205 |
206 |
207 |
208 | #endif // DOXYGEN_SHOULD_SKIP_THIS
209 |
210 |
211 | /** @} */ // end group GridModule
212 |
213 | } // CUB namespace
214 | CUB_NS_POSTFIX // Optional outer namespace(s)
215 |
216 |
217 |
--------------------------------------------------------------------------------
/cub/host/mutex.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Simple portable mutex
32 | */
33 |
34 |
35 | #pragma once
36 |
37 | #if __cplusplus > 199711L
38 | #include
39 | #else
40 | #if defined(_WIN32) || defined(_WIN64)
41 | #include
42 | #include
43 | #undef small // Windows is terrible for polluting macro namespace
44 |
45 | /**
46 | * Compiler read/write barrier
47 | */
48 | #pragma intrinsic(_ReadWriteBarrier)
49 |
50 | #endif
51 | #endif
52 |
53 | #include "../util_namespace.cuh"
54 |
55 |
56 | /// Optional outer namespace(s)
57 | CUB_NS_PREFIX
58 |
59 | /// CUB namespace
60 | namespace cub {
61 |
62 |
63 | /**
64 | * Simple portable mutex
65 | * - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
66 | * - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
67 | */
68 | struct Mutex
69 | {
70 | #if __cplusplus > 199711L
71 |
72 | std::mutex mtx;
73 |
74 | void Lock()
75 | {
76 | mtx.lock();
77 | }
78 |
79 | void Unlock()
80 | {
81 | mtx.unlock();
82 | }
83 |
84 | void TryLock()
85 | {
86 | mtx.try_lock();
87 | }
88 |
89 | #else //__cplusplus > 199711L
90 |
91 | #if defined(_MSC_VER)
92 |
93 | // Microsoft VC++
94 | typedef long Spinlock;
95 |
96 | #else
97 |
98 | // GNU g++
99 | typedef int Spinlock;
100 |
101 | /**
102 | * Compiler read/write barrier
103 | */
104 | __forceinline__ void _ReadWriteBarrier()
105 | {
106 | __sync_synchronize();
107 | }
108 |
109 | /**
110 | * Atomic exchange
111 | */
112 | __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
113 | {
114 | // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
115 | _ReadWriteBarrier();
116 | return __sync_lock_test_and_set(Target, Value);
117 | }
118 |
119 | /**
120 | * Pause instruction to prevent excess processor bus usage
121 | */
122 | __forceinline__ void YieldProcessor()
123 | {
124 | #ifndef __arm__
125 | asm volatile("pause\n": : :"memory");
126 | #endif // __arm__
127 | }
128 |
129 | #endif // defined(_MSC_VER)
130 |
131 | /// Lock member
132 | volatile Spinlock lock;
133 |
134 | /**
135 | * Constructor
136 | */
137 | Mutex() : lock(0) {}
138 |
139 | /**
140 | * Return when the specified spinlock has been acquired
141 | */
142 | __forceinline__ void Lock()
143 | {
144 | while (1)
145 | {
146 | if (!_InterlockedExchange(&lock, 1)) return;
147 | while (lock) YieldProcessor();
148 | }
149 | }
150 |
151 |
152 | /**
153 | * Release the specified spinlock
154 | */
155 | __forceinline__ void Unlock()
156 | {
157 | _ReadWriteBarrier();
158 | lock = 0;
159 | }
160 |
161 | #endif // __cplusplus > 199711L
162 |
163 | };
164 |
165 |
166 |
167 |
168 | } // CUB namespace
169 | CUB_NS_POSTFIX // Optional outer namespace(s)
170 |
171 |
--------------------------------------------------------------------------------
/cub/iterator/arg_index_input_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_device.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | #include
45 |
46 | #if (THRUST_VERSION >= 100700)
47 | // This iterator is compatible with Thrust API 1.7 and newer
48 | #include
49 | #include
50 | #endif // THRUST_VERSION
51 |
52 | /// Optional outer namespace(s)
53 | CUB_NS_PREFIX
54 |
55 | /// CUB namespace
56 | namespace cub {
57 |
58 | /**
59 | * \addtogroup UtilIterator
60 | * @{
61 | */
62 |
63 |
64 | /**
65 | * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
66 | *
67 | * \par Overview
68 | * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
69 | * Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
70 | * \p key field is \p i and whose \p value field is itr[i].
71 | * - Can be used with any data type.
72 | * - Can be constructed, manipulated, and exchanged within and between host and device
73 | * functions. Wrapped host memory can only be dereferenced on the host, and wrapped
74 | * device memory can only be dereferenced on the device.
75 | * - Compatible with Thrust API v1.7 or newer.
76 | *
77 | * \par Snippet
78 | * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
79 | * dereference an array of doubles
80 | * \par
81 | * \code
82 | * #include // or equivalently
83 | *
84 | * // Declare, allocate, and initialize a device array
85 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
86 | *
87 | * // Create an iterator wrapper
88 | * cub::ArgIndexInputIterator itr(d_in);
89 | *
90 | * // Within device code:
91 | * typedef typename cub::ArgIndexInputIterator::value_type Tuple;
92 | * Tuple item_offset_pair.key = *itr;
93 | * printf("%f @ %d\n",
94 | * item_offset_pair.value,
95 | * item_offset_pair.key); // 8.0 @ 0
96 | *
97 | * itr = itr + 6;
98 | * item_offset_pair.key = *itr;
99 | * printf("%f @ %d\n",
100 | * item_offset_pair.value,
101 | * item_offset_pair.key); // 9.0 @ 6
102 | *
103 | * \endcode
104 | *
105 | * \tparam InputIteratorT The type of the wrapped input iterator
106 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t)
107 | */
108 | template <
109 | typename InputIteratorT,
110 | typename OffsetT = ptrdiff_t>
111 | class ArgIndexInputIterator
112 | {
113 | private:
114 |
115 | // Data type of input iterator
116 | typedef typename std::iterator_traits::value_type T;
117 |
118 | public:
119 |
120 |
121 | // Required iterator traits
122 | typedef ArgIndexInputIterator self_type; ///< My own type
123 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
124 | typedef KeyValuePair value_type; ///< The type of the element the iterator can point to
125 | typedef value_type* pointer; ///< The type of a pointer to an element the iterator can point to
126 | typedef value_type reference; ///< The type of a reference to an element the iterator can point to
127 |
128 | #if (THRUST_VERSION >= 100700)
129 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
130 | typedef typename thrust::detail::iterator_facade_category<
131 | thrust::any_system_tag,
132 | thrust::random_access_traversal_tag,
133 | value_type,
134 | reference
135 | >::type iterator_category; ///< The iterator category
136 | #else
137 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
138 | #endif // THRUST_VERSION
139 |
140 | private:
141 |
142 | InputIteratorT itr;
143 | difference_type offset;
144 |
145 | public:
146 |
147 | /// Constructor
148 | __host__ __device__ __forceinline__ ArgIndexInputIterator(
149 | InputIteratorT itr, ///< Input iterator to wrap
150 | difference_type offset = 0) ///< OffsetT (in items) from \p itr denoting the position of the iterator
151 | :
152 | itr(itr),
153 | offset(offset)
154 | {}
155 |
156 | /// Postfix increment
157 | __host__ __device__ __forceinline__ self_type operator++(int)
158 | {
159 | self_type retval = *this;
160 | offset++;
161 | return retval;
162 | }
163 |
164 | /// Prefix increment
165 | __host__ __device__ __forceinline__ self_type operator++()
166 | {
167 | offset++;
168 | return *this;
169 | }
170 |
171 | /// Indirection
172 | __host__ __device__ __forceinline__ reference operator*() const
173 | {
174 | value_type retval;
175 | retval.value = itr[offset];
176 | retval.key = offset;
177 | return retval;
178 | }
179 |
180 | /// Addition
181 | template
182 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
183 | {
184 | self_type retval(itr, offset + n);
185 | return retval;
186 | }
187 |
188 | /// Addition assignment
189 | template
190 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
191 | {
192 | offset += n;
193 | return *this;
194 | }
195 |
196 | /// Subtraction
197 | template
198 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
199 | {
200 | self_type retval(itr, offset - n);
201 | return retval;
202 | }
203 |
204 | /// Subtraction assignment
205 | template
206 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
207 | {
208 | offset -= n;
209 | return *this;
210 | }
211 |
212 | /// Distance
213 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
214 | {
215 | return offset - other.offset;
216 | }
217 |
218 | /// Array subscript
219 | template
220 | __host__ __device__ __forceinline__ reference operator[](Distance n) const
221 | {
222 | self_type offset = (*this) + n;
223 | return *offset;
224 | }
225 |
226 | /// Structure dereference
227 | __host__ __device__ __forceinline__ pointer operator->()
228 | {
229 | return &(*(*this));
230 | }
231 |
232 | /// Equal to
233 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
234 | {
235 | return ((itr == rhs.itr) && (offset == rhs.offset));
236 | }
237 |
238 | /// Not equal to
239 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
240 | {
241 | return ((itr != rhs.itr) || (offset != rhs.offset));
242 | }
243 |
244 | /// Normalize
245 | __host__ __device__ __forceinline__ void normalize()
246 | {
247 | itr += offset;
248 | offset = 0;
249 | }
250 |
251 | /// ostream operator
252 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
253 | {
254 | return os;
255 | }
256 | };
257 |
258 |
259 |
260 | /** @} */ // end group UtilIterator
261 |
262 | } // CUB namespace
263 | CUB_NS_POSTFIX // Optional outer namespace(s)
264 |
--------------------------------------------------------------------------------
/cub/iterator/cache_modified_input_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_device.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | #if (THRUST_VERSION >= 100700)
45 | // This iterator is compatible with Thrust API 1.7 and newer
46 | #include
47 | #include
48 | #endif // THRUST_VERSION
49 |
50 |
51 | /// Optional outer namespace(s)
52 | CUB_NS_PREFIX
53 |
54 | /// CUB namespace
55 | namespace cub {
56 |
57 |
58 |
59 | /**
60 | * \addtogroup UtilIterator
61 | * @{
62 | */
63 |
64 |
65 | /**
66 | * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
67 | *
68 | * \par Overview
69 | * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native
70 | * device pointer of type ValueType*. \p ValueType references are
71 | * made by reading \p ValueType values through loads modified by \p MODIFIER.
72 | * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
73 | * "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
74 | * - Can be constructed, manipulated, and exchanged within and between host and device
75 | * functions, but can only be dereferenced within device functions.
76 | * - Compatible with Thrust API v1.7 or newer.
77 | *
78 | * \par Snippet
79 | * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto
80 | * dereference a device array of double using the "ldg" PTX load modifier
81 | * (i.e., load values through texture cache).
82 | * \par
83 | * \code
84 | * #include // or equivalently
85 | *
86 | * // Declare, allocate, and initialize a device array
87 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
88 | *
89 | * // Create an iterator wrapper
90 | * cub::CacheModifiedInputIterator itr(d_in);
91 | *
92 | * // Within device code:
93 | * printf("%f\n", itr[0]); // 8.0
94 | * printf("%f\n", itr[1]); // 6.0
95 | * printf("%f\n", itr[6]); // 9.0
96 | *
97 | * \endcode
98 | *
99 | * \tparam CacheLoadModifier The cub::CacheLoadModifier to use when accessing data
100 | * \tparam ValueType The value type of this iterator
101 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t)
102 | */
103 | template <
104 | CacheLoadModifier MODIFIER,
105 | typename ValueType,
106 | typename OffsetT = ptrdiff_t>
107 | class CacheModifiedInputIterator
108 | {
109 | public:
110 |
111 | // Required iterator traits
112 | typedef CacheModifiedInputIterator self_type; ///< My own type
113 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
114 | typedef ValueType value_type; ///< The type of the element the iterator can point to
115 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
116 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to
117 |
118 | #if (THRUST_VERSION >= 100700)
119 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
120 | typedef typename thrust::detail::iterator_facade_category<
121 | thrust::device_system_tag,
122 | thrust::random_access_traversal_tag,
123 | value_type,
124 | reference
125 | >::type iterator_category; ///< The iterator category
126 | #else
127 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
128 | #endif // THRUST_VERSION
129 |
130 |
131 | public:
132 |
133 | /// Wrapped native pointer
134 | ValueType* ptr;
135 |
136 | /// Constructor
137 | template
138 | __host__ __device__ __forceinline__ CacheModifiedInputIterator(
139 | QualifiedValueType* ptr) ///< Native pointer to wrap
140 | :
141 | ptr(const_cast::Type *>(ptr))
142 | {}
143 |
144 | /// Postfix increment
145 | __host__ __device__ __forceinline__ self_type operator++(int)
146 | {
147 | self_type retval = *this;
148 | ptr++;
149 | return retval;
150 | }
151 |
152 | /// Prefix increment
153 | __host__ __device__ __forceinline__ self_type operator++()
154 | {
155 | ptr++;
156 | return *this;
157 | }
158 |
159 | /// Indirection
160 | __device__ __forceinline__ reference operator*() const
161 | {
162 | return ThreadLoad(ptr);
163 | }
164 |
165 | /// Addition
166 | template
167 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
168 | {
169 | self_type retval(ptr + n);
170 | return retval;
171 | }
172 |
173 | /// Addition assignment
174 | template
175 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
176 | {
177 | ptr += n;
178 | return *this;
179 | }
180 |
181 | /// Subtraction
182 | template
183 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
184 | {
185 | self_type retval(ptr - n);
186 | return retval;
187 | }
188 |
189 | /// Subtraction assignment
190 | template
191 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
192 | {
193 | ptr -= n;
194 | return *this;
195 | }
196 |
197 | /// Distance
198 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
199 | {
200 | return ptr - other.ptr;
201 | }
202 |
203 | /// Array subscript
204 | template
205 | __device__ __forceinline__ reference operator[](Distance n) const
206 | {
207 | return ThreadLoad(ptr + n);
208 | }
209 |
210 | /// Structure dereference
211 | __device__ __forceinline__ pointer operator->()
212 | {
213 | return &ThreadLoad(ptr);
214 | }
215 |
216 | /// Equal to
217 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
218 | {
219 | return (ptr == rhs.ptr);
220 | }
221 |
222 | /// Not equal to
223 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
224 | {
225 | return (ptr != rhs.ptr);
226 | }
227 |
228 | /// ostream operator
229 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
230 | {
231 | return os;
232 | }
233 | };
234 |
235 |
236 |
237 | /** @} */ // end group UtilIterator
238 |
239 | } // CUB namespace
240 | CUB_NS_POSTFIX // Optional outer namespace(s)
241 |
--------------------------------------------------------------------------------
/cub/iterator/cache_modified_output_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_device.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | #if (THRUST_VERSION >= 100700)
45 | // This iterator is compatible with Thrust API 1.7 and newer
46 | #include
47 | #include
48 | #endif // THRUST_VERSION
49 |
50 |
51 | /// Optional outer namespace(s)
52 | CUB_NS_PREFIX
53 |
54 | /// CUB namespace
55 | namespace cub {
56 |
57 |
58 | /**
59 | * \addtogroup UtilIterator
60 | * @{
61 | */
62 |
63 |
64 | /**
65 | * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
66 | *
67 | * \par Overview
68 | * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
69 | * device pointer of type ValueType*. \p ValueType references are
70 | * made by writing \p ValueType values through stores modified by \p MODIFIER.
71 | * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
72 | * "STORE_CG", "STORE_CS", "STORE_WT", etc.).
73 | * - Can be constructed, manipulated, and exchanged within and between host and device
74 | * functions, but can only be dereferenced within device functions.
75 | * - Compatible with Thrust API v1.7 or newer.
76 | *
77 | * \par Snippet
78 | * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
79 | * dereference a device array of doubles using the "wt" PTX load modifier
80 | * (i.e., write-through to system memory).
81 | * \par
82 | * \code
83 | * #include // or equivalently
84 | *
85 | * // Declare, allocate, and initialize a device array
86 | * double *d_out; // e.g., [, , , , , , ]
87 | *
88 | * // Create an iterator wrapper
89 | * cub::CacheModifiedOutputIterator itr(d_out);
90 | *
91 | * // Within device code:
92 | * itr[0] = 8.0;
93 | * itr[1] = 66.0;
94 | * itr[55] = 24.0;
95 | *
96 | * \endcode
97 | *
98 | * \par Usage Considerations
99 | * - Can only be dereferenced within device code
100 | *
101 | * \tparam CacheStoreModifier The cub::CacheStoreModifier to use when accessing data
102 | * \tparam ValueType The value type of this iterator
103 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t)
104 | */
105 | template <
106 | CacheStoreModifier MODIFIER,
107 | typename ValueType,
108 | typename OffsetT = ptrdiff_t>
109 | class CacheModifiedOutputIterator
110 | {
111 | private:
112 |
113 | // Proxy object
114 | struct Reference
115 | {
116 | ValueType* ptr;
117 |
118 | /// Constructor
119 | __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
120 |
121 | /// Assignment
122 | __device__ __forceinline__ ValueType operator =(ValueType val)
123 | {
124 | ThreadStore(ptr, val);
125 | return val;
126 | }
127 | };
128 |
129 | public:
130 |
131 | // Required iterator traits
132 | typedef CacheModifiedOutputIterator self_type; ///< My own type
133 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
134 | typedef ValueType value_type; ///< The type of the element the iterator can point to
135 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
136 | typedef Reference reference; ///< The type of a reference to an element the iterator can point to
137 |
138 | #if (THRUST_VERSION >= 100700)
139 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
140 | typedef typename thrust::detail::iterator_facade_category<
141 | thrust::device_system_tag,
142 | thrust::random_access_traversal_tag,
143 | value_type,
144 | reference
145 | >::type iterator_category; ///< The iterator category
146 | #else
147 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
148 | #endif // THRUST_VERSION
149 |
150 | private:
151 |
152 | ValueType* ptr;
153 |
154 | public:
155 |
156 | /// Constructor
157 | template
158 | __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
159 | QualifiedValueType* ptr) ///< Native pointer to wrap
160 | :
161 | ptr(const_cast::Type *>(ptr))
162 | {}
163 |
164 | /// Postfix increment
165 | __host__ __device__ __forceinline__ self_type operator++(int)
166 | {
167 | self_type retval = *this;
168 | ptr++;
169 | return retval;
170 | }
171 |
172 |
173 | /// Prefix increment
174 | __host__ __device__ __forceinline__ self_type operator++()
175 | {
176 | ptr++;
177 | return *this;
178 | }
179 |
180 | /// Indirection
181 | __host__ __device__ __forceinline__ reference operator*() const
182 | {
183 | return Reference(ptr);
184 | }
185 |
186 | /// Addition
187 | template
188 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
189 | {
190 | self_type retval(ptr + n);
191 | return retval;
192 | }
193 |
194 | /// Addition assignment
195 | template
196 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
197 | {
198 | ptr += n;
199 | return *this;
200 | }
201 |
202 | /// Subtraction
203 | template
204 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
205 | {
206 | self_type retval(ptr - n);
207 | return retval;
208 | }
209 |
210 | /// Subtraction assignment
211 | template
212 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
213 | {
214 | ptr -= n;
215 | return *this;
216 | }
217 |
218 | /// Distance
219 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
220 | {
221 | return ptr - other.ptr;
222 | }
223 |
224 | /// Array subscript
225 | template
226 | __host__ __device__ __forceinline__ reference operator[](Distance n) const
227 | {
228 | return Reference(ptr + n);
229 | }
230 |
231 | /// Equal to
232 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
233 | {
234 | return (ptr == rhs.ptr);
235 | }
236 |
237 | /// Not equal to
238 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
239 | {
240 | return (ptr != rhs.ptr);
241 | }
242 |
243 | /// ostream operator
244 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
245 | {
246 | return os;
247 | }
248 | };
249 |
250 |
251 | /** @} */ // end group UtilIterator
252 |
253 | } // CUB namespace
254 | CUB_NS_POSTFIX // Optional outer namespace(s)
255 |
--------------------------------------------------------------------------------
/cub/iterator/constant_input_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_namespace.cuh"
42 |
43 | #if (THRUST_VERSION >= 100700)
44 | // This iterator is compatible with Thrust API 1.7 and newer
45 | #include
46 | #include
47 | #endif // THRUST_VERSION
48 |
49 |
50 | /// Optional outer namespace(s)
51 | CUB_NS_PREFIX
52 |
53 | /// CUB namespace
54 | namespace cub {
55 |
56 |
57 | /**
58 | * \addtogroup UtilIterator
59 | * @{
60 | */
61 |
62 |
63 | /**
64 | * \brief A random-access input generator for dereferencing a sequence of homogeneous values
65 | *
66 | * \par Overview
67 | * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
68 | * of type \p ValueType.
69 | * - Can be used with any data type.
70 | * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
71 | * functions.
72 | * - Compatible with Thrust API v1.7 or newer.
73 | *
74 | * \par Snippet
75 | * The code snippet below illustrates the use of \p ConstantInputIteratorTto
76 | * dereference a sequence of homogeneous doubles.
77 | * \par
78 | * \code
79 | * #include // or equivalently
80 | *
81 | * cub::ConstantInputIterator itr(5.0);
82 | *
83 | * printf("%f\n", itr[0]); // 5.0
84 | * printf("%f\n", itr[1]); // 5.0
85 | * printf("%f\n", itr[2]); // 5.0
86 | * printf("%f\n", itr[50]); // 5.0
87 | *
88 | * \endcode
89 | *
90 | * \tparam ValueType The value type of this iterator
91 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t)
92 | */
93 | template <
94 | typename ValueType,
95 | typename OffsetT = ptrdiff_t>
96 | class ConstantInputIterator
97 | {
98 | public:
99 |
100 | // Required iterator traits
101 | typedef ConstantInputIterator self_type; ///< My own type
102 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
103 | typedef ValueType value_type; ///< The type of the element the iterator can point to
104 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
105 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to
106 |
107 | #if (THRUST_VERSION >= 100700)
108 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
109 | typedef typename thrust::detail::iterator_facade_category<
110 | thrust::any_system_tag,
111 | thrust::random_access_traversal_tag,
112 | value_type,
113 | reference
114 | >::type iterator_category; ///< The iterator category
115 | #else
116 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
117 | #endif // THRUST_VERSION
118 |
119 | private:
120 |
121 | ValueType val;
122 | OffsetT offset;
123 | #ifdef _WIN32
124 | OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))]; // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
125 | #endif
126 |
127 | public:
128 |
129 | /// Constructor
130 | __host__ __device__ __forceinline__ ConstantInputIterator(
131 | ValueType val, ///< Starting value for the iterator instance to report
132 | OffsetT offset = 0) ///< Base offset
133 | :
134 | val(val),
135 | offset(offset)
136 | {}
137 |
138 | /// Postfix increment
139 | __host__ __device__ __forceinline__ self_type operator++(int)
140 | {
141 | self_type retval = *this;
142 | offset++;
143 | return retval;
144 | }
145 |
146 | /// Prefix increment
147 | __host__ __device__ __forceinline__ self_type operator++()
148 | {
149 | offset++;
150 | return *this;
151 | }
152 |
153 | /// Indirection
154 | __host__ __device__ __forceinline__ reference operator*() const
155 | {
156 | return val;
157 | }
158 |
159 | /// Addition
160 | template
161 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
162 | {
163 | self_type retval(val, offset + n);
164 | return retval;
165 | }
166 |
167 | /// Addition assignment
168 | template
169 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
170 | {
171 | offset += n;
172 | return *this;
173 | }
174 |
175 | /// Subtraction
176 | template
177 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
178 | {
179 | self_type retval(val, offset - n);
180 | return retval;
181 | }
182 |
183 | /// Subtraction assignment
184 | template
185 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
186 | {
187 | offset -= n;
188 | return *this;
189 | }
190 |
191 | /// Distance
192 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
193 | {
194 | return offset - other.offset;
195 | }
196 |
197 | /// Array subscript
198 | template
199 | __host__ __device__ __forceinline__ reference operator[](Distance n) const
200 | {
201 | return val;
202 | }
203 |
204 | /// Structure dereference
205 | __host__ __device__ __forceinline__ pointer operator->()
206 | {
207 | return &val;
208 | }
209 |
210 | /// Equal to
211 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
212 | {
213 | return (offset == rhs.offset) && ((val == rhs.val));
214 | }
215 |
216 | /// Not equal to
217 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
218 | {
219 | return (offset != rhs.offset) || (val!= rhs.val);
220 | }
221 |
222 | /// ostream operator
223 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
224 | {
225 | os << "[" << itr.val << "," << itr.offset << "]";
226 | return os;
227 | }
228 |
229 | };
230 |
231 |
232 | /** @} */ // end group UtilIterator
233 |
234 | } // CUB namespace
235 | CUB_NS_POSTFIX // Optional outer namespace(s)
236 |
--------------------------------------------------------------------------------
/cub/iterator/counting_input_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_device.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | #if (THRUST_VERSION >= 100700)
45 | // This iterator is compatible with Thrust API 1.7 and newer
46 | #include
47 | #include
48 | #endif // THRUST_VERSION
49 |
50 |
51 | /// Optional outer namespace(s)
52 | CUB_NS_PREFIX
53 |
54 | /// CUB namespace
55 | namespace cub {
56 |
57 | /**
58 | * \addtogroup UtilIterator
59 | * @{
60 | */
61 |
62 | /**
63 | * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
64 | *
65 | * \par Overview
66 | * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
67 | * at \p offset will return the value \p base + \p offset.
68 | * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
69 | * functions.
70 | * - Compatible with Thrust API v1.7 or newer.
71 | *
72 | * \par Snippet
73 | * The code snippet below illustrates the use of \p CountingInputIteratorTto
74 | * dereference a sequence of incrementing integers.
75 | * \par
76 | * \code
77 | * #include // or equivalently
78 | *
79 | * cub::CountingInputIterator itr(5);
80 | *
81 | * printf("%d\n", itr[0]); // 5
82 | * printf("%d\n", itr[1]); // 6
83 | * printf("%d\n", itr[2]); // 7
84 | * printf("%d\n", itr[50]); // 55
85 | *
86 | * \endcode
87 | *
88 | * \tparam ValueType The value type of this iterator
89 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t)
90 | */
91 | template <
92 | typename ValueType,
93 | typename OffsetT = ptrdiff_t>
94 | class CountingInputIterator
95 | {
96 | public:
97 |
98 | // Required iterator traits
99 | typedef CountingInputIterator self_type; ///< My own type
100 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
101 | typedef ValueType value_type; ///< The type of the element the iterator can point to
102 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
103 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to
104 |
105 | #if (THRUST_VERSION >= 100700)
106 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
107 | typedef typename thrust::detail::iterator_facade_category<
108 | thrust::any_system_tag,
109 | thrust::random_access_traversal_tag,
110 | value_type,
111 | reference
112 | >::type iterator_category; ///< The iterator category
113 | #else
114 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
115 | #endif // THRUST_VERSION
116 |
117 | private:
118 |
119 | ValueType val;
120 |
121 | public:
122 |
123 | /// Constructor
124 | __host__ __device__ __forceinline__ CountingInputIterator(
125 | const ValueType &val) ///< Starting value for the iterator instance to report
126 | :
127 | val(val)
128 | {}
129 |
130 | /// Postfix increment
131 | __host__ __device__ __forceinline__ self_type operator++(int)
132 | {
133 | self_type retval = *this;
134 | val++;
135 | return retval;
136 | }
137 |
138 | /// Prefix increment
139 | __host__ __device__ __forceinline__ self_type operator++()
140 | {
141 | val++;
142 | return *this;
143 | }
144 |
145 | /// Indirection
146 | __host__ __device__ __forceinline__ reference operator*() const
147 | {
148 | return val;
149 | }
150 |
151 | /// Addition
152 | template
153 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
154 | {
155 | self_type retval(val + n);
156 | return retval;
157 | }
158 |
159 | /// Addition assignment
160 | template
161 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
162 | {
163 | val += n;
164 | return *this;
165 | }
166 |
167 | /// Subtraction
168 | template
169 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
170 | {
171 | self_type retval(val - n);
172 | return retval;
173 | }
174 |
175 | /// Subtraction assignment
176 | template
177 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
178 | {
179 | val -= n;
180 | return *this;
181 | }
182 |
183 | /// Distance
184 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
185 | {
186 | return val - other.val;
187 | }
188 |
189 | /// Array subscript
190 | template
191 | __host__ __device__ __forceinline__ reference operator[](Distance n) const
192 | {
193 | return val + n;
194 | }
195 |
196 | /// Structure dereference
197 | __host__ __device__ __forceinline__ pointer operator->()
198 | {
199 | return &val;
200 | }
201 |
202 | /// Equal to
203 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
204 | {
205 | return (val == rhs.val);
206 | }
207 |
208 | /// Not equal to
209 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
210 | {
211 | return (val != rhs.val);
212 | }
213 |
214 | /// ostream operator
215 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
216 | {
217 | os << "[" << itr.val << "]";
218 | return os;
219 | }
220 |
221 | };
222 |
223 |
224 |
225 | /** @} */ // end group UtilIterator
226 |
227 | } // CUB namespace
228 | CUB_NS_POSTFIX // Optional outer namespace(s)
229 |
--------------------------------------------------------------------------------
/cub/iterator/tex_obj_input_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_device.cuh"
42 | #include "../util_debug.cuh"
43 | #include "../util_namespace.cuh"
44 |
45 | #if (THRUST_VERSION >= 100700)
46 | // This iterator is compatible with Thrust API 1.7 and newer
47 | #include
48 | #include
49 | #endif // THRUST_VERSION
50 |
51 |
52 | /// Optional outer namespace(s)
53 | CUB_NS_PREFIX
54 |
55 | /// CUB namespace
56 | namespace cub {
57 |
58 | /**
59 | * \addtogroup UtilIterator
60 | * @{
61 | */
62 |
63 |
64 |
65 | /**
66 | * \brief A random-access input wrapper for dereferencing array values through texture cache. Uses newer Kepler-style texture objects.
67 | *
68 | * \par Overview
69 | * - TexObjInputIteratorTwraps a native device pointer of type ValueType*. References
70 | * to elements are to be loaded through texture cache.
71 | * - Can be used to load any data type from memory through texture cache.
72 | * - Can be manipulated and exchanged within and between host and device
73 | * functions, can only be constructed within host functions, and can only be
74 | * dereferenced within device functions.
75 | * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be
76 | * created by the host thread, but can be used by any descendant kernel.
77 | * - Compatible with Thrust API v1.7 or newer.
78 | *
79 | * \par Snippet
80 | * The code snippet below illustrates the use of \p TexRefInputIteratorTto
81 | * dereference a device array of doubles through texture cache.
82 | * \par
83 | * \code
84 | * #include // or equivalently
85 | *
86 | * // Declare, allocate, and initialize a device array
87 | * int num_items; // e.g., 7
88 | * double *d_in; // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
89 | *
90 | * // Create an iterator wrapper
91 | * cub::TexObjInputIterator itr;
92 | * itr.BindTexture(d_in, sizeof(double) * num_items);
93 | * ...
94 | *
95 | * // Within device code:
96 | * printf("%f\n", itr[0]); // 8.0
97 | * printf("%f\n", itr[1]); // 6.0
98 | * printf("%f\n", itr[6]); // 9.0
99 | *
100 | * ...
101 | * itr.UnbindTexture();
102 | *
103 | * \endcode
104 | *
105 | * \tparam T The value type of this iterator
106 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t)
107 | */
108 | template <
109 | typename T,
110 | typename OffsetT = ptrdiff_t>
111 | class TexObjInputIterator
112 | {
113 | public:
114 |
115 | // Required iterator traits
116 | typedef TexObjInputIterator self_type; ///< My own type
117 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
118 | typedef T value_type; ///< The type of the element the iterator can point to
119 | typedef T* pointer; ///< The type of a pointer to an element the iterator can point to
120 | typedef T reference; ///< The type of a reference to an element the iterator can point to
121 |
122 | #if (THRUST_VERSION >= 100700)
123 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
124 | typedef typename thrust::detail::iterator_facade_category<
125 | thrust::device_system_tag,
126 | thrust::random_access_traversal_tag,
127 | value_type,
128 | reference
129 | >::type iterator_category; ///< The iterator category
130 | #else
131 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
132 | #endif // THRUST_VERSION
133 |
134 | private:
135 |
136 | // Largest texture word we can use in device
137 | typedef typename UnitWord::TextureWord TextureWord;
138 |
139 | // Number of texture words per T
140 | enum {
141 | TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
142 | };
143 |
144 | private:
145 |
146 | T* ptr;
147 | difference_type tex_offset;
148 | cudaTextureObject_t tex_obj;
149 |
150 | public:
151 |
152 | /// Constructor
153 | __host__ __device__ __forceinline__ TexObjInputIterator()
154 | :
155 | ptr(NULL),
156 | tex_offset(0),
157 | tex_obj(0)
158 | {}
159 |
160 | /// Use this iterator to bind \p ptr with a texture reference
161 | template
162 | cudaError_t BindTexture(
163 | QualifiedT *ptr, ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
164 | size_t bytes = size_t(-1), ///< Number of bytes in the range
165 | size_t tex_offset = 0) ///< OffsetT (in items) from \p ptr denoting the position of the iterator
166 | {
167 | this->ptr = const_cast::Type *>(ptr);
168 | this->tex_offset = tex_offset;
169 |
170 | cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc();
171 | cudaResourceDesc res_desc;
172 | cudaTextureDesc tex_desc;
173 | memset(&res_desc, 0, sizeof(cudaResourceDesc));
174 | memset(&tex_desc, 0, sizeof(cudaTextureDesc));
175 | res_desc.resType = cudaResourceTypeLinear;
176 | res_desc.res.linear.devPtr = this->ptr;
177 | res_desc.res.linear.desc = channel_desc;
178 | res_desc.res.linear.sizeInBytes = bytes;
179 | tex_desc.readMode = cudaReadModeElementType;
180 | return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
181 | }
182 |
183 | /// Unbind this iterator from its texture reference
184 | cudaError_t UnbindTexture()
185 | {
186 | return cudaDestroyTextureObject(tex_obj);
187 | }
188 |
189 | /// Postfix increment
190 | __host__ __device__ __forceinline__ self_type operator++(int)
191 | {
192 | self_type retval = *this;
193 | tex_offset++;
194 | return retval;
195 | }
196 |
197 | /// Prefix increment
198 | __host__ __device__ __forceinline__ self_type operator++()
199 | {
200 | tex_offset++;
201 | return *this;
202 | }
203 |
204 | /// Indirection
205 | __host__ __device__ __forceinline__ reference operator*() const
206 | {
207 | #if (CUB_PTX_ARCH == 0)
208 | // Simply dereference the pointer on the host
209 | return ptr[tex_offset];
210 | #else
211 | // Move array of uninitialized words, then alias and assign to return value
212 | TextureWord words[TEXTURE_MULTIPLE];
213 |
214 | #pragma unroll
215 | for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
216 | {
217 | words[i] = tex1Dfetch(
218 | tex_obj,
219 | (tex_offset * TEXTURE_MULTIPLE) + i);
220 | }
221 |
222 | // Load from words
223 | return *reinterpret_cast(words);
224 | #endif
225 | }
226 |
227 | /// Addition
228 | template
229 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
230 | {
231 | self_type retval;
232 | retval.ptr = ptr;
233 | retval.tex_obj = tex_obj;
234 | retval.tex_offset = tex_offset + n;
235 | return retval;
236 | }
237 |
238 | /// Addition assignment
239 | template
240 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
241 | {
242 | tex_offset += n;
243 | return *this;
244 | }
245 |
246 | /// Subtraction
247 | template
248 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
249 | {
250 | self_type retval;
251 | retval.ptr = ptr;
252 | retval.tex_obj = tex_obj;
253 | retval.tex_offset = tex_offset - n;
254 | return retval;
255 | }
256 |
257 | /// Subtraction assignment
258 | template
259 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
260 | {
261 | tex_offset -= n;
262 | return *this;
263 | }
264 |
265 | /// Distance
266 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
267 | {
268 | return tex_offset - other.tex_offset;
269 | }
270 |
271 | /// Array subscript
272 | template
273 | __host__ __device__ __forceinline__ reference operator[](Distance n) const
274 | {
275 | self_type offset = (*this) + n;
276 | return *offset;
277 | }
278 |
279 | /// Structure dereference
280 | __host__ __device__ __forceinline__ pointer operator->()
281 | {
282 | return &(*(*this));
283 | }
284 |
285 | /// Equal to
286 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
287 | {
288 | return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
289 | }
290 |
291 | /// Not equal to
292 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
293 | {
294 | return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
295 | }
296 |
297 | /// ostream operator
298 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
299 | {
300 | return os;
301 | }
302 |
303 | };
304 |
305 |
306 |
307 | /** @} */ // end group UtilIterator
308 |
309 | } // CUB namespace
310 | CUB_NS_POSTFIX // Optional outer namespace(s)
311 |
--------------------------------------------------------------------------------
/cub/iterator/transform_input_iterator.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Random-access iterator types
32 | */
33 |
34 | #pragma once
35 |
36 | #include
37 | #include
38 |
39 | #include "../thread/thread_load.cuh"
40 | #include "../thread/thread_store.cuh"
41 | #include "../util_device.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | #if (THRUST_VERSION >= 100700)
45 | // This iterator is compatible with Thrust API 1.7 and newer
46 | #include
47 | #include
48 | #endif // THRUST_VERSION
49 |
50 |
51 | /// Optional outer namespace(s)
52 | CUB_NS_PREFIX
53 |
54 | /// CUB namespace
55 | namespace cub {
56 |
57 | /**
58 | * \addtogroup UtilIterator
59 | * @{
60 | */
61 |
62 |
63 | /**
64 | * \brief A random-access input wrapper for transforming dereferenced values.
65 | *
66 | * \par Overview
67 | * - TransformInputIteratorTwraps a unary conversion functor of type \p
68 | * ConversionOp and a random-access input iterator of type InputIteratorT,
69 | * using the former to produce references of type \p ValueType from the latter.
70 | * - Can be used with any data type.
71 | * - Can be constructed, manipulated, and exchanged within and between host and device
72 | * functions. Wrapped host memory can only be dereferenced on the host, and wrapped
73 | * device memory can only be dereferenced on the device.
74 | * - Compatible with Thrust API v1.7 or newer.
75 | *
76 | * \par Snippet
77 | * The code snippet below illustrates the use of \p TransformInputIteratorTto
78 | * dereference an array of integers, tripling the values and converting them to doubles.
79 | * \par
80 | * \code
81 | * #include // or equivalently
82 | *
83 | * // Functor for tripling integer values and converting to doubles
84 | * struct TripleDoubler
85 | * {
86 | * __host__ __device__ __forceinline__
87 | * double operator()(const int &a) const {
88 | * return double(a * 2);
89 | * }
90 | * };
91 | *
92 | * // Declare, allocate, and initialize a device array
93 | * int *d_in; // e.g., [8, 6, 7, 5, 3, 0, 9]
94 | * TripleDoubler conversion_op;
95 | *
96 | * // Create an iterator wrapper
97 | * cub::TransformInputIterator itr(d_in, conversion_op);
98 | *
99 | * // Within device code:
100 | * printf("%f\n", itr[0]); // 24.0
101 | * printf("%f\n", itr[1]); // 18.0
102 | * printf("%f\n", itr[6]); // 27.0
103 | *
104 | * \endcode
105 | *
106 | * \tparam ValueType The value type of this iterator
107 | * \tparam ConversionOp Unary functor type for mapping objects of type \p InputType to type \p ValueType. Must have member ValueType operator()(const InputType &datum).
108 | * \tparam InputIteratorT The type of the wrapped input iterator
109 | * \tparam OffsetT The difference type of this iterator (Default: \p ptrdiff_t)
110 | *
111 | */
112 | template <
113 | typename ValueType,
114 | typename ConversionOp,
115 | typename InputIteratorT,
116 | typename OffsetT = ptrdiff_t>
117 | class TransformInputIterator
118 | {
119 | public:
120 |
121 | // Required iterator traits
122 | typedef TransformInputIterator self_type; ///< My own type
123 | typedef OffsetT difference_type; ///< Type to express the result of subtracting one iterator from another
124 | typedef ValueType value_type; ///< The type of the element the iterator can point to
125 | typedef ValueType* pointer; ///< The type of a pointer to an element the iterator can point to
126 | typedef ValueType reference; ///< The type of a reference to an element the iterator can point to
127 |
128 | #if (THRUST_VERSION >= 100700)
129 | // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
130 | typedef typename thrust::detail::iterator_facade_category<
131 | thrust::any_system_tag,
132 | thrust::random_access_traversal_tag,
133 | value_type,
134 | reference
135 | >::type iterator_category; ///< The iterator category
136 | #else
137 | typedef std::random_access_iterator_tag iterator_category; ///< The iterator category
138 | #endif // THRUST_VERSION
139 |
140 | private:
141 |
142 | ConversionOp conversion_op;
143 | InputIteratorT input_itr;
144 |
145 | public:
146 |
147 | /// Constructor
148 | __host__ __device__ __forceinline__ TransformInputIterator(
149 | InputIteratorT input_itr, ///< Input iterator to wrap
150 | ConversionOp conversion_op) ///< Conversion functor to wrap
151 | :
152 | conversion_op(conversion_op),
153 | input_itr(input_itr)
154 | {}
155 |
156 | /// Postfix increment
157 | __host__ __device__ __forceinline__ self_type operator++(int)
158 | {
159 | self_type retval = *this;
160 | input_itr++;
161 | return retval;
162 | }
163 |
164 | /// Prefix increment
165 | __host__ __device__ __forceinline__ self_type operator++()
166 | {
167 | input_itr++;
168 | return *this;
169 | }
170 |
171 | /// Indirection
172 | __host__ __device__ __forceinline__ reference operator*() const
173 | {
174 | return conversion_op(*input_itr);
175 | }
176 |
177 | /// Addition
178 | template
179 | __host__ __device__ __forceinline__ self_type operator+(Distance n) const
180 | {
181 | self_type retval(input_itr + n, conversion_op);
182 | return retval;
183 | }
184 |
185 | /// Addition assignment
186 | template
187 | __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
188 | {
189 | input_itr += n;
190 | return *this;
191 | }
192 |
193 | /// Subtraction
194 | template
195 | __host__ __device__ __forceinline__ self_type operator-(Distance n) const
196 | {
197 | self_type retval(input_itr - n, conversion_op);
198 | return retval;
199 | }
200 |
201 | /// Subtraction assignment
202 | template
203 | __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
204 | {
205 | input_itr -= n;
206 | return *this;
207 | }
208 |
209 | /// Distance
210 | __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
211 | {
212 | return input_itr - other.input_itr;
213 | }
214 |
215 | /// Array subscript
216 | template
217 | __host__ __device__ __forceinline__ reference operator[](Distance n) const
218 | {
219 | return conversion_op(input_itr[n]);
220 | }
221 |
222 | /// Structure dereference
223 | __host__ __device__ __forceinline__ pointer operator->()
224 | {
225 | return &conversion_op(*input_itr);
226 | }
227 |
228 | /// Equal to
229 | __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
230 | {
231 | return (input_itr == rhs.input_itr);
232 | }
233 |
234 | /// Not equal to
235 | __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
236 | {
237 | return (input_itr != rhs.input_itr);
238 | }
239 |
240 | /// ostream operator
241 | friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
242 | {
243 | return os;
244 | }
245 | };
246 |
247 |
248 |
249 | /** @} */ // end group UtilIterator
250 |
251 | } // CUB namespace
252 | CUB_NS_POSTFIX // Optional outer namespace(s)
253 |
--------------------------------------------------------------------------------
/cub/thread/thread_operators.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Simple binary operator functor types
32 | */
33 |
34 | /******************************************************************************
35 | * Simple functor operators
36 | ******************************************************************************/
37 |
38 | #pragma once
39 |
40 | #include "../util_macro.cuh"
41 | #include "../util_type.cuh"
42 | #include "../util_namespace.cuh"
43 |
44 | /// Optional outer namespace(s)
45 | CUB_NS_PREFIX
46 |
47 | /// CUB namespace
48 | namespace cub {
49 |
50 |
51 | /**
52 | * \addtogroup UtilModule
53 | * @{
54 | */
55 |
56 | /**
57 | * \brief Default equality functor
58 | */
59 | struct Equality
60 | {
61 | /// Boolean equality operator, returns (a == b)
62 | template
63 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
64 | {
65 | return a == b;
66 | }
67 | };
68 |
69 |
70 | /**
71 | * \brief Default inequality functor
72 | */
73 | struct Inequality
74 | {
75 | /// Boolean inequality operator, returns (a != b)
76 | template
77 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
78 | {
79 | return a != b;
80 | }
81 | };
82 |
83 |
84 | /**
85 | * \brief Inequality functor (wraps equality functor)
86 | */
87 | template
88 | struct InequalityWrapper
89 | {
90 | /// Wrapped equality operator
91 | EqualityOp op;
92 |
93 | /// Constructor
94 | __host__ __device__ __forceinline__
95 | InequalityWrapper(EqualityOp op) : op(op) {}
96 |
97 | /// Boolean inequality operator, returns (a != b)
98 | template
99 | __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
100 | {
101 | return !op(a, b);
102 | }
103 | };
104 |
105 |
106 | /**
107 | * \brief Default sum functor
108 | */
109 | struct Sum
110 | {
111 | /// Boolean sum operator, returns a + b
112 | template
113 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
114 | {
115 | return a + b;
116 | }
117 | };
118 |
119 |
120 | /**
121 | * \brief Default max functor
122 | */
123 | struct Max
124 | {
125 | /// Boolean max operator, returns (a > b) ? a : b
126 | template
127 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
128 | {
129 | return CUB_MAX(a, b);
130 | }
131 | };
132 |
133 |
134 | /**
135 | * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
136 | */
137 | struct ArgMax
138 | {
139 | /// Boolean max operator, preferring the item having the smaller offset in case of ties
140 | template
141 | __host__ __device__ __forceinline__ KeyValuePair operator()(
142 | const KeyValuePair &a,
143 | const KeyValuePair &b) const
144 | {
145 | // Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
146 | // return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
147 |
148 | if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
149 | return b;
150 | return a;
151 | }
152 | };
153 |
154 |
155 | /**
156 | * \brief Default min functor
157 | */
158 | struct Min
159 | {
160 | /// Boolean min operator, returns (a < b) ? a : b
161 | template
162 | __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
163 | {
164 | return CUB_MIN(a, b);
165 | }
166 | };
167 |
168 |
169 | /**
170 | * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
171 | */
172 | struct ArgMin
173 | {
174 | /// Boolean min operator, preferring the item having the smaller offset in case of ties
175 | template
176 | __host__ __device__ __forceinline__ KeyValuePair operator()(
177 | const KeyValuePair &a,
178 | const KeyValuePair &b) const
179 | {
180 | // Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
181 | // return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
182 |
183 | if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
184 | return b;
185 | return a;
186 | }
187 | };
188 |
189 |
190 | /**
191 | * \brief Default cast functor
192 | */
193 | template
194 | struct Cast
195 | {
196 | /// Cast operator, returns (B) a
197 | template
198 | __host__ __device__ __forceinline__ B operator()(const A &a) const
199 | {
200 | return (B) a;
201 | }
202 | };
203 |
204 |
205 | /**
206 | * \brief Binary operator wrapper for switching non-commutative scan arguments
207 | */
208 | template
209 | class SwizzleScanOp
210 | {
211 | private:
212 |
213 | /// Wrapped scan operator
214 | ScanOp scan_op;
215 |
216 | public:
217 |
218 | /// Constructor
219 | __host__ __device__ __forceinline__
220 | SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
221 |
222 | /// Switch the scan arguments
223 | template
224 | __host__ __device__ __forceinline__
225 | T operator()(const T &a, const T &b)
226 | {
227 | return scan_op(b, a);
228 | }
229 | };
230 |
231 |
232 | /**
233 | * \brief Reduce-by-segment functor.
234 | *
235 | * Given two cub::KeyValuePair inputs \p a and \p b and a
236 | * binary associative combining operator \p f(const T &x, const T &y),
237 | * an instance of this functor returns a cub::KeyValuePair whose \p key
238 | * field is a.key + a.key, and whose \p value field
239 | * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
240 | *
241 | * ReduceBySegmentOp is an associative, non-commutative binary combining operator
242 | * for input sequences of cub::KeyValuePair pairings. Such
243 | * sequences are typically used to represent a segmented set of values to be reduced
244 | * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
245 | * first value of each segment.
246 | *
247 | */
248 | template ///< Binary reduction operator to apply to values
249 | struct ReduceBySegmentOp
250 | {
251 | /// Wrapped reduction operator
252 | ReductionOpT op;
253 |
254 | /// Constructor
255 | __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
256 |
257 | /// Constructor
258 | __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
259 |
260 | /// Scan operator
261 | template ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
262 | __host__ __device__ __forceinline__ KeyValuePairT operator()(
263 | const KeyValuePairT &first, ///< First partial reduction
264 | const KeyValuePairT &second) ///< Second partial reduction
265 | {
266 | KeyValuePairT retval;
267 | retval.key = first.key + second.key;
268 | retval.value = (second.key) ?
269 | second.value : // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
270 | op(first.value, second.value); // The second partial reduction does not span a reset, so accumulate both into the running aggregate
271 | return retval;
272 | }
273 | };
274 |
275 |
276 |
277 | template ///< Binary reduction operator to apply to values
278 | struct ReduceByKeyOp
279 | {
280 | /// Wrapped reduction operator
281 | ReductionOpT op;
282 |
283 | /// Constructor
284 | __host__ __device__ __forceinline__ ReduceByKeyOp() {}
285 |
286 | /// Constructor
287 | __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
288 |
289 | /// Scan operator
290 | template
291 | __host__ __device__ __forceinline__ KeyValuePairT operator()(
292 | const KeyValuePairT &first, ///< First partial reduction
293 | const KeyValuePairT &second) ///< Second partial reduction
294 | {
295 | KeyValuePairT retval = second;
296 |
297 | if (first.key == second.key)
298 | retval.value = op(first.value, retval.value);
299 |
300 | return retval;
301 | }
302 | };
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 | /** @} */ // end group UtilModule
311 |
312 |
313 | } // CUB namespace
314 | CUB_NS_POSTFIX // Optional outer namespace(s)
315 |
--------------------------------------------------------------------------------
/cub/thread/thread_reduce.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Thread utilities for sequential reduction over statically-sized array types
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../thread/thread_operators.cuh"
37 | #include "../util_namespace.cuh"
38 |
39 | /// Optional outer namespace(s)
40 | CUB_NS_PREFIX
41 |
42 | /// CUB namespace
43 | namespace cub {
44 |
45 | /**
46 | * \addtogroup UtilModule
47 | * @{
48 | */
49 |
50 | /**
51 | * \name Sequential reduction over statically-sized array types
52 | * @{
53 | */
54 |
55 |
56 | template <
57 | int LENGTH,
58 | typename T,
59 | typename ReductionOp>
60 | __device__ __forceinline__ T ThreadReduce(
61 | T* input, ///< [in] Input array
62 | ReductionOp reduction_op, ///< [in] Binary reduction operator
63 | T prefix, ///< [in] Prefix to seed reduction with
64 | Int2Type length)
65 | {
66 | T addend = *input;
67 | prefix = reduction_op(prefix, addend);
68 |
69 | return ThreadReduce(input + 1, reduction_op, prefix, Int2Type());
70 | }
71 |
72 | template <
73 | typename T,
74 | typename ReductionOp>
75 | __device__ __forceinline__ T ThreadReduce(
76 | T* input, ///< [in] Input array
77 | ReductionOp reduction_op, ///< [in] Binary reduction operator
78 | T prefix, ///< [in] Prefix to seed reduction with
79 | Int2Type<0> length)
80 | {
81 | return prefix;
82 | }
83 |
84 |
85 | /**
86 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix. The aggregate is returned.
87 | *
88 | * \tparam LENGTH LengthT of input array
89 | * \tparam T [inferred] The data type to be reduced.
90 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
91 | */
92 | template <
93 | int LENGTH,
94 | typename T,
95 | typename ReductionOp>
96 | __device__ __forceinline__ T ThreadReduce(
97 | T* input, ///< [in] Input array
98 | ReductionOp reduction_op, ///< [in] Binary reduction operator
99 | T prefix) ///< [in] Prefix to seed reduction with
100 | {
101 | return ThreadReduce(input, reduction_op, prefix, Int2Type());
102 | }
103 |
104 |
105 | /**
106 | * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array. The aggregate is returned.
107 | *
108 | * \tparam LENGTH LengthT of input array
109 | * \tparam T [inferred] The data type to be reduced.
110 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
111 | */
112 | template <
113 | int LENGTH,
114 | typename T,
115 | typename ReductionOp>
116 | __device__ __forceinline__ T ThreadReduce(
117 | T* input, ///< [in] Input array
118 | ReductionOp reduction_op) ///< [in] Binary reduction operator
119 | {
120 | T prefix = input[0];
121 | return ThreadReduce(input + 1, reduction_op, prefix);
122 | }
123 |
124 |
125 | /**
126 | * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix. The aggregate is returned.
127 | *
128 | * \tparam LENGTH [inferred] LengthT of \p input array
129 | * \tparam T [inferred] The data type to be reduced.
130 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
131 | */
132 | template <
133 | int LENGTH,
134 | typename T,
135 | typename ReductionOp>
136 | __device__ __forceinline__ T ThreadReduce(
137 | T (&input)[LENGTH], ///< [in] Input array
138 | ReductionOp reduction_op, ///< [in] Binary reduction operator
139 | T prefix) ///< [in] Prefix to seed reduction with
140 | {
141 | return ThreadReduce(input, reduction_op, prefix, Int2Type());
142 | }
143 |
144 |
145 | /**
146 | * \brief Serial reduction with the specified operator
147 | *
148 | * \tparam LENGTH [inferred] LengthT of \p input array
149 | * \tparam T [inferred] The data type to be reduced.
150 | * \tparam ScanOp [inferred] Binary reduction operator type having member T operator()(const T &a, const T &b)
151 | */
152 | template <
153 | int LENGTH,
154 | typename T,
155 | typename ReductionOp>
156 | __device__ __forceinline__ T ThreadReduce(
157 | T (&input)[LENGTH], ///< [in] Input array
158 | ReductionOp reduction_op) ///< [in] Binary reduction operator
159 | {
160 | return ThreadReduce((T*) input, reduction_op);
161 | }
162 |
163 |
164 | //@} end member group
165 |
166 | /** @} */ // end group UtilModule
167 |
168 | } // CUB namespace
169 | CUB_NS_POSTFIX // Optional outer namespace(s)
170 |
--------------------------------------------------------------------------------
/cub/thread/thread_search.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Thread utilities for sequential search
32 | */
33 |
34 | #pragma once
35 |
36 | #include "../util_namespace.cuh"
37 |
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 |
41 | /// CUB namespace
42 | namespace cub {
43 |
44 |
45 | /**
46 | * Computes the begin offsets into A and B for the specific diagonal
47 | */
48 | template <
49 | typename AIteratorT,
50 | typename BIteratorT,
51 | typename OffsetT,
52 | typename CoordinateT>
53 | __host__ __device__ __forceinline__ void MergePathSearch(
54 | OffsetT diagonal,
55 | AIteratorT a,
56 | BIteratorT b,
57 | OffsetT a_len,
58 | OffsetT b_len,
59 | CoordinateT& path_coordinate)
60 | {
61 | /// The value type of the input iterator
62 | typedef typename std::iterator_traits::value_type T;
63 |
64 | OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
65 | OffsetT split_max = CUB_MIN(diagonal, a_len);
66 |
67 | while (split_min < split_max)
68 | {
69 | OffsetT split_pivot = (split_min + split_max) >> 1;
70 | if (a[split_pivot] <= b[diagonal - split_pivot - 1])
71 | {
72 | // Move candidate split range up A, down B
73 | split_min = split_pivot + 1;
74 | }
75 | else
76 | {
77 | // Move candidate split range up B, down A
78 | split_max = split_pivot;
79 | }
80 | }
81 |
82 | path_coordinate.x = CUB_MIN(split_min, a_len);
83 | path_coordinate.y = diagonal - split_min;
84 | }
85 |
86 |
87 |
88 | /**
89 | * \brief Returns the offset of the first value within \p input which does not compare less than \p val
90 | */
91 | template <
92 | typename InputIteratorT,
93 | typename OffsetT,
94 | typename T>
95 | __device__ __forceinline__ OffsetT LowerBound(
96 | InputIteratorT input, ///< [in] Input sequence
97 | OffsetT num_items, ///< [in] Input sequence length
98 | T val) ///< [in] Search key
99 | {
100 | OffsetT retval = 0;
101 | while (num_items > 0)
102 | {
103 | OffsetT half = num_items >> 1;
104 | if (input[retval + half] < val)
105 | {
106 | retval = retval + (half + 1);
107 | num_items = num_items - (half + 1);
108 | }
109 | else
110 | {
111 | num_items = half;
112 | }
113 | }
114 |
115 | return retval;
116 | }
117 |
118 |
119 | /**
120 | * \brief Returns the offset of the first value within \p input which compares greater than \p val
121 | */
122 | template <
123 | typename InputIteratorT,
124 | typename OffsetT,
125 | typename T>
126 | __device__ __forceinline__ OffsetT UpperBound(
127 | InputIteratorT input, ///< [in] Input sequence
128 | OffsetT num_items, ///< [in] Input sequence length
129 | T val) ///< [in] Search key
130 | {
131 | OffsetT retval = 0;
132 | while (num_items > 0)
133 | {
134 | OffsetT half = num_items >> 1;
135 | if (val < input[retval + half])
136 | {
137 | num_items = half;
138 | }
139 | else
140 | {
141 | retval = retval + (half + 1);
142 | num_items = num_items - (half + 1);
143 | }
144 | }
145 |
146 | return retval;
147 | }
148 |
149 |
150 |
151 |
152 |
153 | } // CUB namespace
154 | CUB_NS_POSTFIX // Optional outer namespace(s)
155 |
--------------------------------------------------------------------------------
/cub/util_arch.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Static architectural properties by SM version.
32 | */
33 |
34 | #pragma once
35 |
36 | #include "util_namespace.cuh"
37 |
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 |
41 | /// CUB namespace
42 | namespace cub {
43 |
44 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
45 |
46 |
47 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
48 | #ifndef CUB_PTX_ARCH
49 | #ifndef __CUDA_ARCH__
50 | #define CUB_PTX_ARCH 0
51 | #else
52 | #define CUB_PTX_ARCH __CUDA_ARCH__
53 | #endif
54 | #endif
55 |
56 |
57 | /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API.
58 | #ifndef CUB_RUNTIME_FUNCTION
59 | #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
60 | #define CUB_RUNTIME_ENABLED
61 | #define CUB_RUNTIME_FUNCTION __host__ __device__
62 | #else
63 | #define CUB_RUNTIME_FUNCTION __host__
64 | #endif
65 | #endif
66 |
67 |
68 | /// Number of threads per warp
69 | #ifndef CUB_LOG_WARP_THREADS
70 | #define CUB_LOG_WARP_THREADS(arch) \
71 | (5)
72 | #define CUB_WARP_THREADS(arch) \
73 | (1 << CUB_LOG_WARP_THREADS(arch))
74 |
75 | #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH)
76 | #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
77 | #endif
78 |
79 |
80 | /// Number of smem banks
81 | #ifndef CUB_LOG_SMEM_BANKS
82 | #define CUB_LOG_SMEM_BANKS(arch) \
83 | ((arch >= 200) ? \
84 | (5) : \
85 | (4))
86 | #define CUB_SMEM_BANKS(arch) \
87 | (1 << CUB_LOG_SMEM_BANKS(arch))
88 |
89 | #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
90 | #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH)
91 | #endif
92 |
93 |
94 | /// Oversubscription factor
95 | #ifndef CUB_SUBSCRIPTION_FACTOR
96 | #define CUB_SUBSCRIPTION_FACTOR(arch) \
97 | ((arch >= 300) ? \
98 | (5) : \
99 | ((arch >= 200) ? \
100 | (3) : \
101 | (10)))
102 | #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
103 | #endif
104 |
105 |
106 | /// Prefer padding overhead vs X-way conflicts greater than this threshold
107 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING
108 | #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \
109 | ((arch >= 300) ? \
110 | (1) : \
111 | (4))
112 | #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
113 | #endif
114 |
115 |
116 | /// Scale the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data. Minimum of two warps.
117 | #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \
118 | (CUB_MIN(NOMINAL_4B_BLOCK_THREADS, CUB_MAX(3, ((NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4) / sizeof(T)) * CUB_WARP_THREADS(PTX_ARCH)))
119 |
120 | /// If necessary, scale down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data. Minimum 1 item per thread
121 | #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \
122 | (CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))))
123 |
124 |
125 |
126 | #endif // Do not document
127 |
128 | } // CUB namespace
129 | CUB_NS_POSTFIX // Optional outer namespace(s)
130 |
--------------------------------------------------------------------------------
/cub/util_debug.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Error and event logging routines.
32 | *
33 | * The following macros definitions are supported:
34 | * - \p CUB_LOG. Simple event messages are printed to \p stdout.
35 | */
36 |
37 | #pragma once
38 |
39 | #include
40 | #include "util_namespace.cuh"
41 | #include "util_arch.cuh"
42 |
43 | /// Optional outer namespace(s)
44 | CUB_NS_PREFIX
45 |
46 | /// CUB namespace
47 | namespace cub {
48 |
49 |
50 | /**
51 | * \addtogroup UtilMgmt
52 | * @{
53 | */
54 |
55 |
56 | /// CUB error reporting macro (prints error messages to stderr)
57 | #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
58 | #define CUB_STDERR
59 | #endif
60 |
61 |
62 |
63 | /**
64 | * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
65 | *
66 | * \return The CUDA error.
67 | */
68 | __host__ __device__ __forceinline__ cudaError_t Debug(
69 | cudaError_t error,
70 | const char* filename,
71 | int line)
72 | {
73 | #ifdef CUB_STDERR
74 | if (error)
75 | {
76 | #if (CUB_PTX_ARCH == 0)
77 | fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
78 | fflush(stderr);
79 | #elif (CUB_PTX_ARCH >= 200)
80 | printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
81 | #endif
82 | }
83 | #endif
84 | return error;
85 | }
86 |
87 |
88 | /**
89 | * \brief Debug macro
90 | */
91 | #ifndef CubDebug
92 | #define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
93 | #endif
94 |
95 |
96 | /**
97 | * \brief Debug macro with exit
98 | */
99 | #ifndef CubDebugExit
100 | #define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
101 | #endif
102 |
103 |
104 | /**
105 | * \brief Log macro for printf statements.
106 | */
107 | #if !defined(_CubLog)
108 | #if (CUB_PTX_ARCH == 0)
109 | #define _CubLog(format, ...) printf(format,__VA_ARGS__);
110 | #elif (CUB_PTX_ARCH >= 200)
111 | #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
112 | #endif
113 | #endif
114 |
115 |
116 |
117 |
118 | /** @} */ // end group UtilMgmt
119 |
120 | } // CUB namespace
121 | CUB_NS_POSTFIX // Optional outer namespace(s)
122 |
--------------------------------------------------------------------------------
/cub/util_device.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Properties of a given CUDA device and the corresponding PTX bundle
32 | */
33 |
34 | #pragma once
35 |
36 | #include "util_type.cuh"
37 | #include "util_arch.cuh"
38 | #include "util_debug.cuh"
39 | #include "util_namespace.cuh"
40 | #include "util_macro.cuh"
41 |
42 | /// Optional outer namespace(s)
43 | CUB_NS_PREFIX
44 |
45 | /// CUB namespace
46 | namespace cub {
47 |
48 |
49 | /**
50 | * \addtogroup UtilMgmt
51 | * @{
52 | */
53 |
54 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
55 |
56 |
57 | /**
58 | * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
59 | */
60 | template
61 | CUB_RUNTIME_FUNCTION __forceinline__
62 | cudaError_t AliasTemporaries(
63 | void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
64 | size_t &temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation
65 | void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed
66 | size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed
67 | {
68 | const int ALIGN_BYTES = 256;
69 | const int ALIGN_MASK = ~(ALIGN_BYTES - 1);
70 |
71 | // Compute exclusive prefix sum over allocation requests
72 | size_t allocation_offsets[ALLOCATIONS];
73 | size_t bytes_needed = 0;
74 | for (int i = 0; i < ALLOCATIONS; ++i)
75 | {
76 | size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
77 | allocation_offsets[i] = bytes_needed;
78 | bytes_needed += allocation_bytes;
79 | }
80 | bytes_needed += ALIGN_BYTES - 1;
81 |
82 | // Check if the caller is simply requesting the size of the storage allocation
83 | if (!d_temp_storage)
84 | {
85 | temp_storage_bytes = bytes_needed;
86 | return cudaSuccess;
87 | }
88 |
89 | // Check if enough storage provided
90 | if (temp_storage_bytes < bytes_needed)
91 | {
92 | return CubDebug(cudaErrorInvalidValue);
93 | }
94 |
95 | // Alias
96 | d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
97 | for (int i = 0; i < ALLOCATIONS; ++i)
98 | {
99 | allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i];
100 | }
101 |
102 | return cudaSuccess;
103 | }
104 |
105 |
106 | /**
107 | * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
108 | */
109 | template
110 | __global__ void EmptyKernel(void) { }
111 |
112 |
113 | #endif // DOXYGEN_SHOULD_SKIP_THIS
114 |
115 | /**
116 | * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
117 | */
118 | CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
119 | {
120 | struct Dummy
121 | {
122 | /// Type definition of the EmptyKernel kernel entry point
123 | typedef void (*EmptyKernelPtr)();
124 |
125 | /// Force EmptyKernel to be generated if this class is used
126 | CUB_RUNTIME_FUNCTION __forceinline__
127 | EmptyKernelPtr Empty()
128 | {
129 | return EmptyKernel;
130 | }
131 | };
132 |
133 |
134 | #ifndef CUB_RUNTIME_ENABLED
135 |
136 | // CUDA API calls not supported from this device
137 | return cudaErrorInvalidConfiguration;
138 |
139 | #elif (CUB_PTX_ARCH > 0)
140 |
141 | ptx_version = CUB_PTX_ARCH;
142 | return cudaSuccess;
143 |
144 | #else
145 |
146 | cudaError_t error = cudaSuccess;
147 | do
148 | {
149 | cudaFuncAttributes empty_kernel_attrs;
150 | if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel))) break;
151 | ptx_version = empty_kernel_attrs.ptxVersion * 10;
152 | }
153 | while (0);
154 |
155 | return error;
156 |
157 | #endif
158 | }
159 |
160 |
161 | /**
162 | * \brief Retrieves the SM version (major * 100 + minor * 10)
163 | */
164 | CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
165 | {
166 | #ifndef CUB_RUNTIME_ENABLED
167 |
168 | // CUDA API calls not supported from this device
169 | return cudaErrorInvalidConfiguration;
170 |
171 | #else
172 |
173 | cudaError_t error = cudaSuccess;
174 | do
175 | {
176 | // Fill in SM version
177 | int major, minor;
178 | if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
179 | if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
180 | sm_version = major * 100 + minor * 10;
181 | }
182 | while (0);
183 |
184 | return error;
185 |
186 | #endif
187 | }
188 |
189 |
190 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
191 |
192 | /**
193 | * Synchronize the stream if specified
194 | */
195 | CUB_RUNTIME_FUNCTION __forceinline__
196 | static cudaError_t SyncStream(cudaStream_t stream)
197 | {
198 | #if (CUB_PTX_ARCH == 0)
199 | return cudaStreamSynchronize(stream);
200 | #else
201 | // Device can't yet sync on a specific stream
202 | return cudaDeviceSynchronize();
203 | #endif
204 | }
205 |
206 |
207 | /**
208 | * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
209 | *
210 | * \par Snippet
211 | * The code snippet below illustrates the use of the MaxSmOccupancy function.
212 | * \par
213 | * \code
214 | * #include // or equivalently
215 | *
216 | * template
217 | * __global__ void ExampleKernel()
218 | * {
219 | * // Allocate shared memory for BlockScan
220 | * __shared__ volatile T buffer[4096];
221 | *
222 | * ...
223 | * }
224 | *
225 | * ...
226 | *
227 | * // Determine SM occupancy for ExampleKernel specialized for unsigned char
228 | * int max_sm_occupancy;
229 | * MaxSmOccupancy(max_sm_occupancy, ExampleKernel, 64);
230 | *
231 | * // max_sm_occupancy <-- 4 on SM10
232 | * // max_sm_occupancy <-- 8 on SM20
233 | * // max_sm_occupancy <-- 12 on SM35
234 | *
235 | * \endcode
236 | *
237 | */
238 | template
239 | CUB_RUNTIME_FUNCTION __forceinline__
240 | cudaError_t MaxSmOccupancy(
241 | int &max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM
242 | KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy
243 | int block_threads, ///< [in] Number of threads per thread block
244 | int dynamic_smem_bytes = 0)
245 | {
246 | #ifndef CUB_RUNTIME_ENABLED
247 |
248 | // CUDA API calls not supported from this device
249 | return CubDebug(cudaErrorInvalidConfiguration);
250 |
251 | #else
252 |
253 | return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
254 | &max_sm_occupancy,
255 | kernel_ptr,
256 | block_threads,
257 | dynamic_smem_bytes);
258 |
259 | #endif // CUB_RUNTIME_ENABLED
260 | }
261 |
262 |
263 | /******************************************************************************
264 | * Policy management
265 | ******************************************************************************/
266 |
267 | /**
268 | * Kernel dispatch configuration
269 | */
270 | struct KernelConfig
271 | {
272 | int block_threads;
273 | int items_per_thread;
274 | int tile_size;
275 | int sm_occupancy;
276 |
277 | CUB_RUNTIME_FUNCTION __forceinline__
278 | KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
279 |
280 | template
281 | CUB_RUNTIME_FUNCTION __forceinline__
282 | cudaError_t Init(KernelPtrT kernel_ptr)
283 | {
284 | block_threads = AgentPolicyT::BLOCK_THREADS;
285 | items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
286 | tile_size = block_threads * items_per_thread;
287 | cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
288 | return retval;
289 | }
290 | };
291 |
292 |
293 |
294 | /// Helper for dispatching into a policy chain
295 | template
296 | struct ChainedPolicy
297 | {
298 | /// The policy for the active compiler pass
299 | typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
300 |
301 | /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
302 | template
303 | CUB_RUNTIME_FUNCTION __forceinline__
304 | static cudaError_t Invoke(int ptx_version, FunctorT &op)
305 | {
306 | if (ptx_version < PTX_VERSION) {
307 | return PrevPolicyT::Invoke(ptx_version, op);
308 | }
309 | return op.template Invoke();
310 | }
311 | };
312 |
313 | /// Helper for dispatching into a policy chain (end-of-chain specialization)
314 | template
315 | struct ChainedPolicy
316 | {
317 | /// The policy for the active compiler pass
318 | typedef PolicyT ActivePolicy;
319 |
320 | /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
321 | template
322 | CUB_RUNTIME_FUNCTION __forceinline__
323 | static cudaError_t Invoke(int ptx_version, FunctorT &op) {
324 | return op.template Invoke();
325 | }
326 | };
327 |
328 |
329 |
330 |
331 | #endif // Do not document
332 |
333 |
334 |
335 |
336 | /** @} */ // end group UtilMgmt
337 |
338 | } // CUB namespace
339 | CUB_NS_POSTFIX // Optional outer namespace(s)
340 |
--------------------------------------------------------------------------------
/cub/util_macro.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /******************************************************************************
30 | * Common C/C++ macro utilities
31 | ******************************************************************************/
32 |
33 | #pragma once
34 |
35 | #include "util_namespace.cuh"
36 |
37 | /// Optional outer namespace(s)
38 | CUB_NS_PREFIX
39 |
40 | /// CUB namespace
41 | namespace cub {
42 |
43 |
44 | /**
45 | * \addtogroup UtilModule
46 | * @{
47 | */
48 |
49 | #ifndef CUB_ALIGN
50 | #if defined(_WIN32) || defined(_WIN64)
51 | /// Align struct
52 | #define CUB_ALIGN(bytes) __declspec(align(32))
53 | #else
54 | /// Align struct
55 | #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
56 | #endif
57 | #endif
58 |
59 | #ifndef CUB_MAX
60 | /// Select maximum(a, b)
61 | #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
62 | #endif
63 |
64 | #ifndef CUB_MIN
65 | /// Select minimum(a, b)
66 | #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
67 | #endif
68 |
69 | #ifndef CUB_QUOTIENT_FLOOR
70 | /// Quotient of x/y rounded down to nearest integer
71 | #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
72 | #endif
73 |
74 | #ifndef CUB_QUOTIENT_CEILING
75 | /// Quotient of x/y rounded up to nearest integer
76 | #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
77 | #endif
78 |
79 | #ifndef CUB_ROUND_UP_NEAREST
80 | /// x rounded up to the nearest multiple of y
81 | #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
82 | #endif
83 |
84 | #ifndef CUB_ROUND_DOWN_NEAREST
85 | /// x rounded down to the nearest multiple of y
86 | #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
87 | #endif
88 |
89 |
90 | #ifndef CUB_STATIC_ASSERT
91 | #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
92 | #define CUB_CAT_(a, b) a ## b
93 | #define CUB_CAT(a, b) CUB_CAT_(a, b)
94 | #endif // DOXYGEN_SHOULD_SKIP_THIS
95 |
96 | /// Static assert
97 | #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
98 | #endif
99 |
100 | /** @} */ // end group UtilModule
101 |
102 | } // CUB namespace
103 | CUB_NS_POSTFIX // Optional outer namespace(s)
104 |
--------------------------------------------------------------------------------
/cub/util_namespace.cuh:
--------------------------------------------------------------------------------
1 | /******************************************************************************
2 | * Copyright (c) 2011, Duane Merrill. All rights reserved.
3 | * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved.
4 | *
5 | * Redistribution and use in source and binary forms, with or without
6 | * modification, are permitted provided that the following conditions are met:
7 | * * Redistributions of source code must retain the above copyright
8 | * notice, this list of conditions and the following disclaimer.
9 | * * Redistributions in binary form must reproduce the above copyright
10 | * notice, this list of conditions and the following disclaimer in the
11 | * documentation and/or other materials provided with the distribution.
12 | * * Neither the name of the NVIDIA CORPORATION nor the
13 | * names of its contributors may be used to endorse or promote products
14 | * derived from this software without specific prior written permission.
15 | *
16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | *
27 | ******************************************************************************/
28 |
29 | /**
30 | * \file
31 | * Place-holder for prefixing the cub namespace
32 | */
33 |
34 | #pragma once
35 |
36 | // For example:
37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail {
38 | //#define CUB_NS_POSTFIX } }
39 |
40 | #define CUB_NS_PREFIX
41 | #define CUB_NS_POSTFIX
42 |
--------------------------------------------------------------------------------
/eval_csrmv.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if (( $# != 2 )); then
4 | echo "$0 "
5 | exit 0
6 | fi
7 |
8 | echo "file, num_rows, num_cols, num_nonzeros, row_length_mean, row_length_std_dev, row_length_variation, row_length_skewness, method_name, setup_ms, avg_spmv_ms, gflops, effective_GBs"
9 |
10 | MTX_DIR=$1
11 |
12 | shift
13 |
14 | for i in `find $MTX_DIR -name *.mtx`
15 | do
16 | ./$@ --quiet --mtx=$i
17 | done
18 |
--------------------------------------------------------------------------------
/get_uf_datasets.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ "$#" -eq 0 ]; then
4 | MTX_DIR = mtx
5 | else
6 | MTX_DIR = $1
7 | fi
8 |
9 | # Make temporary directory for download/unpack
10 | mkdir -p tgz
11 | cd tgz
12 |
13 | # Download
14 | for i in `cat ../ufl_urls.txt`; do echo $i; wget $i; done
15 |
16 | # Unpack
17 | for i in `cat ../ufl_matrices.txt`; do gunzip $i.tar.gz; tar -xvf $i.tar; rm $i.tar; done
18 |
19 | # Relocate
20 | mkdir -p ../$MTX_DIR
21 | for i in `find . -name *.mtx`; do echo $i; mv $i ../$MTX_DIR/; done
22 |
23 | # Cleanup
24 | cd ..
25 | rm -rf tgz
26 |
--------------------------------------------------------------------------------
/gpu_spmv:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ./_gpu_spmv_driver $@
4 |
--------------------------------------------------------------------------------
/merge-based-spmv-sc16-preprint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dumerrill/merge-spmv/18895571ce9af960ee207dae541b0ffc701ea4bb/merge-based-spmv-sc16-preprint.pdf
--------------------------------------------------------------------------------
/merge_decomposition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dumerrill/merge-spmv/18895571ce9af960ee207dae541b0ffc701ea4bb/merge_decomposition.png
--------------------------------------------------------------------------------
/merge_spmv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dumerrill/merge-spmv/18895571ce9af960ee207dae541b0ffc701ea4bb/merge_spmv.png
--------------------------------------------------------------------------------