├── .cproject
├── .gitignore
├── .project
├── .settings
    ├── .gitignore
    └── language.settings.xml
├── LICENSE.TXT
├── Makefile
├── README.md
├── cpu_spmv
├── cpu_spmv.cpp
├── cub
    ├── agent
    │   ├── agent_histogram.cuh
    │   ├── agent_radix_sort_downsweep.cuh
    │   ├── agent_radix_sort_upsweep.cuh
    │   ├── agent_reduce.cuh
    │   ├── agent_reduce_by_key.cuh
    │   ├── agent_rle.cuh
    │   ├── agent_scan.cuh
    │   ├── agent_segment_fixup.cuh
    │   ├── agent_select_if.cuh
    │   ├── agent_spmv_csrt.cuh
    │   ├── agent_spmv_orig.cuh
    │   ├── agent_spmv_row_based.cuh
    │   └── single_pass_scan_operators.cuh
    ├── block
    │   ├── block_discontinuity.cuh
    │   ├── block_exchange.cuh
    │   ├── block_histogram.cuh
    │   ├── block_load.cuh
    │   ├── block_radix_rank.cuh
    │   ├── block_radix_sort.cuh
    │   ├── block_raking_layout.cuh
    │   ├── block_reduce.cuh
    │   ├── block_reduce_by_key.cuh
    │   ├── block_scan.cuh
    │   ├── block_shuffle.cuh
    │   ├── block_store.cuh
    │   └── specializations
    │   │   ├── block_histogram_atomic.cuh
    │   │   ├── block_histogram_sort.cuh
    │   │   ├── block_reduce_raking.cuh
    │   │   ├── block_reduce_raking_commutative_only.cuh
    │   │   ├── block_reduce_warp_reductions.cuh
    │   │   ├── block_scan_raking.cuh
    │   │   └── block_scan_warp_scans.cuh
    ├── cub.cuh
    ├── device
    │   ├── device_histogram.cuh
    │   ├── device_partition.cuh
    │   ├── device_radix_sort.cuh
    │   ├── device_reduce.cuh
    │   ├── device_run_length_encode.cuh
    │   ├── device_scan.cuh
    │   ├── device_segmented_radix_sort.cuh
    │   ├── device_segmented_reduce.cuh
    │   ├── device_select.cuh
    │   ├── device_spmv.cuh
    │   └── dispatch
    │   │   ├── dispatch_histogram.cuh
    │   │   ├── dispatch_radix_sort.cuh
    │   │   ├── dispatch_reduce.cuh
    │   │   ├── dispatch_reduce_by_key.cuh
    │   │   ├── dispatch_rle.cuh
    │   │   ├── dispatch_scan.cuh
    │   │   ├── dispatch_select_if.cuh
    │   │   ├── dispatch_spmv_csrt.cuh
    │   │   ├── dispatch_spmv_orig.cuh
    │   │   └── dispatch_spmv_row_based.cuh
    ├── grid
    │   ├── grid_barrier.cuh
    │   ├── grid_even_share.cuh
    │   ├── grid_mapping.cuh
    │   └── grid_queue.cuh
    ├── host
    │   └── mutex.cuh
    ├── iterator
    │   ├── arg_index_input_iterator.cuh
    │   ├── cache_modified_input_iterator.cuh
    │   ├── cache_modified_output_iterator.cuh
    │   ├── constant_input_iterator.cuh
    │   ├── counting_input_iterator.cuh
    │   ├── tex_obj_input_iterator.cuh
    │   ├── tex_ref_input_iterator.cuh
    │   └── transform_input_iterator.cuh
    ├── thread
    │   ├── thread_load.cuh
    │   ├── thread_operators.cuh
    │   ├── thread_reduce.cuh
    │   ├── thread_scan.cuh
    │   ├── thread_search.cuh
    │   └── thread_store.cuh
    ├── util_allocator.cuh
    ├── util_arch.cuh
    ├── util_debug.cuh
    ├── util_device.cuh
    ├── util_macro.cuh
    ├── util_namespace.cuh
    ├── util_ptx.cuh
    ├── util_type.cuh
    └── warp
    │   ├── specializations
    │       ├── warp_reduce_shfl.cuh
    │       ├── warp_reduce_smem.cuh
    │       ├── warp_scan_shfl.cuh
    │       └── warp_scan_smem.cuh
    │   ├── warp_reduce.cuh
    │   └── warp_scan.cuh
├── eval_csrmv.sh
├── get_uf_datasets.sh
├── gpu_spmv
├── gpu_spmv.cu
├── merge-based-spmv-sc16-preprint.pdf
├── merge_decomposition.png
├── merge_spmv.png
├── sparse_matrix.h
├── ufl_matrices.txt
├── ufl_urls.txt
└── utils.h


/.cproject:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 3 | 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 4 | 		<cconfiguration id="cdt.managedbuild.toolchain.gnu.cygwin.base.1803165810">
 5 | 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.toolchain.gnu.cygwin.base.1803165810" moduleId="org.eclipse.cdt.core.settings" name="Default">
 6 | 				<externalSettings/>
 7 | 				<extensions>
 8 | 					<extension id="org.eclipse.cdt.core.Cygwin_PE" point="org.eclipse.cdt.core.BinaryParser"/>
 9 | 					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
10 | 					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
11 | 					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
12 | 					<extension id="org.eclipse.cdt.core.CWDLocator" point="org.eclipse.cdt.core.ErrorParser"/>
13 | 					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
14 | 				</extensions>
15 | 			</storageModule>
16 | 			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
17 | 				<configuration artifactName="${ProjName}" buildProperties="" description="" id="cdt.managedbuild.toolchain.gnu.cygwin.base.1803165810" name="Default" parent="org.eclipse.cdt.build.core.emptycfg">
18 | 					<folderInfo id="cdt.managedbuild.toolchain.gnu.cygwin.base.1803165810.1924824585" name="/" resourcePath="">
19 | 						<toolChain id="cdt.managedbuild.toolchain.gnu.cygwin.base.286399110" name="Cygwin GCC" superClass="cdt.managedbuild.toolchain.gnu.cygwin.base">
20 | 							<targetPlatform archList="all" binaryParser="org.eclipse.cdt.core.Cygwin_PE" id="cdt.managedbuild.target.gnu.platform.cygwin.base.1576095345" name="Debug Platform" osList="win32" superClass="cdt.managedbuild.target.gnu.platform.cygwin.base"/>
21 | 							<builder id="cdt.managedbuild.target.gnu.builder.cygwin.base.1436435951" managedBuildOn="false" name="Gnu Make Builder.Default" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
22 | 							<tool id="cdt.managedbuild.tool.gnu.assembler.cygwin.base.896064160" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.cygwin.base"/>
23 | 							<tool id="cdt.managedbuild.tool.gnu.archiver.cygwin.base.264840539" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.cygwin.base"/>
24 | 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.592013859" name="Cygwin C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base"/>
25 | 							<tool id="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.695120737" name="Cygwin C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base"/>
26 | 							<tool id="cdt.managedbuild.tool.gnu.c.linker.cygwin.base.245849754" name="Cygwin C Linker" superClass="cdt.managedbuild.tool.gnu.c.linker.cygwin.base"/>
27 | 							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base.31375288" name="Cygwin C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.cygwin.base"/>
28 | 						</toolChain>
29 | 					</folderInfo>
30 | 				</configuration>
31 | 			</storageModule>
32 | 			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
33 | 		</cconfiguration>
34 | 	</storageModule>
35 | 	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
36 | 		<project id="merge-spmv.null.723424341" name="merge-spmv"/>
37 | 	</storageModule>
38 | 	<storageModule moduleId="scannerConfiguration">
39 | 		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
40 | 	</storageModule>
41 | 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
42 | </cproject>
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | _cpu_spmv_driver
2 | _gpu_spmv_driver
3 | mtx
4 | 


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>merge-spmv</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.cdt.autotools.core.genmakebuilderV2</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
15 | 			<triggers>clean,full,incremental,</triggers>
16 | 			<arguments>
17 | 			</arguments>
18 | 		</buildCommand>
19 | 		<buildCommand>
20 | 			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
21 | 			<triggers>full,incremental,</triggers>
22 | 			<arguments>
23 | 			</arguments>
24 | 		</buildCommand>
25 | 	</buildSpec>
26 | 	<natures>
27 | 		<nature>org.eclipse.cdt.core.cnature</nature>
28 | 		<nature>org.eclipse.cdt.core.ccnature</nature>
29 | 		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
30 | 		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
31 | 		<nature>org.eclipse.cdt.autotools.core.autotoolsNatureV2</nature>
32 | 	</natures>
33 | </projectDescription>
34 | 


--------------------------------------------------------------------------------
/.settings/.gitignore:
--------------------------------------------------------------------------------
1 | /language.settings.xml
2 | 


--------------------------------------------------------------------------------
/.settings/language.settings.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <project>
 3 | 	<configuration id="cdt.managedbuild.toolchain.gnu.cygwin.base.1803165810" name="Default">
 4 | 		<extension point="org.eclipse.cdt.core.LanguageSettingsProvider">
 5 | 			<provider copy-of="extension" id="org.eclipse.cdt.ui.UserLanguageSettingsProvider"/>
 6 | 			<provider-reference id="org.eclipse.cdt.core.ReferencedProjectsLanguageSettingsProvider" ref="shared-provider"/>
 7 | 			<provider copy-of="extension" id="org.eclipse.cdt.managedbuilder.core.GCCBuildCommandParser"/>
 8 | 			<provider class="org.eclipse.cdt.managedbuilder.internal.language.settings.providers.GCCBuiltinSpecsDetectorCygwin" console="false" env-hash="1912346218685478520" id="org.eclipse.cdt.managedbuilder.core.GCCBuiltinSpecsDetectorCygwin" keep-relative-paths="false" name="CDT GCC Built-in Compiler Settings Cygwin" parameter="${COMMAND} ${FLAGS} -E -P -v -dD &quot;${INPUTS}&quot;" prefer-non-shared="true">
 9 | 				<language-scope id="org.eclipse.cdt.core.gcc"/>
10 | 				<language-scope id="org.eclipse.cdt.core.g++"/>
11 | 			</provider>
12 | 			<provider-reference id="org.eclipse.cdt.managedbuilder.core.MBSLanguageSettingsProvider" ref="shared-provider"/>
13 | 		</extension>
14 | 	</configuration>
15 | </project>
16 | 


--------------------------------------------------------------------------------
/LICENSE.TXT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are met:
 5 |    *  Redistributions of source code must retain the above copyright
 6 |       notice, this list of conditions and the following disclaimer.
 7 |    *  Redistributions in binary form must reproduce the above copyright
 8 |       notice, this list of conditions and the following disclaimer in the
 9 |       documentation and/or other materials provided with the distribution.
10 |    *  Neither the name of the NVIDIA CORPORATION nor the
11 |       names of its contributors may be used to endorse or promote products
12 |       derived from this software without specific prior written permission.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
18 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | #/******************************************************************************
  2 | # * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 | # * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 | # * 
  5 | # * Redistribution and use in source and binary forms, with or without
  6 | # * modification, are permitted provided that the following conditions are met:
  7 | # *	 * Redistributions of source code must retain the above copyright
  8 | # *	   notice, this list of conditions and the following disclaimer.
  9 | # *	 * Redistributions in binary form must reproduce the above copyright
 10 | # *	   notice, this list of conditions and the following disclaimer in the
 11 | # *	   documentation and/or other materials provided with the distribution.
 12 | # *	 * Neither the name of the NVIDIA CORPORATION nor the
 13 | # *	   names of its contributors may be used to endorse or promote products
 14 | # *	   derived from this software without specific prior written permission.
 15 | # * 
 16 | # * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 | # * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 | # * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 | # * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 | # * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 | # * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 | # * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 | # * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 | # * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 | # * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 | # *
 27 | #******************************************************************************/
 28 | 
 29 | #-------------------------------------------------------------------------------
 30 | #
 31 | # Makefile usage
 32 | # 
 33 | # CPU:
 34 | # make cpu_spmv
 35 | #
 36 | # GPU:
 37 | # make gpu_spmv [sm=<XXX,...>] [verbose=<0|1>] 
 38 | #
 39 | #-------------------------------------------------------------------------------
 40 |  
 41 | #-------------------------------------------------------------------------------
 42 | # Commandline Options
 43 | #-------------------------------------------------------------------------------
 44 | 
 45 | 
 46 | # [sm=<XXX,...>] Compute-capability to compile for, e.g., "sm=200,300,350" (SM20 by default).
 47 |   
 48 | COMMA = ,
 49 | ifdef sm
 50 | 	SM_ARCH = $(subst $(COMMA),-,$(sm))
 51 | else 
 52 |     SM_ARCH = 350
 53 | endif
 54 | 
 55 | ifeq (520, $(findstring 520, $(SM_ARCH)))
 56 |     SM_TARGETS 	+= -gencode=arch=compute_52,code=\"sm_52,compute_52\" 
 57 | endif
 58 | ifeq (370, $(findstring 370, $(SM_ARCH)))
 59 |     SM_TARGETS 	+= -gencode=arch=compute_37,code=\"sm_37,compute_37\" 
 60 | endif
 61 | ifeq (350, $(findstring 350, $(SM_ARCH)))
 62 |     SM_TARGETS 	+= -gencode=arch=compute_35,code=\"sm_35,compute_35\" 
 63 | endif
 64 | ifeq (300, $(findstring 300, $(SM_ARCH)))
 65 |     SM_TARGETS 	+= -gencode=arch=compute_30,code=\"sm_30,compute_30\"
 66 | endif
 67 | 
 68 | 
 69 | # [verbose=<0|1>] Verbose toolchain output from nvcc option
 70 | 
 71 | ifeq ($(verbose), 1)
 72 | 	NVCCFLAGS += -v
 73 | endif
 74 | 
 75 | 
 76 | 
 77 | #-------------------------------------------------------------------------------
 78 | # Compiler and compilation platform
 79 | #-------------------------------------------------------------------------------
 80 | 
 81 | CUB_DIR = $(dir $(lastword $(MAKEFILE_LIST)))
 82 | 
 83 | NVCC = "$(shell which nvcc)"
 84 | ifdef nvccver
 85 |     NVCC_VERSION = $(nvccver)
 86 | else
 87 |     NVCC_VERSION = $(strip $(shell nvcc --version | grep release | sed 's/.*release //' |  sed 's/,.*//'))
 88 | endif
 89 | 
 90 | # detect OS
 91 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
 92 | 
 93 | # Default flags: verbose kernel properties (regs, smem, cmem, etc.); runtimes for compilation phases 
 94 | NVCCFLAGS += $(SM_DEF) -Xptxas -v -Xcudafe -\# 
 95 | 
 96 | ifeq (WIN_NT, $(findstring WIN_NT, $(OSUPPER)))
 97 |     # For MSVC
 98 |     # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
 99 |     NVCCFLAGS += -Xcompiler /fp:strict
100 |     # Help the compiler/linker work with huge numbers of kernels on Windows
101 | 	NVCCFLAGS += -Xcompiler /bigobj -Xcompiler /Zm500
102 | 	CC = cl
103 | 	NPPI = -lnppi
104 | 	
105 | 	# Multithreaded runtime
106 | 	NVCCFLAGS += -Xcompiler /MT
107 | 	
108 | ifneq ($(force32), 1)
109 | 	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/Win32/cudart.lib"
110 | else
111 | 	CUDART_CYG = "$(shell dirname $(NVCC))/../lib/x64/cudart.lib"
112 | endif
113 | 	CUDART = "$(shell cygpath -w $(CUDART_CYG))"
114 | else
115 |     # For g++
116 |     # Disable excess x86 floating point precision that can lead to results being labeled incorrectly
117 |     NVCCFLAGS += -Xcompiler -ffloat-store
118 |     CC = g++
119 | ifneq ($(force32), 1)
120 |     CUDART = "$(shell dirname $(NVCC))/../lib/libcudart_static.a"
121 | else
122 |     CUDART = "$(shell dirname $(NVCC))/../lib64/libcudart_static.a"
123 | endif
124 | endif
125 | 
126 | 
127 | #-------------------------------------------------------------------------------
128 | # Compiler and compilation platform
129 | #-------------------------------------------------------------------------------
130 | 
131 | # OMP compiler
132 | OMPCC=icpc
133 | OMPCC_FLAGS=-openmp -O3 -lrt -fno-alias -xHost -lnuma -O3 -mkl
134 | 
135 | # Includes
136 | INC += -I$(CUB_DIR) -I$(CUB_DIR)test 
137 | 
138 | # detect OS
139 | OSUPPER = $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
140 | 
141 | #-------------------------------------------------------------------------------
142 | # Dependency Lists
143 | #-------------------------------------------------------------------------------
144 | 
145 | rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
146 | 
147 | DEPS = 	$(call rwildcard, $(CUB_DIR),*.cuh) \
148 | 		$(call rwildcard, $(CUB_DIR),*.h) \
149 |         Makefile
150 | 
151 | #-------------------------------------------------------------------------------
152 | # make clean
153 | #-------------------------------------------------------------------------------
154 | 
155 | clean :
156 | 	rm -f _gpu_spmv_driver _cpu_spmv_driver
157 | 
158 | 		
159 | #-------------------------------------------------------------------------------
160 | # make gpu_spmv
161 | #-------------------------------------------------------------------------------
162 | 
163 | gpu_spmv : gpu_spmv.cu $(DEPS)
164 | 	$(NVCC) $(DEFINES) $(SM_TARGETS) -o _gpu_spmv_driver gpu_spmv.cu $(NVCCFLAGS) $(CPU_ARCH) $(INC) $(LIBS) -lcusparse -O3
165 | 
166 | 	
167 | #-------------------------------------------------------------------------------
168 | # make cpu_spmv
169 | #-------------------------------------------------------------------------------
170 | 
171 | cpu_spmv : cpu_spmv.cpp $(DEPS)
172 | 	$(OMPCC) $(DEFINES) -DCUB_MKL -o _cpu_spmv_driver cpu_spmv.cpp $(OMPCC_FLAGS)
173 | 
174 | 


--------------------------------------------------------------------------------
/cpu_spmv:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | KMP_AFFINITY=granularity=core,scatter
4 | ./_cpu_spmv_driver $@
5 | 


--------------------------------------------------------------------------------
/cub/block/block_raking_layout.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../util_macro.cuh"
 38 | #include "../util_arch.cuh"
 39 | #include "../util_type.cuh"
 40 | #include "../util_namespace.cuh"
 41 | 
 42 | /// Optional outer namespace(s)
 43 | CUB_NS_PREFIX
 44 | 
 45 | /// CUB namespace
 46 | namespace cub {
 47 | 
 48 | /**
 49 |  * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
 50 |  * \ingroup BlockModule
 51 |  *
 52 |  * \par Overview
 53 |  * This type facilitates a shared memory usage pattern where a block of CUDA
 54 |  * threads places elements into shared memory and then reduces the active
 55 |  * parallelism to one "raking" warp of threads for serially aggregating consecutive
 56 |  * sequences of shared items.  Padding is inserted to eliminate bank conflicts
 57 |  * (for most data types).
 58 |  *
 59 |  * \tparam T                        The data type to be exchanged.
 60 |  * \tparam BLOCK_THREADS            The thread block size in threads.
 61 |  * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
 62 |  */
 63 | template <
 64 |     typename    T,
 65 |     int         BLOCK_THREADS,
 66 |     int         PTX_ARCH = CUB_PTX_ARCH>
 67 | struct BlockRakingLayout
 68 | {
 69 |     //---------------------------------------------------------------------
 70 |     // Constants and type definitions
 71 |     //---------------------------------------------------------------------
 72 | 
 73 |     enum
 74 |     {
 75 |         /// The total number of elements that need to be cooperatively reduced
 76 |         SHARED_ELEMENTS = BLOCK_THREADS,
 77 | 
 78 |         /// Maximum number of warp-synchronous raking threads
 79 |         MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
 80 | 
 81 |         /// Number of raking elements per warp-synchronous raking thread (rounded up)
 82 |         SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
 83 | 
 84 |         /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
 85 |         RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
 86 | 
 87 |         /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
 88 |         HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
 89 | 
 90 |         /// Degree of bank conflicts (e.g., 4-way)
 91 |         CONFLICT_DEGREE = (HAS_CONFLICTS) ?
 92 |             (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
 93 |             1,
 94 | 
 95 |         /// Pad each segment length with one element if degree of bank conflicts is greater than 4-way (heuristic)
 96 |         SEGMENT_PADDING = (CONFLICT_DEGREE > CUB_PREFER_CONFLICT_OVER_PADDING(PTX_ARCH)) ? 1 : 0,
 97 | //        SEGMENT_PADDING = (HAS_CONFLICTS) ? 1 : 0,
 98 | 
 99 |         /// Total number of elements in the raking grid
100 |         GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
101 | 
102 |         /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
103 |         UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
104 |     };
105 | 
106 | 
107 |     /**
108 |      * \brief Shared memory storage type
109 |      */
110 |     typedef T _TempStorage[BlockRakingLayout::GRID_ELEMENTS];
111 | 
112 |     /// Alias wrapper allowing storage to be unioned
113 |     struct TempStorage : Uninitialized<_TempStorage> {};
114 | 
115 | 
116 |     /**
117 |      * \brief Returns the location for the calling thread to place data into the grid
118 |      */
119 |     static __device__ __forceinline__ T* PlacementPtr(
120 |         TempStorage &temp_storage,
121 |         int linear_tid)
122 |     {
123 |         // Offset for partial
124 |         unsigned int offset = linear_tid;
125 | 
126 |         // Add in one padding element for every segment
127 |         if (SEGMENT_PADDING > 0)
128 |         {
129 |             offset += offset / SEGMENT_LENGTH;
130 |         }
131 | 
132 |         // Incorporating a block of padding partials every shared memory segment
133 |         return temp_storage.Alias() + offset;
134 |     }
135 | 
136 | 
137 |     /**
138 |      * \brief Returns the location for the calling thread to begin sequential raking
139 |      */
140 |     static __device__ __forceinline__ T* RakingPtr(
141 |         TempStorage &temp_storage,
142 |         int linear_tid)
143 |     {
144 |         return temp_storage.Alias() + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
145 |     }
146 | };
147 | 
148 | }               // CUB namespace
149 | CUB_NS_POSTFIX  // Optional outer namespace(s)
150 | 
151 | 


--------------------------------------------------------------------------------
/cub/block/specializations/block_histogram_atomic.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "../../util_namespace.cuh"
37 | 
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 | 
41 | /// CUB namespace
42 | namespace cub {
43 | 
44 | 
45 | /**
46 |  * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
47 |  */
48 | template <int BINS>
49 | struct BlockHistogramAtomic
50 | {
51 |     /// Shared memory storage layout type
52 |     struct TempStorage {};
53 | 
54 | 
55 |     /// Constructor
56 |     __device__ __forceinline__ BlockHistogramAtomic(
57 |         TempStorage &temp_storage)
58 |     {}
59 | 
60 | 
61 |     /// Composite data onto an existing histogram
62 |     template <
63 |         typename            T,
64 |         typename            CounterT,     
65 |         int                 ITEMS_PER_THREAD>
66 |     __device__ __forceinline__ void Composite(
67 |         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
68 |         CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
69 |     {
70 |         // Update histogram
71 |         #pragma unroll
72 |         for (int i = 0; i < ITEMS_PER_THREAD; ++i)
73 |         {
74 |               atomicAdd(histogram + items[i], 1);
75 |         }
76 |     }
77 | 
78 | };
79 | 
80 | }               // CUB namespace
81 | CUB_NS_POSTFIX  // Optional outer namespace(s)
82 | 
83 | 


--------------------------------------------------------------------------------
/cub/block/specializations/block_histogram_sort.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../../block/block_radix_sort.cuh"
 37 | #include "../../block/block_discontinuity.cuh"
 38 | #include "../../util_ptx.cuh"
 39 | #include "../../util_namespace.cuh"
 40 | 
 41 | /// Optional outer namespace(s)
 42 | CUB_NS_PREFIX
 43 | 
 44 | /// CUB namespace
 45 | namespace cub {
 46 | 
 47 | 
 48 | 
 49 | /**
 50 |  * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
 51 |  */
 52 | template <
 53 |     typename    T,                  ///< Sample type
 54 |     int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
 55 |     int         ITEMS_PER_THREAD,   ///< The number of samples per thread
 56 |     int         BINS,               ///< The number of bins into which histogram samples may fall
 57 |     int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
 58 |     int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
 59 |     int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
 60 | struct BlockHistogramSort
 61 | {
 62 |     /// Constants
 63 |     enum
 64 |     {
 65 |         /// The thread block size in threads
 66 |         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 67 |     };
 68 | 
 69 |     // Parameterize BlockRadixSort type for our thread block
 70 |     typedef BlockRadixSort<
 71 |             T,
 72 |             BLOCK_DIM_X,
 73 |             ITEMS_PER_THREAD,
 74 |             NullType,
 75 |             4,
 76 |             (PTX_ARCH >= 350) ? true : false,
 77 |             BLOCK_SCAN_WARP_SCANS,
 78 |             cudaSharedMemBankSizeFourByte,
 79 |             BLOCK_DIM_Y,
 80 |             BLOCK_DIM_Z,
 81 |             PTX_ARCH>
 82 |         BlockRadixSortT;
 83 | 
 84 |     // Parameterize BlockDiscontinuity type for our thread block
 85 |     typedef BlockDiscontinuity<
 86 |             T,
 87 |             BLOCK_DIM_X,
 88 |             BLOCK_DIM_Y,
 89 |             BLOCK_DIM_Z,
 90 |             PTX_ARCH>
 91 |         BlockDiscontinuityT;
 92 | 
 93 |     /// Shared memory
 94 |     union _TempStorage
 95 |     {
 96 |         // Storage for sorting bin values
 97 |         typename BlockRadixSortT::TempStorage sort;
 98 | 
 99 |         struct
100 |         {
101 |             // Storage for detecting discontinuities in the tile of sorted bin values
102 |             typename BlockDiscontinuityT::TempStorage flag;
103 | 
104 |             // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
105 |             unsigned int run_begin[BINS];
106 |             unsigned int run_end[BINS];
107 |         };
108 |     };
109 | 
110 | 
111 |     /// Alias wrapper allowing storage to be unioned
112 |     struct TempStorage : Uninitialized<_TempStorage> {};
113 | 
114 | 
115 |     // Thread fields
116 |     _TempStorage &temp_storage;
117 |     int linear_tid;
118 | 
119 | 
120 |     /// Constructor
121 |     __device__ __forceinline__ BlockHistogramSort(
122 |         TempStorage     &temp_storage)
123 |     :
124 |         temp_storage(temp_storage.Alias()),
125 |         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
126 |     {}
127 | 
128 | 
129 |     // Discontinuity functor
130 |     struct DiscontinuityOp
131 |     {
132 |         // Reference to temp_storage
133 |         _TempStorage &temp_storage;
134 | 
135 |         // Constructor
136 |         __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
137 |             temp_storage(temp_storage)
138 |         {}
139 | 
140 |         // Discontinuity predicate
141 |         __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
142 |         {
143 |             if (a != b)
144 |             {
145 |                 // Note the begin/end offsets in shared storage
146 |                 temp_storage.run_begin[b] = b_index;
147 |                 temp_storage.run_end[a] = b_index;
148 | 
149 |                 return true;
150 |             }
151 |             else
152 |             {
153 |                 return false;
154 |             }
155 |         }
156 |     };
157 | 
158 | 
159 |     // Composite data onto an existing histogram
160 |     template <
161 |         typename            CounterT     >
162 |     __device__ __forceinline__ void Composite(
163 |         T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
164 |         CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
165 |     {
166 |         enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
167 | 
168 |         // Sort bytes in blocked arrangement
169 |         BlockRadixSortT(temp_storage.sort).Sort(items);
170 | 
171 |         __syncthreads();
172 | 
173 |         // Initialize the shared memory's run_begin and run_end for each bin
174 |         int histo_offset = 0;
175 | 
176 |         #pragma unroll
177 |         for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
178 |         {
179 |             temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
180 |             temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
181 |         }
182 |         // Finish up with guarded initialization if necessary
183 |         if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
184 |         {
185 |             temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
186 |             temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
187 |         }
188 | 
189 |         __syncthreads();
190 | 
191 |         int flags[ITEMS_PER_THREAD];    // unused
192 | 
193 |         // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
194 |         DiscontinuityOp flag_op(temp_storage);
195 |         BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
196 | 
197 |         // Update begin for first item
198 |         if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
199 | 
200 |         __syncthreads();
201 | 
202 |         // Composite into histogram
203 |         histo_offset = 0;
204 | 
205 |         #pragma unroll
206 |         for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
207 |         {
208 |             int thread_offset = histo_offset + linear_tid;
209 |             CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
210 |             histogram[thread_offset] += count;
211 |         }
212 | 
213 |         // Finish up with guarded composition if necessary
214 |         if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
215 |         {
216 |             int thread_offset = histo_offset + linear_tid;
217 |             CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
218 |             histogram[thread_offset] += count;
219 |         }
220 |     }
221 | 
222 | };
223 | 
224 | }               // CUB namespace
225 | CUB_NS_POSTFIX  // Optional outer namespace(s)
226 | 
227 | 


--------------------------------------------------------------------------------
/cub/block/specializations/block_reduce_raking.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../../block/block_raking_layout.cuh"
 37 | #include "../../warp/warp_reduce.cuh"
 38 | #include "../../thread/thread_reduce.cuh"
 39 | #include "../../util_ptx.cuh"
 40 | #include "../../util_namespace.cuh"
 41 | 
 42 | /// Optional outer namespace(s)
 43 | CUB_NS_PREFIX
 44 | 
 45 | /// CUB namespace
 46 | namespace cub {
 47 | 
 48 | 
 49 | /**
 50 |  * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
 51 |  *
 52 |  * Supports non-commutative binary reduction operators.  Unlike commutative
 53 |  * reduction operators (e.g., addition), the application of a non-commutative
 54 |  * reduction operator (e.g, string concatenation) across a sequence of inputs must
 55 |  * honor the relative ordering of items and partial reductions when applying the
 56 |  * reduction operator.
 57 |  *
 58 |  * Compared to the implementation of BlockReduceRaking (which does not support
 59 |  * non-commutative operators), this implementation requires a few extra
 60 |  * rounds of inter-thread communication.
 61 |  */
 62 | template <
 63 |     typename    T,              ///< Data type being reduced
 64 |     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
 65 |     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
 66 |     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
 67 |     int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
 68 | struct BlockReduceRaking
 69 | {
 70 |     /// Constants
 71 |     enum
 72 |     {
 73 |         /// The thread block size in threads
 74 |         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 75 |     };
 76 | 
 77 |     /// Layout type for padded thread block raking grid
 78 |     typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
 79 | 
 80 |     ///  WarpReduce utility type
 81 |     typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
 82 | 
 83 |     /// Constants
 84 |     enum
 85 |     {
 86 |         /// Number of raking threads
 87 |         RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
 88 | 
 89 |         /// Number of raking elements per warp synchronous raking thread
 90 |         SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
 91 | 
 92 |         /// Cooperative work can be entirely warp synchronous
 93 |         WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
 94 | 
 95 |         /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
 96 |         WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
 97 | 
 98 |         /// Whether or not accesses into smem are unguarded
 99 |         RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
100 | 
101 |     };
102 | 
103 | 
104 |     /// Shared memory storage layout type
105 |     union _TempStorage
106 |     {
107 |         typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
108 |         typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded threadblock raking grid
109 |     };
110 | 
111 | 
112 |     /// Alias wrapper allowing storage to be unioned
113 |     struct TempStorage : Uninitialized<_TempStorage> {};
114 | 
115 | 
116 |     // Thread fields
117 |     _TempStorage &temp_storage;
118 |     int linear_tid;
119 | 
120 | 
121 |     /// Constructor
122 |     __device__ __forceinline__ BlockReduceRaking(
123 |         TempStorage &temp_storage)
124 |     :
125 |         temp_storage(temp_storage.Alias()),
126 |         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
127 |     {}
128 | 
129 | 
130 |     template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
131 |     __device__ __forceinline__ T RakingReduction(
132 |         ReductionOp                 reduction_op,       ///< [in] Binary scan operator
133 |         T                           *raking_segment,
134 |         T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
135 |         int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
136 |         Int2Type<ITERATION>         iteration)
137 |     {
138 |         // Update partial if addend is in range
139 |         if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
140 |         {
141 |             T addend = raking_segment[ITERATION];
142 |             partial = reduction_op(partial, addend);
143 |         }
144 |         return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
145 |     }
146 | 
147 |     template <bool IS_FULL_TILE, typename ReductionOp>
148 |     __device__ __forceinline__ T RakingReduction(
149 |         ReductionOp                 reduction_op,       ///< [in] Binary scan operator
150 |         T                           *raking_segment,
151 |         T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
152 |         int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
153 |         Int2Type<SEGMENT_LENGTH>    iteration)
154 |     {
155 |         return partial;
156 |     }
157 | 
158 | 
159 | 
160 |     /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
161 |     template <
162 |         bool                IS_FULL_TILE,
163 |         typename            ReductionOp>
164 |     __device__ __forceinline__ T Reduce(
165 |         T                   partial,            ///< [in] Calling thread's input partial reductions
166 |         int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
167 |         ReductionOp         reduction_op)       ///< [in] Binary reduction operator
168 |     {
169 |         if (WARP_SYNCHRONOUS)
170 |         {
171 |             // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
172 |             partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE, SEGMENT_LENGTH>(
173 |                 partial,
174 |                 num_valid,
175 |                 reduction_op);
176 |         }
177 |         else
178 |         {
179 |             // Place partial into shared memory grid.
180 |             *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
181 | 
182 |             __syncthreads();
183 | 
184 |             // Reduce parallelism to one warp
185 |             if (linear_tid < RAKING_THREADS)
186 |             {
187 |                 // Raking reduction in grid
188 |                 T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
189 |                 partial = raking_segment[0];
190 | 
191 |                 partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
192 | 
193 |                 partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
194 |                     partial,
195 |                     num_valid,
196 |                     reduction_op);
197 | 
198 |             }
199 |         }
200 | 
201 |         return partial;
202 |     }
203 | 
204 | 
205 |     /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
206 |     template <bool IS_FULL_TILE>
207 |     __device__ __forceinline__ T Sum(
208 |         T                   partial,            ///< [in] Calling thread's input partial reductions
209 |         int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
210 |     {
211 |         cub::Sum reduction_op;
212 | 
213 |         return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
214 |     }
215 | 
216 | 
217 | 
218 | };
219 | 
220 | }               // CUB namespace
221 | CUB_NS_POSTFIX  // Optional outer namespace(s)
222 | 
223 | 


--------------------------------------------------------------------------------
/cub/block/specializations/block_reduce_raking_commutative_only.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "block_reduce_raking.cuh"
 37 | #include "../../warp/warp_reduce.cuh"
 38 | #include "../../thread/thread_reduce.cuh"
 39 | #include "../../util_ptx.cuh"
 40 | #include "../../util_namespace.cuh"
 41 | 
 42 | /// Optional outer namespace(s)
 43 | CUB_NS_PREFIX
 44 | 
 45 | /// CUB namespace
 46 | namespace cub {
 47 | 
 48 | 
 49 | /**
 50 |  * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
 51 |  */
 52 | template <
 53 |     typename    T,              ///< Data type being reduced
 54 |     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
 55 |     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
 56 |     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
 57 |     int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
 58 | struct BlockReduceRakingCommutativeOnly
 59 | {
 60 |     /// Constants
 61 |     enum
 62 |     {
 63 |         /// The thread block size in threads
 64 |         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 65 |     };
 66 | 
 67 |     // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
 68 |     typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
 69 | 
 70 |     /// Constants
 71 |     enum
 72 |     {
 73 |         /// Number of warp threads
 74 |         WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
 75 | 
 76 |         /// Whether or not to use fall-back
 77 |         USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
 78 | 
 79 |         /// Number of raking threads
 80 |         RAKING_THREADS = WARP_THREADS,
 81 | 
 82 |         /// Number of threads actually sharing items with the raking threads
 83 |         SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
 84 | 
 85 |         /// Number of raking elements per warp synchronous raking thread
 86 |         SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
 87 |     };
 88 | 
 89 |     ///  WarpReduce utility type
 90 |     typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
 91 | 
 92 |     /// Layout type for padded thread block raking grid
 93 |     typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
 94 | 
 95 |     /// Shared memory storage layout type
 96 |     struct _TempStorage
 97 |     {
 98 |         union
 99 |         {
100 |             struct
101 |             {
102 |                 typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
103 |                 typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded threadblock raking grid
104 |             };
105 |             typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
106 |         };
107 |     };
108 | 
109 | 
110 |     /// Alias wrapper allowing storage to be unioned
111 |     struct TempStorage : Uninitialized<_TempStorage> {};
112 | 
113 | 
114 |     // Thread fields
115 |     _TempStorage &temp_storage;
116 |     int linear_tid;
117 | 
118 | 
119 |     /// Constructor
120 |     __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
121 |         TempStorage &temp_storage)
122 |     :
123 |         temp_storage(temp_storage.Alias()),
124 |         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
125 |     {}
126 | 
127 | 
128 |     /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
129 |     template <bool FULL_TILE>
130 |     __device__ __forceinline__ T Sum(
131 |         T                   partial,            ///< [in] Calling thread's input partial reductions
132 |         int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
133 |     {
134 |         if (USE_FALLBACK || !FULL_TILE)
135 |         {
136 |             return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
137 |         }
138 |         else
139 |         {
140 |             // Place partial into shared memory grid
141 |             if (linear_tid >= RAKING_THREADS)
142 |                 *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
143 | 
144 |             __syncthreads();
145 | 
146 |             // Reduce parallelism to one warp
147 |             if (linear_tid < RAKING_THREADS)
148 |             {
149 |                 // Raking reduction in grid
150 |                 T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
151 |                 partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
152 | 
153 |                 // Warpscan
154 |                 partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
155 |             }
156 |         }
157 | 
158 |         return partial;
159 |     }
160 | 
161 | 
162 |     /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
163 |     template <
164 |         bool                FULL_TILE,
165 |         typename            ReductionOp>
166 |     __device__ __forceinline__ T Reduce(
167 |         T                   partial,            ///< [in] Calling thread's input partial reductions
168 |         int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
169 |         ReductionOp         reduction_op)       ///< [in] Binary reduction operator
170 |     {
171 |         if (USE_FALLBACK || !FULL_TILE)
172 |         {
173 |             return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
174 |         }
175 |         else
176 |         {
177 |             // Place partial into shared memory grid
178 |             if (linear_tid >= RAKING_THREADS)
179 |                 *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
180 | 
181 |             __syncthreads();
182 | 
183 |             // Reduce parallelism to one warp
184 |             if (linear_tid < RAKING_THREADS)
185 |             {
186 |                 // Raking reduction in grid
187 |                 T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
188 |                 partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
189 | 
190 |                 // Warpscan
191 |                 partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
192 |             }
193 |         }
194 | 
195 |         return partial;
196 |     }
197 | 
198 | };
199 | 
200 | }               // CUB namespace
201 | CUB_NS_POSTFIX  // Optional outer namespace(s)
202 | 
203 | 


--------------------------------------------------------------------------------
/cub/block/specializations/block_reduce_warp_reductions.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock.  Supports non-commutative reduction operators.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../../warp/warp_reduce.cuh"
 37 | #include "../../util_ptx.cuh"
 38 | #include "../../util_arch.cuh"
 39 | #include "../../util_namespace.cuh"
 40 | 
 41 | /// Optional outer namespace(s)
 42 | CUB_NS_PREFIX
 43 | 
 44 | /// CUB namespace
 45 | namespace cub {
 46 | 
 47 | 
 48 | /**
 49 |  * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock.  Supports non-commutative reduction operators.
 50 |  */
 51 | template <
 52 |     typename    T,              ///< Data type being reduced
 53 |     int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
 54 |     int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
 55 |     int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
 56 |     int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
 57 | struct BlockReduceWarpReductions
 58 | {
 59 |     /// Constants
 60 |     enum
 61 |     {
 62 |         /// The thread block size in threads
 63 |         BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
 64 | 
 65 |         /// Number of warp threads
 66 |         WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
 67 | 
 68 |         /// Number of active warps
 69 |         WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
 70 | 
 71 |         /// The logical warp size for warp reductions
 72 |         LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
 73 | 
 74 |         /// Whether or not the logical warp size evenly divides the threadblock size
 75 |         EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
 76 |     };
 77 | 
 78 | 
 79 |     ///  WarpReduce utility type
 80 |     typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
 81 | 
 82 | 
 83 |     /// Shared memory storage layout type
 84 |     struct _TempStorage
 85 |     {
 86 |         typename WarpReduce::TempStorage    warp_reduce[WARPS];                ///< Buffer for warp-synchronous scan
 87 |         T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
 88 |         T                                   block_prefix;               ///< Shared prefix for the entire threadblock
 89 |     };
 90 | 
 91 |     /// Alias wrapper allowing storage to be unioned
 92 |     struct TempStorage : Uninitialized<_TempStorage> {};
 93 | 
 94 | 
 95 |     // Thread fields
 96 |     _TempStorage &temp_storage;
 97 |     int linear_tid;
 98 |     int warp_id;
 99 |     int lane_id;
100 | 
101 | 
102 |     /// Constructor
103 |     __device__ __forceinline__ BlockReduceWarpReductions(
104 |         TempStorage &temp_storage)
105 |     :
106 |         temp_storage(temp_storage.Alias()),
107 |         linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
108 |         warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
109 |         lane_id(LaneId())
110 |     {}
111 | 
112 | 
113 |     template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
114 |     __device__ __forceinline__ T ApplyWarpAggregates(
115 |         ReductionOp                 reduction_op,       ///< [in] Binary scan operator
116 |         T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
117 |         int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
118 |         Int2Type<SUCCESSOR_WARP>    successor_warp)
119 |     {
120 |         if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
121 |         {
122 |             T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
123 |             warp_aggregate = reduction_op(warp_aggregate, addend);
124 |         }
125 |         return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
126 |     }
127 | 
128 |     template <bool FULL_TILE, typename ReductionOp>
129 |     __device__ __forceinline__ T ApplyWarpAggregates(
130 |         ReductionOp         reduction_op,       ///< [in] Binary scan operator
131 |         T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
132 |         int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
133 |         Int2Type<WARPS>     successor_warp)
134 |     {
135 |         return warp_aggregate;
136 |     }
137 | 
138 | 
139 |     /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
140 |     template <
141 |         bool                FULL_TILE,
142 |         typename            ReductionOp>
143 |     __device__ __forceinline__ T ApplyWarpAggregates(
144 |         ReductionOp         reduction_op,       ///< [in] Binary scan operator
145 |         T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
146 |         int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
147 |     {
148 |         // Share lane aggregates
149 |         if (lane_id == 0)
150 |         {
151 |             temp_storage.warp_aggregates[warp_id] = warp_aggregate;
152 |         }
153 | 
154 |         __syncthreads();
155 | 
156 |         // Update total aggregate in warp 0, lane 0
157 |         if (linear_tid == 0)
158 |         {
159 |             warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
160 |         }
161 | 
162 |         return warp_aggregate;
163 |     }
164 | 
165 | 
166 |     /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
167 |     template <bool FULL_TILE>
168 |     __device__ __forceinline__ T Sum(
169 |         T                   input,          ///< [in] Calling thread's input partial reductions
170 |         int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
171 |     {
172 |         cub::Sum        reduction_op;
173 |         unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
174 |         unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
175 |                             LOGICAL_WARP_SIZE :
176 |                             (warp_offset < num_valid) ?
177 |                                 num_valid - warp_offset :
178 |                                 0;
179 | 
180 |         // Warp reduction in every warp
181 |         T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
182 |             input,
183 |             warp_num_valid,
184 |             cub::Sum());
185 | 
186 |         // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
187 |         return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
188 |     }
189 | 
190 | 
191 |     /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
192 |     template <
193 |         bool                FULL_TILE,
194 |         typename            ReductionOp>
195 |     __device__ __forceinline__ T Reduce(
196 |         T                   input,              ///< [in] Calling thread's input partial reductions
197 |         int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
198 |         ReductionOp         reduction_op)       ///< [in] Binary reduction operator
199 |     {
200 |         unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
201 |         unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
202 |                             LOGICAL_WARP_SIZE :
203 |                             (warp_offset < num_valid) ?
204 |                                 num_valid - warp_offset :
205 |                                 0;
206 | 
207 |         // Warp reduction in every warp
208 |         T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
209 |             input,
210 |             warp_num_valid,
211 |             reduction_op);
212 | 
213 |         // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
214 |         return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
215 |     }
216 | 
217 | };
218 | 
219 | 
220 | }               // CUB namespace
221 | CUB_NS_POSTFIX  // Optional outer namespace(s)
222 | 
223 | 


--------------------------------------------------------------------------------
/cub/cub.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  * 
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * CUB umbrella include file
32 |  */
33 | 
34 | #pragma once
35 | 
36 | 
37 | // Block
38 | #include "block/block_histogram.cuh"
39 | #include "block/block_discontinuity.cuh"
40 | #include "block/block_exchange.cuh"
41 | #include "block/block_load.cuh"
42 | #include "block/block_radix_rank.cuh"
43 | #include "block/block_radix_sort.cuh"
44 | #include "block/block_reduce.cuh"
45 | #include "block/block_scan.cuh"
46 | #include "block/block_store.cuh"
47 | //#include "block/block_shift.cuh"
48 | 
49 | // Device
50 | #include "device/device_histogram.cuh"
51 | #include "device/device_partition.cuh"
52 | #include "device/device_radix_sort.cuh"
53 | #include "device/device_reduce.cuh"
54 | #include "device/device_run_length_encode.cuh"
55 | #include "device/device_scan.cuh"
56 | #include "device/device_segmented_radix_sort.cuh"
57 | #include "device/device_segmented_reduce.cuh"
58 | #include "device/device_select.cuh"
59 | #include "device/device_spmv.cuh"
60 | 
61 | // Grid
62 | //#include "grid/grid_barrier.cuh"
63 | #include "grid/grid_even_share.cuh"
64 | #include "grid/grid_mapping.cuh"
65 | #include "grid/grid_queue.cuh"
66 | 
67 | // Thread
68 | #include "thread/thread_load.cuh"
69 | #include "thread/thread_operators.cuh"
70 | #include "thread/thread_reduce.cuh"
71 | #include "thread/thread_scan.cuh"
72 | #include "thread/thread_store.cuh"
73 | 
74 | // Warp
75 | #include "warp/warp_reduce.cuh"
76 | #include "warp/warp_scan.cuh"
77 | 
78 | // Iterator
79 | #include "iterator/arg_index_input_iterator.cuh"
80 | #include "iterator/cache_modified_input_iterator.cuh"
81 | #include "iterator/cache_modified_output_iterator.cuh"
82 | #include "iterator/constant_input_iterator.cuh"
83 | #include "iterator/counting_input_iterator.cuh"
84 | #include "iterator/tex_obj_input_iterator.cuh"
85 | #include "iterator/tex_ref_input_iterator.cuh"
86 | #include "iterator/transform_input_iterator.cuh"
87 | 
88 | // Util
89 | #include "util_allocator.cuh"
90 | #include "util_arch.cuh"
91 | #include "util_debug.cuh"
92 | #include "util_device.cuh"
93 | #include "util_macro.cuh"
94 | #include "util_ptx.cuh"
95 | #include "util_type.cuh"
96 | 
97 | 


--------------------------------------------------------------------------------
/cub/device/device_spmv.cuh:
--------------------------------------------------------------------------------
  1 | 
  2 | /******************************************************************************
  3 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  4 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  5 |  *
  6 |  * Redistribution and use in source and binary forms, with or without
  7 |  * modification, are permitted provided that the following conditions are met:
  8 |  *     * Redistributions of source code must retain the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer.
 10 |  *     * Redistributions in binary form must reproduce the above copyright
 11 |  *       notice, this list of conditions and the following disclaimer in the
 12 |  *       documentation and/or other materials provided with the distribution.
 13 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 14 |  *       names of its contributors may be used to endorse or promote products
 15 |  *       derived from this software without specific prior written permission.
 16 |  *
 17 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 18 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 19 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 20 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 21 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 22 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 23 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 24 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 25 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 26 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 27 |  *
 28 |  ******************************************************************************/
 29 | 
 30 | /**
 31 |  * \file
 32 |  * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
 33 |  */
 34 | 
 35 | #pragma once
 36 | 
 37 | #include <stdio.h>
 38 | #include <iterator>
 39 | #include <limits>
 40 | 
 41 | #include "dispatch/dispatch_spmv_orig.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | /// Optional outer namespace(s)
 45 | CUB_NS_PREFIX
 46 | 
 47 | /// CUB namespace
 48 | namespace cub {
 49 | 
 50 | 
 51 | /**
 52 |  * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
 53 |  * \ingroup SingleModule
 54 |  *
 55 |  * \par Overview
 56 |  * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
 57 |  * performs the matrix-vector operation
 58 |  * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
 59 |  * where:
 60 |  *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
 61 |  *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
 62 |  *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
 63 |  *  - <em>x</em> and <em>y</em> are dense vectors
 64 |  *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
 65 |  *
 66 |  * \par Usage Considerations
 67 |  * \cdp_class{DeviceSpmv}
 68 |  *
 69 |  */
 70 | struct DeviceSpmv
 71 | {
 72 |     /******************************************************************//**
 73 |      * \name CSR matrix operations
 74 |      *********************************************************************/
 75 |     //@{
 76 | 
 77 |     /**
 78 |      * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
 79 |      *
 80 |      * \par Snippet
 81 |      * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
 82 |      * representing a 3x3 lattice (24 non-zeros).
 83 |      *
 84 |      * \par
 85 |      * \code
 86 |      * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
 87 |      *
 88 |      * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
 89 |      * // and output vector y
 90 |      * int    num_rows = 9;
 91 |      * int    num_cols = 9;
 92 |      * int    num_nonzeros = 24;
 93 |      *
 94 |      * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
 95 |      *                   //        1, 1, 1, 1, 1, 1, 1, 1,
 96 |      *                   //        1, 1, 1, 1, 1, 1, 1, 1]
 97 |      *
 98 |      * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
 99 |      *                          //        4, 6, 1, 3, 5, 7, 2, 4,
100 |      *                          //        8, 3, 7, 4, 6, 8, 5, 7]
101 |      *
102 |      * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
103 |      *
104 |      * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
105 |      * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
106 |      * ...
107 |      *
108 |      * // Determine temporary device storage requirements
109 |      * void*    d_temp_storage = NULL;
110 |      * size_t   temp_storage_bytes = 0;
111 |      * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
112 |      *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
113 |      *     num_rows, num_cols, num_nonzeros, alpha, beta);
114 |      *
115 |      * // Allocate temporary storage
116 |      * cudaMalloc(&d_temp_storage, temp_storage_bytes);
117 |      *
118 |      * // Run SpMV
119 |      * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
120 |      *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
121 |      *     num_rows, num_cols, num_nonzeros, alpha, beta);
122 |      *
123 |      * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
124 |      *
125 |      * \endcode
126 |      *
127 |      * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
128 |      */
129 |     template <
130 |         typename            ValueT>
131 |     CUB_RUNTIME_FUNCTION
132 |     static cudaError_t CsrMV(
133 |         void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
134 |         size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
135 |         ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
136 |         int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
137 |         int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
138 |         ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
139 |         ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
140 |         int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
141 |         int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
142 |         int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
143 |         cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
144 |         bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
145 |     {
146 |         SpmvParams<ValueT, int> spmv_params;
147 |         spmv_params.d_values             = d_values;
148 |         spmv_params.d_row_end_offsets    = d_row_offsets + 1;
149 |         spmv_params.d_column_indices     = d_column_indices;
150 |         spmv_params.d_vector_x           = d_vector_x;
151 |         spmv_params.d_vector_y           = d_vector_y;
152 |         spmv_params.num_rows             = num_rows;
153 |         spmv_params.num_cols             = num_cols;
154 |         spmv_params.num_nonzeros         = num_nonzeros;
155 |         spmv_params.alpha                = 1.0;
156 |         spmv_params.beta                 = 0.0;
157 | 
158 |         return DispatchSpmv<ValueT, int>::Dispatch(
159 |             d_temp_storage,
160 |             temp_storage_bytes,
161 |             spmv_params,
162 |             stream,
163 |             debug_synchronous);
164 |     }
165 | 
166 |     //@}  end member group
167 | };
168 | 
169 | 
170 | 
171 | }               // CUB namespace
172 | CUB_NS_POSTFIX  // Optional outer namespace(s)
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/cub/grid/grid_barrier.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_debug.cuh"
 37 | #include "../util_namespace.cuh"
 38 | #include "../thread/thread_load.cuh"
 39 | 
 40 | /// Optional outer namespace(s)
 41 | CUB_NS_PREFIX
 42 | 
 43 | /// CUB namespace
 44 | namespace cub {
 45 | 
 46 | 
 47 | /**
 48 |  * \addtogroup GridModule
 49 |  * @{
 50 |  */
 51 | 
 52 | 
 53 | /**
 54 |  * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
 55 |  */
 56 | class GridBarrier
 57 | {
 58 | protected :
 59 | 
 60 |     typedef unsigned int SyncFlag;
 61 | 
 62 |     // Counters in global device memory
 63 |     SyncFlag* d_sync;
 64 | 
 65 | public:
 66 | 
 67 |     /**
 68 |      * Constructor
 69 |      */
 70 |     GridBarrier() : d_sync(NULL) {}
 71 | 
 72 | 
 73 |     /**
 74 |      * Synchronize
 75 |      */
 76 |     __device__ __forceinline__ void Sync() const
 77 |     {
 78 |         volatile SyncFlag *d_vol_sync = d_sync;
 79 | 
 80 |         // Threadfence and syncthreads to make sure global writes are visible before
 81 |         // thread-0 reports in with its sync counter
 82 |         __threadfence();
 83 |         __syncthreads();
 84 | 
 85 |         if (blockIdx.x == 0)
 86 |         {
 87 |             // Report in ourselves
 88 |             if (threadIdx.x == 0)
 89 |             {
 90 |                 d_vol_sync[blockIdx.x] = 1;
 91 |             }
 92 | 
 93 |             __syncthreads();
 94 | 
 95 |             // Wait for everyone else to report in
 96 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
 97 |             {
 98 |                 while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
 99 |                 {
100 |                     __threadfence_block();
101 |                 }
102 |             }
103 | 
104 |             __syncthreads();
105 | 
106 |             // Let everyone know it's safe to proceed
107 |             for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
108 |             {
109 |                 d_vol_sync[peer_block] = 0;
110 |             }
111 |         }
112 |         else
113 |         {
114 |             if (threadIdx.x == 0)
115 |             {
116 |                 // Report in
117 |                 d_vol_sync[blockIdx.x] = 1;
118 | 
119 |                 // Wait for acknowledgment
120 |                 while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
121 |                 {
122 |                     __threadfence_block();
123 |                 }
124 |             }
125 | 
126 |             __syncthreads();
127 |         }
128 |     }
129 | };
130 | 
131 | 
132 | /**
133 |  * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
134 |  *
135 |  * Uses RAII for lifetime, i.e., device resources are reclaimed when
136 |  * the destructor is called.
137 |  */
138 | class GridBarrierLifetime : public GridBarrier
139 | {
140 | protected:
141 | 
142 |     // Number of bytes backed by d_sync
143 |     size_t sync_bytes;
144 | 
145 | public:
146 | 
147 |     /**
148 |      * Constructor
149 |      */
150 |     GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
151 | 
152 | 
153 |     /**
154 |      * DeviceFrees and resets the progress counters
155 |      */
156 |     cudaError_t HostReset()
157 |     {
158 |         cudaError_t retval = cudaSuccess;
159 |         if (d_sync)
160 |         {
161 |             CubDebug(retval = cudaFree(d_sync));
162 |             d_sync = NULL;
163 |         }
164 |         sync_bytes = 0;
165 |         return retval;
166 |     }
167 | 
168 | 
169 |     /**
170 |      * Destructor
171 |      */
172 |     virtual ~GridBarrierLifetime()
173 |     {
174 |         HostReset();
175 |     }
176 | 
177 | 
178 |     /**
179 |      * Sets up the progress counters for the next kernel launch (lazily
180 |      * allocating and initializing them if necessary)
181 |      */
182 |     cudaError_t Setup(int sweep_grid_size)
183 |     {
184 |         cudaError_t retval = cudaSuccess;
185 |         do {
186 |             size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
187 |             if (new_sync_bytes > sync_bytes)
188 |             {
189 |                 if (d_sync)
190 |                 {
191 |                     if (CubDebug(retval = cudaFree(d_sync))) break;
192 |                 }
193 | 
194 |                 sync_bytes = new_sync_bytes;
195 | 
196 |                 // Allocate and initialize to zero
197 |                 if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
198 |                 if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
199 |             }
200 |         } while (0);
201 | 
202 |         return retval;
203 |     }
204 | };
205 | 
206 | 
207 | /** @} */       // end group GridModule
208 | 
209 | }               // CUB namespace
210 | CUB_NS_POSTFIX  // Optional outer namespace(s)
211 | 
212 | 


--------------------------------------------------------------------------------
/cub/grid/grid_even_share.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../util_namespace.cuh"
 38 | #include "../util_macro.cuh"
 39 | 
 40 | /// Optional outer namespace(s)
 41 | CUB_NS_PREFIX
 42 | 
 43 | /// CUB namespace
 44 | namespace cub {
 45 | 
 46 | 
 47 | /**
 48 |  * \addtogroup GridModule
 49 |  * @{
 50 |  */
 51 | 
 52 | 
 53 | /**
 54 |  * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
 55 |  *
 56 |  * \par Overview
 57 |  * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
 58 |  * Threadblocks may receive one of three different amounts of work: "big", "normal",
 59 |  * and "last".  The "big" workloads are one scheduling grain larger than "normal".  The "last" work unit
 60 |  * for the last threadblock may be partially-full if the input is not an even multiple of
 61 |  * the scheduling grain size.
 62 |  *
 63 |  * \par
 64 |  * Before invoking a child grid, a parent thread will typically construct an instance of
 65 |  * GridEvenShare.  The instance can be passed to child threadblocks which can
 66 |  * initialize their per-threadblock offsets using \p BlockInit().
 67 |  *
 68 |  * \tparam OffsetT      Signed integer type for global offsets
 69 |  */
 70 | template <typename OffsetT>
 71 | struct GridEvenShare
 72 | {
 73 |     OffsetT     total_grains;
 74 |     int         big_blocks;
 75 |     OffsetT     big_share;
 76 |     OffsetT     normal_share;
 77 |     OffsetT     normal_base_offset;
 78 | 
 79 |     /// Total number of input items
 80 |     OffsetT     num_items;
 81 | 
 82 |     /// Grid size in threadblocks
 83 |     int         grid_size;
 84 | 
 85 |     /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
 86 |     OffsetT     block_offset;
 87 | 
 88 |     /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
 89 |     OffsetT     block_end;
 90 | 
 91 |     /**
 92 |      * \brief Default constructor.  Zero-initializes block-specific fields.
 93 |      */
 94 |     __host__ __device__ __forceinline__ GridEvenShare() :
 95 |         num_items(0),
 96 |         grid_size(0),
 97 |         block_offset(0),
 98 |         block_end(0) {}
 99 | 
100 |     /**
101 |      * \brief Constructor.  Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
102 |      */
103 |     __host__ __device__ __forceinline__ GridEvenShare(
104 |         OffsetT  num_items,                 ///< Total number of input items
105 |         int     max_grid_size,              ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
106 |         int     schedule_granularity)       ///< Granularity by which the input can be parcelled into and distributed among threablocks.  Usually the thread block's native tile size (or a multiple thereof.
107 |     {
108 |         this->num_items             = num_items;
109 |         this->block_offset          = num_items;
110 |         this->block_end             = num_items;
111 |         this->total_grains          = (num_items + schedule_granularity - 1) / schedule_granularity;
112 |         this->grid_size             = CUB_MIN(total_grains, max_grid_size);
113 |         OffsetT grains_per_block     = total_grains / grid_size;
114 |         this->big_blocks            = total_grains - (grains_per_block * grid_size);        // leftover grains go to big blocks
115 |         this->normal_share          = grains_per_block * schedule_granularity;
116 |         this->normal_base_offset    = big_blocks * schedule_granularity;
117 |         this->big_share             = normal_share + schedule_granularity;
118 |     }
119 | 
120 | 
121 | 
122 |     /**
123 |      * \brief Initializes ranges for the specified partition index
124 |      */
125 |     __device__ __forceinline__ void Init(int partition_id)
126 |     {
127 |         if (partition_id < big_blocks)
128 |         {
129 |             // This threadblock gets a big share of grains (grains_per_block + 1)
130 |             block_offset = (partition_id * big_share);
131 |             block_end = block_offset + big_share;
132 |         }
133 |         else if (partition_id < total_grains)
134 |         {
135 |             // This threadblock gets a normal share of grains (grains_per_block)
136 |             block_offset = normal_base_offset + (partition_id * normal_share);
137 |             block_end = CUB_MIN(num_items, block_offset + normal_share);
138 |         }
139 |     }
140 | 
141 | 
142 |     /**
143 |      * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup)
144 |      */
145 |     __device__ __forceinline__ void BlockInit()
146 |     {
147 |         Init(blockIdx.x);
148 |     }
149 | 
150 | 
151 |     /**
152 |      * Print to stdout
153 |      */
154 |     __host__ __device__ __forceinline__ void Print()
155 |     {
156 |         printf(
157 | #if (CUB_PTX_ARCH > 0)
158 |             "\tthreadblock(%d) "
159 |             "block_offset(%lu) "
160 |             "block_end(%lu) "
161 | #endif
162 |             "num_items(%lu)  "
163 |             "total_grains(%lu)  "
164 |             "big_blocks(%lu)  "
165 |             "big_share(%lu)  "
166 |             "normal_share(%lu)\n",
167 | #if (CUB_PTX_ARCH > 0)
168 |                 blockIdx.x,
169 |                 (unsigned long) block_offset,
170 |                 (unsigned long) block_end,
171 | #endif
172 |                 (unsigned long) num_items,
173 |                 (unsigned long) total_grains,
174 |                 (unsigned long) big_blocks,
175 |                 (unsigned long) big_share,
176 |                 (unsigned long) normal_share);
177 |     }
178 | };
179 | 
180 | 
181 | 
182 | /** @} */       // end group GridModule
183 | 
184 | }               // CUB namespace
185 | CUB_NS_POSTFIX  // Optional outer namespace(s)
186 | 


--------------------------------------------------------------------------------
/cub/grid/grid_mapping.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
32 |  */
33 | 
34 | #pragma once
35 | 
36 | #include "../util_namespace.cuh"
37 | 
38 | /// Optional outer namespace(s)
39 | CUB_NS_PREFIX
40 | 
41 | /// CUB namespace
42 | namespace cub {
43 | 
44 | 
45 | /**
46 |  * \addtogroup GridModule
47 |  * @{
48 |  */
49 | 
50 | 
51 | /******************************************************************************
52 |  * Mapping policies
53 |  *****************************************************************************/
54 | 
55 | 
56 | /**
57 |  * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
58 |  */
59 | enum GridMappingStrategy
60 | {
61 |     /**
62 |      * \brief An "even-share" strategy for assigning input tiles to thread blocks.
63 |      *
64 |      * \par Overview
65 |      * The input is evenly partitioned into \p p segments, where \p p is
66 |      * constant and corresponds loosely to the number of thread blocks that may
67 |      * actively reside on the target device. Each segment is comprised of
68 |      * consecutive tiles, where a tile is a small, constant-sized unit of input
69 |      * to be processed to completion before the thread block terminates or
70 |      * obtains more work.  The kernel invokes \p p thread blocks, each
71 |      * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
72 |      * in tile-size increments.
73 |      */
74 |     GRID_MAPPING_EVEN_SHARE,
75 | 
76 |     /**
77 |      * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
78 |      *
79 |      * \par Overview
80 |      * The input is treated as a queue to be dynamically consumed by a grid of
81 |      * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
82 |      * unit of input to be processed to completion before the thread block
83 |      * terminates or obtains more work.  The grid size \p p is constant,
84 |      * loosely corresponding to the number of thread blocks that may actively
85 |      * reside on the target device.
86 |      */
87 |     GRID_MAPPING_DYNAMIC,
88 | };
89 | 
90 | 
91 | /** @} */       // end group GridModule
92 | 
93 | }               // CUB namespace
94 | CUB_NS_POSTFIX  // Optional outer namespace(s)
95 | 
96 | 


--------------------------------------------------------------------------------
/cub/grid/grid_queue.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * cub::GridQueue is a descriptor utility for dynamic queue management.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_namespace.cuh"
 37 | #include "../util_debug.cuh"
 38 | 
 39 | /// Optional outer namespace(s)
 40 | CUB_NS_PREFIX
 41 | 
 42 | /// CUB namespace
 43 | namespace cub {
 44 | 
 45 | 
 46 | /**
 47 |  * \addtogroup GridModule
 48 |  * @{
 49 |  */
 50 | 
 51 | 
 52 | /**
 53 |  * \brief GridQueue is a descriptor utility for dynamic queue management.
 54 |  *
 55 |  * \par Overview
 56 |  * GridQueue descriptors provides abstractions for "filling" or
 57 |  * "draining" globally-shared vectors.
 58 |  *
 59 |  * \par
 60 |  * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
 61 |  * returning a unique offset for the calling thread to write its items.
 62 |  * The GridQueue maintains the total "fill-size".  The fill counter must be reset
 63 |  * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
 64 |  * will be filling.
 65 |  *
 66 |  * \par
 67 |  * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
 68 |  * zero-initialized counter, returning a unique offset for the calling thread to
 69 |  * read its items. Threads can safely drain until the array's logical fill-size is
 70 |  * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
 71 |  * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
 72 |  * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
 73 |  * is simply the number of elements in the array.)
 74 |  *
 75 |  * \par
 76 |  * Iterative work management can be implemented simply with a pair of flip-flopping
 77 |  * work buffers, each with an associated set of fill and drain GridQueue descriptors.
 78 |  *
 79 |  * \tparam OffsetT Signed integer type for global offsets
 80 |  */
 81 | template <typename OffsetT>
 82 | class GridQueue
 83 | {
 84 | private:
 85 | 
 86 |     /// Counter indices
 87 |     enum
 88 |     {
 89 |         FILL    = 0,
 90 |         DRAIN   = 1,
 91 |     };
 92 | 
 93 |     /// Pair of counters
 94 |     OffsetT *d_counters;
 95 | 
 96 | public:
 97 | 
 98 |     /// Returns the device allocation size in bytes needed to construct a GridQueue instance
 99 |     __host__ __device__ __forceinline__
100 |     static size_t AllocationSize()
101 |     {
102 |         return sizeof(OffsetT) * 2;
103 |     }
104 | 
105 | 
106 |     /// Constructs an invalid GridQueue descriptor
107 |     __host__ __device__ __forceinline__ GridQueue()
108 |     :
109 |         d_counters(NULL)
110 |     {}
111 | 
112 | 
113 |     /// Constructs a GridQueue descriptor around the device storage allocation
114 |     __host__ __device__ __forceinline__ GridQueue(
115 |         void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
116 |     :
117 |         d_counters((OffsetT*) d_storage)
118 |     {}
119 | 
120 | 
121 |     /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
122 |     __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
123 |         OffsetT fill_size,
124 |         cudaStream_t stream = 0)
125 |     {
126 | #if (CUB_PTX_ARCH > 0)
127 |         d_counters[FILL] = fill_size;
128 |         d_counters[DRAIN] = 0;
129 |         return cudaSuccess;
130 | #else
131 |         OffsetT counters[2];
132 |         counters[FILL] = fill_size;
133 |         counters[DRAIN] = 0;
134 |         return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
135 | #endif
136 |     }
137 | 
138 | 
139 |     /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
140 |     __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
141 |     {
142 | #if (CUB_PTX_ARCH > 0)
143 |         d_counters[DRAIN] = 0;
144 |         return cudaSuccess;
145 | #else
146 |         return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
147 | #endif
148 |     }
149 | 
150 | 
151 |     /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
152 |     __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
153 |     {
154 | #if (CUB_PTX_ARCH > 0)
155 |         d_counters[FILL] = 0;
156 |         return cudaSuccess;
157 | #else
158 |         return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
159 | #endif
160 |     }
161 | 
162 | 
163 |     /// Returns the fill-size established by the parent or by the previous kernel.
164 |     __host__ __device__ __forceinline__ cudaError_t FillSize(
165 |         OffsetT &fill_size,
166 |         cudaStream_t stream = 0)
167 |     {
168 | #if (CUB_PTX_ARCH > 0)
169 |         fill_size = d_counters[FILL];
170 |         return cudaSuccess;
171 | #else
172 |         return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
173 | #endif
174 |     }
175 | 
176 | 
177 |     /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
178 |     __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
179 |     {
180 |         return atomicAdd(d_counters + DRAIN, num_items);
181 |     }
182 | 
183 | 
184 |     /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
185 |     __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
186 |     {
187 |         return atomicAdd(d_counters + FILL, num_items);
188 |     }
189 | };
190 | 
191 | 
192 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
193 | 
194 | 
195 | /**
196 |  * Reset grid queue (call with 1 block of 1 thread)
197 |  */
198 | template <typename OffsetT>
199 | __global__ void FillAndResetDrainKernel(
200 |     GridQueue<OffsetT>   grid_queue,
201 |     OffsetT              num_items)
202 | {
203 |     grid_queue.FillAndResetDrain(num_items);
204 | }
205 | 
206 | 
207 | 
208 | #endif // DOXYGEN_SHOULD_SKIP_THIS
209 | 
210 | 
211 | /** @} */       // end group GridModule
212 | 
213 | }               // CUB namespace
214 | CUB_NS_POSTFIX  // Optional outer namespace(s)
215 | 
216 | 
217 | 


--------------------------------------------------------------------------------
/cub/host/mutex.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  *
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  *
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Simple portable mutex
 32 |  */
 33 | 
 34 | 
 35 | #pragma once
 36 | 
 37 | #if __cplusplus > 199711L
 38 |     #include <mutex>
 39 | #else
 40 |     #if defined(_WIN32) || defined(_WIN64)
 41 |         #include <intrin.h>
 42 |         #include <windows.h>
 43 |         #undef small            // Windows is terrible for polluting macro namespace
 44 | 
 45 |         /**
 46 |          * Compiler read/write barrier
 47 |          */
 48 |         #pragma intrinsic(_ReadWriteBarrier)
 49 | 
 50 |     #endif
 51 | #endif
 52 | 
 53 | #include "../util_namespace.cuh"
 54 | 
 55 | 
 56 | /// Optional outer namespace(s)
 57 | CUB_NS_PREFIX
 58 | 
 59 | /// CUB namespace
 60 | namespace cub {
 61 | 
 62 | 
 63 | /**
 64 |  * Simple portable mutex
 65 |  *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
 66 |  *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
 67 |  */
 68 | struct Mutex
 69 | {
 70 | #if __cplusplus > 199711L
 71 | 
 72 |     std::mutex mtx;
 73 | 
 74 |     void Lock()
 75 |     {
 76 |         mtx.lock();
 77 |     }
 78 | 
 79 |     void Unlock()
 80 |     {
 81 |         mtx.unlock();
 82 |     }
 83 | 
 84 |     void TryLock()
 85 |     {
 86 |         mtx.try_lock();
 87 |     }
 88 | 
 89 | #else       //__cplusplus > 199711L
 90 | 
 91 |     #if defined(_MSC_VER)
 92 | 
 93 |         // Microsoft VC++
 94 |         typedef long Spinlock;
 95 | 
 96 |     #else
 97 | 
 98 |         // GNU g++
 99 |         typedef int Spinlock;
100 | 
101 |         /**
102 |          * Compiler read/write barrier
103 |          */
104 |         __forceinline__ void _ReadWriteBarrier()
105 |         {
106 |             __sync_synchronize();
107 |         }
108 | 
109 |         /**
110 |          * Atomic exchange
111 |          */
112 |         __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
113 |         {
114 |             // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
115 |             _ReadWriteBarrier();
116 |             return __sync_lock_test_and_set(Target, Value);
117 |         }
118 | 
119 |         /**
120 |          * Pause instruction to prevent excess processor bus usage
121 |          */
122 |         __forceinline__ void YieldProcessor()
123 |         {
124 |         #ifndef __arm__
125 |                 asm volatile("pause\n": : :"memory");
126 |         #endif  // __arm__
127 |         }
128 | 
129 |     #endif  // defined(_MSC_VER)
130 | 
131 |         /// Lock member
132 |         volatile Spinlock lock;
133 | 
134 |         /**
135 |          * Constructor
136 |          */
137 |         Mutex() : lock(0) {}
138 | 
139 |         /**
140 |          * Return when the specified spinlock has been acquired
141 |          */
142 |         __forceinline__ void Lock()
143 |         {
144 |             while (1)
145 |             {
146 |                 if (!_InterlockedExchange(&lock, 1)) return;
147 |                 while (lock) YieldProcessor();
148 |             }
149 |         }
150 | 
151 | 
152 |         /**
153 |          * Release the specified spinlock
154 |          */
155 |         __forceinline__ void Unlock()
156 |         {
157 |             _ReadWriteBarrier();
158 |             lock = 0;
159 |         }
160 | 
161 | #endif      // __cplusplus > 199711L
162 | 
163 | };
164 | 
165 | 
166 | 
167 | 
168 | }               // CUB namespace
169 | CUB_NS_POSTFIX  // Optional outer namespace(s)
170 | 
171 | 


--------------------------------------------------------------------------------
/cub/iterator/arg_index_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_device.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | #include <thrust/version.h>
 45 | 
 46 | #if (THRUST_VERSION >= 100700)
 47 |     // This iterator is compatible with Thrust API 1.7 and newer
 48 |     #include <thrust/iterator/iterator_facade.h>
 49 |     #include <thrust/iterator/iterator_traits.h>
 50 | #endif // THRUST_VERSION
 51 | 
 52 | /// Optional outer namespace(s)
 53 | CUB_NS_PREFIX
 54 | 
 55 | /// CUB namespace
 56 | namespace cub {
 57 | 
 58 | /**
 59 |  * \addtogroup UtilIterator
 60 |  * @{
 61 |  */
 62 | 
 63 | 
 64 | /**
 65 |  * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
 66 |  *
 67 |  * \par Overview
 68 |  * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
 69 |  *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
 70 |  *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
 71 |  * - Can be used with any data type.
 72 |  * - Can be constructed, manipulated, and exchanged within and between host and device
 73 |  *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
 74 |  *   device memory can only be dereferenced on the device.
 75 |  * - Compatible with Thrust API v1.7 or newer.
 76 |  *
 77 |  * \par Snippet
 78 |  * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
 79 |  * dereference an array of doubles
 80 |  * \par
 81 |  * \code
 82 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
 83 |  *
 84 |  * // Declare, allocate, and initialize a device array
 85 |  * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
 86 |  *
 87 |  * // Create an iterator wrapper
 88 |  * cub::ArgIndexInputIterator<double*> itr(d_in);
 89 |  *
 90 |  * // Within device code:
 91 |  * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
 92 |  * Tuple item_offset_pair.key = *itr;
 93 |  * printf("%f @ %d\n",
 94 |  *   item_offset_pair.value,
 95 |  *   item_offset_pair.key);   // 8.0 @ 0
 96 |  *
 97 |  * itr = itr + 6;
 98 |  * item_offset_pair.key = *itr;
 99 |  * printf("%f @ %d\n",
100 |  *   item_offset_pair.value,
101 |  *   item_offset_pair.key);   // 9.0 @ 6
102 |  *
103 |  * \endcode
104 |  *
105 |  * \tparam InputIteratorT       The type of the wrapped input iterator
106 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
107 |  */
108 | template <
109 |     typename    InputIteratorT,
110 |     typename    OffsetT = ptrdiff_t>
111 | class ArgIndexInputIterator
112 | {
113 | private:
114 | 
115 |     // Data type of input iterator
116 |     typedef typename std::iterator_traits<InputIteratorT>::value_type T;
117 | 
118 | public:
119 | 
120 | 
121 |     // Required iterator traits
122 |     typedef ArgIndexInputIterator               self_type;              ///< My own type
123 |     typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
124 |     typedef KeyValuePair<difference_type, T>    value_type;             ///< The type of the element the iterator can point to
125 |     typedef value_type*                         pointer;                ///< The type of a pointer to an element the iterator can point to
126 |     typedef value_type                          reference;              ///< The type of a reference to an element the iterator can point to
127 | 
128 | #if (THRUST_VERSION >= 100700)
129 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
130 |     typedef typename thrust::detail::iterator_facade_category<
131 |         thrust::any_system_tag,
132 |         thrust::random_access_traversal_tag,
133 |         value_type,
134 |         reference
135 |       >::type iterator_category;                                        ///< The iterator category
136 | #else
137 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
138 | #endif  // THRUST_VERSION
139 | 
140 | private:
141 | 
142 |     InputIteratorT  itr;
143 |     difference_type offset;
144 | 
145 | public:
146 | 
147 |     /// Constructor
148 |     __host__ __device__ __forceinline__ ArgIndexInputIterator(
149 |         InputIteratorT  itr,            ///< Input iterator to wrap
150 |         difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
151 |     :
152 |         itr(itr),
153 |         offset(offset)
154 |     {}
155 | 
156 |     /// Postfix increment
157 |     __host__ __device__ __forceinline__ self_type operator++(int)
158 |     {
159 |         self_type retval = *this;
160 |         offset++;
161 |         return retval;
162 |     }
163 | 
164 |     /// Prefix increment
165 |     __host__ __device__ __forceinline__ self_type operator++()
166 |     {
167 |         offset++;
168 |         return *this;
169 |     }
170 | 
171 |     /// Indirection
172 |     __host__ __device__ __forceinline__ reference operator*() const
173 |     {
174 |         value_type retval;
175 |         retval.value = itr[offset];
176 |         retval.key = offset;
177 |         return retval;
178 |     }
179 | 
180 |     /// Addition
181 |     template <typename Distance>
182 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
183 |     {
184 |         self_type retval(itr, offset + n);
185 |         return retval;
186 |     }
187 | 
188 |     /// Addition assignment
189 |     template <typename Distance>
190 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
191 |     {
192 |         offset += n;
193 |         return *this;
194 |     }
195 | 
196 |     /// Subtraction
197 |     template <typename Distance>
198 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
199 |     {
200 |         self_type retval(itr, offset - n);
201 |         return retval;
202 |     }
203 | 
204 |     /// Subtraction assignment
205 |     template <typename Distance>
206 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
207 |     {
208 |         offset -= n;
209 |         return *this;
210 |     }
211 | 
212 |     /// Distance
213 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
214 |     {
215 |         return offset - other.offset;
216 |     }
217 | 
218 |     /// Array subscript
219 |     template <typename Distance>
220 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
221 |     {
222 |         self_type offset = (*this) + n;
223 |         return *offset;
224 |     }
225 | 
226 |     /// Structure dereference
227 |     __host__ __device__ __forceinline__ pointer operator->()
228 |     {
229 |         return &(*(*this));
230 |     }
231 | 
232 |     /// Equal to
233 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
234 |     {
235 |         return ((itr == rhs.itr) && (offset == rhs.offset));
236 |     }
237 | 
238 |     /// Not equal to
239 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
240 |     {
241 |         return ((itr != rhs.itr) || (offset != rhs.offset));
242 |     }
243 | 
244 |     /// Normalize
245 |     __host__ __device__ __forceinline__ void normalize()
246 |     {
247 |         itr += offset;
248 |         offset = 0;
249 |     }
250 | 
251 |     /// ostream operator
252 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
253 |     {
254 |         return os;
255 |     }
256 | };
257 | 
258 | 
259 | 
260 | /** @} */       // end group UtilIterator
261 | 
262 | }               // CUB namespace
263 | CUB_NS_POSTFIX  // Optional outer namespace(s)
264 | 


--------------------------------------------------------------------------------
/cub/iterator/cache_modified_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_device.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | #if (THRUST_VERSION >= 100700)
 45 |     // This iterator is compatible with Thrust API 1.7 and newer
 46 |     #include <thrust/iterator/iterator_facade.h>
 47 |     #include <thrust/iterator/iterator_traits.h>
 48 | #endif // THRUST_VERSION
 49 | 
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | 
 58 | 
 59 | /**
 60 |  * \addtogroup UtilIterator
 61 |  * @{
 62 |  */
 63 | 
 64 | 
 65 | /**
 66 |  * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
 67 |  *
 68 |  * \par Overview
 69 |  * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native
 70 |  *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
 71 |  *   made by reading \p ValueType values through loads modified by \p MODIFIER.
 72 |  * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
 73 |  *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
 74 |  * - Can be constructed, manipulated, and exchanged within and between host and device
 75 |  *   functions, but can only be dereferenced within device functions.
 76 |  * - Compatible with Thrust API v1.7 or newer.
 77 |  *
 78 |  * \par Snippet
 79 |  * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto
 80 |  * dereference a device array of double using the "ldg" PTX load modifier
 81 |  * (i.e., load values through texture cache).
 82 |  * \par
 83 |  * \code
 84 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
 85 |  *
 86 |  * // Declare, allocate, and initialize a device array
 87 |  * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
 88 |  *
 89 |  * // Create an iterator wrapper
 90 |  * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
 91 |  *
 92 |  * // Within device code:
 93 |  * printf("%f\n", itr[0]);  // 8.0
 94 |  * printf("%f\n", itr[1]);  // 6.0
 95 |  * printf("%f\n", itr[6]);  // 9.0
 96 |  *
 97 |  * \endcode
 98 |  *
 99 |  * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
100 |  * \tparam ValueType            The value type of this iterator
101 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
102 |  */
103 | template <
104 |     CacheLoadModifier   MODIFIER,
105 |     typename            ValueType,
106 |     typename            OffsetT = ptrdiff_t>
107 | class CacheModifiedInputIterator
108 | {
109 | public:
110 | 
111 |     // Required iterator traits
112 |     typedef CacheModifiedInputIterator          self_type;              ///< My own type
113 |     typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
114 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
115 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
116 |     typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
117 | 
118 | #if (THRUST_VERSION >= 100700)
119 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
120 |     typedef typename thrust::detail::iterator_facade_category<
121 |         thrust::device_system_tag,
122 |         thrust::random_access_traversal_tag,
123 |         value_type,
124 |         reference
125 |       >::type iterator_category;                                        ///< The iterator category
126 | #else
127 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
128 | #endif  // THRUST_VERSION
129 | 
130 | 
131 | public:
132 | 
133 |     /// Wrapped native pointer
134 |     ValueType* ptr;
135 | 
136 |     /// Constructor
137 |     template <typename QualifiedValueType>
138 |     __host__ __device__ __forceinline__ CacheModifiedInputIterator(
139 |         QualifiedValueType* ptr)     ///< Native pointer to wrap
140 |     :
141 |         ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
142 |     {}
143 | 
144 |     /// Postfix increment
145 |     __host__ __device__ __forceinline__ self_type operator++(int)
146 |     {
147 |         self_type retval = *this;
148 |         ptr++;
149 |         return retval;
150 |     }
151 | 
152 |     /// Prefix increment
153 |     __host__ __device__ __forceinline__ self_type operator++()
154 |     {
155 |         ptr++;
156 |         return *this;
157 |     }
158 | 
159 |     /// Indirection
160 |     __device__ __forceinline__ reference operator*() const
161 |     {
162 |         return ThreadLoad<MODIFIER>(ptr);
163 |     }
164 | 
165 |     /// Addition
166 |     template <typename Distance>
167 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
168 |     {
169 |         self_type retval(ptr + n);
170 |         return retval;
171 |     }
172 | 
173 |     /// Addition assignment
174 |     template <typename Distance>
175 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
176 |     {
177 |         ptr += n;
178 |         return *this;
179 |     }
180 | 
181 |     /// Subtraction
182 |     template <typename Distance>
183 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
184 |     {
185 |         self_type retval(ptr - n);
186 |         return retval;
187 |     }
188 | 
189 |     /// Subtraction assignment
190 |     template <typename Distance>
191 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
192 |     {
193 |         ptr -= n;
194 |         return *this;
195 |     }
196 | 
197 |     /// Distance
198 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
199 |     {
200 |         return ptr - other.ptr;
201 |     }
202 | 
203 |     /// Array subscript
204 |     template <typename Distance>
205 |     __device__ __forceinline__ reference operator[](Distance n) const
206 |     {
207 |         return ThreadLoad<MODIFIER>(ptr + n);
208 |     }
209 | 
210 |     /// Structure dereference
211 |     __device__ __forceinline__ pointer operator->()
212 |     {
213 |         return &ThreadLoad<MODIFIER>(ptr);
214 |     }
215 | 
216 |     /// Equal to
217 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
218 |     {
219 |         return (ptr == rhs.ptr);
220 |     }
221 | 
222 |     /// Not equal to
223 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
224 |     {
225 |         return (ptr != rhs.ptr);
226 |     }
227 | 
228 |     /// ostream operator
229 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
230 |     {
231 |         return os;
232 |     }
233 | };
234 | 
235 | 
236 | 
237 | /** @} */       // end group UtilIterator
238 | 
239 | }               // CUB namespace
240 | CUB_NS_POSTFIX  // Optional outer namespace(s)
241 | 


--------------------------------------------------------------------------------
/cub/iterator/cache_modified_output_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_device.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | #if (THRUST_VERSION >= 100700)
 45 |     // This iterator is compatible with Thrust API 1.7 and newer
 46 |     #include <thrust/iterator/iterator_facade.h>
 47 |     #include <thrust/iterator/iterator_traits.h>
 48 | #endif // THRUST_VERSION
 49 | 
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | 
 58 | /**
 59 |  * \addtogroup UtilIterator
 60 |  * @{
 61 |  */
 62 | 
 63 | 
 64 | /**
 65 |  * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
 66 |  *
 67 |  * \par Overview
 68 |  * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
 69 |  *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
 70 |  *   made by writing \p ValueType values through stores modified by \p MODIFIER.
 71 |  * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
 72 |  *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
 73 |  * - Can be constructed, manipulated, and exchanged within and between host and device
 74 |  *   functions, but can only be dereferenced within device functions.
 75 |  * - Compatible with Thrust API v1.7 or newer.
 76 |  *
 77 |  * \par Snippet
 78 |  * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
 79 |  * dereference a device array of doubles using the "wt" PTX load modifier
 80 |  * (i.e., write-through to system memory).
 81 |  * \par
 82 |  * \code
 83 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
 84 |  *
 85 |  * // Declare, allocate, and initialize a device array
 86 |  * double *d_out;              // e.g., [, , , , , , ]
 87 |  *
 88 |  * // Create an iterator wrapper
 89 |  * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
 90 |  *
 91 |  * // Within device code:
 92 |  * itr[0]  = 8.0;
 93 |  * itr[1]  = 66.0;
 94 |  * itr[55] = 24.0;
 95 |  *
 96 |  * \endcode
 97 |  *
 98 |  * \par Usage Considerations
 99 |  * - Can only be dereferenced within device code
100 |  *
101 |  * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
102 |  * \tparam ValueType            The value type of this iterator
103 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
104 |  */
105 | template <
106 |     CacheStoreModifier  MODIFIER,
107 |     typename            ValueType,
108 |     typename            OffsetT = ptrdiff_t>
109 | class CacheModifiedOutputIterator
110 | {
111 | private:
112 | 
113 |     // Proxy object
114 |     struct Reference
115 |     {
116 |         ValueType* ptr;
117 | 
118 |         /// Constructor
119 |         __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
120 | 
121 |         /// Assignment
122 |         __device__ __forceinline__ ValueType operator =(ValueType val)
123 |         {
124 |             ThreadStore<MODIFIER>(ptr, val);
125 |             return val;
126 |         }
127 |     };
128 | 
129 | public:
130 | 
131 |     // Required iterator traits
132 |     typedef CacheModifiedOutputIterator         self_type;              ///< My own type
133 |     typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
134 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
135 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
136 |     typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
137 | 
138 | #if (THRUST_VERSION >= 100700)
139 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
140 |     typedef typename thrust::detail::iterator_facade_category<
141 |         thrust::device_system_tag,
142 |         thrust::random_access_traversal_tag,
143 |         value_type,
144 |         reference
145 |       >::type iterator_category;                                        ///< The iterator category
146 | #else
147 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
148 | #endif  // THRUST_VERSION
149 | 
150 | private:
151 | 
152 |     ValueType* ptr;
153 | 
154 | public:
155 | 
156 |     /// Constructor
157 |     template <typename QualifiedValueType>
158 |     __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
159 |         QualifiedValueType* ptr)     ///< Native pointer to wrap
160 |     :
161 |         ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
162 |     {}
163 | 
164 |     /// Postfix increment
165 |     __host__ __device__ __forceinline__ self_type operator++(int)
166 |     {
167 |         self_type retval = *this;
168 |         ptr++;
169 |         return retval;
170 |     }
171 | 
172 | 
173 |     /// Prefix increment
174 |     __host__ __device__ __forceinline__ self_type operator++()
175 |     {
176 |         ptr++;
177 |         return *this;
178 |     }
179 | 
180 |     /// Indirection
181 |     __host__ __device__ __forceinline__ reference operator*() const
182 |     {
183 |         return Reference(ptr);
184 |     }
185 | 
186 |     /// Addition
187 |     template <typename Distance>
188 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
189 |     {
190 |         self_type retval(ptr + n);
191 |         return retval;
192 |     }
193 | 
194 |     /// Addition assignment
195 |     template <typename Distance>
196 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
197 |     {
198 |         ptr += n;
199 |         return *this;
200 |     }
201 | 
202 |     /// Subtraction
203 |     template <typename Distance>
204 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
205 |     {
206 |         self_type retval(ptr - n);
207 |         return retval;
208 |     }
209 | 
210 |     /// Subtraction assignment
211 |     template <typename Distance>
212 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
213 |     {
214 |         ptr -= n;
215 |         return *this;
216 |     }
217 | 
218 |     /// Distance
219 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
220 |     {
221 |         return ptr - other.ptr;
222 |     }
223 | 
224 |     /// Array subscript
225 |     template <typename Distance>
226 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
227 |     {
228 |         return Reference(ptr + n);
229 |     }
230 | 
231 |     /// Equal to
232 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
233 |     {
234 |         return (ptr == rhs.ptr);
235 |     }
236 | 
237 |     /// Not equal to
238 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
239 |     {
240 |         return (ptr != rhs.ptr);
241 |     }
242 | 
243 |     /// ostream operator
244 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
245 |     {
246 |         return os;
247 |     }
248 | };
249 | 
250 | 
251 | /** @} */       // end group UtilIterator
252 | 
253 | }               // CUB namespace
254 | CUB_NS_POSTFIX  // Optional outer namespace(s)
255 | 


--------------------------------------------------------------------------------
/cub/iterator/constant_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_namespace.cuh"
 42 | 
 43 | #if (THRUST_VERSION >= 100700)
 44 |     // This iterator is compatible with Thrust API 1.7 and newer
 45 |     #include <thrust/iterator/iterator_facade.h>
 46 |     #include <thrust/iterator/iterator_traits.h>
 47 | #endif // THRUST_VERSION
 48 | 
 49 | 
 50 | /// Optional outer namespace(s)
 51 | CUB_NS_PREFIX
 52 | 
 53 | /// CUB namespace
 54 | namespace cub {
 55 | 
 56 | 
 57 | /**
 58 |  * \addtogroup UtilIterator
 59 |  * @{
 60 |  */
 61 | 
 62 | 
 63 | /**
 64 |  * \brief A random-access input generator for dereferencing a sequence of homogeneous values
 65 |  *
 66 |  * \par Overview
 67 |  * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
 68 |  *   of type \p ValueType.
 69 |  * - Can be used with any data type.
 70 |  * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
 71 |  *   functions.
 72 |  * - Compatible with Thrust API v1.7 or newer.
 73 |  *
 74 |  * \par Snippet
 75 |  * The code snippet below illustrates the use of \p ConstantInputIteratorTto
 76 |  * dereference a sequence of homogeneous doubles.
 77 |  * \par
 78 |  * \code
 79 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
 80 |  *
 81 |  * cub::ConstantInputIterator<double> itr(5.0);
 82 |  *
 83 |  * printf("%f\n", itr[0]);      // 5.0
 84 |  * printf("%f\n", itr[1]);      // 5.0
 85 |  * printf("%f\n", itr[2]);      // 5.0
 86 |  * printf("%f\n", itr[50]);     // 5.0
 87 |  *
 88 |  * \endcode
 89 |  *
 90 |  * \tparam ValueType            The value type of this iterator
 91 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
 92 |  */
 93 | template <
 94 |     typename ValueType,
 95 |     typename OffsetT = ptrdiff_t>
 96 | class ConstantInputIterator
 97 | {
 98 | public:
 99 | 
100 |     // Required iterator traits
101 |     typedef ConstantInputIterator               self_type;              ///< My own type
102 |     typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
103 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
104 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
105 |     typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
106 | 
107 | #if (THRUST_VERSION >= 100700)
108 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
109 |     typedef typename thrust::detail::iterator_facade_category<
110 |         thrust::any_system_tag,
111 |         thrust::random_access_traversal_tag,
112 |         value_type,
113 |         reference
114 |       >::type iterator_category;                                        ///< The iterator category
115 | #else
116 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
117 | #endif  // THRUST_VERSION
118 | 
119 | private:
120 | 
121 |     ValueType   val;
122 |     OffsetT     offset;
123 | #ifdef _WIN32
124 |     OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
125 | #endif
126 | 
127 | public:
128 | 
129 |     /// Constructor
130 |     __host__ __device__ __forceinline__ ConstantInputIterator(
131 |         ValueType   val,            ///< Starting value for the iterator instance to report
132 |         OffsetT     offset = 0)     ///< Base offset
133 |     :
134 |         val(val),
135 |         offset(offset)
136 |     {}
137 | 
138 |     /// Postfix increment
139 |     __host__ __device__ __forceinline__ self_type operator++(int)
140 |     {
141 |         self_type retval = *this;
142 |         offset++;
143 |         return retval;
144 |     }
145 | 
146 |     /// Prefix increment
147 |     __host__ __device__ __forceinline__ self_type operator++()
148 |     {
149 |         offset++;
150 |         return *this;
151 |     }
152 | 
153 |     /// Indirection
154 |     __host__ __device__ __forceinline__ reference operator*() const
155 |     {
156 |         return val;
157 |     }
158 | 
159 |     /// Addition
160 |     template <typename Distance>
161 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
162 |     {
163 |         self_type retval(val, offset + n);
164 |         return retval;
165 |     }
166 | 
167 |     /// Addition assignment
168 |     template <typename Distance>
169 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
170 |     {
171 |         offset += n;
172 |         return *this;
173 |     }
174 | 
175 |     /// Subtraction
176 |     template <typename Distance>
177 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
178 |     {
179 |         self_type retval(val, offset - n);
180 |         return retval;
181 |     }
182 | 
183 |     /// Subtraction assignment
184 |     template <typename Distance>
185 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
186 |     {
187 |         offset -= n;
188 |         return *this;
189 |     }
190 | 
191 |     /// Distance
192 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
193 |     {
194 |         return offset - other.offset;
195 |     }
196 | 
197 |     /// Array subscript
198 |     template <typename Distance>
199 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
200 |     {
201 |         return val;
202 |     }
203 | 
204 |     /// Structure dereference
205 |     __host__ __device__ __forceinline__ pointer operator->()
206 |     {
207 |         return &val;
208 |     }
209 | 
210 |     /// Equal to
211 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
212 |     {
213 |         return (offset == rhs.offset) && ((val == rhs.val));
214 |     }
215 | 
216 |     /// Not equal to
217 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
218 |     {
219 |         return (offset != rhs.offset) || (val!= rhs.val);
220 |     }
221 | 
222 |     /// ostream operator
223 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
224 |     {
225 |         os << "[" << itr.val << "," << itr.offset << "]";
226 |         return os;
227 |     }
228 | 
229 | };
230 | 
231 | 
232 | /** @} */       // end group UtilIterator
233 | 
234 | }               // CUB namespace
235 | CUB_NS_POSTFIX  // Optional outer namespace(s)
236 | 


--------------------------------------------------------------------------------
/cub/iterator/counting_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_device.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | #if (THRUST_VERSION >= 100700)
 45 |     // This iterator is compatible with Thrust API 1.7 and newer
 46 |     #include <thrust/iterator/iterator_facade.h>
 47 |     #include <thrust/iterator/iterator_traits.h>
 48 | #endif // THRUST_VERSION
 49 | 
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | /**
 58 |  * \addtogroup UtilIterator
 59 |  * @{
 60 |  */
 61 | 
 62 | /**
 63 |  * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
 64 |  *
 65 |  * \par Overview
 66 |  * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
 67 |  *   at \p offset will return the value \p base + \p offset.
 68 |  * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
 69 |  *   functions.
 70 |  * - Compatible with Thrust API v1.7 or newer.
 71 |  *
 72 |  * \par Snippet
 73 |  * The code snippet below illustrates the use of \p CountingInputIteratorTto
 74 |  * dereference a sequence of incrementing integers.
 75 |  * \par
 76 |  * \code
 77 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
 78 |  *
 79 |  * cub::CountingInputIterator<int> itr(5);
 80 |  *
 81 |  * printf("%d\n", itr[0]);      // 5
 82 |  * printf("%d\n", itr[1]);      // 6
 83 |  * printf("%d\n", itr[2]);      // 7
 84 |  * printf("%d\n", itr[50]);     // 55
 85 |  *
 86 |  * \endcode
 87 |  *
 88 |  * \tparam ValueType            The value type of this iterator
 89 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
 90 |  */
 91 | template <
 92 |     typename ValueType,
 93 |     typename OffsetT = ptrdiff_t>
 94 | class CountingInputIterator
 95 | {
 96 | public:
 97 | 
 98 |     // Required iterator traits
 99 |     typedef CountingInputIterator               self_type;              ///< My own type
100 |     typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
101 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
102 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
103 |     typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
104 | 
105 | #if (THRUST_VERSION >= 100700)
106 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
107 |     typedef typename thrust::detail::iterator_facade_category<
108 |         thrust::any_system_tag,
109 |         thrust::random_access_traversal_tag,
110 |         value_type,
111 |         reference
112 |       >::type iterator_category;                                        ///< The iterator category
113 | #else
114 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
115 | #endif  // THRUST_VERSION
116 | 
117 | private:
118 | 
119 |     ValueType val;
120 | 
121 | public:
122 | 
123 |     /// Constructor
124 |     __host__ __device__ __forceinline__ CountingInputIterator(
125 |         const ValueType &val)          ///< Starting value for the iterator instance to report
126 |     :
127 |         val(val)
128 |     {}
129 | 
130 |     /// Postfix increment
131 |     __host__ __device__ __forceinline__ self_type operator++(int)
132 |     {
133 |         self_type retval = *this;
134 |         val++;
135 |         return retval;
136 |     }
137 | 
138 |     /// Prefix increment
139 |     __host__ __device__ __forceinline__ self_type operator++()
140 |     {
141 |         val++;
142 |         return *this;
143 |     }
144 | 
145 |     /// Indirection
146 |     __host__ __device__ __forceinline__ reference operator*() const
147 |     {
148 |         return val;
149 |     }
150 | 
151 |     /// Addition
152 |     template <typename Distance>
153 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
154 |     {
155 |         self_type retval(val + n);
156 |         return retval;
157 |     }
158 | 
159 |     /// Addition assignment
160 |     template <typename Distance>
161 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
162 |     {
163 |         val += n;
164 |         return *this;
165 |     }
166 | 
167 |     /// Subtraction
168 |     template <typename Distance>
169 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
170 |     {
171 |         self_type retval(val - n);
172 |         return retval;
173 |     }
174 | 
175 |     /// Subtraction assignment
176 |     template <typename Distance>
177 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
178 |     {
179 |         val -= n;
180 |         return *this;
181 |     }
182 | 
183 |     /// Distance
184 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
185 |     {
186 |         return val - other.val;
187 |     }
188 | 
189 |     /// Array subscript
190 |     template <typename Distance>
191 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
192 |     {
193 |         return val + n;
194 |     }
195 | 
196 |     /// Structure dereference
197 |     __host__ __device__ __forceinline__ pointer operator->()
198 |     {
199 |         return &val;
200 |     }
201 | 
202 |     /// Equal to
203 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
204 |     {
205 |         return (val == rhs.val);
206 |     }
207 | 
208 |     /// Not equal to
209 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
210 |     {
211 |         return (val != rhs.val);
212 |     }
213 | 
214 |     /// ostream operator
215 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
216 |     {
217 |         os << "[" << itr.val << "]";
218 |         return os;
219 |     }
220 | 
221 | };
222 | 
223 | 
224 | 
225 | /** @} */       // end group UtilIterator
226 | 
227 | }               // CUB namespace
228 | CUB_NS_POSTFIX  // Optional outer namespace(s)
229 | 


--------------------------------------------------------------------------------
/cub/iterator/tex_obj_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_device.cuh"
 42 | #include "../util_debug.cuh"
 43 | #include "../util_namespace.cuh"
 44 | 
 45 | #if (THRUST_VERSION >= 100700)
 46 |     // This iterator is compatible with Thrust API 1.7 and newer
 47 |     #include <thrust/iterator/iterator_facade.h>
 48 |     #include <thrust/iterator/iterator_traits.h>
 49 | #endif // THRUST_VERSION
 50 | 
 51 | 
 52 | /// Optional outer namespace(s)
 53 | CUB_NS_PREFIX
 54 | 
 55 | /// CUB namespace
 56 | namespace cub {
 57 | 
 58 | /**
 59 |  * \addtogroup UtilIterator
 60 |  * @{
 61 |  */
 62 | 
 63 | 
 64 | 
 65 | /**
 66 |  * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
 67 |  *
 68 |  * \par Overview
 69 |  * - TexObjInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
 70 |  *   to elements are to be loaded through texture cache.
 71 |  * - Can be used to load any data type from memory through texture cache.
 72 |  * - Can be manipulated and exchanged within and between host and device
 73 |  *   functions, can only be constructed within host functions, and can only be
 74 |  *   dereferenced within device functions.
 75 |  * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be
 76 |  *   created by the host thread, but can be used by any descendant kernel.
 77 |  * - Compatible with Thrust API v1.7 or newer.
 78 |  *
 79 |  * \par Snippet
 80 |  * The code snippet below illustrates the use of \p TexRefInputIteratorTto
 81 |  * dereference a device array of doubles through texture cache.
 82 |  * \par
 83 |  * \code
 84 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
 85 |  *
 86 |  * // Declare, allocate, and initialize a device array
 87 |  * int num_items;   // e.g., 7
 88 |  * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
 89 |  *
 90 |  * // Create an iterator wrapper
 91 |  * cub::TexObjInputIterator<double> itr;
 92 |  * itr.BindTexture(d_in, sizeof(double) * num_items);
 93 |  * ...
 94 |  *
 95 |  * // Within device code:
 96 |  * printf("%f\n", itr[0]);      // 8.0
 97 |  * printf("%f\n", itr[1]);      // 6.0
 98 |  * printf("%f\n", itr[6]);      // 9.0
 99 |  *
100 |  * ...
101 |  * itr.UnbindTexture();
102 |  *
103 |  * \endcode
104 |  *
105 |  * \tparam T                    The value type of this iterator
106 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
107 |  */
108 | template <
109 |     typename    T,
110 |     typename    OffsetT = ptrdiff_t>
111 | class TexObjInputIterator
112 | {
113 | public:
114 | 
115 |     // Required iterator traits
116 |     typedef TexObjInputIterator                 self_type;              ///< My own type
117 |     typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
118 |     typedef T                                   value_type;             ///< The type of the element the iterator can point to
119 |     typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
120 |     typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
121 | 
122 | #if (THRUST_VERSION >= 100700)
123 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
124 |     typedef typename thrust::detail::iterator_facade_category<
125 |         thrust::device_system_tag,
126 |         thrust::random_access_traversal_tag,
127 |         value_type,
128 |         reference
129 |       >::type iterator_category;                                        ///< The iterator category
130 | #else
131 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
132 | #endif  // THRUST_VERSION
133 | 
134 | private:
135 | 
136 |     // Largest texture word we can use in device
137 |     typedef typename UnitWord<T>::TextureWord TextureWord;
138 | 
139 |     // Number of texture words per T
140 |     enum {
141 |         TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
142 |     };
143 | 
144 | private:
145 | 
146 |     T*                  ptr;
147 |     difference_type     tex_offset;
148 |     cudaTextureObject_t tex_obj;
149 | 
150 | public:
151 | 
152 |     /// Constructor
153 |     __host__ __device__ __forceinline__ TexObjInputIterator()
154 |     :
155 |         ptr(NULL),
156 |         tex_offset(0),
157 |         tex_obj(0)
158 |     {}
159 | 
160 |     /// Use this iterator to bind \p ptr with a texture reference
161 |     template <typename QualifiedT>
162 |     cudaError_t BindTexture(
163 |         QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
164 |         size_t          bytes = size_t(-1),         ///< Number of bytes in the range
165 |         size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
166 |     {
167 |         this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
168 |         this->tex_offset = tex_offset;
169 | 
170 |         cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
171 |         cudaResourceDesc        res_desc;
172 |         cudaTextureDesc         tex_desc;
173 |         memset(&res_desc, 0, sizeof(cudaResourceDesc));
174 |         memset(&tex_desc, 0, sizeof(cudaTextureDesc));
175 |         res_desc.resType                = cudaResourceTypeLinear;
176 |         res_desc.res.linear.devPtr      = this->ptr;
177 |         res_desc.res.linear.desc        = channel_desc;
178 |         res_desc.res.linear.sizeInBytes = bytes;
179 |         tex_desc.readMode               = cudaReadModeElementType;
180 |         return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
181 |     }
182 | 
183 |     /// Unbind this iterator from its texture reference
184 |     cudaError_t UnbindTexture()
185 |     {
186 |         return cudaDestroyTextureObject(tex_obj);
187 |     }
188 | 
189 |     /// Postfix increment
190 |     __host__ __device__ __forceinline__ self_type operator++(int)
191 |     {
192 |         self_type retval = *this;
193 |         tex_offset++;
194 |         return retval;
195 |     }
196 | 
197 |     /// Prefix increment
198 |     __host__ __device__ __forceinline__ self_type operator++()
199 |     {
200 |         tex_offset++;
201 |         return *this;
202 |     }
203 | 
204 |     /// Indirection
205 |     __host__ __device__ __forceinline__ reference operator*() const
206 |     {
207 | #if (CUB_PTX_ARCH == 0)
208 |         // Simply dereference the pointer on the host
209 |         return ptr[tex_offset];
210 | #else
211 |         // Move array of uninitialized words, then alias and assign to return value
212 |         TextureWord words[TEXTURE_MULTIPLE];
213 | 
214 |         #pragma unroll
215 |         for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
216 |         {
217 |             words[i] = tex1Dfetch<TextureWord>(
218 |                 tex_obj,
219 |                 (tex_offset * TEXTURE_MULTIPLE) + i);
220 |         }
221 | 
222 |         // Load from words
223 |         return *reinterpret_cast<T*>(words);
224 | #endif
225 |     }
226 | 
227 |     /// Addition
228 |     template <typename Distance>
229 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
230 |     {
231 |         self_type retval;
232 |         retval.ptr          = ptr;
233 |         retval.tex_obj      = tex_obj;
234 |         retval.tex_offset   = tex_offset + n;
235 |         return retval;
236 |     }
237 | 
238 |     /// Addition assignment
239 |     template <typename Distance>
240 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
241 |     {
242 |         tex_offset += n;
243 |         return *this;
244 |     }
245 | 
246 |     /// Subtraction
247 |     template <typename Distance>
248 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
249 |     {
250 |         self_type retval;
251 |         retval.ptr          = ptr;
252 |         retval.tex_obj      = tex_obj;
253 |         retval.tex_offset   = tex_offset - n;
254 |         return retval;
255 |     }
256 | 
257 |     /// Subtraction assignment
258 |     template <typename Distance>
259 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
260 |     {
261 |         tex_offset -= n;
262 |         return *this;
263 |     }
264 | 
265 |     /// Distance
266 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
267 |     {
268 |         return tex_offset - other.tex_offset;
269 |     }
270 | 
271 |     /// Array subscript
272 |     template <typename Distance>
273 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
274 |     {
275 |         self_type offset = (*this) + n;
276 |         return *offset;
277 |     }
278 | 
279 |     /// Structure dereference
280 |     __host__ __device__ __forceinline__ pointer operator->()
281 |     {
282 |         return &(*(*this));
283 |     }
284 | 
285 |     /// Equal to
286 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
287 |     {
288 |         return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
289 |     }
290 | 
291 |     /// Not equal to
292 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
293 |     {
294 |         return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
295 |     }
296 | 
297 |     /// ostream operator
298 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
299 |     {
300 |         return os;
301 |     }
302 | 
303 | };
304 | 
305 | 
306 | 
307 | /** @} */       // end group UtilIterator
308 | 
309 | }               // CUB namespace
310 | CUB_NS_POSTFIX  // Optional outer namespace(s)
311 | 


--------------------------------------------------------------------------------
/cub/iterator/transform_input_iterator.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Random-access iterator types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include <iterator>
 37 | #include <iostream>
 38 | 
 39 | #include "../thread/thread_load.cuh"
 40 | #include "../thread/thread_store.cuh"
 41 | #include "../util_device.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | #if (THRUST_VERSION >= 100700)
 45 |     // This iterator is compatible with Thrust API 1.7 and newer
 46 |     #include <thrust/iterator/iterator_facade.h>
 47 |     #include <thrust/iterator/iterator_traits.h>
 48 | #endif // THRUST_VERSION
 49 | 
 50 | 
 51 | /// Optional outer namespace(s)
 52 | CUB_NS_PREFIX
 53 | 
 54 | /// CUB namespace
 55 | namespace cub {
 56 | 
 57 | /**
 58 |  * \addtogroup UtilIterator
 59 |  * @{
 60 |  */
 61 | 
 62 | 
 63 | /**
 64 |  * \brief A random-access input wrapper for transforming dereferenced values.
 65 |  *
 66 |  * \par Overview
 67 |  * - TransformInputIteratorTwraps a unary conversion functor of type \p
 68 |  *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
 69 |  *   using the former to produce references of type \p ValueType from the latter.
 70 |  * - Can be used with any data type.
 71 |  * - Can be constructed, manipulated, and exchanged within and between host and device
 72 |  *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
 73 |  *   device memory can only be dereferenced on the device.
 74 |  * - Compatible with Thrust API v1.7 or newer.
 75 |  *
 76 |  * \par Snippet
 77 |  * The code snippet below illustrates the use of \p TransformInputIteratorTto
 78 |  * dereference an array of integers, tripling the values and converting them to doubles.
 79 |  * \par
 80 |  * \code
 81 |  * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
 82 |  *
 83 |  * // Functor for tripling integer values and converting to doubles
 84 |  * struct TripleDoubler
 85 |  * {
 86 |  *     __host__ __device__ __forceinline__
 87 |  *     double operator()(const int &a) const {
 88 |  *         return double(a * 2);
 89 |  *     }
 90 |  * };
 91 |  *
 92 |  * // Declare, allocate, and initialize a device array
 93 |  * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
 94 |  * TripleDoubler conversion_op;
 95 |  *
 96 |  * // Create an iterator wrapper
 97 |  * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
 98 |  *
 99 |  * // Within device code:
100 |  * printf("%f\n", itr[0]);  // 24.0
101 |  * printf("%f\n", itr[1]);  // 18.0
102 |  * printf("%f\n", itr[6]);  // 27.0
103 |  *
104 |  * \endcode
105 |  *
106 |  * \tparam ValueType            The value type of this iterator
107 |  * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
108 |  * \tparam InputIteratorT       The type of the wrapped input iterator
109 |  * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
110 |  *
111 |  */
112 | template <
113 |     typename ValueType,
114 |     typename ConversionOp,
115 |     typename InputIteratorT,
116 |     typename OffsetT = ptrdiff_t>
117 | class TransformInputIterator
118 | {
119 | public:
120 | 
121 |     // Required iterator traits
122 |     typedef TransformInputIterator              self_type;              ///< My own type
123 |     typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
124 |     typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
125 |     typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
126 |     typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
127 | 
128 | #if (THRUST_VERSION >= 100700)
129 |     // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
130 |     typedef typename thrust::detail::iterator_facade_category<
131 |         thrust::any_system_tag,
132 |         thrust::random_access_traversal_tag,
133 |         value_type,
134 |         reference
135 |       >::type iterator_category;                                        ///< The iterator category
136 | #else
137 |     typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
138 | #endif  // THRUST_VERSION
139 | 
140 | private:
141 | 
142 |     ConversionOp    conversion_op;
143 |     InputIteratorT  input_itr;
144 | 
145 | public:
146 | 
147 |     /// Constructor
148 |     __host__ __device__ __forceinline__ TransformInputIterator(
149 |         InputIteratorT      input_itr,          ///< Input iterator to wrap
150 |         ConversionOp        conversion_op)      ///< Conversion functor to wrap
151 |     :
152 |         conversion_op(conversion_op),
153 |         input_itr(input_itr)
154 |     {}
155 | 
156 |     /// Postfix increment
157 |     __host__ __device__ __forceinline__ self_type operator++(int)
158 |     {
159 |         self_type retval = *this;
160 |         input_itr++;
161 |         return retval;
162 |     }
163 | 
164 |     /// Prefix increment
165 |     __host__ __device__ __forceinline__ self_type operator++()
166 |     {
167 |         input_itr++;
168 |         return *this;
169 |     }
170 | 
171 |     /// Indirection
172 |     __host__ __device__ __forceinline__ reference operator*() const
173 |     {
174 |         return conversion_op(*input_itr);
175 |     }
176 | 
177 |     /// Addition
178 |     template <typename Distance>
179 |     __host__ __device__ __forceinline__ self_type operator+(Distance n) const
180 |     {
181 |         self_type retval(input_itr + n, conversion_op);
182 |         return retval;
183 |     }
184 | 
185 |     /// Addition assignment
186 |     template <typename Distance>
187 |     __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
188 |     {
189 |         input_itr += n;
190 |         return *this;
191 |     }
192 | 
193 |     /// Subtraction
194 |     template <typename Distance>
195 |     __host__ __device__ __forceinline__ self_type operator-(Distance n) const
196 |     {
197 |         self_type retval(input_itr - n, conversion_op);
198 |         return retval;
199 |     }
200 | 
201 |     /// Subtraction assignment
202 |     template <typename Distance>
203 |     __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
204 |     {
205 |         input_itr -= n;
206 |         return *this;
207 |     }
208 | 
209 |     /// Distance
210 |     __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
211 |     {
212 |         return input_itr - other.input_itr;
213 |     }
214 | 
215 |     /// Array subscript
216 |     template <typename Distance>
217 |     __host__ __device__ __forceinline__ reference operator[](Distance n) const
218 |     {
219 |         return conversion_op(input_itr[n]);
220 |     }
221 | 
222 |     /// Structure dereference
223 |     __host__ __device__ __forceinline__ pointer operator->()
224 |     {
225 |         return &conversion_op(*input_itr);
226 |     }
227 | 
228 |     /// Equal to
229 |     __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
230 |     {
231 |         return (input_itr == rhs.input_itr);
232 |     }
233 | 
234 |     /// Not equal to
235 |     __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
236 |     {
237 |         return (input_itr != rhs.input_itr);
238 |     }
239 | 
240 |     /// ostream operator
241 |     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
242 |     {
243 |         return os;
244 |     }
245 | };
246 | 
247 | 
248 | 
249 | /** @} */       // end group UtilIterator
250 | 
251 | }               // CUB namespace
252 | CUB_NS_POSTFIX  // Optional outer namespace(s)
253 | 


--------------------------------------------------------------------------------
/cub/thread/thread_operators.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Simple binary operator functor types
 32 |  */
 33 | 
 34 | /******************************************************************************
 35 |  * Simple functor operators
 36 |  ******************************************************************************/
 37 | 
 38 | #pragma once
 39 | 
 40 | #include "../util_macro.cuh"
 41 | #include "../util_type.cuh"
 42 | #include "../util_namespace.cuh"
 43 | 
 44 | /// Optional outer namespace(s)
 45 | CUB_NS_PREFIX
 46 | 
 47 | /// CUB namespace
 48 | namespace cub {
 49 | 
 50 | 
 51 | /**
 52 |  * \addtogroup UtilModule
 53 |  * @{
 54 |  */
 55 | 
 56 | /**
 57 |  * \brief Default equality functor
 58 |  */
 59 | struct Equality
 60 | {
 61 |     /// Boolean equality operator, returns <tt>(a == b)</tt>
 62 |     template <typename T>
 63 |     __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
 64 |     {
 65 |         return a == b;
 66 |     }
 67 | };
 68 | 
 69 | 
 70 | /**
 71 |  * \brief Default inequality functor
 72 |  */
 73 | struct Inequality
 74 | {
 75 |     /// Boolean inequality operator, returns <tt>(a != b)</tt>
 76 |     template <typename T>
 77 |     __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
 78 |     {
 79 |         return a != b;
 80 |     }
 81 | };
 82 | 
 83 | 
 84 | /**
 85 |  * \brief Inequality functor (wraps equality functor)
 86 |  */
 87 | template <typename EqualityOp>
 88 | struct InequalityWrapper
 89 | {
 90 |     /// Wrapped equality operator
 91 |     EqualityOp op;
 92 | 
 93 |     /// Constructor
 94 |     __host__ __device__ __forceinline__
 95 |     InequalityWrapper(EqualityOp op) : op(op) {}
 96 | 
 97 |     /// Boolean inequality operator, returns <tt>(a != b)</tt>
 98 |     template <typename T>
 99 |     __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
100 |     {
101 |         return !op(a, b);
102 |     }
103 | };
104 | 
105 | 
106 | /**
107 |  * \brief Default sum functor
108 |  */
109 | struct Sum
110 | {
111 |     /// Boolean sum operator, returns <tt>a + b</tt>
112 |     template <typename T>
113 |     __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
114 |     {
115 |         return a + b;
116 |     }
117 | };
118 | 
119 | 
120 | /**
121 |  * \brief Default max functor
122 |  */
123 | struct Max
124 | {
125 |     /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
126 |     template <typename T>
127 |     __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
128 |     {
129 |         return CUB_MAX(a, b);
130 |     }
131 | };
132 | 
133 | 
134 | /**
135 |  * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
136 |  */
137 | struct ArgMax
138 | {
139 |     /// Boolean max operator, preferring the item having the smaller offset in case of ties
140 |     template <typename T, typename OffsetT>
141 |     __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
142 |         const KeyValuePair<OffsetT, T> &a,
143 |         const KeyValuePair<OffsetT, T> &b) const
144 |     {
145 | // Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
146 | //        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
147 | 
148 |         if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
149 |             return b;
150 |         return a;
151 |     }
152 | };
153 | 
154 | 
155 | /**
156 |  * \brief Default min functor
157 |  */
158 | struct Min
159 | {
160 |     /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
161 |     template <typename T>
162 |     __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
163 |     {
164 |         return CUB_MIN(a, b);
165 |     }
166 | };
167 | 
168 | 
169 | /**
170 |  * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
171 |  */
172 | struct ArgMin
173 | {
174 |     /// Boolean min operator, preferring the item having the smaller offset in case of ties
175 |     template <typename T, typename OffsetT>
176 |     __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
177 |         const KeyValuePair<OffsetT, T> &a,
178 |         const KeyValuePair<OffsetT, T> &b) const
179 |     {
180 | // Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
181 | //        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
182 | 
183 |         if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
184 |             return b;
185 |         return a;
186 |     }
187 | };
188 | 
189 | 
190 | /**
191 |  * \brief Default cast functor
192 |  */
193 | template <typename B>
194 | struct Cast
195 | {
196 |     /// Cast operator, returns <tt>(B) a</tt>
197 |     template <typename A>
198 |     __host__ __device__ __forceinline__ B operator()(const A &a) const
199 |     {
200 |         return (B) a;
201 |     }
202 | };
203 | 
204 | 
205 | /**
206 |  * \brief Binary operator wrapper for switching non-commutative scan arguments
207 |  */
208 | template <typename ScanOp>
209 | class SwizzleScanOp
210 | {
211 | private:
212 | 
213 |     /// Wrapped scan operator
214 |     ScanOp scan_op;
215 | 
216 | public:
217 | 
218 |     /// Constructor
219 |     __host__ __device__ __forceinline__
220 |     SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
221 | 
222 |     /// Switch the scan arguments
223 |     template <typename T>
224 |     __host__ __device__ __forceinline__
225 |     T operator()(const T &a, const T &b)
226 |     {
227 |         return scan_op(b, a);
228 |     }
229 | };
230 | 
231 | 
232 | /**
233 |  * \brief Reduce-by-segment functor.
234 |  *
235 |  * Given two cub::KeyValuePair inputs \p a and \p b and a
236 |  * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
237 |  * an instance of this functor returns a cub::KeyValuePair whose \p key
238 |  * field is <tt>a.key</tt> + <tt>a.key</tt>, and whose \p value field
239 |  * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
240 |  *
241 |  * ReduceBySegmentOp is an associative, non-commutative binary combining operator
242 |  * for input sequences of cub::KeyValuePair pairings.  Such
243 |  * sequences are typically used to represent a segmented set of values to be reduced
244 |  * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
245 |  * first value of each segment.
246 |  *
247 |  */
248 | template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
249 | struct ReduceBySegmentOp
250 | {
251 |     /// Wrapped reduction operator
252 |     ReductionOpT op;
253 | 
254 |     /// Constructor
255 |     __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
256 | 
257 |     /// Constructor
258 |     __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
259 | 
260 |     /// Scan operator
261 |     template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
262 |     __host__ __device__ __forceinline__ KeyValuePairT operator()(
263 |         const KeyValuePairT &first,         ///< First partial reduction
264 |         const KeyValuePairT &second)        ///< Second partial reduction
265 |     {
266 |         KeyValuePairT retval;
267 |         retval.key = first.key + second.key;
268 |         retval.value = (second.key) ?
269 |                 second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
270 |                 op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
271 |         return retval;
272 |     }
273 | };
274 | 
275 | 
276 | 
277 | template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
278 | struct ReduceByKeyOp
279 | {
280 |     /// Wrapped reduction operator
281 |     ReductionOpT op;
282 | 
283 |     /// Constructor
284 |     __host__ __device__ __forceinline__ ReduceByKeyOp() {}
285 | 
286 |     /// Constructor
287 |     __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
288 | 
289 |     /// Scan operator
290 |     template <typename KeyValuePairT>
291 |     __host__ __device__ __forceinline__ KeyValuePairT operator()(
292 |         const KeyValuePairT &first,       ///< First partial reduction
293 |         const KeyValuePairT &second)      ///< Second partial reduction
294 |     {
295 |         KeyValuePairT retval = second;
296 | 
297 |         if (first.key == second.key)
298 |             retval.value = op(first.value, retval.value);
299 | 
300 |         return retval;
301 |     }
302 | };
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | 
310 | /** @} */       // end group UtilModule
311 | 
312 | 
313 | }               // CUB namespace
314 | CUB_NS_POSTFIX  // Optional outer namespace(s)
315 | 


--------------------------------------------------------------------------------
/cub/thread/thread_reduce.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Thread utilities for sequential reduction over statically-sized array types
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../thread/thread_operators.cuh"
 37 | #include "../util_namespace.cuh"
 38 | 
 39 | /// Optional outer namespace(s)
 40 | CUB_NS_PREFIX
 41 | 
 42 | /// CUB namespace
 43 | namespace cub {
 44 | 
 45 | /**
 46 |  * \addtogroup UtilModule
 47 |  * @{
 48 |  */
 49 | 
 50 | /**
 51 |  * \name Sequential reduction over statically-sized array types
 52 |  * @{
 53 |  */
 54 | 
 55 | 
 56 | template <
 57 |     int         LENGTH,
 58 |     typename    T,
 59 |     typename    ReductionOp>
 60 | __device__ __forceinline__ T ThreadReduce(
 61 |     T*                  input,                  ///< [in] Input array
 62 |     ReductionOp         reduction_op,           ///< [in] Binary reduction operator
 63 |     T                   prefix,                 ///< [in] Prefix to seed reduction with
 64 |     Int2Type<LENGTH>    length)
 65 | {
 66 |     T addend = *input;
 67 |     prefix = reduction_op(prefix, addend);
 68 | 
 69 |     return ThreadReduce(input + 1, reduction_op, prefix, Int2Type<LENGTH - 1>());
 70 | }
 71 | 
 72 | template <
 73 |     typename    T,
 74 |     typename    ReductionOp>
 75 | __device__ __forceinline__ T ThreadReduce(
 76 |     T*                  input,                  ///< [in] Input array
 77 |     ReductionOp         reduction_op,           ///< [in] Binary reduction operator
 78 |     T                   prefix,                 ///< [in] Prefix to seed reduction with
 79 |     Int2Type<0>         length)
 80 | {
 81 |     return prefix;
 82 | }
 83 | 
 84 | 
 85 | /**
 86 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
 87 |  *
 88 |  * \tparam LENGTH     LengthT of input array
 89 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
 90 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
 91 |  */
 92 | template <
 93 |     int         LENGTH,
 94 |     typename    T,
 95 |     typename    ReductionOp>
 96 | __device__ __forceinline__ T ThreadReduce(
 97 |     T*          input,                  ///< [in] Input array
 98 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
 99 |     T           prefix)                 ///< [in] Prefix to seed reduction with
100 | {
101 |     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
102 | }
103 | 
104 | 
105 | /**
106 |  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
107 |  *
108 |  * \tparam LENGTH     LengthT of input array
109 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
110 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
111 |  */
112 | template <
113 |     int         LENGTH,
114 |     typename    T,
115 |     typename    ReductionOp>
116 | __device__ __forceinline__ T ThreadReduce(
117 |     T*          input,                  ///< [in] Input array
118 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
119 | {
120 |     T prefix = input[0];
121 |     return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
122 | }
123 | 
124 | 
125 | /**
126 |  * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
127 |  *
128 |  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
129 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
130 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
131 |  */
132 | template <
133 |     int         LENGTH,
134 |     typename    T,
135 |     typename    ReductionOp>
136 | __device__ __forceinline__ T ThreadReduce(
137 |     T           (&input)[LENGTH],       ///< [in] Input array
138 |     ReductionOp reduction_op,           ///< [in] Binary reduction operator
139 |     T           prefix)                 ///< [in] Prefix to seed reduction with
140 | {
141 |     return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
142 | }
143 | 
144 | 
145 | /**
146 |  * \brief Serial reduction with the specified operator
147 |  *
148 |  * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
149 |  * \tparam T          <b>[inferred]</b> The data type to be reduced.
150 |  * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
151 |  */
152 | template <
153 |     int         LENGTH,
154 |     typename    T,
155 |     typename    ReductionOp>
156 | __device__ __forceinline__ T ThreadReduce(
157 |     T           (&input)[LENGTH],       ///< [in] Input array
158 |     ReductionOp reduction_op)           ///< [in] Binary reduction operator
159 | {
160 |     return ThreadReduce<LENGTH>((T*) input, reduction_op);
161 | }
162 | 
163 | 
164 | //@}  end member group
165 | 
166 | /** @} */       // end group UtilModule
167 | 
168 | }               // CUB namespace
169 | CUB_NS_POSTFIX  // Optional outer namespace(s)
170 | 


--------------------------------------------------------------------------------
/cub/thread/thread_search.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Thread utilities for sequential search
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "../util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | 
 45 | /**
 46 |  * Computes the begin offsets into A and B for the specific diagonal
 47 |  */
 48 | template <
 49 |     typename AIteratorT,
 50 |     typename BIteratorT,
 51 |     typename OffsetT,
 52 |     typename CoordinateT>
 53 | __host__ __device__ __forceinline__ void MergePathSearch(
 54 |     OffsetT         diagonal,
 55 |     AIteratorT      a,
 56 |     BIteratorT      b,
 57 |     OffsetT         a_len,
 58 |     OffsetT         b_len,
 59 |     CoordinateT&    path_coordinate)
 60 | {
 61 |     /// The value type of the input iterator
 62 |     typedef typename std::iterator_traits<AIteratorT>::value_type T;
 63 | 
 64 |     OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
 65 |     OffsetT split_max = CUB_MIN(diagonal, a_len);
 66 | 
 67 |     while (split_min < split_max)
 68 |     {
 69 |         OffsetT split_pivot = (split_min + split_max) >> 1;
 70 |         if (a[split_pivot] <= b[diagonal - split_pivot - 1])
 71 |         {
 72 |             // Move candidate split range up A, down B
 73 |             split_min = split_pivot + 1;
 74 |         }
 75 |         else
 76 |         {
 77 |             // Move candidate split range up B, down A
 78 |             split_max = split_pivot;
 79 |         }
 80 |     }
 81 | 
 82 |     path_coordinate.x = CUB_MIN(split_min, a_len);
 83 |     path_coordinate.y = diagonal - split_min;
 84 | }
 85 | 
 86 | 
 87 | 
 88 | /**
 89 |  * \brief Returns the offset of the first value within \p input which does not compare less than \p val
 90 |  */
 91 | template <
 92 |     typename InputIteratorT,
 93 |     typename OffsetT,
 94 |     typename T>
 95 | __device__ __forceinline__ OffsetT LowerBound(
 96 |     InputIteratorT      input,              ///< [in] Input sequence
 97 |     OffsetT             num_items,          ///< [in] Input sequence length
 98 |     T                   val)                ///< [in] Search key
 99 | {
100 |     OffsetT retval = 0;
101 |     while (num_items > 0)
102 |     {
103 |         OffsetT half = num_items >> 1;
104 |         if (input[retval + half] < val)
105 |         {
106 |             retval = retval + (half + 1);
107 |             num_items = num_items - (half + 1);
108 |         }
109 |         else
110 |         {
111 |             num_items = half;
112 |         }
113 |     }
114 | 
115 |     return retval;
116 | }
117 | 
118 | 
119 | /**
120 |  * \brief Returns the offset of the first value within \p input which compares greater than \p val
121 |  */
122 | template <
123 |     typename InputIteratorT,
124 |     typename OffsetT,
125 |     typename T>
126 | __device__ __forceinline__ OffsetT UpperBound(
127 |     InputIteratorT      input,              ///< [in] Input sequence
128 |     OffsetT             num_items,          ///< [in] Input sequence length
129 |     T                   val)                ///< [in] Search key
130 | {
131 |     OffsetT retval = 0;
132 |     while (num_items > 0)
133 |     {
134 |         OffsetT half = num_items >> 1;
135 |         if (val < input[retval + half])
136 |         {
137 |             num_items = half;
138 |         }
139 |         else
140 |         {
141 |             retval = retval + (half + 1);
142 |             num_items = num_items - (half + 1);
143 |         }
144 |     }
145 | 
146 |     return retval;
147 | }
148 | 
149 | 
150 | 
151 | 
152 | 
153 | }               // CUB namespace
154 | CUB_NS_POSTFIX  // Optional outer namespace(s)
155 | 


--------------------------------------------------------------------------------
/cub/util_arch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Static architectural properties by SM version.
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "util_namespace.cuh"
 37 | 
 38 | /// Optional outer namespace(s)
 39 | CUB_NS_PREFIX
 40 | 
 41 | /// CUB namespace
 42 | namespace cub {
 43 | 
 44 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 45 | 
 46 | 
 47 | /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
 48 | #ifndef CUB_PTX_ARCH
 49 |     #ifndef __CUDA_ARCH__
 50 |         #define CUB_PTX_ARCH 0
 51 |     #else
 52 |         #define CUB_PTX_ARCH __CUDA_ARCH__
 53 |     #endif
 54 | #endif
 55 | 
 56 | 
 57 | /// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
 58 | #ifndef CUB_RUNTIME_FUNCTION
 59 |     #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
 60 |         #define CUB_RUNTIME_ENABLED
 61 |         #define CUB_RUNTIME_FUNCTION __host__ __device__
 62 |     #else
 63 |         #define CUB_RUNTIME_FUNCTION __host__
 64 |     #endif
 65 | #endif
 66 | 
 67 | 
 68 | /// Number of threads per warp
 69 | #ifndef CUB_LOG_WARP_THREADS
 70 |     #define CUB_LOG_WARP_THREADS(arch)                      \
 71 |         (5)
 72 |     #define CUB_WARP_THREADS(arch)                          \
 73 |         (1 << CUB_LOG_WARP_THREADS(arch))
 74 | 
 75 |     #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
 76 |     #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
 77 | #endif
 78 | 
 79 | 
 80 | /// Number of smem banks
 81 | #ifndef CUB_LOG_SMEM_BANKS
 82 |     #define CUB_LOG_SMEM_BANKS(arch)                        \
 83 |         ((arch >= 200) ?                                    \
 84 |             (5) :                                           \
 85 |             (4))
 86 |     #define CUB_SMEM_BANKS(arch)                            \
 87 |         (1 << CUB_LOG_SMEM_BANKS(arch))
 88 | 
 89 |     #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
 90 |     #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
 91 | #endif
 92 | 
 93 | 
 94 | /// Oversubscription factor
 95 | #ifndef CUB_SUBSCRIPTION_FACTOR
 96 |     #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
 97 |         ((arch >= 300) ?                                    \
 98 |             (5) :                                           \
 99 |             ((arch >= 200) ?                                \
100 |                 (3) :                                       \
101 |                 (10)))
102 |     #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
103 | #endif
104 | 
105 | 
106 | /// Prefer padding overhead vs X-way conflicts greater than this threshold
107 | #ifndef CUB_PREFER_CONFLICT_OVER_PADDING
108 |     #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
109 |         ((arch >= 300) ?                                    \
110 |             (1) :                                           \
111 |             (4))
112 |     #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
113 | #endif
114 | 
115 | 
116 | /// Scale the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data.  Minimum of two warps.
117 | #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \
118 |     (CUB_MIN(NOMINAL_4B_BLOCK_THREADS, CUB_MAX(3, ((NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4) / sizeof(T)) * CUB_WARP_THREADS(PTX_ARCH)))
119 | 
120 | /// If necessary, scale down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data.  Minimum 1 item per thread
121 | #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) \
122 |     (CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))))
123 | 
124 | 
125 | 
126 | #endif  // Do not document
127 | 
128 | }               // CUB namespace
129 | CUB_NS_POSTFIX  // Optional outer namespace(s)
130 | 


--------------------------------------------------------------------------------
/cub/util_debug.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Error and event logging routines.
 32 |  *
 33 |  * The following macros definitions are supported:
 34 |  * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
 35 |  */
 36 | 
 37 | #pragma once
 38 | 
 39 | #include <stdio.h>
 40 | #include "util_namespace.cuh"
 41 | #include "util_arch.cuh"
 42 | 
 43 | /// Optional outer namespace(s)
 44 | CUB_NS_PREFIX
 45 | 
 46 | /// CUB namespace
 47 | namespace cub {
 48 | 
 49 | 
 50 | /**
 51 |  * \addtogroup UtilMgmt
 52 |  * @{
 53 |  */
 54 | 
 55 | 
 56 | /// CUB error reporting macro (prints error messages to stderr)
 57 | #if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
 58 |     #define CUB_STDERR
 59 | #endif
 60 | 
 61 | 
 62 | 
 63 | /**
 64 |  * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
 65 |  *
 66 |  * \return The CUDA error.
 67 |  */
 68 | __host__ __device__ __forceinline__ cudaError_t Debug(
 69 |     cudaError_t     error,
 70 |     const char*     filename,
 71 |     int             line)
 72 | {
 73 | #ifdef CUB_STDERR
 74 |     if (error)
 75 |     {
 76 |     #if (CUB_PTX_ARCH == 0)
 77 |         fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
 78 |         fflush(stderr);
 79 |     #elif (CUB_PTX_ARCH >= 200)
 80 |         printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
 81 |     #endif
 82 |     }
 83 | #endif
 84 |     return error;
 85 | }
 86 | 
 87 | 
 88 | /**
 89 |  * \brief Debug macro
 90 |  */
 91 | #ifndef CubDebug
 92 |     #define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
 93 | #endif
 94 | 
 95 | 
 96 | /**
 97 |  * \brief Debug macro with exit
 98 |  */
 99 | #ifndef CubDebugExit
100 |     #define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
101 | #endif
102 | 
103 | 
104 | /**
105 |  * \brief Log macro for printf statements.
106 |  */
107 | #if !defined(_CubLog)
108 |     #if (CUB_PTX_ARCH == 0)
109 |         #define _CubLog(format, ...) printf(format,__VA_ARGS__);
110 |     #elif (CUB_PTX_ARCH >= 200)
111 |         #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
112 |     #endif
113 | #endif
114 | 
115 | 
116 | 
117 | 
118 | /** @} */       // end group UtilMgmt
119 | 
120 | }               // CUB namespace
121 | CUB_NS_POSTFIX  // Optional outer namespace(s)
122 | 


--------------------------------------------------------------------------------
/cub/util_device.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /**
 30 |  * \file
 31 |  * Properties of a given CUDA device and the corresponding PTX bundle
 32 |  */
 33 | 
 34 | #pragma once
 35 | 
 36 | #include "util_type.cuh"
 37 | #include "util_arch.cuh"
 38 | #include "util_debug.cuh"
 39 | #include "util_namespace.cuh"
 40 | #include "util_macro.cuh"
 41 | 
 42 | /// Optional outer namespace(s)
 43 | CUB_NS_PREFIX
 44 | 
 45 | /// CUB namespace
 46 | namespace cub {
 47 | 
 48 | 
 49 | /**
 50 |  * \addtogroup UtilMgmt
 51 |  * @{
 52 |  */
 53 | 
 54 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 55 | 
 56 | 
 57 | /**
 58 |  * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
 59 |  */
 60 | template <int ALLOCATIONS>
 61 | CUB_RUNTIME_FUNCTION __forceinline__
 62 | cudaError_t AliasTemporaries(
 63 |     void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
 64 |     size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
 65 |     void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
 66 |     size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
 67 | {
 68 |     const int ALIGN_BYTES   = 256;
 69 |     const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
 70 | 
 71 |     // Compute exclusive prefix sum over allocation requests
 72 |     size_t allocation_offsets[ALLOCATIONS];
 73 |     size_t bytes_needed = 0;
 74 |     for (int i = 0; i < ALLOCATIONS; ++i)
 75 |     {
 76 |         size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
 77 |         allocation_offsets[i] = bytes_needed;
 78 |         bytes_needed += allocation_bytes;
 79 |     }
 80 |     bytes_needed += ALIGN_BYTES - 1;
 81 | 
 82 |     // Check if the caller is simply requesting the size of the storage allocation
 83 |     if (!d_temp_storage)
 84 |     {
 85 |         temp_storage_bytes = bytes_needed;
 86 |         return cudaSuccess;
 87 |     }
 88 | 
 89 |     // Check if enough storage provided
 90 |     if (temp_storage_bytes < bytes_needed)
 91 |     {
 92 |         return CubDebug(cudaErrorInvalidValue);
 93 |     }
 94 | 
 95 |     // Alias
 96 |     d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
 97 |     for (int i = 0; i < ALLOCATIONS; ++i)
 98 |     {
 99 |         allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
100 |     }
101 | 
102 |     return cudaSuccess;
103 | }
104 | 
105 | 
106 | /**
107 |  * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
108 |  */
109 | template <typename T>
110 | __global__ void EmptyKernel(void) { }
111 | 
112 | 
113 | #endif  // DOXYGEN_SHOULD_SKIP_THIS
114 | 
115 | /**
116 |  * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
117 |  */
118 | CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
119 | {
120 |     struct Dummy
121 |     {
122 |         /// Type definition of the EmptyKernel kernel entry point
123 |         typedef void (*EmptyKernelPtr)();
124 | 
125 |         /// Force EmptyKernel<void> to be generated if this class is used
126 |         CUB_RUNTIME_FUNCTION __forceinline__
127 |         EmptyKernelPtr Empty()
128 |         {
129 |             return EmptyKernel<void>;
130 |         }
131 |     };
132 | 
133 | 
134 | #ifndef CUB_RUNTIME_ENABLED
135 | 
136 |     // CUDA API calls not supported from this device
137 |     return cudaErrorInvalidConfiguration;
138 | 
139 | #elif (CUB_PTX_ARCH > 0)
140 | 
141 |     ptx_version = CUB_PTX_ARCH;
142 |     return cudaSuccess;
143 | 
144 | #else
145 | 
146 |     cudaError_t error = cudaSuccess;
147 |     do
148 |     {
149 |         cudaFuncAttributes empty_kernel_attrs;
150 |         if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
151 |         ptx_version = empty_kernel_attrs.ptxVersion * 10;
152 |     }
153 |     while (0);
154 | 
155 |     return error;
156 | 
157 | #endif
158 | }
159 | 
160 | 
161 | /**
162 |  * \brief Retrieves the SM version (major * 100 + minor * 10)
163 |  */
164 | CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
165 | {
166 | #ifndef CUB_RUNTIME_ENABLED
167 | 
168 |     // CUDA API calls not supported from this device
169 |     return cudaErrorInvalidConfiguration;
170 | 
171 | #else
172 | 
173 |     cudaError_t error = cudaSuccess;
174 |     do
175 |     {
176 |         // Fill in SM version
177 |         int major, minor;
178 |         if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
179 |         if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
180 |         sm_version = major * 100 + minor * 10;
181 |     }
182 |     while (0);
183 | 
184 |     return error;
185 | 
186 | #endif
187 | }
188 | 
189 | 
190 | #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
191 | 
192 | /**
193 |  * Synchronize the stream if specified
194 |  */
195 | CUB_RUNTIME_FUNCTION __forceinline__
196 | static cudaError_t SyncStream(cudaStream_t stream)
197 | {
198 | #if (CUB_PTX_ARCH == 0)
199 |     return cudaStreamSynchronize(stream);
200 | #else
201 |     // Device can't yet sync on a specific stream
202 |     return cudaDeviceSynchronize();
203 | #endif
204 | }
205 | 
206 | 
207 | /**
208 |  * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
209 |  *
210 |  * \par Snippet
211 |  * The code snippet below illustrates the use of the MaxSmOccupancy function.
212 |  * \par
213 |  * \code
214 |  * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
215 |  *
216 |  * template <typename T>
217 |  * __global__ void ExampleKernel()
218 |  * {
219 |  *     // Allocate shared memory for BlockScan
220 |  *     __shared__ volatile T buffer[4096];
221 |  *
222 |  *        ...
223 |  * }
224 |  *
225 |  *     ...
226 |  *
227 |  * // Determine SM occupancy for ExampleKernel specialized for unsigned char
228 |  * int max_sm_occupancy;
229 |  * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
230 |  *
231 |  * // max_sm_occupancy  <-- 4 on SM10
232 |  * // max_sm_occupancy  <-- 8 on SM20
233 |  * // max_sm_occupancy  <-- 12 on SM35
234 |  *
235 |  * \endcode
236 |  *
237 |  */
238 | template <typename KernelPtr>
239 | CUB_RUNTIME_FUNCTION __forceinline__
240 | cudaError_t MaxSmOccupancy(
241 |     int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
242 |     KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
243 |     int                 block_threads,              ///< [in] Number of threads per thread block
244 |     int                 dynamic_smem_bytes = 0)
245 | {
246 | #ifndef CUB_RUNTIME_ENABLED
247 | 
248 |     // CUDA API calls not supported from this device
249 |     return CubDebug(cudaErrorInvalidConfiguration);
250 | 
251 | #else
252 | 
253 |     return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
254 |         &max_sm_occupancy,
255 |         kernel_ptr,
256 |         block_threads,
257 |         dynamic_smem_bytes);
258 | 
259 | #endif  // CUB_RUNTIME_ENABLED
260 | }
261 | 
262 | 
263 | /******************************************************************************
264 |  * Policy management
265 |  ******************************************************************************/
266 | 
267 | /**
268 |  * Kernel dispatch configuration
269 |  */
270 | struct KernelConfig
271 | {
272 |     int block_threads;
273 |     int items_per_thread;
274 |     int tile_size;
275 |     int sm_occupancy;
276 | 
277 |     CUB_RUNTIME_FUNCTION __forceinline__
278 |     KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
279 | 
280 |     template <typename AgentPolicyT, typename KernelPtrT>
281 |     CUB_RUNTIME_FUNCTION __forceinline__
282 |     cudaError_t Init(KernelPtrT kernel_ptr)
283 |     {
284 |         block_threads        = AgentPolicyT::BLOCK_THREADS;
285 |         items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
286 |         tile_size            = block_threads * items_per_thread;
287 |         cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
288 |         return retval;
289 |     }
290 | };
291 | 
292 | 
293 | 
294 | /// Helper for dispatching into a policy chain
295 | template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
296 | struct ChainedPolicy
297 | {
298 |    /// The policy for the active compiler pass
299 |    typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
300 | 
301 |    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
302 |    template <typename FunctorT>
303 |    CUB_RUNTIME_FUNCTION __forceinline__
304 |    static cudaError_t Invoke(int ptx_version, FunctorT &op)
305 |    {
306 |        if (ptx_version < PTX_VERSION) {
307 |            return PrevPolicyT::Invoke(ptx_version, op);
308 |        }
309 |        return op.template Invoke<PolicyT>();
310 |    }
311 | };
312 | 
313 | /// Helper for dispatching into a policy chain (end-of-chain specialization)
314 | template <int PTX_VERSION, typename PolicyT>
315 | struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
316 | {
317 |     /// The policy for the active compiler pass
318 |     typedef PolicyT ActivePolicy;
319 | 
320 |     /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
321 |     template <typename FunctorT>
322 |     CUB_RUNTIME_FUNCTION __forceinline__
323 |     static cudaError_t Invoke(int ptx_version, FunctorT &op) {
324 |         return op.template Invoke<PolicyT>();
325 |     }
326 | };
327 | 
328 | 
329 | 
330 | 
331 | #endif  // Do not document
332 | 
333 | 
334 | 
335 | 
336 | /** @} */       // end group UtilMgmt
337 | 
338 | }               // CUB namespace
339 | CUB_NS_POSTFIX  // Optional outer namespace(s)
340 | 


--------------------------------------------------------------------------------
/cub/util_macro.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  *     * Redistributions of source code must retain the above copyright
  8 |  *       notice, this list of conditions and the following disclaimer.
  9 |  *     * Redistributions in binary form must reproduce the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer in the
 11 |  *       documentation and/or other materials provided with the distribution.
 12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 13 |  *       names of its contributors may be used to endorse or promote products
 14 |  *       derived from this software without specific prior written permission.
 15 |  * 
 16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  *
 27 |  ******************************************************************************/
 28 | 
 29 | /******************************************************************************
 30 |  * Common C/C++ macro utilities
 31 |  ******************************************************************************/
 32 | 
 33 | #pragma once
 34 | 
 35 | #include "util_namespace.cuh"
 36 | 
 37 | /// Optional outer namespace(s)
 38 | CUB_NS_PREFIX
 39 | 
 40 | /// CUB namespace
 41 | namespace cub {
 42 | 
 43 | 
 44 | /**
 45 |  * \addtogroup UtilModule
 46 |  * @{
 47 |  */
 48 | 
 49 | #ifndef CUB_ALIGN
 50 |     #if defined(_WIN32) || defined(_WIN64)
 51 |         /// Align struct
 52 |         #define CUB_ALIGN(bytes) __declspec(align(32))
 53 |     #else
 54 |         /// Align struct
 55 |         #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
 56 |     #endif
 57 | #endif
 58 | 
 59 | #ifndef CUB_MAX
 60 |     /// Select maximum(a, b)
 61 |     #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
 62 | #endif
 63 | 
 64 | #ifndef CUB_MIN
 65 |     /// Select minimum(a, b)
 66 |     #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
 67 | #endif
 68 | 
 69 | #ifndef CUB_QUOTIENT_FLOOR
 70 |     /// Quotient of x/y rounded down to nearest integer
 71 |     #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
 72 | #endif
 73 | 
 74 | #ifndef CUB_QUOTIENT_CEILING
 75 |     /// Quotient of x/y rounded up to nearest integer
 76 |     #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
 77 | #endif
 78 | 
 79 | #ifndef CUB_ROUND_UP_NEAREST
 80 |     /// x rounded up to the nearest multiple of y
 81 |     #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
 82 | #endif
 83 | 
 84 | #ifndef CUB_ROUND_DOWN_NEAREST
 85 |     /// x rounded down to the nearest multiple of y
 86 |     #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
 87 | #endif
 88 | 
 89 | 
 90 | #ifndef CUB_STATIC_ASSERT
 91 |     #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 92 |         #define CUB_CAT_(a, b) a ## b
 93 |         #define CUB_CAT(a, b) CUB_CAT_(a, b)
 94 |     #endif // DOXYGEN_SHOULD_SKIP_THIS
 95 | 
 96 |     /// Static assert
 97 |     #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
 98 | #endif
 99 | 
100 | /** @} */       // end group UtilModule
101 | 
102 | }               // CUB namespace
103 | CUB_NS_POSTFIX  // Optional outer namespace(s)
104 | 


--------------------------------------------------------------------------------
/cub/util_namespace.cuh:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
 3 |  * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
 4 |  *
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  *     * Redistributions of source code must retain the above copyright
 8 |  *       notice, this list of conditions and the following disclaimer.
 9 |  *     * Redistributions in binary form must reproduce the above copyright
10 |  *       notice, this list of conditions and the following disclaimer in the
11 |  *       documentation and/or other materials provided with the distribution.
12 |  *     * Neither the name of the NVIDIA CORPORATION nor the
13 |  *       names of its contributors may be used to endorse or promote products
14 |  *       derived from this software without specific prior written permission.
15 |  *
16 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 |  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  *
27 |  ******************************************************************************/
28 | 
29 | /**
30 |  * \file
31 |  * Place-holder for prefixing the cub namespace
32 |  */
33 | 
34 | #pragma once
35 | 
36 | // For example:
37 | //#define CUB_NS_PREFIX namespace thrust{ namespace detail {
38 | //#define CUB_NS_POSTFIX } }
39 | 
40 | #define CUB_NS_PREFIX
41 | #define CUB_NS_POSTFIX
42 | 


--------------------------------------------------------------------------------
/eval_csrmv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if (( $# != 2 )); then
 4 |   echo "$0 <mtx dataset dir> <cpu_spmv | gpu_spmv [--device=...]>"
 5 |   exit 0
 6 | fi
 7 | 
 8 | echo "file, num_rows, num_cols, num_nonzeros, row_length_mean, row_length_std_dev, row_length_variation, row_length_skewness, method_name, setup_ms, avg_spmv_ms, gflops, effective_GBs"
 9 | 
10 | MTX_DIR=$1
11 | 
12 | shift
13 | 
14 | for i in `find $MTX_DIR -name *.mtx`
15 | do
16 |     ./$@ --quiet --mtx=$i
17 | done
18 | 


--------------------------------------------------------------------------------
/get_uf_datasets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$#" -eq 0 ]; then
 4 |     MTX_DIR = mtx
 5 | else
 6 | 	MTX_DIR = $1
 7 | fi
 8 | 
 9 | # Make temporary directory for download/unpack
10 | mkdir -p tgz
11 | cd tgz
12 | 
13 | # Download 
14 | for i in `cat ../ufl_urls.txt`; do echo $i; wget $i; done
15 | 
16 | # Unpack
17 | for i in `cat ../ufl_matrices.txt`; do gunzip $i.tar.gz; tar -xvf $i.tar; rm $i.tar; done
18 | 
19 | # Relocate
20 | mkdir -p ../$MTX_DIR
21 | for i in `find . -name *.mtx`; do echo $i; mv $i ../$MTX_DIR/; done
22 | 
23 | # Cleanup
24 | cd ..
25 | rm -rf tgz
26 | 


--------------------------------------------------------------------------------
/gpu_spmv:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ./_gpu_spmv_driver $@
4 | 


--------------------------------------------------------------------------------
/merge-based-spmv-sc16-preprint.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dumerrill/merge-spmv/18895571ce9af960ee207dae541b0ffc701ea4bb/merge-based-spmv-sc16-preprint.pdf


--------------------------------------------------------------------------------
/merge_decomposition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dumerrill/merge-spmv/18895571ce9af960ee207dae541b0ffc701ea4bb/merge_decomposition.png


--------------------------------------------------------------------------------
/merge_spmv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dumerrill/merge-spmv/18895571ce9af960ee207dae541b0ffc701ea4bb/merge_spmv.png


--------------------------------------------------------------------------------