├── LICENSE
├── README.md
├── results
└── res.jpg
└── src
├── cuCompactor.cuh
├── cuda_error_check.cu
└── tests.cu
/LICENSE:
--------------------------------------------------------------------------------
1 | NU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # cuStreamComp
2 | Efficient CUDA Stream Compaction Library
3 |
4 | Based on thw folllowing works:
5 |
6 | 1. Markus Billeter et al. Efficient Stream Compaction on Wide SIMD Many-Core Architectures
7 |
8 | 2. InK-Compact-: In kernel Stream Compaction and Its Application to Multi-kernel Data Visualization on GPGPU- D.M. Hughes
9 |
10 | It is an CUDA efficient implementation of the stream compaction algorithm based on **warp ballotting intrinsic**.
11 |
12 | # How to use it
13 | Its usage is straightforward:
14 |
15 | - Create a predicate functor to decide whether an element is valid or not.
16 | ```
17 | struct predicate
18 | {
19 | __host__ __device__
20 | bool operator()(const int x)
21 | {
22 | return x>0;
23 | }
24 | };
25 | ```
26 |
27 | - Call the compact procedure to obtain the compacted array `d_output`.
28 |
29 | ```
30 | cuCompactor::compact(d_data,d_output,length,predicate(),blockSize);
31 | ```
32 |
33 | Note that both the input `d_data` and the output `d_output` arrays have to be allocated on device.
34 |
35 |
36 | *PERFORMANCE*
37 |
38 | 
39 |
--------------------------------------------------------------------------------
/results/res.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knotman90/cuStreamComp/a8d2718a456981b2aad6ae08e37884288a4eb51d/results/res.jpg
--------------------------------------------------------------------------------
/src/cuCompactor.cuh:
--------------------------------------------------------------------------------
1 |
2 | /*
3 | * cuCompactor.h
4 | *
5 | * Created on: 21/mag/2015
6 | * Author: knotman
7 | */
8 |
9 | #ifndef CUCOMPACTOR_H_
10 | #define CUCOMPACTOR_H_
11 |
12 | #include
13 | #include
14 | #include "cuda_error_check.cu"
15 |
16 | namespace cuCompactor {
17 |
18 | #define warpSize (32)
19 | #define FULL_MASK 0xffffffff
20 |
21 | __host__ __device__ int divup(int x, int y) { return x / y + (x % y ? 1 : 0); }
22 |
23 | __device__ __inline__ int pow2i (int e){
24 | return 1<
29 | __global__ void computeBlockCounts(T* d_input,int length,int*d_BlockCounts,Predicate predicate){
30 | int idx = threadIdx.x + blockIdx.x*blockDim.x;
31 | if(idx < length){
32 | int pred = predicate(d_input[idx]);
33 | int BC=__syncthreads_count(pred);
34 |
35 | if(threadIdx.x==0){
36 | d_BlockCounts[blockIdx.x]=BC; // BC will contain the number of valid elements in all threads of this thread block
37 | }
38 | }
39 | }
40 |
41 |
42 |
43 | template
44 | __global__ void compactK(T* d_input,int length, T* d_output,int* d_BlocksOffset,Predicate predicate ){
45 | int idx = threadIdx.x + blockIdx.x*blockDim.x;
46 | extern __shared__ int warpTotals[];
47 | if(idx < length){
48 | int pred = predicate(d_input[idx]);
49 | int w_i = threadIdx.x/warpSize; //warp index
50 | int w_l = idx % warpSize;//thread index within a warp
51 |
52 | // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp
53 | int t_m = FULL_MASK >> (warpSize-w_l); //thread mask
54 | #if (CUDART_VERSION < 9000)
55 | int b = __ballot(pred) & t_m; //ballot result = number whose ith bit is one if the ith's thread pred is true masked up to the current index in warp
56 | #else
57 | int b = __ballot_sync(FULL_MASK,pred) & t_m;
58 | #endif
59 | int t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
60 |
61 | // last thread in warp computes total valid counts for the warp
62 | if(w_l==warpSize-1){
63 | warpTotals[w_i]=t_u+pred;
64 | }
65 |
66 | // need all warps in thread block to fill in warpTotals before proceeding
67 | __syncthreads();
68 |
69 | // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block
70 | int numWarps = blockDim.x/warpSize;
71 | unsigned int numWarpsMask = FULL_MASK >> (warpSize-numWarps);
72 | if(w_i==0 && w_l
99 | __global__ void printArray_GPU(T* hd_data, int size,int newline){
100 | int w=0;
101 | for(int i=0;i ",w);
104 | w++;
105 | }
106 | printf("%i ",hd_data[i]);
107 | }
108 | printf("\n");
109 | }
110 |
111 | template
112 | int compact(T* d_input,T* d_output,int length, Predicate predicate, int blockSize){
113 | int numBlocks = divup(length,blockSize);
114 | int* d_BlocksCount;
115 | int* d_BlocksOffset;
116 | CUDASAFECALL (cudaMalloc(&d_BlocksCount,sizeof(int)*numBlocks));
117 | CUDASAFECALL (cudaMalloc(&d_BlocksOffset,sizeof(int)*numBlocks));
118 | thrust::device_ptr thrustPrt_bCount(d_BlocksCount);
119 | thrust::device_ptr thrustPrt_bOffset(d_BlocksOffset);
120 |
121 | //phase 1: count number of valid elements in each thread block
122 | computeBlockCounts<<>>(d_input,length,d_BlocksCount,predicate);
123 |
124 | //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid
125 | thrust::exclusive_scan(thrustPrt_bCount, thrustPrt_bCount + numBlocks, thrustPrt_bOffset);
126 |
127 | //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
128 | compactK<<>>(d_input,length,d_output,d_BlocksOffset,predicate);
129 |
130 | // determine number of elements in the compacted list
131 | int compact_length = thrustPrt_bOffset[numBlocks-1] + thrustPrt_bCount[numBlocks-1];
132 |
133 | cudaFree(d_BlocksCount);
134 | cudaFree(d_BlocksOffset);
135 |
136 | return compact_length;
137 | }
138 |
139 |
140 |
141 | } /* namespace cuCompactor */
142 | #endif /* CUCOMPACTOR_H_ */
143 |
--------------------------------------------------------------------------------
/src/cuda_error_check.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | // Define this to turn on error checking
5 | #define CUDA_ERROR_CHECK
6 |
7 | #define CUDASAFECALL( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
8 | #define CUDACHECKERROR() __cudaCheckError( __FILE__, __LINE__ )
9 |
10 | inline void __cudaSafeCall( cudaError err, const char *file, const int line )
11 | {
12 | #ifdef CUDA_ERROR_CHECK
13 | if ( cudaSuccess != err )
14 | {
15 | fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
16 | file, line, cudaGetErrorString( err ) );
17 |
18 | fprintf( stdout, "cudaSafeCall() failed at %s:%i : %s\n",
19 | file, line, cudaGetErrorString( err ) );
20 | exit( -1 );
21 | }
22 | #endif
23 |
24 | return;
25 | }
26 |
27 | inline void __cudaCheckError( const char *file, const int line )
28 | {
29 | #ifdef CUDA_ERROR_CHECK
30 | cudaError err = cudaGetLastError();
31 | if ( cudaSuccess != err )
32 | {
33 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
34 | file, line, cudaGetErrorString( err ) );
35 |
36 | fprintf( stdout, "cudaCheckError() failed at %s:%i : %s\n",
37 | file, line, cudaGetErrorString( err ) );
38 | exit( -1 );
39 | }
40 |
41 | // More careful checking. However, this will affect performance.
42 | // Comment away if needed.
43 | err = cudaDeviceSynchronize();
44 | if( cudaSuccess != err )
45 | {
46 | fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
47 | file, line, cudaGetErrorString( err ) );
48 |
49 | fprintf( stdout, "cudaCheckError() with sync failed at %s:%i : %s\n",
50 | file, line, cudaGetErrorString( err ) );
51 |
52 | exit( -1 );
53 | }
54 | #endif
55 |
56 | return;
57 | }
58 |
--------------------------------------------------------------------------------
/src/tests.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "cuCompactor.cuh"
4 | #include
5 | #include
6 | using namespace std;
7 |
8 |
9 | struct int_predicate
10 | {
11 | __host__ __device__
12 | bool operator()(const int x)
13 | {
14 | return x>0;
15 | }
16 | };
17 | #define randBound (50)//<100
18 | void initiData(int *h_data, uint NELEMENTS,uint &goodElements,bool randomOrStride){
19 |
20 | ushort stride = 4;
21 | for (int i = 0; i < NELEMENTS; ++i) {
22 | if(randomOrStride)
23 | h_data[i] = i%stride;
24 | else
25 | h_data[i] =(rand()%100 <= randBound) ? 1 : 0;
26 | if(h_data[i])
27 | goodElements++;
28 | }
29 | }
30 |
31 |
32 | void printData(int *h_data, uint NELEMENTS){
33 | for (int i = 0; i < NELEMENTS; ++i) {
34 | cout<(d_data,d_output,NELEMENTS,int_predicate(),blockSize);
88 | cudaDeviceSynchronize();
89 | clock_t end = clock();
90 | unsigned long millis = (end - start) * 1000 / CLOCKS_PER_SEC;
91 |
92 |
93 | //copy back results to host
94 | cudaMemcpy(h_data,d_output,datasize,cudaMemcpyDeviceToHost);
95 | //printData(h_data,NELEMENTS);
96 | checkVector(h_data,NELEMENTS,NgoodElements);
97 | //device memory free
98 | cudaFree(d_data);
99 | cudaFree(d_output);
100 | //host free memory
101 | free(h_data);
102 | printf("(%i,%i,%i)\n",NELEMENTS,blockSize,millis);
103 | }//for blocksize
104 | }//for elements
105 | printf("ALL TEST PASSED");
106 |
107 | }
108 |
--------------------------------------------------------------------------------