├── LICENSE
├── README.md
├── results
    └── res.jpg
└── src
    ├── cuCompactor.cuh
    ├── cuda_error_check.cu
    └── tests.cu


/LICENSE:
--------------------------------------------------------------------------------
  1 | NU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # cuStreamComp
 2 | Efficient CUDA Stream Compaction Library
 3 | 
 4 | Based on thw folllowing works:
 5 | 
 6 | 1. Markus Billeter et al. Efficient Stream Compaction on Wide SIMD Many-Core Architectures
 7 | 
 8 | 2. InK-Compact-: In kernel Stream Compaction and Its Application to Multi-kernel Data Visualization on GPGPU- D.M. Hughes
 9 | 
10 | It is an CUDA efficient implementation of the stream compaction algorithm based on **warp ballotting intrinsic**.
11 | 
12 | # How to use it
13 | Its usage is straightforward:
14 | 
15 |  - Create a predicate functor to decide whether an element is valid or not.
16 | ```
17 | struct predicate
18 | {
19 | 	__host__ __device__
20 | 	bool operator()(const int x)
21 | 	{
22 | 		return x>0;
23 | 	}
24 | };
25 | ```
26 | 
27 | - Call the compact procedure to obtain the compacted array `d_output`.
28 | 
29 | ```
30 | cuCompactor::compact<int>(d_data,d_output,length,predicate(),blockSize);
31 | ```
32 | 
33 | Note that both the input `d_data` and the output  `d_output` arrays have to be allocated on device.
34 | 
35 | 
36 | *PERFORMANCE*
37 | 
38 | ![Alt text](/results/res.jpg?raw=true "Thrust Performance Comparison")
39 | 


--------------------------------------------------------------------------------
/results/res.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/knotman90/cuStreamComp/a8d2718a456981b2aad6ae08e37884288a4eb51d/results/res.jpg


--------------------------------------------------------------------------------
/src/cuCompactor.cuh:
--------------------------------------------------------------------------------
  1 | 
  2 | /*
  3 |  * cuCompactor.h
  4 |  *
  5 |  *  Created on: 21/mag/2015
  6 |  *      Author: knotman
  7 |  */
  8 | 
  9 | #ifndef CUCOMPACTOR_H_
 10 | #define CUCOMPACTOR_H_
 11 | 
 12 | #include <thrust/scan.h>
 13 | #include <thrust/device_vector.h>
 14 | #include "cuda_error_check.cu"
 15 | 
 16 | namespace cuCompactor {
 17 | 
 18 | #define warpSize (32)
 19 | #define FULL_MASK 0xffffffff
 20 | 
 21 | __host__ __device__ int divup(int x, int y) { return x / y + (x % y ? 1 : 0); }
 22 | 
 23 | __device__ __inline__ int pow2i (int e){
 24 | 	return 1<<e;
 25 | }
 26 | 
 27 | 
 28 | template <typename T,typename Predicate>
 29 | __global__ void computeBlockCounts(T* d_input,int length,int*d_BlockCounts,Predicate predicate){
 30 | 	int idx = threadIdx.x + blockIdx.x*blockDim.x;
 31 | 	if(idx < length){
 32 | 		int pred = predicate(d_input[idx]);
 33 | 		int BC=__syncthreads_count(pred);
 34 | 
 35 | 		if(threadIdx.x==0){
 36 | 			d_BlockCounts[blockIdx.x]=BC; // BC will contain the number of valid elements in all threads of this thread block
 37 | 		}
 38 | 	}
 39 | }
 40 | 
 41 | 
 42 | 
 43 | template <typename T,typename Predicate>
 44 | __global__ void compactK(T* d_input,int length, T* d_output,int* d_BlocksOffset,Predicate predicate ){
 45 | 	int idx = threadIdx.x + blockIdx.x*blockDim.x;
 46 | 	extern __shared__ int warpTotals[];
 47 | 	if(idx < length){
 48 | 		int pred = predicate(d_input[idx]);
 49 | 		int w_i = threadIdx.x/warpSize; //warp index
 50 | 		int w_l = idx % warpSize;//thread index within a warp
 51 | 
 52 | 		// compute exclusive prefix sum based on predicate validity to get output offset for thread in warp
 53 | 		int t_m = FULL_MASK >> (warpSize-w_l); //thread mask
 54 | 		#if (CUDART_VERSION < 9000)
 55 | 		int b   = __ballot(pred) & t_m; //ballot result = number whose ith bit is one if the ith's thread pred is true masked up to the current index in warp
 56 | 		#else
 57 | 		int b	= __ballot_sync(FULL_MASK,pred) & t_m;
 58 | 		#endif
 59 | 		int t_u	= __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX
 60 | 
 61 | 		// last thread in warp computes total valid counts for the warp
 62 | 		if(w_l==warpSize-1){
 63 | 			warpTotals[w_i]=t_u+pred;
 64 | 		}
 65 | 
 66 | 		// need all warps in thread block to fill in warpTotals before proceeding
 67 | 		__syncthreads();
 68 | 
 69 | 		// first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block
 70 | 		int numWarps = blockDim.x/warpSize;
 71 | 		unsigned int numWarpsMask = FULL_MASK >> (warpSize-numWarps);
 72 | 		if(w_i==0 && w_l<numWarps){
 73 | 			int w_i_u=0;
 74 | 			for(int j=0;j<=5;j++){ // must include j=5 in loop in case any elements of warpTotals are identically equal to 32
 75 | 				#if (CUDART_VERSION < 9000)
 76 | 		                int b_j =__ballot( warpTotals[w_l] & pow2i(j) ); //# of the ones in the j'th digit of the warp offsets
 77 | 				#else
 78 | 				int b_j =__ballot_sync(numWarpsMask, warpTotals[w_l] & pow2i(j) );
 79 | 				#endif
 80 | 				w_i_u += (__popc(b_j & t_m)  ) << j;
 81 | 				//printf("indice %i t_m=%i,j=%i,b_j=%i,w_i_u=%i\n",w_l,t_m,j,b_j,w_i_u);
 82 | 			}
 83 | 			warpTotals[w_l]=w_i_u;
 84 | 		}
 85 | 
 86 | 		// need all warps in thread block to wait until prefix sum is calculated in warpTotals
 87 | 		__syncthreads(); 
 88 | 
 89 | 		// if valid element, place the element in proper destination address based on thread offset in warp, warp offset in block, and block offset in grid
 90 | 		if(pred){
 91 | 			d_output[t_u+warpTotals[w_i]+d_BlocksOffset[blockIdx.x]]= d_input[idx];
 92 | 		}
 93 | 
 94 | 
 95 | 	}
 96 | }
 97 | 
 98 | template <class T>
 99 | __global__  void printArray_GPU(T* hd_data, int size,int newline){
100 | 	int w=0;
101 | 	for(int i=0;i<size;i++){
102 | 		if(i%newline==0) {
103 | 			printf("\n%i -> ",w);
104 | 			w++;
105 | 		}
106 | 		printf("%i ",hd_data[i]);
107 | 	}
108 | 	printf("\n");
109 | }
110 | 
111 | template <typename T,typename Predicate>
112 | int compact(T* d_input,T* d_output,int length, Predicate predicate, int blockSize){
113 | 	int numBlocks = divup(length,blockSize);
114 | 	int* d_BlocksCount;
115 | 	int* d_BlocksOffset;
116 | 	CUDASAFECALL (cudaMalloc(&d_BlocksCount,sizeof(int)*numBlocks));
117 | 	CUDASAFECALL (cudaMalloc(&d_BlocksOffset,sizeof(int)*numBlocks));
118 | 	thrust::device_ptr<int> thrustPrt_bCount(d_BlocksCount);
119 | 	thrust::device_ptr<int> thrustPrt_bOffset(d_BlocksOffset);
120 | 
121 | 	//phase 1: count number of valid elements in each thread block
122 | 	computeBlockCounts<<<numBlocks,blockSize>>>(d_input,length,d_BlocksCount,predicate);
123 | 	
124 | 	//phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid
125 | 	thrust::exclusive_scan(thrustPrt_bCount, thrustPrt_bCount + numBlocks, thrustPrt_bOffset);
126 | 	
127 | 	//phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements
128 | 	compactK<<<numBlocks,blockSize,sizeof(int)*(blockSize/warpSize)>>>(d_input,length,d_output,d_BlocksOffset,predicate);
129 | 
130 | 	// determine number of elements in the compacted list
131 | 	int compact_length = thrustPrt_bOffset[numBlocks-1] + thrustPrt_bCount[numBlocks-1];
132 | 
133 | 	cudaFree(d_BlocksCount);
134 | 	cudaFree(d_BlocksOffset);
135 | 
136 | 	return compact_length;
137 | }
138 | 
139 | 
140 | 
141 | } /* namespace cuCompactor */
142 | #endif /* CUCOMPACTOR_H_ */
143 | 


--------------------------------------------------------------------------------
/src/cuda_error_check.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | 
 4 | // Define this to turn on error checking
 5 | #define CUDA_ERROR_CHECK
 6 | 
 7 | #define CUDASAFECALL( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
 8 | #define CUDACHECKERROR()    __cudaCheckError( __FILE__, __LINE__ )
 9 | 
10 | inline void __cudaSafeCall( cudaError err, const char *file, const int line )
11 | {
12 | #ifdef CUDA_ERROR_CHECK
13 | 	if ( cudaSuccess != err )
14 | 	{
15 | 		fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
16 | 				file, line, cudaGetErrorString( err ) );
17 | 
18 | 		fprintf( stdout, "cudaSafeCall() failed at %s:%i : %s\n",
19 | 						file, line, cudaGetErrorString( err ) );
20 | 		exit( -1 );
21 | 	}
22 | #endif
23 | 
24 | 	return;
25 | }
26 | 
27 | inline void __cudaCheckError( const char *file, const int line )
28 | {
29 | #ifdef CUDA_ERROR_CHECK
30 | 	cudaError err = cudaGetLastError();
31 | 	if ( cudaSuccess != err )
32 | 	{
33 | 		fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
34 | 				file, line, cudaGetErrorString( err ) );
35 | 
36 | 		fprintf( stdout, "cudaCheckError() failed at %s:%i : %s\n",
37 | 						file, line, cudaGetErrorString( err ) );
38 | 		exit( -1 );
39 | 	}
40 | 
41 | 	// More careful checking. However, this will affect performance.
42 | 	// Comment away if needed.
43 | 	err = cudaDeviceSynchronize();
44 | 	if( cudaSuccess != err )
45 | 	{
46 | 		fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
47 | 				file, line, cudaGetErrorString( err ) );
48 | 
49 | 		fprintf( stdout, "cudaCheckError() with sync failed at %s:%i : %s\n",
50 | 						file, line, cudaGetErrorString( err ) );
51 | 
52 | 		exit( -1 );
53 | 	}
54 | #endif
55 | 
56 | 	return;
57 | }
58 | 


--------------------------------------------------------------------------------
/src/tests.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <assert.h>
  3 | #include "cuCompactor.cuh"
  4 | #include <chrono>
  5 | #include <stdlib.h>
  6 | using namespace std;
  7 | 
  8 | 
  9 | struct int_predicate
 10 | {
 11 | 	__host__ __device__
 12 | 	bool operator()(const int x)
 13 | 	{
 14 | 		return x>0;
 15 | 	}
 16 | };
 17 | #define randBound (50)//<100
 18 | void initiData(int *h_data, uint NELEMENTS,uint &goodElements,bool randomOrStride){
 19 | 
 20 | 	ushort stride = 4;
 21 | 	for (int i = 0; i < NELEMENTS; ++i) {
 22 | 		if(randomOrStride)
 23 | 			h_data[i] = i%stride;
 24 | 		else
 25 | 			h_data[i] =(rand()%100 <= randBound) ? 1 : 0;
 26 | 		if(h_data[i])
 27 | 			goodElements++;
 28 | 	}
 29 | }
 30 | 
 31 | 
 32 | void printData(int *h_data, uint NELEMENTS){
 33 | 	for (int i = 0; i < NELEMENTS; ++i) {
 34 | 		cout<<h_data[i]<<" ";
 35 | 	}
 36 | 	cout<<endl;
 37 | }
 38 | 
 39 | 
 40 | void checkVector(int *h_data,uint NELEMENTS,uint NgoodElements){
 41 | 	//printf("Checking: %i, %i",NELEMENTS,NgoodElements);
 42 | 	int_predicate predicate;
 43 | 	for(int i=0;i<NgoodElements;i++){
 44 | 		assert(predicate(h_data[i]));
 45 | 	}
 46 | 	for(int i=NgoodElements;i<NELEMENTS;i++){
 47 | 		assert(!predicate(h_data[i]));
 48 | 	}
 49 | }
 50 | 
 51 | 
 52 | 
 53 | unsigned int NELEMENTS=0;
 54 | uint NgoodElements=0;
 55 | uint blockSize=8;
 56 | 
 57 | 
 58 | int main(){
 59 | srand(time(0));
 60 | 	int *d_data, *d_output, *h_data;
 61 | 
 62 | 	//data elements from 2^5 to 2^29
 63 | 	for(int e=7;e<30;e++){
 64 | 		//blocksize from 32 to 1024
 65 | 		for(int b=5;b<=10;b++){
 66 | 
 67 | 			NELEMENTS=1<<e;
 68 | 			NgoodElements=0;
 69 | 			blockSize=1<<b;
 70 | 			size_t datasize=sizeof(int)*NELEMENTS;
 71 | 			//host input/output data
 72 | 			h_data = (int*) malloc(datasize);
 73 | 			memset(h_data,0,datasize);
 74 | 			//device input data
 75 | 			cudaMalloc(&d_data,datasize);
 76 | 			//device output data
 77 | 			cudaMalloc(&d_output,datasize);
 78 | 
 79 | 			cudaMemset(d_output,0,datasize);
 80 | 			initiData(h_data,NELEMENTS,NgoodElements,false);
 81 | 
 82 | 			//printData(h_data,NELEMENTS);
 83 | 
 84 | 			cudaMemcpy(d_data,h_data,datasize,cudaMemcpyHostToDevice);
 85 | 
 86 | 			clock_t start = clock();
 87 | 			cuCompactor::compact<int>(d_data,d_output,NELEMENTS,int_predicate(),blockSize);
 88 | 			cudaDeviceSynchronize();
 89 | 			clock_t end = clock();
 90 | 			unsigned long millis = (end - start) * 1000 / CLOCKS_PER_SEC;
 91 | 
 92 | 
 93 | 			//copy back results to host
 94 | 			cudaMemcpy(h_data,d_output,datasize,cudaMemcpyDeviceToHost);
 95 | 			//printData(h_data,NELEMENTS);
 96 | 			checkVector(h_data,NELEMENTS,NgoodElements);
 97 | 			//device memory free
 98 | 			cudaFree(d_data);
 99 | 			cudaFree(d_output);
100 | 			//host free  memory
101 | 			free(h_data);
102 | 			printf("(%i,%i,%i)\n",NELEMENTS,blockSize,millis);
103 | 		}//for blocksize
104 | 	}//for elements
105 | 	printf("ALL TEST PASSED");
106 | 
107 | }
108 | 


--------------------------------------------------------------------------------