├── LICENSE ├── README.md ├── results └── res.jpg └── src ├── cuCompactor.cuh ├── cuda_error_check.cu └── tests.cu /LICENSE: -------------------------------------------------------------------------------- 1 | NU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cuStreamComp 2 | Efficient CUDA Stream Compaction Library 3 | 4 | Based on thw folllowing works: 5 | 6 | 1. Markus Billeter et al. Efficient Stream Compaction on Wide SIMD Many-Core Architectures 7 | 8 | 2. InK-Compact-: In kernel Stream Compaction and Its Application to Multi-kernel Data Visualization on GPGPU- D.M. Hughes 9 | 10 | It is an CUDA efficient implementation of the stream compaction algorithm based on **warp ballotting intrinsic**. 11 | 12 | # How to use it 13 | Its usage is straightforward: 14 | 15 | - Create a predicate functor to decide whether an element is valid or not. 16 | ``` 17 | struct predicate 18 | { 19 | __host__ __device__ 20 | bool operator()(const int x) 21 | { 22 | return x>0; 23 | } 24 | }; 25 | ``` 26 | 27 | - Call the compact procedure to obtain the compacted array `d_output`. 28 | 29 | ``` 30 | cuCompactor::compact(d_data,d_output,length,predicate(),blockSize); 31 | ``` 32 | 33 | Note that both the input `d_data` and the output `d_output` arrays have to be allocated on device. 34 | 35 | 36 | *PERFORMANCE* 37 | 38 | ![Alt text](/results/res.jpg?raw=true "Thrust Performance Comparison") 39 | -------------------------------------------------------------------------------- /results/res.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/knotman90/cuStreamComp/a8d2718a456981b2aad6ae08e37884288a4eb51d/results/res.jpg -------------------------------------------------------------------------------- /src/cuCompactor.cuh: -------------------------------------------------------------------------------- 1 | 2 | /* 3 | * cuCompactor.h 4 | * 5 | * Created on: 21/mag/2015 6 | * Author: knotman 7 | */ 8 | 9 | #ifndef CUCOMPACTOR_H_ 10 | #define CUCOMPACTOR_H_ 11 | 12 | #include 13 | #include 14 | #include "cuda_error_check.cu" 15 | 16 | namespace cuCompactor { 17 | 18 | #define warpSize (32) 19 | #define FULL_MASK 0xffffffff 20 | 21 | __host__ __device__ int divup(int x, int y) { return x / y + (x % y ? 1 : 0); } 22 | 23 | __device__ __inline__ int pow2i (int e){ 24 | return 1< 29 | __global__ void computeBlockCounts(T* d_input,int length,int*d_BlockCounts,Predicate predicate){ 30 | int idx = threadIdx.x + blockIdx.x*blockDim.x; 31 | if(idx < length){ 32 | int pred = predicate(d_input[idx]); 33 | int BC=__syncthreads_count(pred); 34 | 35 | if(threadIdx.x==0){ 36 | d_BlockCounts[blockIdx.x]=BC; // BC will contain the number of valid elements in all threads of this thread block 37 | } 38 | } 39 | } 40 | 41 | 42 | 43 | template 44 | __global__ void compactK(T* d_input,int length, T* d_output,int* d_BlocksOffset,Predicate predicate ){ 45 | int idx = threadIdx.x + blockIdx.x*blockDim.x; 46 | extern __shared__ int warpTotals[]; 47 | if(idx < length){ 48 | int pred = predicate(d_input[idx]); 49 | int w_i = threadIdx.x/warpSize; //warp index 50 | int w_l = idx % warpSize;//thread index within a warp 51 | 52 | // compute exclusive prefix sum based on predicate validity to get output offset for thread in warp 53 | int t_m = FULL_MASK >> (warpSize-w_l); //thread mask 54 | #if (CUDART_VERSION < 9000) 55 | int b = __ballot(pred) & t_m; //ballot result = number whose ith bit is one if the ith's thread pred is true masked up to the current index in warp 56 | #else 57 | int b = __ballot_sync(FULL_MASK,pred) & t_m; 58 | #endif 59 | int t_u = __popc(b); // popc count the number of bit one. simply count the number predicated true BEFORE MY INDEX 60 | 61 | // last thread in warp computes total valid counts for the warp 62 | if(w_l==warpSize-1){ 63 | warpTotals[w_i]=t_u+pred; 64 | } 65 | 66 | // need all warps in thread block to fill in warpTotals before proceeding 67 | __syncthreads(); 68 | 69 | // first numWarps threads in first warp compute exclusive prefix sum to get output offset for each warp in thread block 70 | int numWarps = blockDim.x/warpSize; 71 | unsigned int numWarpsMask = FULL_MASK >> (warpSize-numWarps); 72 | if(w_i==0 && w_l 99 | __global__ void printArray_GPU(T* hd_data, int size,int newline){ 100 | int w=0; 101 | for(int i=0;i ",w); 104 | w++; 105 | } 106 | printf("%i ",hd_data[i]); 107 | } 108 | printf("\n"); 109 | } 110 | 111 | template 112 | int compact(T* d_input,T* d_output,int length, Predicate predicate, int blockSize){ 113 | int numBlocks = divup(length,blockSize); 114 | int* d_BlocksCount; 115 | int* d_BlocksOffset; 116 | CUDASAFECALL (cudaMalloc(&d_BlocksCount,sizeof(int)*numBlocks)); 117 | CUDASAFECALL (cudaMalloc(&d_BlocksOffset,sizeof(int)*numBlocks)); 118 | thrust::device_ptr thrustPrt_bCount(d_BlocksCount); 119 | thrust::device_ptr thrustPrt_bOffset(d_BlocksOffset); 120 | 121 | //phase 1: count number of valid elements in each thread block 122 | computeBlockCounts<<>>(d_input,length,d_BlocksCount,predicate); 123 | 124 | //phase 2: compute exclusive prefix sum of valid block counts to get output offset for each thread block in grid 125 | thrust::exclusive_scan(thrustPrt_bCount, thrustPrt_bCount + numBlocks, thrustPrt_bOffset); 126 | 127 | //phase 3: compute output offset for each thread in warp and each warp in thread block, then output valid elements 128 | compactK<<>>(d_input,length,d_output,d_BlocksOffset,predicate); 129 | 130 | // determine number of elements in the compacted list 131 | int compact_length = thrustPrt_bOffset[numBlocks-1] + thrustPrt_bCount[numBlocks-1]; 132 | 133 | cudaFree(d_BlocksCount); 134 | cudaFree(d_BlocksOffset); 135 | 136 | return compact_length; 137 | } 138 | 139 | 140 | 141 | } /* namespace cuCompactor */ 142 | #endif /* CUCOMPACTOR_H_ */ 143 | -------------------------------------------------------------------------------- /src/cuda_error_check.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // Define this to turn on error checking 5 | #define CUDA_ERROR_CHECK 6 | 7 | #define CUDASAFECALL( err ) __cudaSafeCall( err, __FILE__, __LINE__ ) 8 | #define CUDACHECKERROR() __cudaCheckError( __FILE__, __LINE__ ) 9 | 10 | inline void __cudaSafeCall( cudaError err, const char *file, const int line ) 11 | { 12 | #ifdef CUDA_ERROR_CHECK 13 | if ( cudaSuccess != err ) 14 | { 15 | fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n", 16 | file, line, cudaGetErrorString( err ) ); 17 | 18 | fprintf( stdout, "cudaSafeCall() failed at %s:%i : %s\n", 19 | file, line, cudaGetErrorString( err ) ); 20 | exit( -1 ); 21 | } 22 | #endif 23 | 24 | return; 25 | } 26 | 27 | inline void __cudaCheckError( const char *file, const int line ) 28 | { 29 | #ifdef CUDA_ERROR_CHECK 30 | cudaError err = cudaGetLastError(); 31 | if ( cudaSuccess != err ) 32 | { 33 | fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n", 34 | file, line, cudaGetErrorString( err ) ); 35 | 36 | fprintf( stdout, "cudaCheckError() failed at %s:%i : %s\n", 37 | file, line, cudaGetErrorString( err ) ); 38 | exit( -1 ); 39 | } 40 | 41 | // More careful checking. However, this will affect performance. 42 | // Comment away if needed. 43 | err = cudaDeviceSynchronize(); 44 | if( cudaSuccess != err ) 45 | { 46 | fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n", 47 | file, line, cudaGetErrorString( err ) ); 48 | 49 | fprintf( stdout, "cudaCheckError() with sync failed at %s:%i : %s\n", 50 | file, line, cudaGetErrorString( err ) ); 51 | 52 | exit( -1 ); 53 | } 54 | #endif 55 | 56 | return; 57 | } 58 | -------------------------------------------------------------------------------- /src/tests.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "cuCompactor.cuh" 4 | #include 5 | #include 6 | using namespace std; 7 | 8 | 9 | struct int_predicate 10 | { 11 | __host__ __device__ 12 | bool operator()(const int x) 13 | { 14 | return x>0; 15 | } 16 | }; 17 | #define randBound (50)//<100 18 | void initiData(int *h_data, uint NELEMENTS,uint &goodElements,bool randomOrStride){ 19 | 20 | ushort stride = 4; 21 | for (int i = 0; i < NELEMENTS; ++i) { 22 | if(randomOrStride) 23 | h_data[i] = i%stride; 24 | else 25 | h_data[i] =(rand()%100 <= randBound) ? 1 : 0; 26 | if(h_data[i]) 27 | goodElements++; 28 | } 29 | } 30 | 31 | 32 | void printData(int *h_data, uint NELEMENTS){ 33 | for (int i = 0; i < NELEMENTS; ++i) { 34 | cout<(d_data,d_output,NELEMENTS,int_predicate(),blockSize); 88 | cudaDeviceSynchronize(); 89 | clock_t end = clock(); 90 | unsigned long millis = (end - start) * 1000 / CLOCKS_PER_SEC; 91 | 92 | 93 | //copy back results to host 94 | cudaMemcpy(h_data,d_output,datasize,cudaMemcpyDeviceToHost); 95 | //printData(h_data,NELEMENTS); 96 | checkVector(h_data,NELEMENTS,NgoodElements); 97 | //device memory free 98 | cudaFree(d_data); 99 | cudaFree(d_output); 100 | //host free memory 101 | free(h_data); 102 | printf("(%i,%i,%i)\n",NELEMENTS,blockSize,millis); 103 | }//for blocksize 104 | }//for elements 105 | printf("ALL TEST PASSED"); 106 | 107 | } 108 | --------------------------------------------------------------------------------