├── LICENSE ├── Makefile ├── README.md └── src ├── common-host.cpp ├── common-host.h ├── common.cu ├── common.h ├── generator_ETHZ.cu ├── generator_ETHZ.cuh ├── hash_join_clustered_probe.cu ├── join-primitives.cu ├── join-primitives.cuh ├── main.cu ├── partition-primitives.cu └── partition-primitives.cuh /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 4 | Ecole Polytechnique Federale de Lausanne 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX=nvcc 2 | 3 | ARCH=sm_61 4 | #ARCH=sm_20 5 | 6 | #Use CXXFLAGS=_DENABLE_NVPROF in command line to compile selection with nvprof 7 | #CXXFLAGS+=-g 8 | #CXXFLAGS+=-G 9 | #CXXFLAGS+=-Xptxas 10 | #CXXFLAGS+=-v 11 | # CXXFLAGS+= -O3 -lineinfo -Xcompiler -fopenmp -std=c++11 12 | # CXXFLAGS+= -lineinfo -Xcompiler -fopenmp -std=c++11 --ptxas-options=-v,-preserve-relocs 13 | 14 | DEBUGFLAGS+= -g -G 15 | RELEASEFLAGS+= 16 | # RELEASEFLAGS+= -DNDEBUG 17 | 18 | CUDA_INSTALL_PATH?=/usr/local/cuda 19 | 20 | LDLIBS=-lgomp -lnuma 21 | 22 | INCLUDE_PATH =-I. -Icub 23 | 24 | # CXXFLAGS+= -lnuma 25 | CXXFLAGS+= -O3 -arch=$(ARCH) -lineinfo --std=c++11 26 | # -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED 27 | # CXXFLAGS+= -DNTESTMEMCPY 28 | # CXXFLAGS+= --maxrregcount=32 29 | CXXFLAGS+= -lineinfo -rdc=true 30 | CXXFLAGS+= --default-stream per-thread --expt-relaxed-constexpr 31 | CXXFLAGS+= --compiler-options='-O3 -fopenmp -mavx2 -mbmi2' 32 | #-Wall -Wunsafe-loop-optimizations 33 | 34 | # PROFFLAGS+= -L/usr/local/cuda/lib64 -lnvToolsExt 35 | 36 | CXXFLAGS+= $(INCLUDE_PATH) 37 | 38 | DBG_DIR=debug 39 | RLS_DIR=release 40 | 41 | BIN_ROOT=bin 42 | OBJ_ROOT=obj 43 | SRC_ROOT=src 44 | DEP_ROOT=.depend 45 | 46 | BIN_DBG=$(BIN_ROOT)/$(DBG_DIR)/ 47 | BIN_RLS=$(BIN_ROOT)/$(RLS_DIR)/ 48 | 49 | OBJ_DBG=$(OBJ_ROOT)/$(DBG_DIR)/ 50 | OBJ_RLS=$(OBJ_ROOT)/$(RLS_DIR)/ 51 | 52 | DEP_DBG=$(DEP_ROOT)/$(DBG_DIR)/ 53 | DEP_RLS=$(DEP_ROOT)/$(RLS_DIR)/ 54 | 55 | SED_ODD=$(subst /,\/,$(OBJ_DBG)) 56 | SED_ORD=$(subst /,\/,$(OBJ_RLS)) 57 | 58 | SED_DDD=$(subst /,\/,$(DEP_DBG)) 59 | SED_DRD=$(subst /,\/,$(DEP_RLS)) 60 | 61 | EXCLUDE_SOURCES+= src/exclude_me.cu 62 | EXCLUDE_SOURCES+= src/cub/% 63 | 64 | CXX_SOURCESD= $(shell find $(SRC_ROOT) -name "*.cpp") 65 | CUDA_SOURCESD= $(shell find $(SRC_ROOT) -name "*.cu") 66 | CXX_SOURCESD:= $(filter-out $(EXCLUDE_SOURCES),$(CXX_SOURCESD)) 67 | CUDA_SOURCESD:= $(filter-out $(EXCLUDE_SOURCES),$(CUDA_SOURCESD)) 68 | CXX_SOURCES= $(subst $(SRC_ROOT)/,,$(CXX_SOURCESD)) 69 | CUDA_SOURCES= $(subst $(SRC_ROOT)/,,$(CUDA_SOURCESD)) 70 | CXX_OBJECTS= $(CXX_SOURCES:.cpp=.o) 71 | CUDA_OBJECTS= $(CUDA_SOURCES:.cu=.o) 72 | 73 | OBJ_FILES:=$(addprefix $(OBJ_DBG), $(CXX_OBJECTS)) $(addprefix $(OBJ_RLS), $(CXX_OBJECTS)) $(addprefix $(OBJ_DBG), $(CUDA_OBJECTS)) $(addprefix $(OBJ_RLS), $(CUDA_OBJECTS)) 74 | 75 | # .DEFAULT_GOAL := release 76 | all: debug release 77 | 78 | debug:CXXFLAGS+= $(DEBUGFLAGS) $(PROFFLAGS) 79 | release:CXXFLAGS+= $(OPTFLAGS) $(PROFFLAGS) 80 | 81 | release:BIN_DIR:= $(BIN_RLS) 82 | release:IMP_DIR:= $(RLS_DIR) 83 | release:OBJ_DIR:= $(OBJ_RLS) 84 | # release:CXX_OBJ_D:= $(addprefix $(OBJ_RLS), $(CXX_OBJECTS)) $(addprefix $(OBJ_DBG), $(CUDA_OBJECTS)) 85 | 86 | debug:BIN_DIR:= $(BIN_DBG) 87 | debug:IMP_DIR:= $(DBG_DIR) 88 | debug:OBJ_DIR:= $(OBJ_DBG) 89 | # debug:CXX_OBJ_D:= $(addprefix $(OBJ_DBG), $(CXX_OBJECTS)) $(addprefix $(OBJ_DBG), $(CUDA_OBJECTS)) 90 | 91 | -include $(addprefix $(DEP_DBG), $(CUDA_SOURCES:.cu=.d)) 92 | -include $(addprefix $(DEP_RLS), $(CUDA_SOURCES:.cu=.d)) 93 | -include $(addprefix $(DEP_DBG), $(CXX_SOURCES:.cpp=.d)) 94 | -include $(addprefix $(DEP_RLS), $(CXX_SOURCES:.cpp=.d)) 95 | 96 | $(BIN_RLS)bench:$(addprefix $(OBJ_RLS), $(CXX_OBJECTS)) $(addprefix $(OBJ_RLS), $(CUDA_OBJECTS)) 97 | $(BIN_DBG)bench:$(addprefix $(OBJ_DBG), $(CXX_OBJECTS)) $(addprefix $(OBJ_DBG), $(CUDA_OBJECTS)) 98 | 99 | release: $(BIN_RLS)bench 100 | debug: $(BIN_DBG)bench 101 | 102 | .PHONY: all debug release 103 | 104 | space= 105 | #do no remove this lines!!! needed!!! 106 | space+= 107 | 108 | vpath %.o $(subst $(space),:,$(dir $(OBJ_FILES))) 109 | vpath %.cu $(subst $(space),:,$(dir $(CXX_SOURCESD))) 110 | vpath %.cpp $(subst $(space),:,$(dir $(CUDA_SOURCESD))) 111 | 112 | $(sort $(subst //,/,$(dir $(OBJ_FILES)))): 113 | mkdir -p $@ 114 | 115 | %.o: 116 | $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(filter $(subst $(OBJ_DIR),$(SRC_ROOT)/,$(@:.o=.cu)),$(CUDA_SOURCESD)) $(filter $(subst $(OBJ_DIR),$(SRC_ROOT)/,$(@:.o=.cpp)),$(CXX_SOURCESD)) -o $@ 117 | 118 | %bench: 119 | $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDLIBS) -o $@ $^ 120 | 121 | clean: 122 | -rm -r $(OBJ_ROOT) $(BIN_ROOT) $(DEP_ROOT) 123 | mkdir -p $(BIN_DBG) $(BIN_RLS) $(OBJ_DBG) $(OBJ_RLS) $(DEP_DBG) $(DEP_RLS) 124 | 125 | $(DEP_DBG)%.d: %.cu Makefile 126 | @mkdir -p $(@D) 127 | $(CXX) -E -Xcompiler "-isystem $(CUDA_INSTALL_PATH)/include -MM" $(CPPFLAGS) $(CXXFLAGS) $< | sed -r 's/^(\S+).(\S+):/$(SED_ODD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(<:.cu=.o))) $(SED_DDD)$(subst /,\/,$(<:.cu=.d)): \\\n Makefile \\\n/g' | sed -r 's/(\w)\s+(\w)/\1 \\\n \2/g' | sed '$$s/$$/\\\n | $(SED_ODD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(dir $<)))/g' | sed -r 's/(\w)+\/\.\.\///g' | awk '!x[$$0]++' > $@ 128 | 129 | $(DEP_RLS)%.d: %.cu Makefile 130 | @mkdir -p $(@D) 131 | $(CXX) -E -Xcompiler "-isystem $(CUDA_INSTALL_PATH)/include -MM" $(CPPFLAGS) $(CXXFLAGS) $< | sed -r 's/^(\S+).(\S+):/$(SED_ORD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(<:.cu=.o))) $(SED_DRD)$(subst /,\/,$(<:.cu=.d)): \\\n Makefile \\\n/g' | sed -r 's/(\w)\s+(\w)/\1 \\\n \2/g' | sed '$$s/$$/\\\n | $(SED_ORD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(dir $<)))/g' | sed -r 's/(\w)+\/\.\.\///g' | awk '!x[$$0]++' > $@ 132 | 133 | $(DEP_DBG)%.d: %.cpp Makefile 134 | @mkdir -p $(@D) 135 | $(CXX) -E -Xcompiler "-isystem $(CUDA_INSTALL_PATH)/include -MM" $(CPPFLAGS) $(CXXFLAGS) $< | sed -r 's/^(\S+).(\S+):/$(SED_ODD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(<:.cpp=.o))) $(SED_DDD)$(subst /,\/,$(<:.cpp=.d)): \\\n Makefile \\\n/g' | sed -r 's/(\w)\s+(\w)/\1 \\\n \2/g' | sed '$$s/$$/\\\n | $(SED_ODD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(dir $<)))/g' | sed -r 's/(\w)+\/\.\.\///g' | awk '!x[$$0]++' > $@ 136 | 137 | $(DEP_RLS)%.d: %.cpp Makefile 138 | @mkdir -p $(@D) 139 | $(CXX) -E -Xcompiler "-isystem $(CUDA_INSTALL_PATH)/include -MM" $(CPPFLAGS) $(CXXFLAGS) $< | sed -r 's/^(\S+).(\S+):/$(SED_ORD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(<:.cpp=.o))) $(SED_DRD)$(subst /,\/,$(<:.cpp=.d)): \\\n Makefile \\\n/g' | sed -r 's/(\w)\s+(\w)/\1 \\\n \2/g' | sed '$$s/$$/\\\n | $(SED_ORD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(dir $<)))/g' | sed -r 's/(\w)+\/\.\.\///g' | awk '!x[$$0]++' > $@ 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Code repository for the paper "Hardware-conscious Hash-Joins on GPUs" presented in ICDE 2019 2 | 3 | The publicly available version is a work-in-progress. Soon we will be adding more detailed documentation, 4 | tips for configurations and tuning, a better interface and some inline explanations for some design choices. 5 | -------------------------------------------------------------------------------- /src/common-host.cpp: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 2 | Ecole Polytechnique Federale de Lausanne 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE.*/ 21 | 22 | #include "common-host.h" 23 | 24 | #include 25 | 26 | double cpuSeconds() { 27 | struct timeval tp; 28 | gettimeofday(&tp, NULL); 29 | return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6); 30 | } 31 | 32 | void initializeSeq(int *in, size_t size) { 33 | for(int i = 0 ; i < size; i++) { 34 | in[i] = i; 35 | } 36 | } 37 | 38 | void initializeUniform(int *in, size_t size) { 39 | //srand (time(NULL)); 40 | //We want the input to be the same for every test 41 | //BUT: If seed is set to 1, the generator is reinitialized to 42 | //its initial value and produces the same values 43 | //as before any call to rand or srand. 44 | // srand (1); 45 | // for(int i = 0 ; i < size; i++) { 46 | // in[i] = rand() % size; 47 | // } 48 | 49 | for (int i = 0; i < size; i++) { 50 | in[i] = rand() % size; 51 | } 52 | } 53 | 54 | void initializeUniform(int *in, size_t size, int seed) { 55 | // //srand (time(NULL)); 56 | // //We want the input to be the same for every test 57 | // //BUT: If seed is set to 1, the generator is reinitialized to 58 | // //its initial value and produces the same values 59 | // //as before any call to rand or srand. 60 | // srand(seed + 10); 61 | // for(int i = 0 ; i < size; i++) { 62 | // in[i] = rand() % size; 63 | // } 64 | 65 | struct random_data* rand_states; 66 | char* rand_statebufs; 67 | int nthreads = 1; 68 | int bufferSize = 32; 69 | rand_states = (struct random_data*) calloc(nthreads, 70 | sizeof(struct random_data)); 71 | rand_statebufs = (char*) calloc(nthreads, bufferSize); 72 | 73 | /* for each 'thread', initialize a PRNG (the seed is the first argument) */ 74 | //initstate_r(random(), &rand_statebufs[t], PRNG_BUFSZ, &rand_states[t]); 75 | initstate_r(seed + 10, &rand_statebufs[0], bufferSize, &rand_states[0]); 76 | int state1; 77 | 78 | for (int i = 0; i < size; i++) { 79 | random_r(&rand_states[0], &state1); 80 | in[i] = state1 % size; 81 | } 82 | 83 | free(rand_states); 84 | free(rand_statebufs); 85 | } 86 | 87 | void initializeUniform(int *in, size_t size, int maxNo, int seed) { 88 | //srand (time(NULL)); 89 | //We want the input to be the same for every test 90 | //BUT: If seed is set to 1, the generator is reinitialized to 91 | //its initial value and produces the same values 92 | //as before any call to rand or srand. 93 | // srand(seed + 10); 94 | // for(int i = 0 ; i < size; i++) { 95 | // in[i] = rand() % maxNo; 96 | // } 97 | 98 | struct random_data* rand_states; 99 | char* rand_statebufs; 100 | int nthreads = 1; 101 | int bufferSize = 32; 102 | rand_states = (struct random_data*) calloc(nthreads, 103 | sizeof(struct random_data)); 104 | rand_statebufs = (char*) calloc(nthreads, bufferSize); 105 | 106 | /* for each 'thread', initialize a PRNG (the seed is the first argument) */ 107 | //initstate_r(random(), &rand_statebufs[t], PRNG_BUFSZ, &rand_states[t]); 108 | initstate_r(seed + 10, &rand_statebufs[0], bufferSize, &rand_states[0]); 109 | int state1; 110 | for (int i = 0; i < size; i++) { 111 | random_r(&rand_states[0], &state1); 112 | in[i] = state1 % maxNo; 113 | } 114 | 115 | free(rand_states); 116 | free(rand_statebufs); 117 | } 118 | 119 | void initializeZero(int *in, size_t size) { 120 | for(int i = 0 ; i < size; i++) { 121 | in[i] = 0; 122 | } 123 | } 124 | 125 | int NumberOfSetBits(int i) //uint32_t 126 | { 127 | // Java: use >>> instead of >> 128 | // C or C++: use uint32_t 129 | i = i - ((i >> 1) & 0x55555555); 130 | i = (i & 0x33333333) + ((i >> 2) & 0x33333333); 131 | return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24; 132 | } 133 | -------------------------------------------------------------------------------- /src/common-host.h: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 2 | Ecole Polytechnique Federale de Lausanne 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE.*/ 21 | 22 | #ifndef COMMON_HOST_H_ 23 | #define COMMON_HOST_H_ 24 | 25 | #include 26 | #include 27 | #include 28 | 29 | #define PAGESIZE 65568 //16384 * sizeof(int) + 32 // 30 | //#define PAGESIZE 16416 //16384 - 32 31 | //#define PAGESIZE 4096 32 | 33 | #include 34 | 35 | //nsight complains about cstdint (?) 36 | typedef unsigned int uint32_t; 37 | typedef unsigned long int uint64_t; 38 | 39 | typedef struct args { 40 | int *S; 41 | size_t S_els; 42 | char S_filename[50]; 43 | int *R; 44 | size_t R_els; 45 | char R_filename[50]; 46 | int threadsNum; 47 | // int blocksNum_max; 48 | // int valuesPerThread; 49 | unsigned int sharedMem; 50 | unsigned int pivotsNum; 51 | 52 | } args; 53 | 54 | /* Timing */ 55 | double cpuSeconds(); 56 | 57 | /* Benchmarking */ 58 | void initializeSeq(int *in, size_t size); 59 | void initializeUniform(int *in, size_t size); 60 | void initializeUniform(int *in, size_t size, int seed); 61 | void initializeUniform(int *in, size_t size, int maxNo, int seed); 62 | void initializeZero(int *in, size_t size); 63 | 64 | /* Bitmap Ops */ 65 | int NumberOfSetBits(int i); 66 | 67 | class time_block{ 68 | private: 69 | std::chrono::time_point start; 70 | std::string text ; 71 | public: 72 | inline time_block(std::string text = ""): 73 | text(text), start(std::chrono::system_clock::now()){} 74 | 75 | inline ~time_block(){ 76 | auto end = std::chrono::system_clock::now(); 77 | std::cout << text; 78 | std::cout << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 79 | } 80 | }; 81 | 82 | #endif /* COMMON_HOST_H_ */ 83 | -------------------------------------------------------------------------------- /src/common.cu: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 2 | Ecole Polytechnique Federale de Lausanne 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE.*/ 21 | 22 | #include "common.h" 23 | 24 | #include 25 | #include 26 | #include /* gettimeofday */ 27 | #include 28 | 29 | __constant__ unsigned int valuesToProcess; 30 | __device__ maxSize_type sum_dev; 31 | 32 | 33 | void recordTime(time_st *t) { 34 | gettimeofday(t, NULL); 35 | } 36 | 37 | unsigned int smallestGreaterPowerOf2(const unsigned int num) { 38 | unsigned int x = (UINT_MAX >> 1) + 1; //the greatest possible power of 2 39 | while (!(x & num)) 40 | x >>= 1; 41 | if (x ^ num) return x << 1; 42 | return x; /*size is already a power of 2*/ 43 | 44 | } 45 | 46 | unsigned int greatestLowerPowerOf2(const unsigned int num) { 47 | unsigned int x = (UINT_MAX >> 1) + 1; //the greatest possible power of 2 48 | while (!(x & num)) 49 | x >>= 1; 50 | if (x ^ num) return x; 51 | return x >> 1; /*size is already a power of 2*/ 52 | } 53 | 54 | void initialise_float(float *A, int N) { 55 | int i; 56 | for (i = 0; i < N; i++) 57 | A[i] = (float) (rand() & 0xff) / 10.0f; 58 | } 59 | 60 | void initialise_int(int *A, const int N) { 61 | int i; 62 | for (i = 0; i < N; i++) 63 | A[i] = rand() % 100 + 1; 64 | } 65 | 66 | void printArray_int(int *A, const size_t N) { 67 | if (N > (1 << 10)) return; 68 | int i; 69 | for (i = 0; i < N; i++) { 70 | printf("%5d", A[i]); 71 | if ((i + 1) % 35 == 0) printf("\n"); 72 | } 73 | printf("\n"); 74 | } 75 | 76 | void printArray_uint(unsigned int *A, const size_t N) { 77 | if (N > (1 << 22)) return; 78 | int i; 79 | for (i = 0; i < N; i++) { 80 | printf("%6u", A[i]); 81 | if ((i + 1) % 35 == 0) printf("\n"); 82 | } 83 | printf("\n"); 84 | } 85 | 86 | void printArray_maxSize_type(maxSize_type *A, const maxSize_type N) { 87 | if (N > (1 << 10)) return; 88 | int i; 89 | for (i = 0; i < N; i++) { 90 | printf("%5lu", A[i]); 91 | if ((i + 1) % 35 == 0) printf("\n"); 92 | } 93 | printf("\n"); 94 | } 95 | 96 | void printArray_char(char *A, const maxSize_type N) { 97 | if (N > (1 << 22)) return; 98 | maxSize_type i; 99 | for (i = 0; i < N; i++) { 100 | printf("%6d", A[i]); 101 | if ((i + 1) % 30 == 0) printf("\n"); 102 | } 103 | printf("\n"); 104 | } 105 | 106 | __global__ void copy(data_type *dataTO, data_type *dataFROM, const maxSize_type size) { 107 | maxSize_type gidx = blockIdx.x*blockDim.x + threadIdx.x; 108 | if(gidx >= size) return; 109 | dataTO[gidx] = dataFROM[gidx]; 110 | } 111 | 112 | __global__ void copy(maxSize_type *dataTO, maxSize_type *dataFROM, const maxSize_type size) { 113 | maxSize_type gidx = blockIdx.x*blockDim.x + threadIdx.x; 114 | if(gidx >= size) return; 115 | dataTO[gidx] = dataFROM[gidx]; 116 | } 117 | 118 | __global__ void scatter(data_type *dataIN, data_type *dataOUT, const maxSize_type size, maxSize_type *pos) { 119 | uint64_t gidx = blockIdx.x*blockDim.x + threadIdx.x; 120 | if(gidx >= size) return; 121 | // if(pos[gidx] > 1010) { 122 | // printf("(%d,%d) : data[%lu] <- %d\n", blockIdx.x, threadIdx.x, pos[gidx], dataIN[gidx]); 123 | // } 124 | dataOUT[pos[gidx]] = dataIN[gidx]; 125 | } 126 | 127 | /*Processes at most threadsNum elements of type uint64_t*/ 128 | __device__ void prefixSum_after(maxSize_type *data, const unsigned int size, maxSize_type *total) { 129 | unsigned int idx = threadIdx.x, idx_f, idx_s; 130 | 131 | /*iterate until the final result is computed */ 132 | unsigned int stride = 1; 133 | for (stride = 1; stride < size; stride <<= 1) { 134 | idx_f = stride * (2 * idx); 135 | idx_s = stride * (2 * idx + 1); 136 | 137 | if (idx_s < size) data[idx_f] += data[idx_s]; 138 | 139 | /*wait for all the threads in the block to finish before going to the next iteration*/ 140 | __syncthreads(); 141 | } 142 | 143 | *total = data[0]; //all threads get the result; 144 | __syncthreads(); 145 | 146 | /*store the final results*/ 147 | if (threadIdx.x == 0) { 148 | data[0] = 0; 149 | } 150 | __syncthreads(); 151 | 152 | /*now go the other direction*/ 153 | for (stride >>= 1; stride > 0; stride >>= 1) { 154 | idx_f = stride * (2 * idx); 155 | idx_s = stride * (2 * idx + 1); 156 | 157 | if (idx_s < size) { 158 | int tmp = data[idx_s]; 159 | data[idx_s] = data[idx_f]; 160 | data[idx_f] = tmp + data[idx_s]; 161 | } 162 | __syncthreads(); 163 | } 164 | } 165 | 166 | /*Processes at most threadsNum elements of type maxSize_type*/ 167 | __device__ void prefixSum_before(maxSize_type *data, const unsigned int size, maxSize_type *total) { 168 | unsigned int idx = threadIdx.x; 169 | unsigned int idx_f, idx_s; 170 | 171 | 172 | unsigned int stride; 173 | 174 | /*iterate until the final result is computed */ 175 | for (stride = 1; stride < size; stride <<= 1) { 176 | idx_f = stride * (2 * idx); 177 | idx_s = stride * (2 * idx + 1); 178 | 179 | if (idx_s < size) data[size - 1 - idx_f] += data[size - 1 - idx_s]; 180 | 181 | /*wait for all the threads in the block to finish before going to the next iteration*/ 182 | __syncthreads(); 183 | } 184 | 185 | *total = data[size-1]; 186 | 187 | __syncthreads(); 188 | 189 | /*store the final results*/ 190 | if (threadIdx.x == 0) { 191 | data[size - 1] = 0; 192 | // printf("*total = %lu\n", *total); 193 | } 194 | 195 | __syncthreads(); 196 | 197 | /*now go the other direction*/ 198 | for (stride >>= 1; stride > 0; stride >>= 1) { 199 | idx_f = stride * (2 * idx); 200 | idx_s = stride * (2 * idx + 1); 201 | 202 | if (idx_s < size) { 203 | maxSize_type tmp = data[size - 1 - idx_s]; 204 | data[size - 1 - idx_s] = data[size - 1 - idx_f]; 205 | data[size - 1 - idx_f] = tmp + data[size - 1 - idx_s]; 206 | } 207 | __syncthreads(); 208 | } 209 | } 210 | 211 | /*Processes at most threadsNum elements of type data_type*/ 212 | __device__ void prefixSum_before(data_type *const data, const unsigned int size, data_type *total) { 213 | unsigned int idx = threadIdx.x; 214 | unsigned int idx_f, idx_s; 215 | 216 | 217 | unsigned int stride; 218 | 219 | /*iterate until the final result is computed */ 220 | for (stride = 1; stride < size; stride <<= 1) { 221 | idx_f = stride * (2 * idx); 222 | idx_s = stride * (2 * idx + 1); 223 | 224 | if (idx_s < size) data[size - 1 - idx_f] += data[size - 1 - idx_s]; 225 | 226 | /*wait for all the threads in the block to finish before going to the next iteration*/ 227 | __syncthreads(); 228 | } 229 | 230 | *total = data[size-1]; 231 | 232 | __syncthreads(); 233 | 234 | /*store the final results*/ 235 | if (threadIdx.x == 0) { 236 | data[size - 1] = 0; 237 | // printf("*total = %lu\n", *total); 238 | } 239 | 240 | __syncthreads(); 241 | 242 | /*now go the other direction*/ 243 | for (stride >>= 1; stride > 0; stride >>= 1) { 244 | idx_f = stride * (2 * idx); 245 | idx_s = stride * (2 * idx + 1); 246 | 247 | if (idx_s < size) { 248 | maxSize_type tmp = data[size - 1 - idx_s]; 249 | data[size - 1 - idx_s] = data[size - 1 - idx_f]; 250 | data[size - 1 - idx_f] = tmp + data[size - 1 - idx_s]; 251 | } 252 | __syncthreads(); 253 | } 254 | } 255 | 256 | __device__ void prefixSum_before_multiple(data_type *const data, const unsigned int size, data_type *total, unsigned int num) { 257 | unsigned int idx = threadIdx.x; 258 | unsigned int idx_f, idx_s; 259 | unsigned int i; 260 | 261 | unsigned int stride; 262 | 263 | for (i = 0; i < num; i++) { 264 | data_type *data_local = data + i * size; 265 | // printf("%d/%d (%d,%d) : data_local [%p, %p]\n", i, num, blockIdx.x, threadIdx.x, data_local, data_local+size); 266 | 267 | /*iterate until the final result is computed */ 268 | for (stride = 1; stride < size; stride <<= 1) { 269 | idx_f = stride * (2 * idx); 270 | idx_s = stride * (2 * idx + 1); 271 | 272 | if (idx_s < size) { 273 | // if(data_local + size - 1 - idx_f == addr) 274 | // printf("%d/%d U (%d,%d) -> data_local[%d] (%p) = %d, data_local[%d] (%p) = %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local + size - 1 - idx_f, 275 | // data_local[size - 1 - idx_f], size - 1 - idx_s, data_local + size - 1 - idx_s, data_local[size - 1 - idx_s]); 276 | data_local[size - 1 - idx_f] += data_local[size - 1 - idx_s]; 277 | // if(data_local + size - 1 - idx_f == addr) printf("%d/%d U (%d,%d) -> data_local[%d] = %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local[size - 1 - idx_f]); 278 | } 279 | 280 | /*wait for all the threads in the block to finish before going to the next iteration*/ 281 | __syncthreads(); 282 | } 283 | 284 | total[i] = data_local[size - 1]; 285 | // printf("%u : (%d,%d) -> total[%d] = %d\n", size, blockIdx.x, threadIdx.x, i, total[i]); 286 | 287 | __syncthreads(); 288 | // } 289 | 290 | // /*store the final results (assumes more threads than num)*/ 291 | // if (threadIdx.x < num) { 292 | // (data+threadIdx.x*size)[size - 1] = 0; 293 | //// printf("(%d,%d) : total[%d] = %d\n", blockIdx.x, threadIdx.x, threadIdx.x, total[threadIdx.x]); 294 | // } 295 | // 296 | // __syncthreads(); 297 | 298 | // for (i = 0; i < num; i++) { 299 | // data_type *data_local = data + i * size; 300 | // if(blockIdx.x == 0) printf("%d/%d\n", i,num); 301 | 302 | if (threadIdx.x == 0) data_local[size - 1] = 0; 303 | __syncthreads(); 304 | 305 | // printf("%p\n", addr); 306 | // if(data_local + threadIdx.x == addr) 307 | // printf("%d/%d D0 (%d,%d) -> (%p) data_local[%d] = %d\n", i, num, blockIdx.x, threadIdx.x, data_local + threadIdx.x, threadIdx.x, data_local[threadIdx.x]); 308 | 309 | /*now go the other direction*/ 310 | for (stride >>= 1; stride > 0; stride >>= 1) { 311 | idx_f = stride * (2 * idx); 312 | idx_s = stride * (2 * idx + 1); 313 | 314 | // if(blockIdx.x == 0) printf("%d/%d D0 (%d,%d) -> data_local[%d], data_local[%d]\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, size - 1 - idx_s); 315 | 316 | if (idx_s < size) { 317 | // if(data_local + size - 1 - idx_f == addr) 318 | // printf("%d/%d D0 (%d,%d) -> (%p) data_local[%d] = %d, (%p) data_local[%d] = %d\n", i, num, blockIdx.x, threadIdx.x, data_local + size - 1 - idx_f, size - 1 - idx_f, 319 | // data_local[size - 1 - idx_f], data_local + size - 1 - idx_s, size - 1 - idx_s, data_local[size - 1 - idx_s]); 320 | // data_type tmps = data_local[size - 1 - idx_s]; 321 | // data_type tmpf = data_local[size - 1 - idx_f]; 322 | data_type tmp = data_local[size - 1 - idx_s]; 323 | // if (data_local + size - 1 - idx_f == addr) 324 | // printf("%d/%d D1 (%d,%d) -> data_local[%d] = %d, data_local[%d] = %d, %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local[size - 1 - idx_f], size - 1 - idx_s, 325 | // data_local[size - 1 - idx_s], tmp); 326 | data_local[size - 1 - idx_s] = data_local[size - 1 - idx_f]; 327 | // data_local[size - 1 - idx_f] = tmpf + tmps; 328 | // if (data_local + size - 1 - idx_f == addr) 329 | // printf("%d/%d D2 (%d,%d) -> data_local[%d] = %d, data_local[%d] = %d, %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local[size - 1 - idx_f], size - 1 - idx_s, 330 | // data_local[size - 1 - idx_s], tmp); 331 | data_local[size - 1 - idx_f] = tmp + data_local[size - 1 - idx_s]; 332 | // data_local[size - 1 - idx_s] = tmpf; 333 | // if (data_local + size - 1 - idx_f == addr) 334 | // printf("%d/%d D (%d,%d) -> data_local[%d] = %d, data_local[%d] = %d, %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local[size - 1 - idx_f], size - 1 - idx_s, 335 | // data_local[size - 1 - idx_s], tmp); 336 | } 337 | __syncthreads(); 338 | } 339 | 340 | __syncthreads(); 341 | } 342 | } 343 | 344 | /*Processes at most threadsNum elements of type uint64_t. Write the total to the sum_dev*/ 345 | __device__ void prefixSum_before_device(maxSize_type *data, const unsigned int size) { 346 | unsigned int idx = threadIdx.x; 347 | unsigned int idx_f, idx_s; 348 | unsigned int stride; 349 | 350 | // if(threadIdx.x == 0) printf("(%d,%d) : %u\n", blockIdx.x, threadIdx.x, size); 351 | 352 | // printf("B: %u data[%u] = %lu (%u,%u)\n", blockIdx.x, idx, data[idx], stride, size); 353 | 354 | /*iterate until the final result is computed */ 355 | for (stride = 1; stride < size; stride <<= 1) { 356 | idx_f = stride * (2 * idx); 357 | idx_s = stride * (2 * idx + 1); 358 | 359 | // if (idx_s < size) printf("%u (%d,%d) : %u, %u\n", stride, blockIdx.x, threadIdx.x, size - 1 - idx_f, size - 1 - idx_s); 360 | 361 | if (idx_s < size) data[size - 1 - idx_f] += data[size - 1 - idx_s]; 362 | 363 | /*wait for all the threads in the block to finish before going to the next iteration*/ 364 | __syncthreads(); 365 | } 366 | 367 | // printf("A: %u data[%u] = %lu\n", blockIdx.x, idx, data[idx]); 368 | 369 | /*store the final results*/ 370 | if (threadIdx.x == 0) { 371 | atomicAdd(&sum_dev, data[size-1]); 372 | data[size - 1] = 0; 373 | } 374 | 375 | __syncthreads(); 376 | 377 | /*now go the other direction*/ 378 | for (stride >>= 1; stride > 0; stride >>= 1) { 379 | idx_f = stride * (2 * idx); 380 | idx_s = stride * (2 * idx + 1); 381 | 382 | if (idx_s < size) { 383 | maxSize_type tmp = data[size - 1 - idx_s]; 384 | data[size - 1 - idx_s] = data[size - 1 - idx_f]; 385 | data[size - 1 - idx_f] = tmp + data[size - 1 - idx_s]; 386 | } 387 | __syncthreads(); 388 | } 389 | } 390 | 391 | __device__ void prefixSum_before_multipleSeq(data_type *const data, const unsigned int size, data_type *const borders, const unsigned int bordersNum, data_type *totalPerSeq) { 392 | /*set the ids and the data according to the ranges*/ 393 | unsigned int lidx = threadIdx.x; 394 | data_type range = data[lidx]; 395 | data_type border = borders[range]; 396 | 397 | lidx -= border; 398 | 399 | data_type *data_local = data+border; 400 | data_type *totalPerSeq_local = totalPerSeq + range; 401 | 402 | unsigned int size_local = 0; 403 | if(range == 0) size_local = borders[range+1]; 404 | else if(range == bordersNum-1) size_local = size-border; 405 | else size_local = borders[range+1]-border; 406 | 407 | if(threadIdx.x < bordersNum) 408 | totalPerSeq[threadIdx.x] = 0; //initialise 409 | 410 | __syncthreads(); 411 | 412 | // printf("(%d,%d) : range = %d, border = %d -> idx = %d, size = %d, data = %d\n", blockIdx.x, threadIdx.x, range, border, lidx, size_local, data_local[lidx]); 413 | /*continue with prefixSum as usual*/ 414 | unsigned int idx_f, idx_s; 415 | unsigned int stride; 416 | 417 | /*iterate until the final result is computed */ 418 | for (stride = 1; stride < size_local; stride <<= 1) { 419 | idx_f = stride * (2 * lidx); 420 | idx_s = stride * (2 * lidx + 1); 421 | 422 | if (idx_s < size_local) data_local[size_local - 1 - idx_f] += data_local[size_local - 1 - idx_s]; 423 | 424 | /*wait for all the threads in the block to finish before going to the next iteration*/ 425 | __syncthreads(); 426 | } 427 | 428 | *totalPerSeq_local = data_local[size_local-1]; 429 | // printf("(%d,%d) : total = %d (%d)\n", blockIdx.x, threadIdx.x, *totalPerSeq_local, range); 430 | 431 | __syncthreads(); 432 | 433 | /*store the final results*/ 434 | if (lidx == 0) { 435 | data_local[size_local - 1] = 0; 436 | 437 | } 438 | 439 | __syncthreads(); 440 | 441 | /*now go the other direction*/ 442 | for (stride >>= 1; stride > 0; stride >>= 1) { 443 | idx_f = stride * (2 * lidx); 444 | idx_s = stride * (2 * lidx + 1); 445 | 446 | if (idx_s < size_local) { 447 | maxSize_type tmp = data_local[size_local - 1 - idx_s]; 448 | data_local[size_local - 1 - idx_s] = data_local[size_local - 1 - idx_f]; 449 | data_local[size_local - 1 - idx_f] = tmp + data_local[size_local - 1 - idx_s]; 450 | } 451 | __syncthreads(); 452 | } 453 | 454 | // printf("(%d,%d) : data = %d\n", blockIdx.x, threadIdx.x, data_local[lidx]); 455 | } 456 | 457 | 458 | __global__ void prefixSum_before(maxSize_type *data, const maxSize_type size, maxSize_type *total) { 459 | if(size <= blockDim.x*blockIdx.x) return; /*nothing for this block*/ 460 | 461 | maxSize_type gidx = blockIdx.x*blockDim.x+threadIdx.x; 462 | maxSize_type blockTotal; 463 | 464 | /*write data to shared memory*/ 465 | if(gidx < size) uint64_shared[threadIdx.x] = data[gidx]; 466 | 467 | __syncthreads(); /*wait for all threads to finish writing in shared memory*/ 468 | 469 | unsigned int blockSize = blockDim.x; 470 | if(blockSize > size - blockDim.x*blockIdx.x) blockSize = size - blockDim.x*blockIdx.x; 471 | 472 | prefixSum_before(uint64_shared, blockSize, &blockTotal); 473 | 474 | /*write the results back from shared memory*/ 475 | if(gidx < size) data[gidx] = uint64_shared[threadIdx.x]; 476 | 477 | /*all threads in the block get the blockTotal but only one writes it in the output*/ 478 | if(threadIdx.x == 0) *(total+blockIdx.x) = blockTotal; 479 | } 480 | 481 | /*there is no total per block, there is only the sum of all totals of all blocks 482 | * If there is only one block then the sum of all totals is the total sum of the block. 483 | * The kernel does not return the total. it just computes it and stores it in case another 484 | * kernel wants to use it later*/ 485 | __global__ void prefixSum_before_device(maxSize_type *data, const maxSize_type size) { 486 | 487 | if(size <= blockDim.x*blockIdx.x) return; /*nothing for this block*/ 488 | 489 | maxSize_type gidx = blockIdx.x*blockDim.x+threadIdx.x; 490 | 491 | if(threadIdx.x == 0) sum_dev = 0; //initialise the sum to clean from any old result 492 | 493 | /*write data to shared memory*/ 494 | if(gidx < size) uint64_shared[threadIdx.x] = data[gidx]; 495 | 496 | __syncthreads(); /*wait for all threads to finish writing in shared memory*/ 497 | 498 | unsigned int blockSize = blockDim.x; 499 | if(blockSize > size - blockDim.x*blockIdx.x) blockSize = size - blockDim.x*blockIdx.x; 500 | 501 | prefixSum_before_device(uint64_shared, blockSize); 502 | 503 | /*write the results back from shared memory*/ 504 | if(gidx < size) data[gidx] = uint64_shared[threadIdx.x]; 505 | 506 | // if(threadIdx.x == 0) printf("%d : blockSum = %lu\n", blockIdx.x, sum_dev); 507 | } 508 | 509 | __device__ void sum(maxSize_type *data, const unsigned int size, maxSize_type *res) { 510 | /*iterate until the final result is computed */ 511 | maxSize_type stride = 1; 512 | for (stride = 1; stride < size; stride <<= 1) { 513 | maxSize_type idx_f = stride * (2 * threadIdx.x); 514 | maxSize_type idx_s = stride * (2 * threadIdx.x + 1); 515 | 516 | // if(idx_f < size) 517 | // printf("(%d, %d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, idx_f, data[idx_f]); 518 | // if(idx_s < size) 519 | // printf("(%d, %d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, idx_s, data[idx_s]); 520 | 521 | if (idx_s < size) data[idx_f] += data[idx_s]; 522 | 523 | /*wait for all the threads in the block to finish before going to the next iteration*/ 524 | __syncthreads(); 525 | } 526 | 527 | /*store the final results*/ 528 | *res = data[0]; 529 | // if (threadIdx.x == 0) printf("(%d,%d): %u %lu - %lu\n", blockIdx.x, threadIdx.x, size, *res, data[0]); 530 | } 531 | 532 | __device__ void sum_device(maxSize_type *data, const unsigned int size) { 533 | /*iterate until the final result is computed */ 534 | maxSize_type stride = 1; 535 | for (stride = 1; stride < size; stride <<= 1) { 536 | maxSize_type idx_f = stride * (2 * threadIdx.x); 537 | maxSize_type idx_s = stride * (2 * threadIdx.x + 1); 538 | 539 | // if(idx_f < size) 540 | // printf("(%d, %d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, idx_f, data[idx_f]); 541 | // if(idx_s < size) 542 | // printf("(%d, %d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, idx_s, data[idx_s]); 543 | 544 | if (idx_s < size) data[idx_f] += data[idx_s]; 545 | 546 | /*wait for all the threads in the block to finish before going to the next iteration*/ 547 | __syncthreads(); 548 | } 549 | 550 | /*store the final results*/ 551 | if(threadIdx.x == 0) sum_dev = data[0]; 552 | // if (threadIdx.x == 0) printf("(%d,%d): %u %lu - %lu\n", blockIdx.x, threadIdx.x, size, *res, data[0]); 553 | } 554 | 555 | __global__ void sum(maxSize_type *data, const maxSize_type size, maxSize_type *res) { 556 | if(size <= blockDim.x*blockIdx.x) return; /*nothing for this block*/ 557 | 558 | maxSize_type gidx = blockIdx.x*blockDim.x + threadIdx.x; 559 | 560 | maxSize_type total; 561 | 562 | /*write data to shared memory*/ 563 | if(gidx < size) uint64_shared[threadIdx.x] = data[gidx]; 564 | 565 | __syncthreads(); 566 | 567 | unsigned int blockSize = blockDim.x; 568 | if (blockSize > size - blockDim.x * blockIdx.x) blockSize = size - blockDim.x * blockIdx.x; 569 | sum(uint64_shared, blockSize, &total); 570 | 571 | /*I want to write to output only once. For that I use a 572 | * local variable for the local and then have only thread 0 573 | * write to output 574 | */ 575 | if (threadIdx.x == 0) 576 | *res = total; 577 | } 578 | 579 | __global__ void sum_device(maxSize_type *data, const maxSize_type size) { 580 | if(size <= blockDim.x*blockIdx.x) return; /*nothing for this block*/ 581 | 582 | maxSize_type gidx = blockIdx.x*blockDim.x + threadIdx.x; 583 | 584 | /*write data to shared memory*/ 585 | if(gidx < size) uint64_shared[threadIdx.x] = data[gidx]; 586 | // printf("(%d,%d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, gidx, data[gidx]); 587 | 588 | __syncthreads(); 589 | 590 | unsigned int blockSize = blockDim.x; 591 | if (blockSize > size - blockDim.x * blockIdx.x) blockSize = size - blockDim.x * blockIdx.x; 592 | sum_device(uint64_shared, blockSize); 593 | } 594 | 595 | 596 | /*I assume threads equal to the total number of elements per range. Data are already at shared memory*/ 597 | __device__ void prefixSum_sharedMem_before_multipleRanges(maxSize_type *data, maxSize_type data_els, maxSize_type *total1, 598 | maxSize_type *total2, maxSize_type *total3, unsigned int partitionsNum) { 599 | maxSize_type idx_f, idx_s; 600 | maxSize_type gidx = blockIdx.x * blockDim.x + threadIdx.x; 601 | unsigned int lidx = threadIdx.x; 602 | unsigned int size = blockDim.x; 603 | maxSize_type stride; 604 | unsigned int i; 605 | 606 | // if(gidx >= data_els) return; /* I want all threads to terminate so that all of them can have the total value*/ 607 | 608 | /*find exactly how many elements this block has in shared memory*/ 609 | if (size > data_els - blockIdx.x * blockDim.x) size = data_els - blockIdx.x * blockDim.x; 610 | 611 | // printf("\n(%d,%d) : shared[%u] = data[%lu] = %lu", blockIdx.x, lidx, lidx, gidx, uint64_shared[lidx]); 612 | 613 | maxSize_type *data_ptr; 614 | 615 | /*iterate until the final result is computed */ 616 | for (stride = 1; stride < size; stride <<= 1) { 617 | idx_f = stride * (2 * lidx); 618 | idx_s = stride * (2 * lidx + 1); 619 | 620 | data_ptr = data; 621 | for (i = 0; i < partitionsNum; i++) { 622 | 623 | // printf("\n%lu (%d,%d) : %lu, %lu -> %lu, %lu", stride, blockIdx.x, lidx, idx_f, idx_s, size - 1 - idx_f,size - 1 - idx_s); 624 | 625 | if (idx_s < size) { 626 | // uint64_t tmp = uint64_shared[size - 1 - idx_f]; 627 | data_ptr[size - 1 - idx_f] += data_ptr[size - 1 - idx_s]; 628 | // printf("\nUP %lu (%d,%d) : shared[%u] = %lu+%lu=%lu", stride, blockIdx.x, lidx, size - 1 - idx_f, tmp, uint64_shared[size - 1 - idx_s], uint64_shared[size - 1 - idx_f]); 629 | } 630 | data_ptr += blockDim.x; 631 | } 632 | 633 | /*wait for all the threads in the block to finish before going to the next iteration*/ 634 | __syncthreads(); 635 | } 636 | 637 | /*store the final results*/ 638 | // if (threadIdx.x == 0) { 639 | *total1 = data[size - 1]; 640 | *total2 = data[blockDim.x + size - 1]; 641 | *total3 = data[2 * blockDim.x + size - 1]; 642 | 643 | __syncthreads(); 644 | //printf("\n-> (%d,%d) : %d", blockIdx.x, threadIdx.x, *total); 645 | if (threadIdx.x == 0) { 646 | for (i = 0; i < partitionsNum; i++) 647 | data[i * blockDim.x + size - 1] = 0; 648 | } 649 | 650 | __syncthreads(); 651 | 652 | /*now go the other direction*/ 653 | for (stride >>= 1; stride > 0; stride >>= 1) { 654 | idx_f = stride * (2 * lidx); 655 | idx_s = stride * (2 * lidx + 1); 656 | 657 | data_ptr = data; 658 | for (i = 0; i < partitionsNum; i++) { 659 | 660 | if (idx_s < size) { 661 | maxSize_type tmp = data_ptr[size - 1 - idx_s]; 662 | data_ptr[size - 1 - idx_s] = data_ptr[size - 1 - idx_f]; 663 | data_ptr[size - 1 - idx_f] = tmp + data_ptr[size - 1 - idx_s]; 664 | // printf("\nDOWN %lu (%d,%d) : shared[%u] = %lu - shared[%u] = %lu", stride, blockIdx.x, lidx, size - 1 - idx_f, uint64_shared[size - 1 - idx_f], size - 1 - idx_s, uint64_shared[size - 1 - idx_s]); 665 | } 666 | data_ptr += blockDim.x; 667 | } 668 | __syncthreads(); 669 | } 670 | } 671 | 672 | __device__ void sum(int *data, unsigned int size, int *res) { 673 | unsigned int idx, idx_f, idx_s; 674 | 675 | /*iterate until the final result is computed */ 676 | int stride = 1; 677 | for (stride = 1; stride < size; stride <<= 1) { 678 | idx = threadIdx.x; 679 | 680 | idx_f = stride * (2 * idx); 681 | idx_s = stride * (2 * idx + 1); 682 | 683 | if (idx_s < size) data[idx_f] += data[idx_s]; 684 | 685 | /*wait for all the threads in the block to finish before going to the next iteration*/ 686 | __syncthreads(); 687 | } 688 | 689 | // /*store the final results*/ 690 | // if (threadIdx.x == 0) { 691 | *res = data[0]; 692 | // } 693 | } 694 | 695 | 696 | 697 | __device__ void max(int *data, unsigned int size, int *res) { 698 | unsigned int idx, idx_f, idx_s; 699 | 700 | /*iterate until the final result is computed */ 701 | int stride = 1; 702 | for (stride = 1; stride < size; stride <<= 1) { 703 | idx = threadIdx.x; 704 | 705 | idx_f = stride * (2 * idx); 706 | idx_s = stride * (2 * idx + 1); 707 | 708 | while (idx_s < size) { 709 | // int tmp = data[idx_f]; 710 | if (data[idx_f] < data[idx_s]) data[idx_f] = data[idx_s]; 711 | // printf("UP %d (%d,%d): %d [%d]=[%d]+[%d]=%d+%d=%d\n", size, blockIdx.x, threadIdx.x, stride, idx_f, idx_f, idx_s, tmp, data[idx_s], data[idx_f]); 712 | 713 | idx += blockDim.x; 714 | 715 | idx_f = stride * (2 * idx); 716 | idx_s = stride * (2 * idx + 1); 717 | } 718 | 719 | /*wait for all the threads in the block to finish before going to the next iteration*/ 720 | __syncthreads(); 721 | } 722 | 723 | /*store the final results*/ 724 | if (threadIdx.x == 0) { 725 | *res = data[0]; 726 | // printf("%d: Max = %d\n", blockIdx.x, *res); 727 | } 728 | } 729 | 730 | __device__ void min(int *data, unsigned int size, int *res) { 731 | unsigned int idx, idx_f, idx_s; 732 | 733 | /*iterate until the final result is computed */ 734 | int stride = 1; 735 | for (stride = 1; stride < size; stride <<= 1) { 736 | idx = threadIdx.x; 737 | 738 | idx_f = stride * (2 * idx); 739 | idx_s = stride * (2 * idx + 1); 740 | 741 | while (idx_s < size) { 742 | // int tmp = data[idx_f]; 743 | if (data[idx_f] > data[idx_s]) data[idx_f] = data[idx_s]; 744 | // printf("UP %d (%d,%d): %d [%d]=[%d]+[%d]=%d+%d=%d\n", size, blockIdx.x, threadIdx.x, stride, idx_f, idx_f, idx_s, tmp, data[idx_s], data[idx_f]); 745 | 746 | idx += blockDim.x; 747 | 748 | idx_f = stride * (2 * idx); 749 | idx_s = stride * (2 * idx + 1); 750 | } 751 | 752 | /*wait for all the threads in the block to finish before going to the next iteration*/ 753 | __syncthreads(); 754 | } 755 | 756 | /*store the final results*/ 757 | if (threadIdx.x == 0) { 758 | *res = data[0]; 759 | // printf("%d: Min = %d\n", blockIdx.x, *res); 760 | } 761 | } 762 | 763 | __global__ void aggregate(int *data, unsigned int size, int *res, int funcId) { 764 | /*all processing should be done in the same block*/ 765 | if (blockIdx.x > 0) return; 766 | 767 | unsigned int idx; 768 | for (idx = threadIdx.x; idx < size; idx += blockDim.x) 769 | int_shared[idx] = data[idx]; 770 | 771 | __syncthreads(); 772 | 773 | if (funcId == 1) 774 | min(int_shared, size, res); 775 | else if (funcId == 2) 776 | max(int_shared, size, res); 777 | else if (funcId == 3) sum(int_shared, size, res); 778 | } 779 | 780 | void* test(size_t sz) { 781 | //CUDA UVA 782 | 783 | void* mem; 784 | cudaHostAlloc((void **) &mem, sz, cudaHostAllocMapped); 785 | 786 | if (mem) 787 | return mem; 788 | else 789 | return NULL; 790 | } 791 | 792 | #define PREFIX_SUM2 793 | #define SUM 794 | 795 | static __global__ void addWithStepANDsum_device(maxSize_type *data, maxSize_type size, maxSize_type *total, maxSize_type total_els, maxSize_type group) { 796 | maxSize_type gidx = blockDim.x * blockIdx.x + threadIdx.x; 797 | 798 | if (blockIdx.x <= group) return; 799 | 800 | // if(threadIdx.x == 0) printf("(%d,%d) : blockSum = %lu\n", blockIdx.x, threadIdx.x, sum_dev); 801 | 802 | unsigned int blockIdx_normalised = blockIdx.x - group; 803 | #if defined(SUM) || defined(PREFIX_SUM) 804 | if (blockIdx_normalised >= total_els && gidx < size) { 805 | /*use the pre-computed sum*/ 806 | // if(gidx == 20) printf("(%d,%d) : blockSum = %lu\n", blockIdx.x, threadIdx.x, sum_dev); 807 | data[gidx] += sum_dev; 808 | } else { 809 | #endif 810 | /*use the pre-computed prefixSum*/ 811 | // if(gidx == 20) printf("(%d,%d) : blockSum = %lu\n", blockIdx.x, threadIdx.x, total[blockIdx_normalised]); 812 | #if defined(PREFIX_SUM) 813 | if(gidx < size) data[gidx] += total[blockIdx_normalised]; 814 | #else 815 | if (threadIdx.x < blockIdx_normalised && threadIdx.x < total_els) 816 | uint64_shared[threadIdx.x] = total[threadIdx.x]; 817 | 818 | // if(threadIdx.x == 0) printf("(%d,%d) : shared memory\n", blockIdx.x, threadIdx.x); 819 | 820 | __syncthreads(); 821 | 822 | maxSize_type sumOfAll; 823 | unsigned int elsNum = blockDim.x; 824 | if (blockIdx.x - group < blockDim.x) elsNum = blockIdx.x - group; 825 | 826 | sum(uint64_shared, elsNum, &sumOfAll); 827 | 828 | // if(gidx == 4) 829 | // printf("(%d,%d) : %lu elsNum = %lu, sum = %lu\n", blockIdx.x, threadIdx.x, gidx, elsNum, sumOfAll); 830 | 831 | if(gidx < size) data[gidx] += sumOfAll; 832 | #endif 833 | #if defined(SUM) || defined(PREFIX_SUM) 834 | } 835 | #endif 836 | } 837 | 838 | void totalPrefixSum(maxSize_type *data, maxSize_type size, maxSize_type *total, maxSize_type *sumOfAll, uint threadsNum, unsigned int iterNum) { 839 | unsigned int blocksNum = (size + threadsNum - 1) / threadsNum; 840 | dim3 block(threadsNum); 841 | dim3 grid(blocksNum); 842 | unsigned int sharedMemSize = threadsNum * sizeof(maxSize_type); /*one positions for each thread*/ 843 | 844 | unsigned int i = 0; 845 | maxSize_type s = data[size - 1]; 846 | 847 | //printf("%u --> s: %u d: %u - %u t: %u - %u\n", iterNum, sumOfAll, data, data+size, total, total+block.x); 848 | /*compute first prefix sum*/ 849 | prefixSum_before<<>>(data, size, total); 850 | 851 | 852 | if (blocksNum > threadsNum) { 853 | for (i = 0; i < blocksNum - threadsNum; i += threadsNum) { 854 | // printf("Adding %lu (%d, %d)\n", i, blocksNum, threadsNum); 855 | /*compute the sum of as many elements as the number of threads*/ 856 | #if defined(PREFIX_SUM) 857 | prefixSum_before_device<<>>(total+i, (maxSize_type)threadsNum); 858 | #elif defined(SUM) 859 | sum_device<<>>(total+i, (maxSize_type)threadsNum); 860 | #endif 861 | // CHK_ERROR(cudaDeviceSynchronize()); 862 | addWithStepANDsum_device<<>>(data, size, total+i, threadsNum, i); 863 | CHK_ERROR(cudaDeviceSynchronize()); 864 | } 865 | } 866 | 867 | /*last iteration*/ 868 | // printf("Last adding %lu (%d, %d) remaining %lu\n", i, blocksNum, threadsNum, blocksNum-i); 869 | #if defined(PREFIX_SUM) 870 | prefixSum_before_device<<>>(total+i, (maxSize_type)blocksNum-i); 871 | #endif 872 | addWithStepANDsum_device<<>>(data, size, total+i, blocksNum-i, i); 873 | CHK_ERROR(cudaDeviceSynchronize()); 874 | 875 | s += data[size - 1]; 876 | *sumOfAll = s; 877 | 878 | // printf("R sumOfAll = %lu \n", *sumOfAll); 879 | 880 | } 881 | ///* Globally added mem allocators */ 882 | //__host__ void* operator new(size_t sz) throw (std::bad_alloc) 883 | //{ 884 | //// cerr << "allocating " << sz << " bytes\n"; 885 | //// void* mem = malloc(sz); 886 | //// if (mem) 887 | //// return mem; 888 | //// else 889 | //// return NULL; 890 | //// //throw std::bad_alloc(); 891 | // 892 | //// //CUDA UVA 893 | // cerr << "[UVA: ] allocating " << sz << " bytes\n"; 894 | // void* mem; 895 | // cudaHostAlloc((void **) &mem, sz, cudaHostAllocMapped); 896 | // 897 | // if (mem) 898 | // return mem; 899 | // else { 900 | // cerr << "[UVA: ] error during allocation!" << endl; 901 | // return NULL; 902 | // //throw std::bad_alloc(); 903 | // } 904 | // 905 | // //throw std::bad_alloc(); 906 | //} 907 | // 908 | //__host__ void operator delete(void* ptr) throw() 909 | //{ 910 | // cerr << "deallocating at " << ptr << endl; 911 | // //free(ptr); 912 | // 913 | // cudaFreeHost(ptr); 914 | // 915 | //} 916 | -------------------------------------------------------------------------------- /src/common.h: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 2 | Ecole Polytechnique Federale de Lausanne 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE.*/ 21 | 22 | #ifndef COMMON_H_ 23 | #define COMMON_H_ 24 | 25 | #include /*uint8_t, uint16_t, uint32_t, uint64_t*/ 26 | #include 27 | #include 28 | 29 | /* Constants */ 30 | #define WARP_SZ 32 31 | #define NSTREAM 16//32 32 | #define BDIM 1024 33 | //Must be equal to BDIM, no? 34 | #define SHMEMDIM 1024 35 | 36 | #define DELIM ',' 37 | 38 | #define COMPUTE_CAPABILITY_5 39 | 40 | #define BANKSNUM 4 41 | #define BANKSIZE 8 42 | #define PADSTEP BANKSNUM*BANKSIZE/sizeof(int); 43 | #define SHIFT log2((double)BANKSNUM*BANKSIZE/sizeof(int)) 44 | 45 | __host__ __device__ __forceinline__ uint32_t hasht(uint32_t x) { 46 | return x; 47 | } 48 | 49 | #define CHUNK_SIZE ((uint64_t) (1 << 31)) 50 | 51 | constexpr uint32_t log_parts1 = 8;//9; //< 12 2^(log_parts1 + log_parts2 + p_d + 5) ~= 'hash table size" ~= 2 * input size 52 | constexpr uint32_t log_parts2 = 5;//6;//8; //< 12 53 | 54 | constexpr int32_t g_d = log_parts1 + log_parts2; 55 | constexpr int32_t p_d = 3; 56 | 57 | constexpr int32_t max_chain = (32 - 1) * 1 - 1; //(32 - 1) * 2 - 1; 58 | 59 | #define hj_d (5 + p_d + g_d) 60 | 61 | constexpr uint32_t hj_mask = ((1 << hj_d) - 1); 62 | 63 | constexpr int32_t partitions = 1 << p_d; 64 | constexpr int32_t partitions_mask = partitions - 1; 65 | 66 | constexpr int32_t grid_parts = 1 << g_d; 67 | constexpr int32_t grid_parts_mask = grid_parts - 1; 68 | 69 | constexpr uint32_t log2_bucket_size = 12; 70 | constexpr uint32_t bucket_size = 1 << log2_bucket_size; 71 | constexpr uint32_t bucket_size_mask = bucket_size - 1; 72 | 73 | 74 | #define MEM_TYPE 0 75 | 76 | #if MEM_TYPE == 0 77 | #define MEM_HOST 78 | #elif MEM_TYPE == 1 79 | #define MEM_DEVICE 80 | #elif MEM_TYPE == 2 81 | #define MEM_MANAGED 82 | #elif MEM_TYPE == 3 83 | #define MEM_S_DEVICE 84 | #else 85 | #define MEM_HOST 86 | #endif 87 | 88 | #define data_type int 89 | #define maxSize_type unsigned long long int 90 | #define data_min INT_MIN 91 | 92 | extern __shared__ data_type int_shared[]; 93 | extern __shared__ maxSize_type uint64_shared[]; 94 | 95 | extern __constant__ unsigned int valuesToProcess; 96 | 97 | extern __device__ maxSize_type sum_dev; 98 | 99 | typedef struct timeval time_st; 100 | 101 | typedef struct timingInfo { 102 | unsigned int n = 5; 103 | time_st start[5]; 104 | time_st end[5]; 105 | double greaterTime = 0; 106 | double reduce_usecs = 0; 107 | double fixPositions_usecs = 0; 108 | double scatter_usecs = 0; 109 | double copy_usecs = 0; 110 | double bitonic_usecs = 0; 111 | double total_usecs = 0; 112 | 113 | double greaterEventTime = 0; 114 | 115 | unsigned int greaterCallsNum = 0; 116 | unsigned int bitonicCallsNum = 0; 117 | unsigned int reduceCallsNum = 0; 118 | unsigned int fixPositionsCallsNum = 0; 119 | } timingInfo; 120 | 121 | union vec4{ 122 | int4 vec ; 123 | int32_t i[4]; 124 | }; 125 | 126 | union vec2{ 127 | long2 vec ; 128 | int64_t i[4]; 129 | }; 130 | 131 | /* Error Checking*/ 132 | #define CHK_ERROR(call) \ 133 | { \ 134 | const cudaError_t error = call; \ 135 | if (error != cudaSuccess) \ 136 | { \ 137 | fprintf(stderr, "GPU Error: %s:%d, ", __FILE__, __LINE__); \ 138 | fprintf(stderr, "code:%d, reason: %s\n", error, cudaGetErrorString(error)); \ 139 | exit(-10*error); \ 140 | } \ 141 | } 142 | 143 | __device__ __forceinline__ uint32_t get_laneid(){ 144 | uint32_t laneid; 145 | asm("mov.u32 %0, %%laneid;" : "=r"(laneid)); 146 | return laneid; 147 | } 148 | 149 | //__host__ void* operator new(size_t sz) throw (std::bad_alloc); 150 | //__host__ void operator delete(void* ptr) throw(); 151 | 152 | #define USECS(start, end) (((end)->tv_sec * 1000000L + (end)->tv_usec) - ((start)->tv_sec * 1000000L + (start)->tv_usec)) 153 | #define MSECS(start, end) (((end)->tv_sec * 1000000L + (end)->tv_usec) - ((start)->tv_sec * 1000000L + (start)->tv_usec))/1000.0 154 | 155 | void recordTime(time_st *t); 156 | 157 | unsigned int smallestGreaterPowerOf2(const unsigned int num); 158 | unsigned int greatestLowerPowerOf2(const unsigned int num); 159 | 160 | void initialise_float(float *A, int N); 161 | void initialise_int(int *A, const int N); 162 | void printArray_int(int *A, const size_t N); 163 | void printArray_uint(unsigned int *A, const size_t N); 164 | void printArray_char(char *A, const maxSize_type N); 165 | void printArray_maxSize_type(maxSize_type *A, const maxSize_type N); 166 | 167 | void totalPrefixSum(maxSize_type *data, maxSize_type size, maxSize_type *total, maxSize_type *sumOfAll, unsigned int threadsNum, unsigned int iterNum); 168 | 169 | /*per block*/ 170 | static __device__ void prefixSum_after(maxSize_type *data, const unsigned int size, maxSize_type *total); 171 | __device__ void prefixSum_before(maxSize_type *data, const unsigned int size, maxSize_type *total); 172 | __device__ void prefixSum_before(data_type *const data, const unsigned int size, data_type *total); 173 | __device__ void prefixSum_before_multiple(data_type *const data, const unsigned int size, data_type *total, unsigned int num); 174 | static __device__ void prefixSum_before_device(maxSize_type *data, const unsigned int size); 175 | __device__ void prefixSum_before_multipleSeq(data_type *const data, const unsigned int sise, data_type *const borders, const unsigned int bordersNum, data_type *totalPerSeq); 176 | 177 | static __device__ void sum(maxSize_type *data, const unsigned int size, maxSize_type *res); 178 | static __device__ void sum_device(maxSize_type *data, const unsigned int size); 179 | 180 | /*the whole dataset*/ 181 | __global__ void prefixSum_before(maxSize_type *data, const maxSize_type size, maxSize_type *total); 182 | __global__ void prefixSum_before_device(maxSize_type *data, const maxSize_type size); 183 | 184 | __global__ void sum(maxSize_type *data, const maxSize_type size, maxSize_type *res); 185 | __global__ void sum_device(maxSize_type *data, const maxSize_type size); 186 | 187 | __global__ void copy(data_type *dataTO, data_type *dataFROM, const maxSize_type size); 188 | __global__ void copy(maxSize_type *dataTO, maxSize_type *dataFROM, const maxSize_type size); 189 | 190 | __global__ void scatter(data_type *dataIN, data_type *dataOUT, maxSize_type size, maxSize_type *pos); 191 | 192 | 193 | __device__ void prefixSum_sharedMem_before_multipleRanges(maxSize_type *data, maxSize_type data_els, maxSize_type *total1, 194 | maxSize_type *total2, maxSize_type *total3, unsigned int partitionsNum); 195 | 196 | 197 | __device__ void sum(int *data, unsigned int size, int *res); 198 | 199 | __device__ void max(int *data, unsigned int size, int *res); 200 | __device__ void min(int *data, unsigned int size, int *res); 201 | 202 | __global__ void aggregate(int *data, unsigned int size, int *res, int funcId); 203 | 204 | // Handle missmatch of atomics for (u)int64/32_t with cuda's definitions 205 | template::type = 0> 208 | __device__ __forceinline__ T atomicExch(T *address, T val){ 209 | return (T) atomicExch((unsigned long long int*) address, (unsigned long long int) val); 210 | } 211 | 212 | template::value, 214 | int>::type = 0> 215 | __device__ __forceinline__ T atomicExch(T *address, T val){ 216 | return (T) atomicExch((unsigned int*) address, (unsigned int) val); 217 | } 218 | 219 | template::type = 0> 222 | __device__ __forceinline__ T atomicExch_block(T *address, T val){ 223 | return (T) atomicExch_block((unsigned long long int*) address, (unsigned long long int) val); 224 | } 225 | 226 | template::value, 228 | int>::type = 0> 229 | __device__ __forceinline__ T atomicExch_block(T *address, T val){ 230 | return (T) atomicExch_block((unsigned int*) address, (unsigned int) val); 231 | } 232 | 233 | 234 | template::value, 236 | int>::type = 0> 237 | __device__ __forceinline__ T atomicExch(T *address, T val){ 238 | return (T) atomicExch((int*) address, (int) val); 239 | } 240 | 241 | template::type = 0> 244 | __device__ __forceinline__ T atomicOr(T *address, T val){ 245 | return (T) atomicOr((unsigned long long int*) address, (unsigned long long int) val); 246 | } 247 | 248 | template::value, 250 | int>::type = 0> 251 | __device__ __forceinline__ T atomicOr(T *address, T val){ 252 | return (T) atomicOr((unsigned int*) address, (unsigned int) val); 253 | } 254 | 255 | template::type = 0> 258 | __device__ __forceinline__ T atomicOr_block(T *address, T val){ 259 | return (T) atomicOr_block((unsigned long long int*) address, (unsigned long long int) val); 260 | } 261 | 262 | template::value, 264 | int>::type = 0> 265 | __device__ __forceinline__ T atomicOr_block(T *address, T val){ 266 | return (T) atomicOr_block((unsigned int*) address, (unsigned int) val); 267 | } 268 | 269 | 270 | template::value, 272 | int>::type = 0> 273 | __device__ __forceinline__ T atomicOr(T *address, T val){ 274 | return (T) atomicOr((int*) address, (int) val); 275 | } 276 | 277 | template::value && !std::is_signed::value, 279 | int>::type = 0> 280 | __device__ __forceinline__ T atomicMin(T *address, T val){ 281 | return (T) atomicMin((unsigned long long int*) address, (unsigned long long int) val); 282 | } 283 | 284 | template::value && !std::is_signed::value, 286 | int>::type = 0> 287 | __device__ __forceinline__ T atomicMin(T *address, T val){ 288 | return (T) atomicMin((unsigned int*) address, (unsigned int) val); 289 | } 290 | 291 | template::value && std::is_signed::value, 293 | int>::type = 0> 294 | __device__ __forceinline__ T atomicMin(T *address, T val){ 295 | return (T) atomicMin((int*) address, (int) val); 296 | } 297 | 298 | template::value && !std::is_signed::value, 300 | int>::type = 0> 301 | __device__ __forceinline__ T atomicMin_block(T *address, T val){ 302 | return (T) atomicMin_block((unsigned long long int*) address, (unsigned long long int) val); 303 | } 304 | 305 | template::value && !std::is_signed::value, 307 | int>::type = 0> 308 | __device__ __forceinline__ T atomicMin_block(T *address, T val){ 309 | return (T) atomicMin_block((unsigned int*) address, (unsigned int) val); 310 | } 311 | 312 | template::value && std::is_signed::value, 314 | int>::type = 0> 315 | __device__ __forceinline__ T atomicMin_block(T *address, T val){ 316 | return (T) atomicMin_block((int*) address, (int) val); 317 | } 318 | 319 | template::value && !std::is_signed::value, 321 | int>::type = 0> 322 | __device__ __forceinline__ T atomicAdd(T *address, T val){ 323 | return (T) atomicAdd((unsigned long long int*) address, (unsigned long long int) val); 324 | } 325 | 326 | template::value && !std::is_signed::value, 328 | int>::type = 0> 329 | __device__ __forceinline__ T atomicAdd(T *address, T val){ 330 | return (T) atomicAdd((unsigned int*) address, (unsigned int) val); 331 | } 332 | 333 | template::value && std::is_signed::value, 335 | int>::type = 0> 336 | __device__ __forceinline__ T atomicAdd(T *address, T val){ 337 | return (T) atomicAdd((int*) address, (int) val); 338 | } 339 | 340 | template::value && !std::is_signed::value, 342 | int>::type = 0> 343 | __device__ __forceinline__ T atomicAdd_block(T *address, T val){ 344 | return (T) atomicAdd_block((unsigned long long int*) address, (unsigned long long int) val); 345 | } 346 | 347 | template::value && !std::is_signed::value, 349 | int>::type = 0> 350 | __device__ __forceinline__ T atomicAdd_block(T *address, T val){ 351 | return (T) atomicAdd_block((unsigned int*) address, (unsigned int) val); 352 | } 353 | 354 | template::value && std::is_signed::value, 356 | int>::type = 0> 357 | __device__ __forceinline__ T atomicAdd_block(T *address, T val){ 358 | return (T) atomicAdd_block((int*) address, (int) val); 359 | } 360 | 361 | #endif /* COMMON_H_ */ 362 | -------------------------------------------------------------------------------- /src/generator_ETHZ.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Code adapted from multicore-hashjoins-0.2@https://www.systems.ethz.ch/node/334 3 | All credit to the original author: Cagri Balkesen 4 | */ 5 | 6 | #include "generator_ETHZ.cuh" 7 | 8 | #include 9 | #include 10 | #include /*printf*/ 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #define RAND_RANGE(N) ((double)rand() / ((double)RAND_MAX + 1) * (N)) 17 | #define RAND_RANGE48(N,STATE) ((double)nrand48(STATE)/((double)RAND_MAX+1)*(N)) 18 | 19 | 20 | static int seeded = 0; 21 | static unsigned int seedValue; 22 | 23 | void seed_generator(unsigned int seed) { 24 | srand(seed); 25 | seedValue = seed; 26 | seeded = 1; 27 | } 28 | 29 | /** Check whether seeded, if not seed the generator with current time */ 30 | static void check_seed() { 31 | if (!seeded) { 32 | seedValue = time(NULL); 33 | srand(seedValue); 34 | seeded = 1; 35 | } 36 | } 37 | 38 | int readFromFile(const char * filename, int *relation, uint64_t num_tuples) { 39 | char path[100]; 40 | sprintf(path, "%s", filename); 41 | FILE *fp = fopen(path, "rb"); 42 | 43 | if (!fp) return 1; 44 | 45 | printf("Reading file %s ", path); 46 | fflush(stdout); 47 | 48 | fread(relation, sizeof(int), num_tuples, fp); 49 | 50 | /*for (int i = 0; i < num_tuples; i++) { 51 | int k = rand() % num_tuples; 52 | int tmp = relation[k]; 53 | relation[k] = relation[i]; 54 | relation[i] = tmp; 55 | }*/ 56 | 57 | fclose(fp); 58 | return 0; 59 | } 60 | 61 | static int writeToFile(const char * filename, int *relation, uint64_t num_tuples) { 62 | FILE *fp = fopen(filename, "wb"); 63 | if (!fp) return 1; 64 | 65 | fwrite(relation, sizeof(int), num_tuples, fp); 66 | fclose(fp); 67 | 68 | char path[100]; 69 | sprintf(path, "%s", filename); 70 | rename(filename, path); 71 | return 0; 72 | } 73 | 74 | int create_relation_nonunique(const char *filename, int *relation, uint64_t num_tuples, const int64_t maxid) { 75 | /*first try to read from a file*/ 76 | if (readFromFile(filename, relation, num_tuples)) { 77 | check_seed(); 78 | random_gen(relation, num_tuples, maxid); 79 | 80 | return writeToFile(filename, relation, num_tuples); 81 | } 82 | return 0; 83 | } 84 | 85 | 86 | int create_relation_unique(const char *filename, int *relation, uint64_t num_tuples, const int64_t maxid) { 87 | /*first try to read from a file*/ 88 | if (readFromFile(filename, relation, num_tuples)) { 89 | random_unique_gen(relation, num_tuples, maxid); 90 | return writeToFile(filename, relation, num_tuples); 91 | } 92 | 93 | return 0; 94 | } 95 | 96 | 97 | int create_relation_n(int* in_relation, int* out_relation, uint64_t num_tuples, uint64_t n) { 98 | for (int i = 0; i < n; i++) { 99 | memcpy (out_relation + i * num_tuples, in_relation, num_tuples * sizeof(int)); 100 | } 101 | 102 | /*unsigned short state[3] = {0, 0, 0}; 103 | unsigned int seed = time(NULL); 104 | memcpy(state, &seed, sizeof(seed)); 105 | 106 | knuth_shuffle48(out_relation, num_tuples * n, state);*/ 107 | 108 | return 0; 109 | } 110 | 111 | /** 112 | * Generate tuple IDs -> random distribution 113 | * relation must have been allocated 114 | */ 115 | void random_gen(int *rel, uint64_t elsNum, const int64_t maxid) { 116 | uint64_t i; 117 | 118 | for (i = 0; i < elsNum; i++) { 119 | rel[i] = RAND_RANGE(maxid); 120 | // printf("%d: rel[%d] = %d\n", maxid, i, rel[i]); 121 | } 122 | } 123 | 124 | /** 125 | * Create random unique keys starting from firstkey 126 | */ 127 | void random_unique_gen(int *rel, uint64_t elsNum, const int64_t maxid) { 128 | uint64_t i; 129 | 130 | uint64_t firstkey = 0; 131 | 132 | /* for randomly seeding nrand48() */ 133 | unsigned short state[3] = {0, 0, 0}; 134 | unsigned int seed = time(NULL); 135 | memcpy(state, &seed, sizeof(seed)); 136 | 137 | for (i = 0; i < elsNum; i++) { 138 | rel[i] = firstkey; 139 | 140 | if(firstkey == maxid) 141 | firstkey = 0; 142 | 143 | firstkey ++; 144 | } 145 | 146 | /* randomly shuffle elements */ 147 | knuth_shuffle48(rel, elsNum, state); 148 | 149 | } 150 | 151 | /** 152 | * Create a foreign-key relation using the given primary-key relation and 153 | * foreign-key relation size. Keys in pkrel is randomly distributed in the full 154 | * integer range. 155 | * 156 | * @param fkrel [output] foreign-key relation 157 | * @param pkrel [input] primary-key relation 158 | * @param num_tuples 159 | * 160 | * @return 161 | */ 162 | int create_relation_fk_from_pk(const char *fkrelFilename, int *fkrel, uint64_t fkrelElsNum, int *pkrel, 163 | uint64_t pkrelElsNum) { 164 | /*first try to read from a file*/ 165 | if (readFromFile(fkrelFilename, fkrel, fkrelElsNum)) { 166 | int i, iters; 167 | int64_t remainder; 168 | 169 | /* alternative generation method */ 170 | iters = fkrelElsNum / pkrelElsNum; 171 | for (i = 0; i < iters; i++) { 172 | memcpy(fkrel + i * pkrelElsNum, pkrel, pkrelElsNum * sizeof(int)); 173 | } 174 | 175 | /* if num_tuples is not an exact multiple of pkrel->num_tuples */ 176 | remainder = fkrelElsNum % pkrelElsNum; 177 | if (remainder > 0) { 178 | memcpy(fkrel + i * pkrelElsNum, pkrel, remainder * sizeof(int)); 179 | } 180 | 181 | knuth_shuffle(fkrel, fkrelElsNum); 182 | 183 | return writeToFile(fkrelFilename, fkrel, fkrelElsNum); 184 | } 185 | 186 | return 0; 187 | } 188 | 189 | /** 190 | * Shuffle tuples of the relation using Knuth shuffle. 191 | * 192 | * @param relation 193 | */ 194 | void knuth_shuffle(int *relation, uint64_t elsNum) { 195 | int64_t i; 196 | for (i = elsNum - 1; i > 0; i--) { 197 | int64_t j = RAND_RANGE(i); 198 | int tmp = relation[i]; 199 | relation[i] = relation[j]; 200 | relation[j] = tmp; 201 | } 202 | } 203 | 204 | void knuth_shuffle48(int *relation, uint64_t elsNum, unsigned short * state) { 205 | int64_t i; 206 | for (i = elsNum - 1; i > 0; i--) { 207 | int64_t j = RAND_RANGE48(i, state); 208 | int tmp = relation[i]; 209 | relation[i] = relation[j]; 210 | relation[j] = tmp; 211 | } 212 | } 213 | 214 | int create_relation_zipf(const char *filename, int *relation, uint64_t elsNum, const int64_t maxid, 215 | const double zipf_param) { 216 | /*first try to read from a file*/ 217 | if (readFromFile(filename, relation, elsNum)) { 218 | check_seed(); 219 | 220 | gen_zipf(elsNum, maxid, zipf_param, relation); 221 | 222 | return writeToFile(filename, relation, elsNum); 223 | } 224 | return 0; 225 | } 226 | 227 | /** 228 | * Create an alphabet, an array of size @a size with randomly 229 | * permuted values 0..size-1. 230 | * 231 | * @param size alphabet size 232 | * @return an item_t array with @a size elements; 233 | * contains values 0..size-1 in a random permutation; the 234 | * return value is malloc'ed, don't forget to free it afterward. 235 | */ 236 | static uint32_t *gen_alphabet(unsigned int size) { 237 | uint32_t *alphabet; 238 | 239 | /* allocate */ 240 | alphabet = (uint32_t *) malloc(size * sizeof(*alphabet)); 241 | assert(alphabet); 242 | 243 | /* populate */ 244 | for (unsigned int i = 0; i < size; i++) 245 | alphabet[i] = i + 1; /* don't let 0 be in the alphabet */ 246 | 247 | /* permute */ 248 | for (unsigned int i = size - 1; i > 0; i--) { 249 | unsigned int k = (unsigned long) i * rand() / RAND_MAX; 250 | unsigned int tmp; 251 | 252 | tmp = alphabet[i]; 253 | alphabet[i] = alphabet[k]; 254 | alphabet[k] = tmp; 255 | } 256 | 257 | return alphabet; 258 | } 259 | 260 | /** 261 | * Generate a lookup table with the cumulated density function 262 | * 263 | * (This is derived from code originally written by Rene Mueller.) 264 | */ 265 | static double *gen_zipf_lut(double zipf_factor, unsigned int alphabet_size) { 266 | double scaling_factor; 267 | double sum; 268 | 269 | double *lut; /**< return value */ 270 | 271 | lut = (double *) malloc(alphabet_size * sizeof(*lut)); 272 | assert(lut); 273 | 274 | /* 275 | * Compute scaling factor such that 276 | * 277 | * sum (lut[i], i=1..alphabet_size) = 1.0 278 | * 279 | */ 280 | scaling_factor = 0.0; 281 | for (unsigned int i = 1; i <= alphabet_size; i++) 282 | scaling_factor += 1.0 / pow(i, zipf_factor); 283 | 284 | /* 285 | * Generate the lookup table 286 | */ 287 | sum = 0.0; 288 | for (unsigned int i = 1; i <= alphabet_size; i++) { 289 | sum += 1.0 / pow(i, zipf_factor); 290 | lut[i - 1] = sum / scaling_factor; 291 | } 292 | 293 | return lut; 294 | } 295 | 296 | /** 297 | * Generate a stream with Zipf-distributed content. 298 | */ 299 | void gen_zipf(uint64_t stream_size, unsigned int alphabet_size, double zipf_factor, int *ret) { 300 | //uint64_t i; 301 | /* prepare stuff for Zipf generation */ 302 | uint32_t *alphabet = gen_alphabet(alphabet_size); 303 | assert(alphabet); 304 | 305 | double *lut = gen_zipf_lut(zipf_factor, alphabet_size); 306 | assert(lut); 307 | 308 | uint32_t seeds[64]; 309 | 310 | for (int i = 0; i < 64; i++) 311 | seeds[i] = rand(); 312 | 313 | for (uint64_t i = 0; i < stream_size; i++) { 314 | if (i % 1000000 == 0) 315 | printf("live %d\n", i / 1000000); 316 | 317 | /* take random number */ 318 | double r; 319 | 320 | r = ((double) (rand())) / RAND_MAX; 321 | 322 | /* binary search in lookup table to determine item */ 323 | unsigned int left = 0; 324 | unsigned int right = alphabet_size - 1; 325 | unsigned int m; /* middle between left and right */ 326 | unsigned int pos; /* position to take */ 327 | 328 | if (lut[0] >= r) 329 | pos = 0; 330 | else { 331 | while (right - left > 1) { 332 | m = (left + right) / 2; 333 | 334 | if (lut[m] < r) 335 | left = m; 336 | else 337 | right = m; 338 | } 339 | 340 | pos = right; 341 | } 342 | 343 | ret[i] = alphabet[pos]; 344 | } 345 | 346 | free(lut); 347 | free(alphabet); 348 | } 349 | 350 | -------------------------------------------------------------------------------- /src/generator_ETHZ.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Code adapted from multicore-hashjoins-0.2@https://www.systems.ethz.ch/node/334 3 | All credit to the original author: Cagri Balkesen 4 | */ 5 | 6 | #ifndef GENERATOR_ETHZ_CUH_ 7 | #define GENERATOR_ETHZ_CUH_ 8 | 9 | #include /*uint64_t*/ 10 | 11 | void seed_generator(unsigned int seed); 12 | 13 | int readFromFile(const char * filename, int *relation, uint64_t num_tuples); 14 | int create_relation_nonunique(const char *filename, int *relation, uint64_t num_tuples, const int64_t maxid); 15 | int create_relation_unique(const char *filename, int *relation, uint64_t num_tuples, const int64_t maxid); 16 | void random_gen(int *rel, uint64_t elsNum, const int64_t maxid); 17 | void random_unique_gen(int *rel, uint64_t elsNum, const int64_t maxid); 18 | int create_relation_fk_from_pk(const char *filename, int *fkrel, uint64_t fkrelElsNum, int *pkrel, uint64_t pkrelElsNum); 19 | void knuth_shuffle(int *relation, uint64_t elsNum); 20 | void knuth_shuffle48(int *relation, uint64_t elsNum, unsigned short * state); 21 | int create_relation_zipf(const char *filename, int *relation, uint64_t elsNum, const int64_t maxid, const double zipf_param); 22 | void gen_zipf(uint64_t stream_size, unsigned int alphabet_size, double zipf_factor, int *ret); 23 | int create_relation_n(int* in_relation, int* out_relation, uint64_t num_tuples, uint64_t n); 24 | 25 | #endif /* GENERATOR_ETHZ-CUH_ */ 26 | -------------------------------------------------------------------------------- /src/join-primitives.cu: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 2 | Ecole Polytechnique Federale de Lausanne 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE.*/ 21 | 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | 28 | #include "join-primitives.cuh" 29 | 30 | __global__ void init_payload (int* R, int n) { 31 | for (int i = threadIdx.x + blockIdx.x*blockDim.x; i < n; i += blockDim.x*gridDim.x) 32 | R[i] = i; 33 | } 34 | 35 | /* 36 | S= keys of data to be partitioned 37 | P= payloads of data to be partitioned 38 | heads= keeps information on first bucket per partition and number of elements in it, packet in one 64-bit integer (only used here) 39 | chains= the successor of a bucket in the bucket list 40 | out_cnts= number of elements per partition 41 | buckets_used= how many buckets are reserved by the partitioning already 42 | offsets= describe the segments that occur due to partitioning 43 | note: multithreaded partitioning creates partitions that consist of contiguous segments 44 | => iterate over these segments to avoid handling empty slots 45 | 46 | output_S= bucketized partitions of data keys 47 | output_P= bucketized partitions of data payloads 48 | cnt= number of elements to partition on total 49 | log_parts- log of number of partitions 50 | first_bit= shift the keys before "hashing" 51 | num_threads= number of threads used in CPU side, used together with offsets 52 | 53 | preconditions: 54 | heads: current bucket (1 << 18) [special value for no bucket] and -1 elements (first write allocates bucket) 55 | out_cnts: 0 56 | buckets_used= number of partitions (first num_parts buckets are reserved) 57 | */ 58 | __global__ void partition_pass_one ( 59 | const int32_t * __restrict__ S, 60 | const int32_t * __restrict__ P, 61 | const size_t * __restrict__ offsets, 62 | uint64_t * __restrict__ heads, 63 | uint32_t * __restrict__ buckets_used, 64 | uint32_t * __restrict__ chains, 65 | uint32_t * __restrict__ out_cnts, 66 | int32_t * __restrict__ output_S, 67 | int32_t * __restrict__ output_P, 68 | size_t cnt, 69 | uint32_t log_parts, 70 | uint32_t first_bit, 71 | uint32_t num_threads) { 72 | assert((((size_t) bucket_size) + ((size_t) blockDim.x) * gridDim.x) < (((size_t) 1) << 32)); 73 | const uint32_t parts = 1 << log_parts; 74 | const int32_t parts_mask = parts - 1; 75 | 76 | uint32_t * router = (uint32_t *) int_shared; 77 | 78 | uint32_t segment = 0; 79 | size_t segment_limit = offsets[1]; 80 | size_t segment_next = offsets[2]; 81 | 82 | size_t* shared_offsets = (size_t*) (int_shared + 1024*4 + 4*parts); 83 | 84 | /*if no segmentation in input use one segment with all data, else copy the segment info*/ 85 | if (offsets != NULL) { 86 | for (int i = threadIdx.x; i < 4*num_threads; i += blockDim.x) { 87 | shared_offsets[i] = offsets[i]; 88 | } 89 | } else { 90 | for (int i = threadIdx.x; i < 4*num_threads; i += blockDim.x) { 91 | if (i == 1) 92 | shared_offsets[i] = cnt; 93 | else 94 | shared_offsets[i] = 0; 95 | } 96 | } 97 | 98 | shared_offsets[4*num_threads] = cnt+4096; 99 | shared_offsets[4*num_threads+1] = cnt+4096; 100 | 101 | /*partition element counter starts at 0*/ 102 | for (size_t j = threadIdx.x ; j < parts ; j += blockDim.x ) 103 | router[1024*4 + parts + j] = 0; 104 | 105 | if (threadIdx.x == 0) 106 | router[0] = 0; 107 | 108 | __syncthreads(); 109 | 110 | 111 | /*iterate over the segments*/ 112 | for (int u = 0; u < 2*num_threads; u++) { 113 | size_t segment_start = shared_offsets[2*u]; 114 | size_t segment_limit = shared_offsets[2*u + 1]; 115 | size_t segment_end = segment_start + ((segment_limit - segment_start + 4096 - 1)/4096)*4096; 116 | 117 | for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x) + segment_start; i < segment_end ; i += 4 * blockDim.x * gridDim.x) { 118 | vec4 thread_vals = *(reinterpret_cast(S + i)); 119 | 120 | uint32_t thread_keys[4]; 121 | 122 | /*compute local histogram for a chunk of 4*blockDim.x elements*/ 123 | #pragma unroll 124 | for (int k = 0 ; k < 4 ; ++k){ 125 | if (i + k < segment_limit){ 126 | uint32_t partition = (hasht(thread_vals.i[k]) >> first_bit) & parts_mask; 127 | 128 | atomicAdd(router + (1024 * 4 + parts + partition), 1); 129 | 130 | thread_keys[k] = partition; 131 | } else { 132 | thread_keys[k] = 0; 133 | } 134 | } 135 | 136 | __syncthreads(); 137 | 138 | for (size_t j = threadIdx.x; j < parts ; j += blockDim.x ) { 139 | uint32_t cnt = router[1024 * 4 + parts + j]; 140 | 141 | if (cnt > 0){ 142 | atomicAdd(out_cnts + j, cnt); 143 | 144 | uint32_t pcnt ; 145 | uint32_t bucket ; 146 | uint32_t next_buck; 147 | 148 | bool repeat = true; 149 | 150 | while (__any(repeat)){ 151 | if (repeat){ 152 | /*check if any of the output bucket is filling up*/ 153 | uint64_t old_heads = atomicAdd(heads + j, ((uint64_t) cnt) << 32); 154 | 155 | atomicMin(heads + j, ((uint64_t) (2*bucket_size)) << 32); 156 | 157 | pcnt = ((uint32_t) (old_heads >> 32)); 158 | bucket = (uint32_t) old_heads ; 159 | 160 | /*now there are two cases: 161 | // 2) old_heads.cnt > bucket_size ( => locked => retry) 162 | // if (pcnt >= bucket_size) continue;*/ 163 | 164 | if (pcnt < bucket_size){ 165 | /* 1) old_heads.cnt <= bucket_size*/ 166 | 167 | /*check if the bucket was filled*/ 168 | if (pcnt + cnt >= bucket_size){ 169 | if (bucket < (1 << 18)) { 170 | next_buck = atomicAdd(buckets_used, 1); 171 | chains[bucket] = next_buck; 172 | } else { 173 | next_buck = j; 174 | } 175 | uint64_t tmp = next_buck + (((uint64_t) (pcnt + cnt - bucket_size)) << 32); 176 | 177 | atomicExch(heads + j, tmp); 178 | } else { 179 | next_buck = bucket; 180 | } 181 | 182 | repeat = false; 183 | } 184 | } 185 | } 186 | 187 | router[1024 * 4 + j] = atomicAdd(router, cnt); 188 | router[1024 * 4 + parts + j] = 0;//cnt;//pcnt ; 189 | router[1024 * 4 + 2 * parts + j] = (bucket << log2_bucket_size) + pcnt; 190 | router[1024 * 4 + 3 * parts + j] = next_buck << log2_bucket_size ; 191 | } 192 | } 193 | 194 | __syncthreads(); 195 | 196 | 197 | uint32_t total_cnt = router[0]; 198 | 199 | __syncthreads(); 200 | 201 | /*calculate write positions for block-wise shuffle => atomicAdd on start of partition*/ 202 | #pragma unroll 203 | for (int k = 0 ; k < 4 ; ++k){ 204 | if (i + k < segment_limit) 205 | thread_keys[k] = atomicAdd(router + (1024 * 4 + thread_keys[k]), 1); 206 | } 207 | 208 | /*write the keys in shared memory*/ 209 | #pragma unroll 210 | for (int k = 0 ; k < 4 ; ++k) 211 | if (i + k < segment_limit) 212 | router[thread_keys[k]] = thread_vals.i[k]; 213 | 214 | __syncthreads(); 215 | 216 | int32_t thread_parts[4]; 217 | 218 | /*read shuffled keys and write them to output partitions "somewhat" coalesced*/ 219 | #pragma unroll 220 | for (int k = 0 ; k < 4 ; ++k){ 221 | if (threadIdx.x + 1024 * k < total_cnt) { 222 | int32_t val = router[threadIdx.x + 1024 * k]; 223 | uint32_t partition = (hasht(val) >> first_bit) & parts_mask; 224 | 225 | uint32_t cnt = router[1024 * 4 + partition] - (threadIdx.x + 1024 * k); 226 | 227 | uint32_t bucket = router[1024 * 4 + 2 * parts + partition]; 228 | 229 | if (((bucket + cnt) ^ bucket) & ~bucket_size_mask){ 230 | uint32_t next_buck = router[1024 * 4 + 3 * parts + partition]; 231 | cnt = ((bucket + cnt) & bucket_size_mask); 232 | bucket = next_buck; 233 | } 234 | 235 | bucket += cnt; 236 | 237 | output_S[bucket] = val; 238 | 239 | thread_parts[k] = partition; 240 | } 241 | } 242 | 243 | __syncthreads(); 244 | 245 | /*read payloads of original data*/ 246 | thread_vals = *(reinterpret_cast(P + i)); 247 | 248 | /*shuffle payloads in shared memory, in the same offsets that we used for their corresponding keys*/ 249 | #pragma unroll 250 | for (int k = 0 ; k < 4 ; ++k) 251 | if (i + k < segment_limit) { 252 | router[thread_keys[k]] = thread_vals.i[k]; 253 | } 254 | 255 | __syncthreads(); 256 | 257 | /*write payloads to partition buckets in "somewhat coalesced manner"*/ 258 | #pragma unroll 259 | for (int k = 0 ; k < 4 ; ++k){ 260 | if (threadIdx.x + 1024 * k < total_cnt) { 261 | int32_t val = router[threadIdx.x + 1024 * k]; 262 | 263 | int32_t partition = thread_parts[k]; 264 | 265 | uint32_t cnt = router[1024 * 4 + partition] - (threadIdx.x + 1024 * k); 266 | 267 | uint32_t bucket = router[1024 * 4 + 2 * parts + partition]; 268 | 269 | if (((bucket + cnt) ^ bucket) & ~bucket_size_mask){ 270 | uint32_t next_buck = router[1024 * 4 + 3 * parts + partition]; 271 | cnt = ((bucket + cnt) & bucket_size_mask); 272 | bucket = next_buck; 273 | } 274 | bucket += cnt; 275 | 276 | output_P[bucket] = val; 277 | } 278 | } 279 | 280 | if (threadIdx.x == 0) router[0] = 0; 281 | } 282 | } 283 | } 284 | 285 | /* 286 | compute information for the second partitioning pass 287 | 288 | input: 289 | chains=points to the successor in the bucket list for each bucket (hint: we append new buckets to the end) 290 | out_cnts=count of elements per partition 291 | output: 292 | chains=packed value of element count in bucket and the partition the bucket belongs to 293 | */ 294 | __global__ void compute_bucket_info (uint32_t* chains, uint32_t* out_cnts, uint32_t log_parts) { 295 | uint32_t parts = 1 << log_parts; 296 | 297 | for (int p = threadIdx.x + blockIdx.x*blockDim.x; p < parts; p += gridDim.x*blockDim.x) { 298 | uint32_t cur = p; 299 | int32_t cnt = out_cnts[p]; 300 | 301 | while (cnt > 0) { 302 | uint32_t local_cnt = (cnt >= 4096)? 4096 : cnt; 303 | uint32_t val = (p << 13) + local_cnt; 304 | 305 | uint32_t next = chains[cur]; 306 | chains[cur] = val; 307 | 308 | cur = next; 309 | cnt -= 4096; 310 | } 311 | } 312 | } 313 | 314 | /* 315 | S= keys of data to be re-partitioned 316 | P= payloads of data to be re-partitioned 317 | heads= keeps information on first bucket per partition and number of elements in it, packet in one 64-bit integer (only used here) 318 | chains= the successor of a bucket in the bucket list 319 | out_cnts= number of elements per partition 320 | buckets_used= how many buckets are reserved by the partitioning already 321 | offsets= describe the segments that occur due to partitioning 322 | note: multithreaded partitioning creates partitions that consist of contiguous segments 323 | => iterate over these segments to avoid handling empty slots 324 | 325 | output_S= bucketized partitions of data keys (results) 326 | output_P= bucketized partitions of data payloads (results) 327 | 328 | S_log_parts- log of number of partitions for previous pass 329 | log_parts- log of number of partitions for this pass 330 | first_bit= shift the keys before "hashing" 331 | bucket_num_ptr: number of input buckets 332 | 333 | preconditions: 334 | heads: current bucket (1 << 18) [special value for no bucket] and -1 elements (first write allocates bucket) 335 | out_cnts: 0 336 | buckets_used= number of partitions (first num_parts buckets are reserved) 337 | */ 338 | __global__ void partition_pass_two ( 339 | const int32_t * __restrict__ S, 340 | const int32_t * __restrict__ P, 341 | const uint32_t * __restrict__ bucket_info, 342 | uint32_t * __restrict__ buckets_used, 343 | uint64_t * heads, 344 | uint32_t * __restrict__ chains, 345 | uint32_t * __restrict__ out_cnts, 346 | int32_t * __restrict__ output_S, 347 | int32_t * __restrict__ output_P, 348 | uint32_t S_log_parts, 349 | uint32_t log_parts, 350 | uint32_t first_bit, 351 | uint32_t * bucket_num_ptr) { 352 | assert((((size_t) bucket_size) + ((size_t) blockDim.x) * gridDim.x) < (((size_t) 1) << 32)); 353 | const uint32_t S_parts = 1 << S_log_parts; 354 | const uint32_t parts = 1 << log_parts; 355 | const int32_t parts_mask = parts - 1; 356 | 357 | uint32_t buckets_num = *bucket_num_ptr; 358 | 359 | uint32_t * router = (uint32_t *) int_shared; //[1024*4 + parts]; 360 | 361 | for (size_t j = threadIdx.x ; j < parts ; j += blockDim.x ) 362 | router[1024*4 + parts + j] = 0; 363 | 364 | if (threadIdx.x == 0) 365 | router[0] = 0; 366 | 367 | __syncthreads(); 368 | 369 | 370 | /*each CUDA block processes a bucket at a time*/ 371 | for (size_t i = blockIdx.x; i < buckets_num; i += gridDim.x) { 372 | uint32_t info = bucket_info[i]; 373 | /*number of elements per bucket*/ 374 | uint32_t cnt = info & ((1 << 13) - 1); 375 | /*id of original partition*/ 376 | uint32_t pid = info >> 13; 377 | 378 | vec4 thread_vals = *(reinterpret_cast(S + bucket_size * i + 4*threadIdx.x)); 379 | 380 | uint32_t thread_keys[4]; 381 | 382 | /*compute local histogram for the bucket*/ 383 | #pragma unroll 384 | for (int k = 0 ; k < 4 ; ++k){ 385 | if (4*threadIdx.x + k < cnt){ 386 | uint32_t partition = (hasht(thread_vals.i[k]) >> first_bit) & parts_mask; 387 | 388 | atomicAdd(router + (1024 * 4 + parts + partition), 1); 389 | 390 | thread_keys[k] = partition; 391 | } else { 392 | thread_keys[k] = 0; 393 | } 394 | } 395 | 396 | __syncthreads(); 397 | 398 | for (size_t j = threadIdx.x; j < parts ; j += blockDim.x ) { 399 | uint32_t cnt = router[1024 * 4 + parts + j]; 400 | 401 | if (cnt > 0){ 402 | atomicAdd(out_cnts + (pid << log_parts) + j, cnt); 403 | 404 | uint32_t pcnt ; 405 | uint32_t bucket ; 406 | uint32_t next_buck; 407 | 408 | bool repeat = true; 409 | 410 | while (__any(repeat)){ 411 | if (repeat){ 412 | uint64_t old_heads = atomicAdd(heads + (pid << log_parts) + j, ((uint64_t) cnt) << 32); 413 | 414 | atomicMin(heads + (pid << log_parts) + j, ((uint64_t) (2*bucket_size)) << 32); 415 | 416 | pcnt = ((uint32_t) (old_heads >> 32)); 417 | bucket = (uint32_t) old_heads ; 418 | 419 | if (pcnt < bucket_size){ 420 | if (pcnt + cnt >= bucket_size){ 421 | if (bucket < (1 << 18)) { 422 | next_buck = atomicAdd(buckets_used, 1); 423 | chains[bucket] = next_buck; 424 | } else { 425 | next_buck = (pid << log_parts) + j; 426 | } 427 | 428 | uint64_t tmp = next_buck + (((uint64_t) (pcnt + cnt - bucket_size)) << 32); 429 | 430 | atomicExch(heads + (pid << log_parts) + j, tmp); 431 | } else { 432 | next_buck = bucket; 433 | } 434 | 435 | repeat = false; 436 | } 437 | } 438 | } 439 | 440 | router[1024 * 4 + j] = atomicAdd(router, cnt); 441 | router[1024 * 4 + parts + j] = 0; 442 | router[1024 * 4 + 2 * parts + j] = (bucket << log2_bucket_size) + pcnt; 443 | router[1024 * 4 + 3 * parts + j] = next_buck << log2_bucket_size ; 444 | } 445 | } 446 | 447 | __syncthreads(); 448 | 449 | 450 | uint32_t total_cnt = router[0]; 451 | 452 | __syncthreads(); 453 | 454 | /*calculate write positions for block-wise shuffle => atomicAdd on start of partition*/ 455 | #pragma unroll 456 | for (int k = 0 ; k < 4 ; ++k){ 457 | if (4*threadIdx.x + k < cnt) 458 | thread_keys[k] = atomicAdd(router + (1024 * 4 + thread_keys[k]), 1); 459 | } 460 | 461 | /*write the keys in shared memory*/ 462 | #pragma unroll 463 | for (int k = 0 ; k < 4 ; ++k) 464 | if (4*threadIdx.x + k < cnt) 465 | router[thread_keys[k]] = thread_vals.i[k]; 466 | 467 | __syncthreads(); 468 | 469 | int32_t thread_parts[4]; 470 | 471 | /*read shuffled keys and write them to output partitions "somewhat" coalesced*/ 472 | #pragma unroll 473 | for (int k = 0 ; k < 4 ; ++k){ 474 | if (threadIdx.x + 1024 * k < total_cnt) { 475 | int32_t val = router[threadIdx.x + 1024 * k]; 476 | uint32_t partition = (hasht(val) >> first_bit) & parts_mask; 477 | 478 | uint32_t cnt = router[1024 * 4 + partition] - (threadIdx.x + 1024 * k); 479 | 480 | uint32_t bucket = router[1024 * 4 + 2 * parts + partition]; 481 | 482 | if (((bucket + cnt) ^ bucket) & ~bucket_size_mask){ 483 | uint32_t next_buck = router[1024 * 4 + 3 * parts + partition]; 484 | cnt = ((bucket + cnt) & bucket_size_mask); 485 | bucket = next_buck; 486 | } 487 | 488 | bucket += cnt; 489 | 490 | output_S[bucket] = val; 491 | 492 | thread_parts[k] = partition; 493 | } 494 | } 495 | 496 | __syncthreads(); 497 | 498 | /*read payloads of original data*/ 499 | thread_vals = *(reinterpret_cast(P + i*bucket_size + 4*threadIdx.x)); 500 | 501 | /*shuffle payloads in shared memory, in the same offsets that we used for their corresponding keys*/ 502 | #pragma unroll 503 | for (int k = 0 ; k < 4 ; ++k) 504 | if (4*threadIdx.x + k < cnt) { 505 | router[thread_keys[k]] = thread_vals.i[k]; 506 | } 507 | 508 | __syncthreads(); 509 | 510 | /*write payloads to partition buckets in "somewhat coalesced manner"*/ 511 | #pragma unroll 512 | for (int k = 0 ; k < 4 ; ++k){ 513 | if (threadIdx.x + 1024 * k < total_cnt) { 514 | int32_t val = router[threadIdx.x + 1024 * k]; 515 | 516 | int32_t partition = thread_parts[k]; 517 | 518 | uint32_t cnt = router[1024 * 4 + partition] - (threadIdx.x + 1024 * k); 519 | 520 | uint32_t bucket = router[1024 * 4 + 2 * parts + partition]; 521 | 522 | if (((bucket + cnt) ^ bucket) & ~bucket_size_mask){ 523 | uint32_t next_buck = router[1024 * 4 + 3 * parts + partition]; 524 | cnt = ((bucket + cnt) & bucket_size_mask); 525 | bucket = next_buck; 526 | } 527 | bucket += cnt; 528 | 529 | output_P[bucket] = val; 530 | } 531 | } 532 | 533 | if (threadIdx.x == 0) router[0] = 0; 534 | } 535 | } 536 | 537 | #define LOCAL_BUCKETS_BITS 10 538 | #define LOCAL_BUCKETS ((1 << LOCAL_BUCKETS_BITS)) 539 | 540 | #define MAX_BIT 32 541 | 542 | __device__ int ctzd (int x) { 543 | if (x == 0) 544 | return 32; 545 | 546 | int n = 0; 547 | 548 | if ((n & 0x0000FFFF) == 0) { 549 | n += 16; 550 | x >>= 16; 551 | } 552 | 553 | if ((n & 0x000000FF) == 0) { 554 | n += 8; 555 | x >>= 8; 556 | } 557 | 558 | if ((n & 0x0000000F) == 0) { 559 | n += 4; 560 | x >>= 4; 561 | } 562 | 563 | if ((n & 0x00000003) == 0) { 564 | n += 2; 565 | x >>= 2; 566 | } 567 | 568 | if ((n & 0x00000001) == 0) { 569 | n += 1; 570 | x >>= 1; 571 | } 572 | 573 | return n; 574 | } 575 | 576 | 577 | __global__ void init_metadata_double ( 578 | uint64_t * __restrict__ heads1, 579 | uint32_t * __restrict__ buckets_used1, 580 | uint32_t * __restrict__ chains1, 581 | uint32_t * __restrict__ out_cnts1, 582 | uint32_t parts1, 583 | uint32_t buckets_num1, 584 | uint64_t * __restrict__ heads2, 585 | uint32_t * __restrict__ buckets_used2, 586 | uint32_t * __restrict__ chains2, 587 | uint32_t * __restrict__ out_cnts2, 588 | uint32_t parts2, 589 | uint32_t buckets_num2 590 | ) { 591 | int tid = threadIdx.x + blockIdx.x * blockDim.x; 592 | 593 | for (int i = tid; i < buckets_num1; i += blockDim.x*gridDim.x) 594 | chains1[i] = 0; 595 | 596 | for (int i = tid; i < parts1; i += blockDim.x*gridDim.x) 597 | out_cnts1[i] = 0; 598 | 599 | for (int i = tid; i < parts1; i += blockDim.x*gridDim.x) 600 | heads1[i] = (1 << 18) + (((uint64_t) bucket_size_mask) << 32); 601 | 602 | if (tid == 0) { 603 | *buckets_used1 = parts1; 604 | } 605 | 606 | for (int i = tid; i < buckets_num2; i += blockDim.x*gridDim.x) 607 | chains2[i] = 0; 608 | 609 | for (int i = tid; i < parts2; i += blockDim.x*gridDim.x) 610 | out_cnts2[i] = 0; 611 | 612 | for (int i = tid; i < parts2; i += blockDim.x*gridDim.x) 613 | heads2[i] = (1 << 18) + (((uint64_t) bucket_size_mask) << 32); 614 | 615 | if (tid == 0) { 616 | *buckets_used2 = parts2; 617 | } 618 | } 619 | 620 | /* 621 | Building phase for non-partitioned hash join with perfect hashing (so this property is reflected in the code, we don't follow chains), it is the best case for non-partitioned 622 | 623 | data=array of the keys 624 | payload=array of payloads 625 | n=number of tuples 626 | lookup=lookup table/hashtable that we build => we store the payload at position lookup[key] 627 | */ 628 | __global__ void build_perfect_array (int32_t* data, int32_t* payload, int n, int32_t* lookup) { 629 | for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){ 630 | vec4 thread_vals = *(reinterpret_cast(data + i)); 631 | vec4 thread_payloads = *(reinterpret_cast(payload + i)); 632 | 633 | #pragma unroll 634 | for (int k = 0; k < 4; ++k) { 635 | int32_t val = thread_vals.i[k]; 636 | int32_t payload = thread_payloads.i[k]; 637 | lookup[val] = payload + 1; 638 | } 639 | } 640 | } 641 | 642 | /*Probing phase for non-partitioned hash join with perfect hashing 643 | 644 | data=keys for probe side 645 | payload=payloads for probe side 646 | n=number of elements 647 | lookup=hashtable 648 | aggr=the memory location in which we aggregate with atomics at the end*/ 649 | __global__ void probe_perfect_array (int32_t* data, int32_t* payload, int n, int32_t* lookup, int* aggr) { 650 | int count = 0; 651 | 652 | for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){ 653 | vec4 thread_vals = *(reinterpret_cast(data + i)); 654 | vec4 thread_payloads = *(reinterpret_cast(payload + i)); 655 | 656 | #pragma unroll 657 | for (int k = 0; k < 4; ++k) { 658 | int val = thread_vals.i[k]; 659 | int payload = thread_payloads.i[k]; 660 | int res = lookup[val]; 661 | 662 | if (res) 663 | count += (payload * (res - 1)); 664 | } 665 | } 666 | 667 | atomicAdd(aggr, count); 668 | } 669 | 670 | 671 | /* 672 | Building phase for non-partitioned hash join with chaining 673 | 674 | data=array of the keys 675 | payload=array of payloads 676 | n=number of tuples 677 | log_parts=log size of hashtable/chains 678 | output=the chains [the rest of the array stays in place] 679 | head=the first element of each chain 680 | */ 681 | __global__ void build_ht_chains (int32_t* data, int n, uint32_t log_parts, int32_t* output, int* head) { 682 | int parts = 1 << log_parts; 683 | int parts_mask = parts-1; 684 | 685 | for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){ 686 | vec4 thread_vals = *(reinterpret_cast(data + i)); 687 | 688 | #pragma unroll 689 | for (int k = 0; k < 4; ++k) { 690 | int val = thread_vals.i[k]; 691 | int hval = val & parts_mask; 692 | 693 | int last = atomicExch(head + hval, i+k+1); 694 | //int64_t wr = (((int64_t) last) << 32) + val; 695 | output[i + k] = last; 696 | } 697 | } 698 | } 699 | 700 | /* 701 | Probing phase for non-partitioned hash join with chaining 702 | 703 | data=array of the keys 704 | payload=array of payloads 705 | n=number of tuples 706 | log_parts=log size of hashtable/chains 707 | ht=the chains that show the successor for each build element 708 | head=the first element of each chain 709 | ht_key=the keys of the hashtable as an array 710 | ht_pay=the payloads of the hashtable as an array 711 | aggr=the memory location in which we aggregate with atomics at the end 712 | */ 713 | __global__ void chains_probing (int32_t* data, int32_t* payload, int n, uint32_t log_parts, int32_t* ht, int32_t* ht_key, int32_t* ht_pay, int* head, int* aggr) { 714 | int parts = 1 << log_parts; 715 | int parts_mask = parts-1; 716 | int count = 0; 717 | 718 | for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){ 719 | vec4 thread_vals = *(reinterpret_cast(data + i)); 720 | vec4 thread_payloads = *(reinterpret_cast(payload + i)); 721 | 722 | #pragma unroll 723 | for (int k = 0; k < 4; ++k) { 724 | int val = thread_vals.i[k]; 725 | int payload = thread_payloads.i[k]; 726 | int hval = val & parts_mask; 727 | 728 | int next = head[hval]; 729 | 730 | while (next != 0) { 731 | int ht_val = ht_key[next-1]; 732 | 733 | if (ht_val == val) 734 | count += (payload * ht_pay[next-1]); 735 | 736 | next = ht[next-1]; 737 | } 738 | } 739 | } 740 | 741 | atomicAdd(aggr, count); 742 | } 743 | 744 | 745 | /*functions for linear probing 746 | 747 | FIXME: there is a bug so it is not operational yet [was not in paper so this is not urgent] 748 | */ 749 | 750 | __global__ void ht_hist (int* data, int n, int log_parts, int* hist) { 751 | int parts = 1 << log_parts; 752 | int parts_mask = parts-1; 753 | 754 | for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){ 755 | vec4 thread_vals = *(reinterpret_cast(data + i)); 756 | 757 | #pragma unroll 758 | for (int k = 0; k < 4; ++k) { 759 | int val = thread_vals.i[k]; 760 | int hval = val & parts_mask; 761 | 762 | int off = atomicAdd(hist + hval, 1); 763 | } 764 | } 765 | } 766 | 767 | __global__ void ht_offsets (int log_parts, int* hist, int* offset, int* aggr) { 768 | int parts = 1 << log_parts; 769 | int parts_mask = parts-1; 770 | 771 | for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < parts; i += blockDim.x * gridDim.x) { 772 | int cur = hist[i]; 773 | int off = atomicAdd(aggr, cur); 774 | hist[i] = off; 775 | offset[i] = off; 776 | } 777 | } 778 | 779 | __global__ void build_ht_linear (int* data, int* payload, size_t n, int log_parts, int* offset, int* ht, int* htp) { 780 | int parts = 1 << log_parts; 781 | int parts_mask = parts-1; 782 | 783 | for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){ 784 | vec4 thread_vals = *(reinterpret_cast(data + i)); 785 | vec4 thread_payloads = *(reinterpret_cast(payload + i)); 786 | 787 | #pragma unroll 788 | for (int k = 0; k < 4; ++k) { 789 | int val = thread_vals.i[k]; 790 | int hval = val & parts_mask; 791 | 792 | int off = atomicAdd(offset + hval, 1); 793 | 794 | ht[off] = val; 795 | htp[off] = thread_payloads.i[k]; 796 | 797 | } 798 | } 799 | } 800 | 801 | __global__ void linear_probing (int* data, int* payload, int* ht, int* htp, int* offset_s, int* offset_e, size_t n, int log_parts, int* aggr) { 802 | int parts = 1 << log_parts; 803 | int parts_mask = parts-1; 804 | int count = 0; 805 | 806 | for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){ 807 | vec4 thread_vals = *(reinterpret_cast(data + i)); 808 | vec4 thread_payloads = *(reinterpret_cast(payload + i)); 809 | 810 | #pragma unroll 811 | for (int k = 0; k < 4; ++k) { 812 | int val = thread_vals.i[k]; 813 | 814 | for (int j = 0; j < 32; j++) { 815 | int probe = __shfl(val, j); 816 | int pay = __shfl(thread_payloads.i[k], j); 817 | int hval = probe & parts_mask; 818 | 819 | int start = offset_s[hval]; 820 | int end = offset_e[hval]; 821 | 822 | for (int p = start + threadIdx.x % 32; p < end; p += 32) { 823 | if (ht[p] == probe) { 824 | count += pay*htp[p]; 825 | } 826 | } 827 | } 828 | } 829 | } 830 | 831 | atomicAdd(aggr, count); 832 | } 833 | 834 | /*break "long" bucket chains to smaller chains 835 | this helps load balancing because we can allocate work at sub-chain granularity 836 | and effectively solve the skew problem 837 | 838 | bucket_info=we store the packed (partition, element count) value for each bucket 839 | chains=successor in partition's bucket list 840 | out_cnts=count of elements in this partition 841 | log_parts= log of number of partitions 842 | threshold=the maximum number of elements per subchain*/ 843 | __global__ void decompose_chains (uint32_t* bucket_info, uint32_t* chains, uint32_t* out_cnts, uint32_t log_parts, int threshold) { 844 | uint32_t parts = 1 << log_parts; 845 | 846 | for (int p = threadIdx.x + blockIdx.x*blockDim.x; p < parts; p += gridDim.x*blockDim.x) { 847 | uint32_t cur = p; 848 | int32_t cnt = out_cnts[p]; 849 | uint32_t first_cnt = (cnt >= threshold)? threshold : cnt; 850 | int32_t cutoff = 0; 851 | 852 | while (cnt > 0) { 853 | cutoff += bucket_size; 854 | cnt -= bucket_size; 855 | 856 | uint32_t next = chains[cur]; 857 | 858 | if (cutoff >= threshold && cnt > 0) { 859 | uint32_t local_cnt = (cnt >= threshold)? threshold : cnt; 860 | 861 | bucket_info[next] = (p << 15) + local_cnt; 862 | chains[cur] = 0; 863 | cutoff = 0; 864 | } else if (next != 0) { 865 | bucket_info[next] = 0; 866 | } 867 | 868 | 869 | cur = next; 870 | } 871 | 872 | bucket_info[p] = (p << 15) + first_cnt; 873 | } 874 | } 875 | 876 | /*kernel for performing the join between the partitioned relations 877 | 878 | R,Pr= bucketized keys and payloads for relation R (probe side) 879 | S,Ps= buckerized keys and payloads for relation S (build side) 880 | bucket_info=the info that tells us which partition each bucket belongs to, the number of elements (or whether it belongs to a chain) 881 | S_cnts, S_chain= for build-side we don't pack the info since we operate under the assumption that it is usually one bucket per partition (we don't load balance) 882 | buckets_num=number of buckets for R 883 | results=the memory address where we aggregate 884 | */ 885 | __global__ void join_partitioned_aggregate ( 886 | const int32_t* R, 887 | const int32_t* Pr, 888 | const uint32_t* R_chain, 889 | const uint32_t* bucket_info, 890 | const int32_t* S, 891 | const int32_t* Ps, 892 | const uint32_t* S_cnts, 893 | const uint32_t* S_chain, 894 | int32_t log_parts, 895 | uint32_t* buckets_num, 896 | int32_t* results) { 897 | 898 | /*in order to saze space, we discard the partitioning bits, then we can try fitting keys in int16_t [HACK]*/ 899 | __shared__ int16_t elem[4096 + 512]; 900 | __shared__ int32_t payload[4096 + 512]; 901 | __shared__ int16_t next[4096 + 512]; 902 | __shared__ int32_t head[LOCAL_BUCKETS]; 903 | 904 | 905 | int tid = threadIdx.x; 906 | int block = blockIdx.x; 907 | int width = blockDim.x; 908 | int pwidth = gridDim.x; 909 | int parts = 1 << log_parts; 910 | 911 | int lid = tid % 32; 912 | int gnum = blockDim.x/32; 913 | 914 | int count = 0; 915 | 916 | int buckets_cnt = *buckets_num; 917 | 918 | for (uint32_t bucket_r = block; bucket_r < buckets_cnt; bucket_r += pwidth) { 919 | int info = bucket_info[bucket_r]; 920 | 921 | if (info != 0) { 922 | /*unpack information on the subchain*/ 923 | int p = info >> 15; 924 | int len_R = info & ((1 << 15) - 1); 925 | 926 | int len_S = S_cnts[p]; 927 | 928 | /*S partition doesn't fit in shared memory*/ 929 | if (len_S > 4096+512) { 930 | int bucket_r_loop = bucket_r; 931 | 932 | /*now we will build a bucket of R side in the shared memory at a time and then probe it with S-side 933 | sensible because 934 | 1) we have guarantees on size of R from the chain decomposition 935 | 2) this is a skewed scenario so size of S can be arbitrary*/ 936 | for (int offset_r = 0; offset_r < len_R; offset_r += bucket_size) { 937 | for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x) 938 | head[i] = -1; 939 | __syncthreads(); 940 | 941 | /*build a hashtable from an R bucket*/ 942 | for (int base_r = 0; base_r < bucket_size; base_r += 4*blockDim.x) { 943 | vec4 data_R = *(reinterpret_cast(R + bucket_size * bucket_r_loop + base_r + 4*threadIdx.x)); 944 | vec4 data_Pr = *(reinterpret_cast(Pr + bucket_size * bucket_r_loop + base_r + 4*threadIdx.x)); 945 | int l_cnt_R = len_R - offset_r - base_r - 4 * threadIdx.x; 946 | 947 | int cnt = 0; 948 | 949 | #pragma unroll 950 | for (int k = 0; k < 4; k++) { 951 | if (k < l_cnt_R) { 952 | int val = data_R.i[k]; 953 | elem[base_r + k*blockDim.x + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 954 | payload[base_r + k*blockDim.x + tid] = data_Pr.i[k]; 955 | int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 956 | 957 | int32_t last = atomicExch(&head[hval], base_r + k*blockDim.x + tid); 958 | next[base_r + k*blockDim.x + tid] = last; 959 | } 960 | } 961 | } 962 | 963 | bucket_r_loop = R_chain[bucket_r_loop]; 964 | 965 | __syncthreads(); 966 | 967 | int bucket_s_loop = p; 968 | int base_s = 0; 969 | 970 | /*probe hashtable from an S bucket*/ 971 | for (int offset_s = 0; offset_s < len_S; offset_s += 4*blockDim.x) { 972 | vec4 data_S = *(reinterpret_cast(S + bucket_size * bucket_s_loop + base_s + 4*threadIdx.x)); 973 | vec4 data_Ps = *(reinterpret_cast(Ps + bucket_size * bucket_s_loop + base_s + 4*threadIdx.x)); 974 | int l_cnt_S = len_S - offset_s - 4 * threadIdx.x; 975 | 976 | #pragma unroll 977 | for (int k = 0; k < 4; k++) { 978 | int32_t val = data_S.i[k]; 979 | int32_t pval = data_Ps.i[k]; 980 | int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 981 | int32_t hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 982 | 983 | if (k < l_cnt_S) { 984 | int32_t pos = head[hval]; 985 | while (pos >= 0) { 986 | if (elem[pos] == tval) { 987 | count += pval*payload[pos]; 988 | } 989 | 990 | pos = next[pos]; 991 | } 992 | } 993 | } 994 | 995 | base_s += 4*blockDim.x; 996 | if (base_s >= bucket_size) { 997 | bucket_s_loop = S_chain[bucket_s_loop]; 998 | base_s = 0; 999 | } 1000 | } 1001 | 1002 | __syncthreads(); 1003 | } 1004 | } else { 1005 | for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x) 1006 | head[i] = -1; 1007 | 1008 | int rem_s = len_S % 4096; 1009 | rem_s = (rem_s + 4 - 1)/4; 1010 | 1011 | __syncthreads(); 1012 | 1013 | int off; 1014 | int it; 1015 | int base = 0; 1016 | 1017 | it = p; 1018 | off = 0; 1019 | 1020 | /*build hashtable for S-side*/ 1021 | for (off = 0; off < len_S;) { 1022 | vec4 data_S = *(reinterpret_cast(S + bucket_size * it + base + 4*threadIdx.x)); 1023 | vec4 data_Ps = *(reinterpret_cast(Ps + bucket_size * it + base +4*threadIdx.x)); 1024 | int l_cnt_S = len_S - off - 4 * threadIdx.x; 1025 | 1026 | #pragma unroll 1027 | for (int k = 0; k < 4; k++) { 1028 | if (k < l_cnt_S) { 1029 | int val = data_S.i[k]; 1030 | elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 1031 | payload[off + tid] = data_Ps.i[k]; 1032 | int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 1033 | 1034 | int32_t last = atomicExch(&head[hval], off + tid); 1035 | next[off + tid] = last; 1036 | } 1037 | 1038 | off += (off < bucket_size)? blockDim.x : rem_s; 1039 | base += blockDim.x; 1040 | } 1041 | 1042 | if (base >= bucket_size) { 1043 | it = S_chain[it]; 1044 | base = 0; 1045 | } 1046 | 1047 | 1048 | } 1049 | 1050 | __syncthreads(); 1051 | 1052 | it = bucket_r; 1053 | off = 0; 1054 | 1055 | /*probe from R-side*/ 1056 | for (; 0 < len_R; off += 4*blockDim.x, len_R -= 4*blockDim.x) { 1057 | vec4 data_R = *(reinterpret_cast(R + bucket_size * it + off + 4*threadIdx.x)); 1058 | vec4 data_Pr = *(reinterpret_cast(Pr + bucket_size * it + off + 4*threadIdx.x)); 1059 | int l_cnt_R = len_R - 4 * threadIdx.x; 1060 | 1061 | #pragma unroll 1062 | for (int k = 0; k < 4; k++) { 1063 | int32_t val = data_R.i[k]; 1064 | int32_t pval = data_Pr.i[k]; 1065 | /*hack to fit more data in shared memory*/ 1066 | int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 1067 | int32_t hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 1068 | 1069 | if (k < l_cnt_R) { 1070 | int32_t pos = head[hval]; 1071 | while (pos >= 0) { 1072 | if (elem[pos] == tval) { 1073 | count += pval*payload[pos]; 1074 | } 1075 | 1076 | pos = next[pos]; 1077 | } 1078 | } 1079 | } 1080 | 1081 | if (off >= bucket_size) { 1082 | it = R_chain[it]; 1083 | off = 0; 1084 | } 1085 | } 1086 | 1087 | __syncthreads(); 1088 | } 1089 | } 1090 | } 1091 | 1092 | atomicAdd(results, count); 1093 | 1094 | __syncthreads(); 1095 | } 1096 | 1097 | /*maximum size of output, we always write at *write_offset MOD (FOLD+1)* 1098 | we use it in order to simulate the cases that output size explodes. we do the actual writes then overwrite them*/ 1099 | #define FOLD ((1 << 24) - 1) 1100 | /*the number of elements that can be stored in a warp-level buffer during the join materialization*/ 1101 | #define SHUFFLE_SIZE 16 1102 | 1103 | 1104 | /*practically the same as join_partitioned_aggregate 1105 | 1106 | i add extra comments for the materialization technique*/ 1107 | __global__ void join_partitioned_results ( 1108 | const int32_t* R, 1109 | const int32_t* Pr, 1110 | const uint32_t* R_chain, 1111 | const uint32_t* bucket_info, 1112 | const int32_t* S, 1113 | const int32_t* Ps, 1114 | const uint32_t* S_cnts, 1115 | const uint32_t* S_chain, 1116 | int32_t log_parts, 1117 | uint32_t* buckets_num, 1118 | int32_t* results, 1119 | int32_t* output) { 1120 | __shared__ int16_t elem[4096 + 512]; 1121 | __shared__ int32_t payload[4096 + 512]; 1122 | __shared__ int16_t next[4096 + 512]; 1123 | __shared__ int32_t head[LOCAL_BUCKETS]; 1124 | __shared__ int32_t shuffle[2*SHUFFLE_SIZE*32]; 1125 | 1126 | 1127 | int tid = threadIdx.x; 1128 | int block = blockIdx.x; 1129 | int width = blockDim.x; 1130 | int pwidth = gridDim.x; 1131 | int parts = 1 << log_parts; 1132 | 1133 | int lid = tid % 32; 1134 | int gid = tid / 32; 1135 | int gnum = blockDim.x/32; 1136 | 1137 | int count = 0; 1138 | 1139 | int ptr; 1140 | 1141 | int threadmask = (lid < 31)? ~((1 << (lid+1)) - 1) : 0; 1142 | 1143 | int shuffle_ptr = 0; 1144 | 1145 | int32_t* warp_shuffle = shuffle + gid * 2 * SHUFFLE_SIZE; 1146 | 1147 | int buckets_cnt = *buckets_num; 1148 | 1149 | 1150 | for (uint32_t bucket_r = block; bucket_r < buckets_cnt; bucket_r += pwidth) { 1151 | int info = bucket_info[bucket_r]; 1152 | 1153 | if (info != 0) { 1154 | int p = info >> 15; 1155 | int len_R = info & ((1 << 15) - 1); 1156 | int len_S = S_cnts[p]; 1157 | 1158 | if (len_S > 4096+512) { 1159 | int bucket_r_loop = bucket_r; 1160 | 1161 | for (int offset_r = 0; offset_r < len_R; offset_r += bucket_size) { 1162 | for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x) 1163 | head[i] = -1; 1164 | __syncthreads(); 1165 | 1166 | for (int base_r = 0; base_r < bucket_size; base_r += 4*blockDim.x) { 1167 | vec4 data_R = *(reinterpret_cast(R + bucket_size * bucket_r_loop + base_r + 4*threadIdx.x)); 1168 | vec4 data_Pr = *(reinterpret_cast(Pr + bucket_size * bucket_r_loop + base_r + 4*threadIdx.x)); 1169 | int l_cnt_R = len_R - offset_r - base_r - 4 * threadIdx.x; 1170 | 1171 | int cnt = 0; 1172 | 1173 | #pragma unroll 1174 | for (int k = 0; k < 4; k++) { 1175 | if (k < l_cnt_R) { 1176 | int val = data_R.i[k]; 1177 | elem[base_r + k*blockDim.x + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 1178 | payload[base_r + k*blockDim.x + tid] = data_Pr.i[k]; 1179 | int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 1180 | 1181 | int32_t last = atomicExch(&head[hval], base_r + k*blockDim.x + tid); 1182 | next[base_r + k*blockDim.x + tid] = last; 1183 | } 1184 | } 1185 | } 1186 | 1187 | bucket_r_loop = R_chain[bucket_r_loop]; 1188 | 1189 | __syncthreads(); 1190 | 1191 | int bucket_s_loop = p; 1192 | int base_s = 0; 1193 | 1194 | for (int offset_s = 0; offset_s < len_S; offset_s += 4*blockDim.x) { 1195 | vec4 data_S = *(reinterpret_cast(S + bucket_size * bucket_s_loop + base_s + 4*threadIdx.x)); 1196 | vec4 data_Ps = *(reinterpret_cast(Ps + bucket_size * bucket_s_loop + base_s + 4*threadIdx.x)); 1197 | int l_cnt_S = len_S - offset_s - 4 * threadIdx.x; 1198 | 1199 | #pragma unroll 1200 | for (int k = 0; k < 4; k++) { 1201 | int32_t val = data_S.i[k]; 1202 | int32_t pval = data_Ps.i[k]; 1203 | int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 1204 | int32_t hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 1205 | int32_t pay; 1206 | 1207 | int32_t pos = (k < l_cnt_S)? head[hval] : -1; 1208 | 1209 | /*check at warp level whether someone is still following chain => this way we can shuffle without risk*/ 1210 | int pred = (pos >= 0); 1211 | 1212 | while (__any(pred)) { 1213 | int wr_intention = 0; 1214 | 1215 | /*we have a match, fetch the data to be written*/ 1216 | if (pred) { 1217 | if (elem[pos] == tval) { 1218 | pay = payload[pos]; 1219 | wr_intention = 1; 1220 | count++; 1221 | } 1222 | 1223 | pos = next[pos]; 1224 | pred = (pos >= 0); 1225 | } 1226 | 1227 | /*find out who had a match in this execution step*/ 1228 | int mask = __ballot(wr_intention); 1229 | 1230 | /*our software managed buffer will overflow, flush it*/ 1231 | int wr_offset = shuffle_ptr + __popc(mask & threadmask); 1232 | shuffle_ptr = shuffle_ptr + __popc(mask); 1233 | 1234 | /*while it overflows, flush 1235 | we flush 16 keys and then the 16 corresponding payloads consecutively, of course other formats might be friendlier*/ 1236 | while (shuffle_ptr >= SHUFFLE_SIZE) { 1237 | if (wr_intention && (wr_offset < SHUFFLE_SIZE)) { 1238 | warp_shuffle[wr_offset] = pay; 1239 | warp_shuffle[wr_offset+SHUFFLE_SIZE] = pval; 1240 | wr_intention = 0; 1241 | } 1242 | 1243 | if (lid == 0) { 1244 | ptr = atomicAdd(results, 2*SHUFFLE_SIZE); 1245 | ptr = ptr & FOLD; 1246 | } 1247 | 1248 | ptr = __shfl(ptr, 0); 1249 | 1250 | output[ptr + lid] = warp_shuffle[lid]; 1251 | 1252 | wr_offset -= SHUFFLE_SIZE; 1253 | shuffle_ptr -= SHUFFLE_SIZE; 1254 | } 1255 | 1256 | /*now the fit, write them in buffer*/ 1257 | if (wr_intention && (wr_offset >= 0)) { 1258 | warp_shuffle[wr_offset] = pay; 1259 | warp_shuffle[wr_offset+SHUFFLE_SIZE] = pval; 1260 | wr_intention = 0; 1261 | } 1262 | } 1263 | } 1264 | 1265 | base_s += 4*blockDim.x; 1266 | if (base_s >= bucket_size) { 1267 | bucket_s_loop = S_chain[bucket_s_loop]; 1268 | base_s = 0; 1269 | } 1270 | } 1271 | 1272 | __syncthreads(); 1273 | } 1274 | } else { 1275 | for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x) 1276 | head[i] = -1; 1277 | 1278 | int rem_s = len_S % 4096; 1279 | rem_s = (rem_s + 4 - 1)/4; 1280 | 1281 | __syncthreads(); 1282 | 1283 | int off; 1284 | int it; 1285 | int base = 0; 1286 | 1287 | it = p; 1288 | off = 0; 1289 | 1290 | 1291 | for (off = 0; off < len_S;) { 1292 | vec4 data_S = *(reinterpret_cast(S + bucket_size * it + base + 4*threadIdx.x)); 1293 | vec4 data_Ps = *(reinterpret_cast(Ps + bucket_size * it + base +4*threadIdx.x)); 1294 | int l_cnt_S = len_S - off - 4 * threadIdx.x; 1295 | 1296 | #pragma unroll 1297 | for (int k = 0; k < 4; k++) { 1298 | if (k < l_cnt_S) { 1299 | int val = data_S.i[k]; 1300 | elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 1301 | payload[off + tid] = data_Ps.i[k]; 1302 | int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 1303 | 1304 | int32_t last = atomicExch(&head[hval], off + tid); 1305 | next[off + tid] = last; 1306 | } 1307 | 1308 | off += (off < bucket_size)? blockDim.x : rem_s; 1309 | base += blockDim.x; 1310 | } 1311 | 1312 | if (base >= bucket_size) { 1313 | it = S_chain[it]; 1314 | base = 0; 1315 | } 1316 | } 1317 | 1318 | __syncthreads(); 1319 | 1320 | it = bucket_r; 1321 | off = 0; 1322 | 1323 | for (; 0 < len_R; off += 4*blockDim.x, len_R -= 4*blockDim.x) { 1324 | int l_cnt_R = len_R - 4 * threadIdx.x; 1325 | vec4 data_R; 1326 | vec4 data_Pr; 1327 | 1328 | data_R = *(reinterpret_cast(R + bucket_size * it + off + 4*threadIdx.x)); 1329 | data_Pr = *(reinterpret_cast(Pr + bucket_size * it + off + 4*threadIdx.x)); 1330 | 1331 | #pragma unroll 1332 | for (int k = 0; k < 4; k++) { 1333 | int32_t val = data_R.i[k]; 1334 | int32_t pval = data_Pr.i[k]; 1335 | int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 1336 | int32_t hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 1337 | int32_t pay; 1338 | 1339 | int32_t pos = (k < l_cnt_R)? head[hval] : -1; 1340 | 1341 | /*same as previous code block*/ 1342 | int pred = (pos >= 0); 1343 | 1344 | while (__any(pred)) { 1345 | int wr_intention = 0; 1346 | 1347 | if (pred) { 1348 | if (elem[pos] == tval) { 1349 | pay = payload[pos]; 1350 | wr_intention = 1; 1351 | count++; 1352 | } 1353 | 1354 | pos = next[pos]; 1355 | pred = (pos >= 0); 1356 | } 1357 | 1358 | int mask = __ballot(wr_intention); 1359 | 1360 | int wr_offset = shuffle_ptr + __popc(mask & threadmask); 1361 | shuffle_ptr = shuffle_ptr + __popc(mask); 1362 | 1363 | while (shuffle_ptr >= SHUFFLE_SIZE) { 1364 | if (wr_intention && (wr_offset < SHUFFLE_SIZE)) { 1365 | warp_shuffle[wr_offset] = pval; 1366 | warp_shuffle[wr_offset+SHUFFLE_SIZE] = pay; 1367 | wr_intention = 0; 1368 | } 1369 | 1370 | if (lid == 0) { 1371 | ptr = atomicAdd(results, 2*SHUFFLE_SIZE); 1372 | 1373 | ptr = ptr & FOLD; 1374 | } 1375 | 1376 | ptr = __shfl(ptr, 0); 1377 | 1378 | output[ptr + lid] = warp_shuffle[lid]; 1379 | 1380 | wr_offset -= SHUFFLE_SIZE; 1381 | shuffle_ptr -= SHUFFLE_SIZE; 1382 | } 1383 | 1384 | if (wr_intention && (wr_offset >= 0)) { 1385 | warp_shuffle[wr_offset] = pval; 1386 | warp_shuffle[wr_offset+SHUFFLE_SIZE] = pay; 1387 | wr_intention = 0; 1388 | } 1389 | } 1390 | } 1391 | 1392 | if (off >= bucket_size) { 1393 | it = R_chain[it]; 1394 | off = 0; 1395 | } 1396 | } 1397 | 1398 | __syncthreads(); 1399 | } 1400 | } 1401 | } 1402 | 1403 | if (lid == 0) { 1404 | ptr = atomicAdd(results, 2*shuffle_ptr); 1405 | ptr = ptr & FOLD; 1406 | } 1407 | 1408 | ptr = __shfl(ptr, 0); 1409 | 1410 | if (lid < shuffle_ptr) { 1411 | output[ptr + lid] = warp_shuffle[lid]; 1412 | output[ptr + lid + shuffle_ptr] = warp_shuffle[lid + SHUFFLE_SIZE]; 1413 | } 1414 | 1415 | __syncthreads(); 1416 | } 1417 | 1418 | /*again the same but payload is the virtual tuple id and we late materialize from Dx arrays which store the actual columns that we need 1419 | also here we have no overflows because if we did, we wouldn't fit the data/extra columns :) */ 1420 | __global__ void join_partitioned_varpayload ( 1421 | const int32_t* R, 1422 | const int32_t* Pr, 1423 | const int32_t* Dr, 1424 | const uint32_t* R_chain, 1425 | const uint32_t* bucket_info, 1426 | const int32_t* S, 1427 | const int32_t* Ps, 1428 | const int32_t* Ds, 1429 | const uint32_t* S_cnts, 1430 | const uint32_t* S_chain, 1431 | int32_t log_parts, 1432 | int32_t col_num1, 1433 | int32_t col_num2, 1434 | int32_t rel_size, 1435 | uint32_t* buckets_num, 1436 | int32_t* results) { 1437 | __shared__ int16_t elem[4096 + 512]; 1438 | __shared__ int32_t payload[4096 + 512]; 1439 | __shared__ int16_t next[4096 + 512]; 1440 | __shared__ int32_t head[LOCAL_BUCKETS]; 1441 | 1442 | 1443 | int tid = threadIdx.x; 1444 | int block = blockIdx.x; 1445 | int width = blockDim.x; 1446 | int pwidth = gridDim.x; 1447 | int parts = 1 << log_parts; 1448 | 1449 | int lid = tid % 32; 1450 | int gnum = blockDim.x/32; 1451 | 1452 | int count = 0; 1453 | 1454 | int buckets_cnt = *buckets_num; 1455 | 1456 | for (uint32_t bucket_r = block; bucket_r < buckets_cnt; bucket_r += pwidth) { 1457 | int info = bucket_info[bucket_r]; 1458 | 1459 | if (info != 0) { 1460 | int p = info >> 15; 1461 | int len_R = info & ((1 << 15) - 1); 1462 | 1463 | int len_S = S_cnts[p]; 1464 | 1465 | for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x) 1466 | head[i] = -1; 1467 | 1468 | int rem_s = len_S % 4096; 1469 | rem_s = (rem_s + 4 - 1)/4; 1470 | 1471 | __syncthreads(); 1472 | 1473 | int off; 1474 | int it; 1475 | int base = 0; 1476 | 1477 | it = p; 1478 | off = 0; 1479 | 1480 | for (off = 0; off < len_S;) { 1481 | vec4 data_S = *(reinterpret_cast(S + bucket_size * it + base + 4*threadIdx.x)); 1482 | vec4 data_Ps = *(reinterpret_cast(Ps + bucket_size * it + base +4*threadIdx.x)); 1483 | int l_cnt_S = len_S - off - 4 * threadIdx.x; 1484 | 1485 | #pragma unroll 1486 | for (int k = 0; k < 4; k++) { 1487 | if (k < l_cnt_S) { 1488 | int val = data_S.i[k]; 1489 | elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 1490 | payload[off + tid] = data_Ps.i[k]; 1491 | int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 1492 | 1493 | int32_t last = atomicExch(&head[hval], off + tid); 1494 | next[off + tid] = last; 1495 | } 1496 | 1497 | off += (off < bucket_size)? blockDim.x : rem_s; 1498 | } 1499 | 1500 | if (base >= bucket_size) { 1501 | it = S_chain[it]; 1502 | base = 0; 1503 | } 1504 | 1505 | 1506 | } 1507 | 1508 | __syncthreads(); 1509 | 1510 | it = bucket_r; 1511 | off = 0; 1512 | 1513 | for (; 0 < len_R; off += 4*blockDim.x, len_R -= 4*blockDim.x) { 1514 | vec4 data_R = *(reinterpret_cast(R + bucket_size * it + off + 4*threadIdx.x)); 1515 | vec4 data_Pr = *(reinterpret_cast(Pr + bucket_size * it + off + 4*threadIdx.x)); 1516 | int l_cnt_R = len_R - 4 * threadIdx.x; 1517 | 1518 | #pragma unroll 1519 | for (int k = 0; k < 4; k++) { 1520 | int32_t val = data_R.i[k]; 1521 | int32_t pval = data_Pr.i[k]; 1522 | int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 1523 | int32_t hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 1524 | 1525 | if (k < l_cnt_R) { 1526 | int32_t pos = head[hval]; 1527 | while (pos >= 0) { 1528 | if (elem[pos] == tval) { 1529 | int32_t bval = payload[pos]; 1530 | 1531 | for (int z = 0; z < col_num1; z++) 1532 | count += Dr[pval + z*rel_size]; 1533 | 1534 | for (int z = 0; z < col_num2; z++) 1535 | count += Ds[bval + z*rel_size]; 1536 | } 1537 | 1538 | pos = next[pos]; 1539 | } 1540 | } 1541 | } 1542 | 1543 | if (off >= bucket_size) { 1544 | it = R_chain[it]; 1545 | off = 0; 1546 | } 1547 | } 1548 | 1549 | __syncthreads(); 1550 | 1551 | } 1552 | } 1553 | 1554 | atomicAdd(results, count); 1555 | 1556 | __syncthreads(); 1557 | } 1558 | 1559 | /*late materialization and perfect hashing*/ 1560 | __global__ void probe_perfect_array_varpay (int32_t* data, int32_t* Dr, int n, int32_t* lookup, int32_t* Ds, int col_num1, int col_num2, int rel_size, int* aggr) { 1561 | int count = 0; 1562 | 1563 | for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < n ; i += blockDim.x * gridDim.x) { 1564 | int val = data[i]; 1565 | int payload = i; 1566 | int res = lookup[val]; 1567 | 1568 | if (res > 0) { 1569 | res--; 1570 | 1571 | for (int z = 0; z < col_num1; z++) 1572 | count += Dr[payload + z*rel_size]; 1573 | for (int z = 0; z < col_num2; z++) 1574 | count += Ds[res + z*rel_size]; 1575 | } 1576 | } 1577 | 1578 | atomicAdd(aggr, count); 1579 | } 1580 | 1581 | /*partition and compute metadata for relation with key+payload*/ 1582 | void prepare_Relation_payload (int* R, int* R_temp, int* P, int* P_temp, size_t RelsNum, uint32_t buckets_num, uint64_t* heads[2], uint32_t* cnts[2], uint32_t* chains[2], uint32_t* buckets_used[2], uint32_t log_parts1, uint32_t log_parts2, uint32_t first_bit, cudaStream_t streams, size_t* offsets_GPU, uint32_t num_threads) { 1583 | init_metadata_double<<<64, 1024, 0, streams>>> ( 1584 | heads[0], buckets_used[0], chains[0], cnts[0], 1 << log_parts1, buckets_num, 1585 | heads[1], buckets_used[1], chains[1], cnts[1], 1 << (log_parts1 + log_parts2), buckets_num 1586 | ); 1587 | 1588 | partition_pass_one <<<64, 1024, (1024*4 + 4*(1 << log_parts1)) * sizeof(int32_t) + (4*num_threads+2)*sizeof(size_t), streams>>>( 1589 | R, P, 1590 | offsets_GPU, 1591 | heads[0], 1592 | buckets_used[0], 1593 | chains[0], 1594 | cnts[0], 1595 | R_temp, P_temp, 1596 | RelsNum, 1597 | log_parts1, 1598 | first_bit + log_parts2, 1599 | num_threads 1600 | ); 1601 | 1602 | 1603 | compute_bucket_info <<<64, 1024, 0, streams>>> (chains[0], cnts[0], log_parts1); 1604 | 1605 | partition_pass_two <<<64, 1024, (1024*4 + 4*(1 << log_parts2)) * sizeof(int32_t) + ((2 * (1 << log_parts2) + 1)* sizeof(int32_t)), streams>>>( 1606 | R_temp, P_temp, 1607 | chains[0], 1608 | buckets_used[1], heads[1], chains[1], cnts[1], 1609 | R, P, 1610 | log_parts1, log_parts2, first_bit, 1611 | buckets_used[0]); 1612 | 1613 | } 1614 | 1615 | /*partition and compute metadata for relation with key+payload. We use different buffers at the end (it makes sense for UVA based techniques)*/ 1616 | void prepare_Relation_payload_triple (int* R, int* R_temp, int* R_final, int* P, int* P_temp, int* P_final, size_t RelsNum, uint32_t buckets_num, uint64_t* heads[2], uint32_t* cnts[2], uint32_t* chains[2], uint32_t* buckets_used[2], uint32_t log_parts1, uint32_t log_parts2, uint32_t first_bit, cudaStream_t streams, size_t* offsets_GPU, uint32_t num_threads) { 1617 | init_metadata_double<<<64, 1024, 0, streams>>> ( 1618 | heads[0], buckets_used[0], chains[0], cnts[0], 1 << log_parts1, buckets_num, 1619 | heads[1], buckets_used[1], chains[1], cnts[1], 1 << (log_parts1 + log_parts2), buckets_num 1620 | ); 1621 | 1622 | partition_pass_one <<<64, 1024, (1024*4 + 4*(1 << log_parts1)) * sizeof(int32_t) + (4*num_threads+2)*sizeof(size_t), streams>>>( 1623 | R, P, 1624 | offsets_GPU, 1625 | heads[0], 1626 | buckets_used[0], 1627 | chains[0], 1628 | cnts[0], 1629 | R_temp, P_temp, 1630 | RelsNum, 1631 | log_parts1, 1632 | first_bit + log_parts2, 1633 | num_threads 1634 | ); 1635 | 1636 | CHK_ERROR(cudaDeviceSynchronize()); 1637 | 1638 | 1639 | compute_bucket_info <<<64, 1024, 0, streams>>> (chains[0], cnts[0], log_parts1); 1640 | 1641 | partition_pass_two <<<64, 1024, (1024*4 + 4*(1 << log_parts2)) * sizeof(int32_t) + ((2 * (1 << log_parts2) + 1)* sizeof(int32_t)), streams>>>( 1642 | R_temp, P_temp, 1643 | chains[0], 1644 | buckets_used[1], heads[1], chains[1], cnts[1], 1645 | R_final, P_final, 1646 | log_parts1, log_parts2, first_bit, 1647 | buckets_used[0]); 1648 | 1649 | 1650 | 1651 | } 1652 | 1653 | template 1654 | struct chain_iterator_ref_generic{ 1655 | Tv x ; 1656 | int cnt; 1657 | }; 1658 | 1659 | template 1660 | class chain_iterator_generic{ 1661 | private: 1662 | const T * __restrict__ S_parts ; 1663 | const uint32_t * __restrict__ S_chains ; 1664 | const uint32_t cnt ; 1665 | 1666 | const T * __restrict__ ptr ; 1667 | 1668 | uint32_t current_bucket ; 1669 | uint32_t next_bucket ; 1670 | uint32_t i ; 1671 | public: 1672 | __device__ __forceinline__ chain_iterator_generic( 1673 | const T * __restrict__ S_parts , 1674 | const uint32_t * __restrict__ S_cnts , 1675 | const uint32_t * __restrict__ S_chains , 1676 | uint32_t current_partition): 1677 | S_parts(S_parts + (16/sizeof(T)) * threadIdx.x), S_chains(S_chains), 1678 | cnt((S_cnts[current_partition]/((16/sizeof(T)) * blockDim.x))*(16/sizeof(T)) + max(((int32_t) (S_cnts[current_partition] % ((16/sizeof(T)) * blockDim.x))) - ((int32_t) ((16/sizeof(T)) * threadIdx.x)), 0)), 1679 | ptr(S_parts + ((size_t) current_partition << log2_bucket_size) + (16/sizeof(T)) * threadIdx.x), 1680 | current_bucket(current_partition), 1681 | next_bucket(S_chains[current_partition]), 1682 | i(0){} 1683 | 1684 | __device__ __forceinline__ chain_iterator_generic( 1685 | const uint32_t * __restrict__ S_cnts, 1686 | uint32_t current_partition): 1687 | cnt(0), 1688 | i(((S_cnts[current_partition] + (16/sizeof(T)) * blockDim.x - 1)/((16/sizeof(T)) * blockDim.x))*(16/sizeof(T))){} 1689 | 1690 | __device__ __forceinline__ chain_iterator_generic& operator++(){ 1691 | i += (16/sizeof(T));// * blockDim.x; 1692 | ptr += (16/sizeof(T)) * blockDim.x; 1693 | 1694 | if ((i * blockDim.x) & bucket_size_mask) return *this; 1695 | 1696 | current_bucket = next_bucket;//int_shared[0]; 1697 | 1698 | ptr = S_parts + (current_bucket << log2_bucket_size); 1699 | 1700 | next_bucket = S_chains[next_bucket]; 1701 | 1702 | return *this; 1703 | } 1704 | 1705 | __device__ __forceinline__ chain_iterator_ref_generic operator*() const { 1706 | chain_iterator_ref_generic tmp; 1707 | tmp.x = *reinterpret_cast(ptr); 1708 | tmp.cnt = cnt - i; 1709 | return tmp; 1710 | } 1711 | 1712 | __device__ __forceinline__ bool operator!=(const chain_iterator_generic& o){ 1713 | return i != o.i; 1714 | } 1715 | }; 1716 | 1717 | template 1718 | class chain_generic{ 1719 | private: 1720 | const T * __restrict__ S_parts ; 1721 | const uint32_t * __restrict__ S_cnts ; 1722 | const uint32_t * __restrict__ S_chains ; 1723 | const uint32_t partition; 1724 | public: 1725 | __device__ __host__ __forceinline__ chain_generic( 1726 | const T * __restrict__ S_parts , 1727 | const uint32_t * __restrict__ S_cnts , 1728 | const uint32_t * __restrict__ S_chains , 1729 | uint32_t partition): 1730 | S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains), partition(partition){} 1731 | 1732 | __device__ __forceinline__ chain_iterator_generic begin() const { 1733 | return chain_iterator_generic(S_parts, S_cnts, S_chains, partition); 1734 | } 1735 | 1736 | __device__ __forceinline__ chain_iterator_generic end() const { 1737 | return chain_iterator_generic(S_cnts, partition); 1738 | } 1739 | }; 1740 | 1741 | template 1742 | class chains_generic { 1743 | private: 1744 | const T * __restrict__ S_parts ; 1745 | const uint32_t * __restrict__ S_cnts ; 1746 | const uint32_t * __restrict__ S_chains ; 1747 | public: 1748 | __device__ __host__ __forceinline__ chains_generic( 1749 | const T * __restrict__ S_parts , 1750 | const uint32_t * __restrict__ S_cnts , 1751 | const uint32_t * __restrict__ S_chains ): 1752 | S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains){} 1753 | 1754 | __device__ __host__ __forceinline__ chain_generic get_chain(uint32_t partition) const{ 1755 | return chain_generic(S_parts, S_cnts, S_chains, partition); 1756 | } 1757 | 1758 | __device__ __forceinline__ uint32_t get_chain_size(uint32_t partition) const{ 1759 | return S_cnts[partition]; 1760 | } 1761 | }; 1762 | 1763 | struct chain_iterator_ref{ 1764 | vec4 x ; 1765 | int cnt; 1766 | }; 1767 | 1768 | struct chain_iterator_i_ref{ 1769 | int32_t x; 1770 | bool v; 1771 | }; 1772 | 1773 | class chain_iterator{ 1774 | private: 1775 | const int32_t * __restrict__ S_parts ; 1776 | const uint32_t * __restrict__ S_chains ; 1777 | const uint32_t cnt ; 1778 | 1779 | const int32_t * __restrict__ ptr ; 1780 | 1781 | uint32_t current_bucket ; 1782 | uint32_t next_bucket ; 1783 | uint32_t i ; 1784 | public: 1785 | // __device__ __forceinline__ chain_iterator( 1786 | // const int32_t * __restrict__ S_parts , 1787 | // const uint32_t * __restrict__ S_cnts , 1788 | // const uint32_t * __restrict__ S_chains ): 1789 | // S_parts(S_parts), S_chains(S_chains), cnt(S_cnts[blockIdx.x]), current_bucket(blockIdx.x), i(0){} 1790 | 1791 | __device__ __forceinline__ chain_iterator( 1792 | const int32_t * __restrict__ S_parts , 1793 | const uint32_t * __restrict__ S_cnts , 1794 | const uint32_t * __restrict__ S_chains , 1795 | uint32_t current_partition): 1796 | S_parts(S_parts + 4 * threadIdx.x), S_chains(S_chains), cnt((S_cnts[current_partition]/(4 * blockDim.x))*4 + max(((int32_t) (S_cnts[current_partition] % (4 * blockDim.x))) - ((int32_t) (4 * threadIdx.x)), 0)), ptr(S_parts + ((size_t) current_partition << log2_bucket_size) + 4 * threadIdx.x), current_bucket(current_partition), next_bucket(S_chains[current_partition]), i(0){} 1797 | 1798 | // __device__ __forceinline__ chain_iterator( 1799 | // const uint32_t * __restrict__ S_cnts): 1800 | // cnt(0), i(((S_cnts[blockIdx.x] + 4 * blockDim.x - 1)/(4 * blockDim.x)) * 4 * blockDim.x){} 1801 | 1802 | __device__ __forceinline__ chain_iterator( 1803 | const uint32_t * __restrict__ S_cnts, 1804 | uint32_t current_partition): 1805 | cnt(0), i(((S_cnts[current_partition] + 4 * blockDim.x - 1)/(4 * blockDim.x))*4){} 1806 | 1807 | __device__ __forceinline__ chain_iterator& operator++(){ 1808 | i += 4;// * blockDim.x; 1809 | ptr += 4 * blockDim.x; 1810 | 1811 | if ((i * blockDim.x) & bucket_size_mask) return *this; 1812 | 1813 | current_bucket = next_bucket;//int_shared[0]; 1814 | 1815 | ptr = S_parts + (current_bucket << log2_bucket_size); 1816 | 1817 | next_bucket = S_chains[next_bucket]; 1818 | 1819 | return *this; 1820 | } 1821 | 1822 | __device__ __forceinline__ chain_iterator_ref operator*() const { 1823 | chain_iterator_ref tmp; 1824 | tmp.x = *reinterpret_cast(ptr); 1825 | tmp.cnt = cnt - i; 1826 | return tmp; 1827 | } 1828 | 1829 | __device__ __forceinline__ bool operator!=(const chain_iterator& o){ 1830 | return i != o.i; 1831 | } 1832 | }; 1833 | 1834 | class chain_iterator_i{ 1835 | private: 1836 | const int32_t * __restrict__ S_parts ; 1837 | const uint32_t * __restrict__ S_chains ; 1838 | const uint32_t cnt ; 1839 | 1840 | const int32_t * __restrict__ ptr ; 1841 | 1842 | uint32_t current_bucket ; 1843 | uint32_t next_bucket ; 1844 | uint32_t i ; 1845 | public: 1846 | // __device__ __forceinline__ chain_iterator_i( 1847 | // const int32_t * __restrict__ S_parts , 1848 | // const uint32_t * __restrict__ S_cnts , 1849 | // const uint32_t * __restrict__ S_chains ): 1850 | // S_parts(S_parts), S_chains(S_chains), cnt(S_cnts[blockIdx.x]), current_bucket(blockIdx.x), i(0){} 1851 | 1852 | __device__ __forceinline__ chain_iterator_i( 1853 | const int32_t * __restrict__ S_parts , 1854 | const uint32_t * __restrict__ S_cnts , 1855 | const uint32_t * __restrict__ S_chains , 1856 | uint32_t current_partition): 1857 | S_parts(S_parts + threadIdx.x), S_chains(S_chains), cnt((S_cnts[current_partition]/blockDim.x) + max(((int32_t) (S_cnts[current_partition] % (blockDim.x))) - ((int32_t) (threadIdx.x)), 0)), ptr(S_parts + ((size_t) current_partition << log2_bucket_size) + threadIdx.x), current_bucket(current_partition), next_bucket(S_chains[current_partition]), i(0){} 1858 | 1859 | // __device__ __forceinline__ chain_iterator_i( 1860 | // const uint32_t * __restrict__ S_cnts): 1861 | // cnt(0), i(((S_cnts[blockIdx.x] + 4 * blockDim.x - 1)/(4 * blockDim.x)) * 4 * blockDim.x){} 1862 | 1863 | __device__ __forceinline__ chain_iterator_i( 1864 | const uint32_t * __restrict__ S_cnts, 1865 | uint32_t current_partition): 1866 | cnt(0), i(((S_cnts[current_partition] + blockDim.x - 1)/(blockDim.x))){} 1867 | 1868 | __device__ __forceinline__ chain_iterator_i& operator++(){ 1869 | ++i;// * blockDim.x; 1870 | ptr += blockDim.x; 1871 | 1872 | if ((i * blockDim.x) & bucket_size_mask) return *this; 1873 | 1874 | current_bucket = next_bucket;//int_shared[0]; 1875 | 1876 | ptr = S_parts + (current_bucket << log2_bucket_size); 1877 | 1878 | next_bucket = S_chains[next_bucket]; 1879 | 1880 | return *this; 1881 | } 1882 | 1883 | __device__ __forceinline__ chain_iterator_i_ref operator*() const { 1884 | chain_iterator_i_ref tmp; 1885 | tmp.x = *ptr; 1886 | tmp.v = i < cnt; 1887 | return tmp; 1888 | } 1889 | 1890 | __device__ __forceinline__ bool operator!=(const chain_iterator_i& o){ 1891 | return i != o.i; 1892 | } 1893 | }; 1894 | 1895 | class chain_i{ 1896 | private: 1897 | const int32_t * __restrict__ S_parts ; 1898 | const uint32_t * __restrict__ S_cnts ; 1899 | const uint32_t * __restrict__ S_chains ; 1900 | const uint32_t partition; 1901 | public: 1902 | __device__ __host__ __forceinline__ chain_i( 1903 | const int32_t * __restrict__ S_parts , 1904 | const uint32_t * __restrict__ S_cnts , 1905 | const uint32_t * __restrict__ S_chains , 1906 | uint32_t partition): 1907 | S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains), partition(partition){} 1908 | 1909 | __device__ __forceinline__ chain_iterator_i begin() const { 1910 | return chain_iterator_i(S_parts, S_cnts, S_chains, partition); 1911 | } 1912 | 1913 | __device__ __forceinline__ chain_iterator_i end() const { 1914 | return chain_iterator_i(S_cnts, partition); 1915 | } 1916 | }; 1917 | 1918 | class chain{ 1919 | private: 1920 | const int32_t * __restrict__ S_parts ; 1921 | const uint32_t * __restrict__ S_cnts ; 1922 | const uint32_t * __restrict__ S_chains ; 1923 | const uint32_t partition; 1924 | public: 1925 | __device__ __host__ __forceinline__ chain( 1926 | const int32_t * __restrict__ S_parts , 1927 | const uint32_t * __restrict__ S_cnts , 1928 | const uint32_t * __restrict__ S_chains , 1929 | uint32_t partition): 1930 | S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains), partition(partition){} 1931 | 1932 | __device__ __forceinline__ chain_iterator begin() const { 1933 | return chain_iterator(S_parts, S_cnts, S_chains, partition); 1934 | } 1935 | 1936 | __device__ __forceinline__ chain_iterator end() const { 1937 | return chain_iterator(S_cnts, partition); 1938 | } 1939 | }; 1940 | 1941 | 1942 | class chains{ 1943 | private: 1944 | const int32_t * __restrict__ S_parts ; 1945 | const uint32_t * __restrict__ S_cnts ; 1946 | const uint32_t * __restrict__ S_chains ; 1947 | public: 1948 | __device__ __host__ __forceinline__ chains( 1949 | const int32_t * __restrict__ S_parts , 1950 | const uint32_t * __restrict__ S_cnts , 1951 | const uint32_t * __restrict__ S_chains ): 1952 | S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains){} 1953 | 1954 | __device__ __host__ __forceinline__ chain get_chain(uint32_t partition) const{ 1955 | return chain(S_parts, S_cnts, S_chains, partition); 1956 | } 1957 | 1958 | __device__ __host__ __forceinline__ chain_i get_chain_i(uint32_t partition) const{ 1959 | return chain_i(S_parts, S_cnts, S_chains, partition); 1960 | } 1961 | 1962 | __device__ __forceinline__ uint32_t get_chain_size(uint32_t partition) const{ 1963 | return S_cnts[partition]; 1964 | } 1965 | }; 1966 | 1967 | /*essentially the join_partitioned_aggregate*/ 1968 | __global__ void join_partitioned_shared ( 1969 | const int32_t* R, 1970 | const int32_t* Pr, 1971 | const uint32_t* R_cnts, 1972 | const uint32_t* R_chain, 1973 | const int32_t* S, 1974 | const int32_t* Ps, 1975 | const uint32_t* S_cnts, 1976 | const uint32_t* S_chain, 1977 | int32_t log_parts, 1978 | int32_t* results) { 1979 | __shared__ int16_t elem[4096 + 512]; 1980 | __shared__ int32_t payload[4096 + 512]; 1981 | __shared__ int16_t next[4096 + 512]; 1982 | __shared__ int32_t head[LOCAL_BUCKETS]; 1983 | 1984 | 1985 | int tid = threadIdx.x; 1986 | int block = blockIdx.x; 1987 | int width = blockDim.x; 1988 | int pwidth = gridDim.x; 1989 | int parts = 1 << log_parts; 1990 | 1991 | int lid = tid % 32; 1992 | int gnum = blockDim.x/32; 1993 | 1994 | int count = 0; 1995 | 1996 | int pr = -1; 1997 | int ps = -1; 1998 | 1999 | 2000 | for (uint32_t p = block; p < parts; p += pwidth) { 2001 | int len_R = R_cnts[p]; 2002 | int len_S = S_cnts[p]; 2003 | 2004 | if (len_S > 4096 + 512) { 2005 | /*it was a microbenchmark so I didn't code this part*/ 2006 | continue; 2007 | } else { 2008 | chain R_chains(R, R_cnts, R_chain, p); 2009 | chain Pr_chains(Pr, R_cnts, R_chain, p); 2010 | 2011 | chain S_chains(S, S_cnts, S_chain, p); 2012 | chain Ps_chains(Ps, S_cnts, S_chain, p); 2013 | 2014 | int off = 0; 2015 | 2016 | for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x) 2017 | head[i] = -1; 2018 | 2019 | int rem_s = len_S % 4096; 2020 | rem_s = (rem_s + 4 - 1)/4; 2021 | 2022 | __syncthreads(); 2023 | 2024 | chain_iterator it_S = S_chains.begin(); 2025 | chain_iterator it_Ps = Ps_chains.begin(); 2026 | 2027 | for (;it_S != S_chains.end(); ++it_S, ++it_Ps) { 2028 | vec4 data_S = (*it_S).x; 2029 | vec4 data_Ps = (*it_Ps).x; 2030 | int l_cnt_S = (*it_S).cnt; 2031 | 2032 | #pragma unroll 2033 | for (int k = 0; k < 4; k++) { 2034 | if (k < l_cnt_S) { 2035 | int val = data_S.i[k]; 2036 | elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 2037 | payload[off + tid] = data_Ps.i[k]; 2038 | int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 2039 | 2040 | int32_t last = atomicExch(&head[hval], off + tid); 2041 | next[off + tid] = last; 2042 | } 2043 | 2044 | off += (off < 4096)? blockDim.x : rem_s; 2045 | } 2046 | } 2047 | 2048 | __syncthreads(); 2049 | 2050 | 2051 | chain_iterator it_R = R_chains.begin(); 2052 | chain_iterator it_Pr = Pr_chains.begin(); 2053 | 2054 | for (;it_R != R_chains.end(); ++it_R, ++it_Pr) { 2055 | vec4 data_R = (*it_R).x; 2056 | vec4 data_Pr = (*it_Pr).x; 2057 | int l_cnt_R = (*it_R).cnt; 2058 | 2059 | #pragma unroll 2060 | for (int k = 0; k < 4; k++) { 2061 | int32_t val = data_R.i[k]; 2062 | int32_t pval = data_Pr.i[k]; 2063 | int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 2064 | int32_t hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 2065 | 2066 | if (k < l_cnt_R) { 2067 | int32_t pos = head[hval]; 2068 | while (pos >= 0) { 2069 | if (elem[pos] == tval) { 2070 | count += pval*payload[pos]; 2071 | } 2072 | 2073 | pos = next[pos]; 2074 | } 2075 | } 2076 | } 2077 | } 2078 | 2079 | 2080 | __syncthreads(); 2081 | } 2082 | } 2083 | 2084 | atomicAdd(results, count); 2085 | 2086 | __syncthreads(); 2087 | } 2088 | 2089 | /*essentially the join_partitioned_aggregate but builds hashtable in GPU memory*/ 2090 | __global__ void join_partitioned_global ( 2091 | const int32_t* R, 2092 | const int32_t* Pr, 2093 | const uint32_t* R_cnts, 2094 | const uint32_t* R_chain, 2095 | const int32_t* S, 2096 | const int32_t* Ps, 2097 | const uint32_t* S_cnts, 2098 | const uint32_t* S_chain, 2099 | int32_t log_parts, 2100 | int32_t* results, 2101 | int32_t* buffer) { 2102 | 2103 | int tid = threadIdx.x; 2104 | int block = blockIdx.x; 2105 | int width = blockDim.x; 2106 | int pwidth = gridDim.x; 2107 | int parts = 1 << log_parts; 2108 | 2109 | buffer += block*8*4096; 2110 | 2111 | int16_t* elem = (int16_t*) buffer; 2112 | int32_t* payload = buffer + 4096 + 512;; 2113 | int16_t* next = (int16_t*) (buffer + 2*(4096 + 512)); 2114 | int32_t* head = buffer + 3*(4096+512); 2115 | 2116 | 2117 | 2118 | int lid = tid % 32; 2119 | int gnum = blockDim.x/32; 2120 | 2121 | int count = 0; 2122 | 2123 | int pr = -1; 2124 | int ps = -1; 2125 | 2126 | 2127 | for (uint32_t p = block; p < parts; p += pwidth) { 2128 | chain R_chains(R, R_cnts, R_chain, p); 2129 | chain Pr_chains(Pr, R_cnts, R_chain, p); 2130 | 2131 | chain S_chains(S, S_cnts, S_chain, p); 2132 | chain Ps_chains(Ps, S_cnts, S_chain, p); 2133 | 2134 | int len_R = R_cnts[p]; 2135 | int len_S = S_cnts[p]; 2136 | 2137 | if (len_S > 4096 + 512) { 2138 | /*it was a microbenchmark so I didn't code this part*/ 2139 | continue; 2140 | } else { 2141 | int off = 0; 2142 | 2143 | for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x) 2144 | head[i] = -1; 2145 | 2146 | int rem_s = len_S % 4096; 2147 | rem_s = (rem_s + 4 - 1)/4; 2148 | 2149 | __syncthreads(); 2150 | 2151 | chain_iterator it_S = S_chains.begin(); 2152 | chain_iterator it_Ps = Ps_chains.begin(); 2153 | 2154 | for (;it_S != S_chains.end(); ++it_S, ++it_Ps) { 2155 | vec4 data_S = (*it_S).x; 2156 | vec4 data_Ps = (*it_Ps).x; 2157 | int l_cnt_S = (*it_S).cnt; 2158 | 2159 | #pragma unroll 2160 | for (int k = 0; k < 4; k++) { 2161 | if (k < l_cnt_S) { 2162 | int val = data_S.i[k]; 2163 | elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 2164 | payload[off + tid] = data_Ps.i[k]; 2165 | int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 2166 | 2167 | int32_t last = atomicExch(&head[hval], off + tid); 2168 | next[off + tid] = last; 2169 | } 2170 | 2171 | off += (off < 4096)? blockDim.x : rem_s; 2172 | } 2173 | } 2174 | 2175 | __syncthreads(); 2176 | 2177 | chain_iterator it_R = R_chains.begin(); 2178 | chain_iterator it_Pr = Pr_chains.begin(); 2179 | 2180 | for (;it_R != R_chains.end(); ++it_R, ++it_Pr) { 2181 | vec4 data_R = (*it_R).x; 2182 | vec4 data_Pr = (*it_Pr).x; 2183 | int l_cnt_R = (*it_R).cnt; 2184 | 2185 | #pragma unroll 2186 | for (int k = 0; k < 4; k++) { 2187 | int32_t val = data_R.i[k]; 2188 | int32_t pval = data_Pr.i[k]; 2189 | int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts)); 2190 | int32_t hval = (val >> log_parts) & (LOCAL_BUCKETS - 1); 2191 | 2192 | if (k < l_cnt_R) { 2193 | int32_t pos = head[hval]; 2194 | while (pos >= 0) { 2195 | if (elem[pos] == tval) { 2196 | count += pval*payload[pos]; 2197 | } 2198 | 2199 | pos = next[pos]; 2200 | } 2201 | } 2202 | } 2203 | } 2204 | 2205 | __syncthreads(); 2206 | } 2207 | } 2208 | 2209 | atomicAdd(results, count); 2210 | 2211 | __syncthreads(); 2212 | } -------------------------------------------------------------------------------- /src/join-primitives.cuh: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 2 | Ecole Polytechnique Federale de Lausanne 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE.*/ 21 | 22 | #ifndef JOIN_PRIMITIVES_HPP_ 23 | #define JOIN_PRIMITIVES_HPP_ 24 | 25 | #include 26 | #include "common.h" 27 | #include "common-host.h" 28 | 29 | #define CLUSTERING_FACTOR 64 30 | 31 | struct alignas(alignof(int64_t)) hj_bucket_2{ 32 | int32_t next; 33 | int32_t val ; 34 | 35 | constexpr __host__ __device__ hj_bucket_2(int32_t next, int32_t value): next(next), val(value){} 36 | }; 37 | 38 | __global__ void init_payload (int* R, int n); 39 | 40 | __global__ void partition_pass_one ( 41 | const int32_t * __restrict__ S, 42 | const int32_t * __restrict__ P, 43 | const size_t * __restrict__ offsets, 44 | uint64_t * __restrict__ heads, 45 | uint32_t * __restrict__ buckets_used, 46 | uint32_t * __restrict__ chains, 47 | uint32_t * __restrict__ out_cnts, 48 | int32_t * __restrict__ output_S, 49 | int32_t * __restrict__ output_P, 50 | size_t cnt, 51 | uint32_t log_parts, 52 | uint32_t first_bit, 53 | uint32_t num_threads); 54 | 55 | __global__ void compute_bucket_info (uint32_t* chains, uint32_t* out_cnts, uint32_t log_parts); 56 | 57 | __global__ void partition_pass_two ( 58 | const int32_t * __restrict__ S, 59 | const int32_t * __restrict__ P, 60 | const uint32_t * __restrict__ bucket_info, 61 | uint32_t * __restrict__ buckets_used, 62 | uint64_t * heads, 63 | uint32_t * __restrict__ chains, 64 | uint32_t * __restrict__ out_cnts, 65 | int32_t * __restrict__ output_S, 66 | int32_t * __restrict__ output_P, 67 | uint32_t S_log_parts, 68 | uint32_t log_parts, 69 | uint32_t first_bit, 70 | uint32_t * bucket_num_ptr); 71 | 72 | __global__ void join_partitioned_shared ( 73 | const int32_t* R, 74 | const int32_t* Pr, 75 | const uint32_t* R_cnts, 76 | const uint32_t* R_chain, 77 | const int32_t* S, 78 | const int32_t* Ps, 79 | const uint32_t* S_cnts, 80 | const uint32_t* S_chain, 81 | int32_t log_parts, 82 | int32_t* results); 83 | 84 | __global__ void join_partitioned_global ( 85 | const int32_t* R, 86 | const int32_t* Pr, 87 | const uint32_t* R_cnts, 88 | const uint32_t* R_chain, 89 | const int32_t* S, 90 | const int32_t* Ps, 91 | const uint32_t* S_cnts, 92 | const uint32_t* S_chain, 93 | int32_t log_parts, 94 | int32_t* results, 95 | int32_t* buffer); 96 | 97 | __global__ void init_metadata_double ( 98 | uint64_t * __restrict__ heads1, 99 | uint32_t * __restrict__ buckets_used1, 100 | uint32_t * __restrict__ chains1, 101 | uint32_t * __restrict__ out_cnts1, 102 | uint32_t parts1, 103 | uint32_t buckets_num1, 104 | uint64_t * __restrict__ heads2, 105 | uint32_t * __restrict__ buckets_used2, 106 | uint32_t * __restrict__ chains2, 107 | uint32_t * __restrict__ out_cnts2, 108 | uint32_t parts2, 109 | uint32_t buckets_num2 110 | ); 111 | 112 | __global__ void build_perfect_array (int32_t* data, int32_t* payload, int n, int32_t* lookup); 113 | 114 | __global__ void probe_perfect_array (int32_t* data, int32_t* payload, int n, int32_t* lookup, int* aggr); 115 | 116 | __global__ void build_ht_chains (int32_t* data, int n, uint32_t log_parts, int32_t* output, int* head); 117 | 118 | __global__ void chains_probing (int32_t* data, int32_t* payload, int n, uint32_t log_parts, int32_t* ht, int32_t* ht_key, int32_t* ht_pay, int* head, int* aggr); 119 | 120 | __global__ void ht_hist (int* data, int n, int log_parts, int* hist); 121 | 122 | __global__ void ht_offsets (int log_parts, int* hist, int* offset, int* aggr); 123 | 124 | __global__ void build_ht_linear (int* data, int* payload, size_t n, int log_parts, int* offset, int* ht, int* htp); 125 | 126 | __global__ void linear_probing (int* data, int* payload, int* ht, int* htp, int* offset_s, int* offset_e, size_t n, int log_parts, int* aggr); 127 | 128 | __global__ void decompose_chains (uint32_t* bucket_info, uint32_t* chains, uint32_t* out_cnts, uint32_t log_parts, int threshold); 129 | 130 | __global__ void join_partitioned_aggregate ( 131 | const int32_t* R, 132 | const int32_t* Pr, 133 | const uint32_t* R_chain, 134 | const uint32_t* bucket_info, 135 | const int32_t* S, 136 | const int32_t* Ps, 137 | const uint32_t* S_cnts, 138 | const uint32_t* S_chain, 139 | int32_t log_parts, 140 | uint32_t* buckets_num, 141 | int32_t* results); 142 | 143 | __global__ void join_partitioned_results ( 144 | const int32_t* R, 145 | const int32_t* Pr, 146 | const uint32_t* R_chain, 147 | const uint32_t* bucket_info, 148 | const int32_t* S, 149 | const int32_t* Ps, 150 | const uint32_t* S_cnts, 151 | const uint32_t* S_chain, 152 | int32_t log_parts, 153 | uint32_t* buckets_num, 154 | int32_t* results, 155 | int32_t* output); 156 | 157 | 158 | __global__ void join_partitioned_varpayload ( 159 | const int32_t* R, 160 | const int32_t* Pr, 161 | const int32_t* Dr, 162 | const uint32_t* R_chain, 163 | const uint32_t* bucket_info, 164 | const int32_t* S, 165 | const int32_t* Ps, 166 | const int32_t* Ds, 167 | const uint32_t* S_cnts, 168 | const uint32_t* S_chain, 169 | int32_t log_parts, 170 | int32_t col_num1, 171 | int32_t col_num2, 172 | int32_t rel_size, 173 | uint32_t* buckets_num, 174 | int32_t* results); 175 | 176 | __global__ void probe_perfect_array_varpay (int32_t* data, int32_t* Dr, int n, int32_t* lookup, int32_t* Ds, int col_num1, int col_num2, int res_size, int* aggr); 177 | 178 | void prepare_Relation_payload (int* R, int* R_temp, int* P, int* P_temp, size_t RelsNum, uint32_t buckets_num, uint64_t* heads[2], uint32_t* cnts[2], uint32_t* chains[2], uint32_t* buckets_used[2], uint32_t log_parts1, uint32_t log_parts2, uint32_t first_bit, cudaStream_t streams, size_t* offsets_GPU, uint32_t num_threads); 179 | 180 | void prepare_Relation_payload_triple (int* R, int* R_temp, int* R_final, int* P, int* P_temp, int* P_final, size_t RelsNum, uint32_t buckets_num, uint64_t* heads[2], uint32_t* cnts[2], uint32_t* chains[2], uint32_t* buckets_used[2], uint32_t log_parts1, uint32_t log_parts2, uint32_t first_bit, cudaStream_t streams, size_t* offsets_GPU, uint32_t num_threads); 181 | 182 | 183 | 184 | 185 | #endif -------------------------------------------------------------------------------- /src/main.cu: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 2 | Ecole Polytechnique Federale de Lausanne 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE.*/ 21 | 22 | #include 23 | #include 24 | #include /*INT_MAX*/ 25 | #include 26 | 27 | #include "generator_ETHZ.cuh" 28 | #include "common.h" 29 | #include "common-host.h" 30 | 31 | unsigned int hashJoinClusteredProbe(args *inputAttrs, timingInfo *time); 32 | 33 | typedef struct joinAlg { 34 | char name[4]; 35 | unsigned int (*joinAlg)(args*, timingInfo*); 36 | } joinAlg; 37 | 38 | typedef struct inputArgs { 39 | short option = 0; 40 | joinAlg alg 41 | #ifndef __CUDACC__ 42 | = { "NLJ", nestedLoopsJoin }; // does not play well along --expt-relaxed-constexpr 43 | #else 44 | ; 45 | #endif 46 | uint64_t SelsNum = 0; 47 | uint64_t RelsNum = 0; 48 | int uniqueKeys = 1; 49 | int fullRange = 0; 50 | float skew = 0.0; 51 | int threadsNum = 32; 52 | // int selectivity = 1; 53 | int valuesPerThread = 2; 54 | int sharedMem = 30 << 10; 55 | unsigned int pivotsNum = 1; 56 | int one_to_many = 0; 57 | int RelsMultiplier = 1; 58 | int SelsMultiplier = 1; 59 | const char* R_filename = NULL; 60 | const char* S_filename = NULL; 61 | int fileInput = 0; 62 | } inputArgs; 63 | 64 | static joinAlg algs[] { {"HJC", hashJoinClusteredProbe} 65 | // {"HJ", hashIndexJoin} 66 | }; 67 | 68 | void usage_exit(int op) { 69 | if (op == 0) 70 | printf( 71 | "./benchmark -b \n"); 72 | exit(1); 73 | } 74 | 75 | void print_timing_join(args *input, timingInfo *time, joinAlg *alg); 76 | void parseInputArgs(int argc, char ** argv, inputArgs *input); 77 | int createSingleRelation_filename(inputArgs *input, args *attrs); 78 | void createSingleRelation_data(inputArgs *input, args *attrs, uint64_t bytes); 79 | 80 | int main(int argc, char **argv) { 81 | timingInfo time; 82 | inputArgs input; 83 | parseInputArgs(argc, argv, &input); 84 | 85 | int dev = 0; 86 | 87 | switch (input.option) { 88 | case 7: 89 | case 8: { 90 | //set up device 91 | cudaDeviceProp deviceProp; 92 | CHK_ERROR(cudaGetDeviceProperties(&deviceProp, dev)); 93 | CHK_ERROR(cudaSetDevice(dev)); 94 | 95 | int* Q_r = NULL; 96 | size_t Q_els_r = input.RelsNum; 97 | size_t Q_bytes_r = Q_els_r * sizeof(int); 98 | 99 | int* Q_s = NULL; 100 | size_t Q_els_s = input.SelsNum; 101 | size_t Q_bytes_s = Q_els_s * sizeof(int); 102 | 103 | if (input.SelsMultiplier > 1 || input.RelsMultiplier > 1) { 104 | input.SelsNum = input.SelsNum * input.SelsMultiplier; 105 | input.RelsNum = input.RelsNum * input.RelsMultiplier; 106 | 107 | Q_r = (int*) malloc(Q_bytes_r); 108 | Q_s = (int*) malloc(Q_bytes_s); 109 | } 110 | 111 | args joinArgs; 112 | joinArgs.S_els = input.SelsNum; 113 | joinArgs.R_els = input.RelsNum; 114 | uint64_t S_bytes = joinArgs.S_els * sizeof(int); 115 | uint64_t R_bytes = joinArgs.R_els * sizeof(int); 116 | 117 | 118 | /*fix filenames*/ 119 | if (input.fileInput) { 120 | 121 | } else if (input.fullRange) { 122 | int n = 0; 123 | if ((n = sprintf(joinArgs.S_filename, "fk_S%lu_pk_R%lu.bin", joinArgs.S_els, joinArgs.R_els)) >= 50) { 124 | fprintf(stderr, "ERROR: S_filename is %d characters long\n", n); 125 | return 1; 126 | } 127 | if ((n = sprintf(joinArgs.R_filename, "pk_R%lu.bin", joinArgs.R_els)) >= 50) { 128 | fprintf(stderr, "ERROR: R_filename is %d characters long\n", n); 129 | return 1; 130 | } 131 | 132 | } else if (input.uniqueKeys) { 133 | int n = 0; 134 | 135 | if ((n = sprintf(joinArgs.R_filename, "unique_%lu.bin", (input.RelsMultiplier > 1) ? Q_els_r : joinArgs.R_els)) >= 50) { 136 | fprintf(stderr, "ERROR: R_filename is %d characters long\n", n); 137 | return 1; 138 | } 139 | 140 | if (input.skew > 0) 141 | n = sprintf(joinArgs.S_filename, "unique_skew%.2f_S%lu.bin", joinArgs.S_els); 142 | else 143 | n = sprintf(joinArgs.S_filename, "unique_%lu.bin", (input.SelsMultiplier > 1) ? Q_els_s : joinArgs.S_els); 144 | 145 | if (n >= 50) { 146 | fprintf(stderr, "ERROR: S_filename is %d characters long\n", n); 147 | return 1; 148 | } 149 | } else { 150 | int n = 0; 151 | if ((n = sprintf(joinArgs.S_filename, "nonUnique_S%lu.bin", joinArgs.S_els)) >= 50) { 152 | fprintf(stderr, "ERROR: S_filename is %d characters long\n", n); 153 | return 1; 154 | } 155 | if ((n = sprintf(joinArgs.R_filename, "nonUnique_R%lu.bin", joinArgs.R_els)) >= 50) { 156 | fprintf(stderr, "ERROR: R_filename is %d characters long\n", n); 157 | return 1; 158 | } 159 | } 160 | 161 | /*create relations*/ 162 | #if defined(MEM_DEVICE) 163 | joinArgs.S = (int *) malloc(S_bytes); 164 | joinArgs.R = (int *) malloc(R_bytes); 165 | if (!joinArgs.S || !joinArgs.R) { 166 | fprintf(stderr, "Problem allocating space for the relations\n"); 167 | if (joinArgs.S) free(joinArgs.S); 168 | if (joinArgs.R) free(joinArgs.R); 169 | return 0; 170 | } 171 | #elif defined(MEM_S_DEVICE) 172 | joinArgs.S = (int *) malloc(S_bytes); 173 | if (!joinArgs.S) { 174 | fprintf(stderr, "Problem allocating space for the relations\n"); 175 | return 0; 176 | } 177 | CHK_ERROR(cudaHostAlloc((void** )&joinArgs.R, R_bytes, cudaHostAllocMapped)); 178 | #elif defined(MEM_MANAGED) 179 | CHK_ERROR(cudaMallocManaged((void** )&joinArgs.S, S_bytes)); 180 | CHK_ERROR(cudaMallocManaged((void** )&joinArgs.R, R_bytes)); 181 | #elif defined(MEM_HOST) 182 | CHK_ERROR(cudaHostAlloc((void** )&joinArgs.S, S_bytes, cudaHostAllocMapped)); 183 | CHK_ERROR(cudaHostAlloc((void** )&joinArgs.R, R_bytes, cudaHostAllocMapped)); 184 | #endif 185 | 186 | if (input.fileInput) { 187 | printf("Reading from files\n"); 188 | readFromFile(input.R_filename, joinArgs.R, joinArgs.R_els); 189 | readFromFile(input.S_filename, joinArgs.S, joinArgs.S_els); 190 | } else if (input.fullRange) { 191 | printf("Creating relation R with %lu tuples (%d MB) using non-unique keys and full range : ", 192 | joinArgs.R_els, R_bytes / 1024 / 1024); 193 | fflush(stdout); 194 | create_relation_nonunique(joinArgs.R_filename, joinArgs.R, joinArgs.R_els, INT_MAX); 195 | 196 | printf("Creating relation S with %lu tuples (%d MB) using non-unique keys and full range : ", 197 | joinArgs.S_els, S_bytes / 1024 / 1024); 198 | fflush(stdout); 199 | create_relation_fk_from_pk(joinArgs.S_filename, joinArgs.S, joinArgs.S_els, joinArgs.R, 200 | joinArgs.R_els); 201 | fflush(stdout); 202 | 203 | } else if (input.uniqueKeys) { 204 | printf("Creating relation R with %lu tuples (%d MB) using unique keys : ", joinArgs.R_els, 205 | R_bytes / 1024 / 1024); 206 | fflush(stdout); 207 | 208 | if (Q_r == NULL) { 209 | create_relation_unique(joinArgs.R_filename, joinArgs.R, joinArgs.R_els, joinArgs.R_els); 210 | } else { 211 | create_relation_unique(joinArgs.R_filename, Q_r, Q_els_r, Q_els_r); 212 | create_relation_n(Q_r, joinArgs.R, Q_els_r, input.RelsMultiplier); 213 | } 214 | 215 | if (Q_s == NULL) { 216 | if (input.skew > 0) { 217 | /* S is skewed */ 218 | printf("Creating relation S with %lu tuples (%d MB) using unique keys and skew %f : ", 219 | joinArgs.S_els, S_bytes / 1024 / 1024, input.skew); 220 | fflush(stdout); 221 | create_relation_zipf(joinArgs.S_filename, joinArgs.S, joinArgs.S_els, joinArgs.R_els, 222 | input.skew); 223 | } else { 224 | /* S is uniform foreign key */ 225 | printf("Creating relation S with %lu tuples (%d MB) using unique keys : ", joinArgs.S_els, 226 | S_bytes / 1024 / 1024); 227 | fflush(stdout); 228 | create_relation_unique(joinArgs.S_filename, joinArgs.S, joinArgs.S_els, joinArgs.R_els); 229 | } 230 | } else { 231 | if (input.skew > 0) { 232 | /* S is skewed */ 233 | printf("Creating relation S with %lu tuples (%d MB) using unique keys and skew %f : ", 234 | joinArgs.S_els, S_bytes / 1024 / 1024, input.skew); 235 | fflush(stdout); 236 | create_relation_zipf(joinArgs.S_filename, Q_s, Q_els_s, Q_els_s, input.skew); 237 | } else { 238 | /* S is uniform foreign key */ 239 | printf("Creating relation S with %lu tuples (%d MB) using unique keys : ", joinArgs.S_els, 240 | S_bytes / 1024 / 1024); 241 | fflush(stdout); 242 | create_relation_unique(joinArgs.S_filename, Q_s, Q_els_s, Q_els_s); 243 | } 244 | 245 | create_relation_n(Q_s, joinArgs.S, Q_els_s, input.SelsMultiplier); 246 | 247 | fflush(stdout); 248 | } 249 | 250 | fflush(stdout); 251 | } else { 252 | printf("Creating relation R with %lu tuples (%d MB) using non-unique keys : ", joinArgs.R_els, 253 | R_bytes / 1024 / 1024); 254 | fflush(stdout); 255 | create_relation_nonunique(joinArgs.R_filename, joinArgs.R, joinArgs.R_els, joinArgs.R_els/2); // |R|/2 to get on average 2entries/value 256 | 257 | printf("Creating relation S with %lu tuples (%d MB) using non-unique keys : ", joinArgs.S_els, 258 | S_bytes / 1024 / 1024); 259 | fflush(stdout); 260 | create_relation_nonunique(joinArgs.S_filename, joinArgs.S, joinArgs.S_els, joinArgs.R_els/2); // |R|/2 and not |S|/2 to get the same range 261 | fflush(stdout); 262 | } 263 | 264 | if (input.option == 7) { 265 | joinArgs.sharedMem = input.sharedMem; 266 | joinArgs.threadsNum = input.threadsNum; 267 | printf("%s : shareMemory = %ld\t#threads = %d\n", input.alg.name, joinArgs.sharedMem, 268 | joinArgs.threadsNum); 269 | fflush(stdout); 270 | 271 | #if defined(MEM_DEVICE) 272 | printf ("memory alloc done\n"); 273 | int *S_host = joinArgs.S; 274 | int *R_host = joinArgs.R; 275 | 276 | cudaDeviceSynchronize(); 277 | 278 | CHK_ERROR(cudaMalloc((int** )&joinArgs.S, S_bytes)); 279 | CHK_ERROR(cudaMalloc((int** )&joinArgs.R, R_bytes)); 280 | CHK_ERROR(cudaMemcpy(joinArgs.S, S_host, S_bytes, cudaMemcpyHostToDevice)); 281 | CHK_ERROR(cudaMemcpy(joinArgs.R, R_host, R_bytes, cudaMemcpyHostToDevice)); 282 | 283 | /*free(S_host); free(R_host);*/ 284 | #elif defined(MEM_S_DEVICE) 285 | int *S_host = joinArgs.S; 286 | CHK_ERROR(cudaMalloc((int** )&joinArgs.S, S_bytes)); 287 | CHK_ERROR(cudaMemcpy(joinArgs.S, S_host, S_bytes, cudaMemcpyHostToDevice)); 288 | free(S_host); 289 | #endif 290 | recordTime(&time.start[time.n - 1]); 291 | uint64_t joinsNum = input.alg.joinAlg(&joinArgs, &time); 292 | recordTime(&time.end[time.n - 1]); 293 | 294 | cudaDeviceReset(); 295 | #if defined(MEM_HOST) 296 | cudaFreeHost(joinArgs.S); 297 | cudaFreeHost(joinArgs.R); 298 | #else 299 | cudaFree(joinArgs.S); cudaFree(joinArgs.R); 300 | #endif 301 | } 302 | 303 | } 304 | 305 | break; 306 | default: 307 | usage_exit(0); 308 | break; 309 | } 310 | } 311 | 312 | 313 | 314 | int createSingleRelation_filename(inputArgs *input, args *attrs) { 315 | /*fix filename (no matter which relation store everything in S, name only needed to re-use a file)*/ 316 | int n = 0; 317 | if (input->fullRange) { 318 | if (input->SelsNum) 319 | n = sprintf(attrs->S_filename, "pk_S%lu.bin", attrs->S_els); 320 | else 321 | n = sprintf(attrs->S_filename, "pk_R%lu.bin", attrs->S_els); 322 | } else if (input->uniqueKeys) { 323 | if (input->SelsNum) 324 | n = sprintf(attrs->S_filename, "unique_S%lu.bin", attrs->S_els); 325 | else 326 | n = sprintf(attrs->S_filename, "unique_R%lu.bin", attrs->S_els); 327 | } else { 328 | if (input->SelsNum) 329 | n = sprintf(attrs->S_filename, "nonUnique_S%lu.bin", attrs->S_els); 330 | else 331 | n = sprintf(attrs->S_filename, "nonUnique_R%lu.bin", attrs->S_els); 332 | } 333 | 334 | if (n >= 50) { 335 | fprintf(stderr, "ERROR: filename is %d characters long\n", n); 336 | return 1; 337 | } 338 | 339 | return 0; 340 | } 341 | 342 | void createSingleRelation_data(inputArgs *input, args *attrs, uint64_t bytes) { 343 | if (input->fullRange) { 344 | printf("Creating relation with %lu tuples (%d MB) using non-unique keys and full range : ", 345 | attrs->S_els, bytes / 1024 / 1024); 346 | fflush(stdout); 347 | create_relation_nonunique(attrs->S_filename, attrs->S, attrs->S_els, INT_MAX); 348 | } else if (input->uniqueKeys) { 349 | printf("Creating relation with %lu tuples (%d MB) using unique keys : ", attrs->S_els, 350 | bytes / 1024 / 1024); 351 | fflush(stdout); 352 | create_relation_unique(attrs->S_filename, attrs->S, attrs->S_els, attrs->S_els); 353 | } else { 354 | printf("Creating relation with %lu tuples (%d MB) using non-unique keys : ", attrs->S_els, 355 | bytes / 1024 / 1024); 356 | fflush(stdout); 357 | create_relation_nonunique(attrs->S_filename, attrs->S, attrs->S_els, attrs->S_els); 358 | } 359 | printf("DONE\n"); 360 | fflush(stdout); 361 | } 362 | 363 | void printTimeInfo(uint64_t tuplesNum, time_st *start, time_st *end) { 364 | double diff_usec = (((*end).tv_sec * 1000000L + (*end).tv_usec) 365 | - ((*start).tv_sec * 1000000L + (*start).tv_usec)); 366 | 367 | double tuplesPerSec = tuplesNum / (diff_usec / 1000000.0); 368 | // printf("%10.3f\n", tuplesPerSec); 369 | 370 | printf("total tuples = %10lu time = %.3f msecs = %.3f secs\t", tuplesNum, diff_usec / 1000.0, 371 | diff_usec / 1000000.0); 372 | if (tuplesPerSec < 1024 / sizeof(int)) 373 | printf("throughput = %8.3lf B/sec\n", tuplesPerSec * sizeof(int)); 374 | else if (tuplesPerSec < 1024 * 1024 / sizeof(int)) 375 | printf("throughput = %8.3lf KB/sec\n", tuplesPerSec * sizeof(int) / 1024); 376 | else if (tuplesPerSec < 1024 * 1024 * 1024 / sizeof(int)) 377 | printf("throughput = %8.3lf MB/sec\n", ((tuplesPerSec / 1024) * sizeof(int)) / 1024); 378 | else 379 | printf("throughput = %8.3lf GB/sec\n", ((tuplesPerSec / 1024 / 1024) * sizeof(int)) / 1024); 380 | } 381 | 382 | 383 | void print_timing_join(args *input, timingInfo *time, joinAlg *alg) { 384 | unsigned int blocksNum = (input->R_els + input->threadsNum - 1) / input->threadsNum; 385 | unsigned int memElsNum = (input->sharedMem + sizeof(int) - 1) / sizeof(int); 386 | unsigned int shareMemoryBlocksNum = (input->S_els + memElsNum - 1) / memElsNum; 387 | uint64_t tuplesNum = input->S_els + input->R_els; //if alg==0 388 | 389 | if (strcmp(alg->name, algs[2].name) != 0 && strcmp(alg->name, algs[3].name) != 0 && strcmp(alg->name, algs[4].name) != 0) { 390 | 391 | #if defined(NLJ_SIMPLE) 392 | tuplesNum = input->R_els + input->R_els*input->S_els; 393 | #elif defined(SHAREDMEM_LOOPIN) 394 | #if defined(MEM_S_DEVICE) 395 | tuplesNum = input->R_els + input->S_els; 396 | #else 397 | tuplesNum = input->R_els + input->S_els * blocksNum; 398 | #endif 399 | #endif 400 | 401 | } 402 | 403 | printf("blocksNum=%lu\tmemElsNum=%lu\tshareMemBlocksNum=%lu\ttuplesNum=%lu\n", blocksNum, memElsNum, 404 | shareMemoryBlocksNum, tuplesNum); 405 | 406 | if (strcmp(alg->name, algs[1].name) == 0) { 407 | /*SMJ*/ 408 | printf("SORT:\t"); 409 | printTimeInfo(tuplesNum, &(time->start[1]), &(time->end[1])); 410 | // } else if (strcmp(alg->name, algs[4].name) == 0) { 411 | // /*HJ*/ 412 | // printf("INDEX:\t"); 413 | // printTimeInfo(tuplesNum, &(time->start[1]), &(time->end[1])); 414 | } 415 | 416 | 417 | if (strcmp(alg->name, algs[2].name) == 0 || strcmp(alg->name, algs[3].name) == 0 || strcmp(alg->name, algs[4].name) == 0) { 418 | printf("BUILD:\t"); 419 | printTimeInfo(input->R_els, &(time->start[1]), &(time->end[1])); 420 | printf("PROBE:\t"); 421 | printTimeInfo(input->S_els, &(time->start[0]), &(time->end[0])); 422 | } else { 423 | printf("JOIN:\t"); 424 | printTimeInfo(tuplesNum, &(time->start[0]), &(time->end[0])); 425 | } 426 | 427 | printf("AGGR:\t"); 428 | printTimeInfo(tuplesNum, &(time->start[time->n - 2]), &(time->end[time->n - 2])); 429 | printf("TOTAL:\t"); 430 | printTimeInfo(tuplesNum, &(time->start[time->n - 1]), &(time->end[time->n - 1])); 431 | 432 | } 433 | 434 | void parseInputArgs(int argc, char ** argv, inputArgs *input) { 435 | /* flags */ 436 | int uniqueKeys_flag = input->uniqueKeys; 437 | int fullRange_flag = input->fullRange; 438 | int file_flag = input->fileInput; 439 | 440 | int c; 441 | int option_index = 0; 442 | 443 | printf("INPUT: "); 444 | 445 | static struct option long_options[] = { 446 | /*These options set a flag.*/ 447 | { "file", no_argument, &file_flag, 1 }, { "non-unique", no_argument, &uniqueKeys_flag, 0 }, { "full-range", no_argument, &fullRange_flag, 1 }, 448 | /* These options don't set a flag. We distinguish them by their indices. */ 449 | { "benchmark", required_argument, 0, 'b' }, { "alg", required_argument, 0, 'a' }, { "SelsNum", 450 | required_argument, 0, 'S' }, { "RelsNum", required_argument, 0, 'R' }, { "skew", 451 | required_argument, 0, 's' }, { "threadsNum", required_argument, 0, 't' }, { "values", 452 | required_argument, 0, 'v' }, { "memory", required_argument, 0, 'm' }, { "pivotsNum", 453 | required_argument, 0, 'p' }, { "OneToMany", required_argument, 0, 'w' }, { "XSelsMultiplier", 454 | required_argument, 0, 'x' }, { "YRelsMultiplier", required_argument, 0, 'y' }, { "R_filename", 455 | required_argument, 0, 'k' }, { "S_filename", required_argument, 0, 'l' }, { 0, 0, 0, 0 } }; 456 | 457 | while ((c = getopt_long(argc, argv, "b:a:S:R:s:t:v:m:p:x:y:k:l:", long_options, &option_index)) != -1) { 458 | switch (c) { 459 | case 0: 460 | printf("%s\t", long_options[option_index].name); 461 | /* If this option set a flag, do nothing else now. */ 462 | if (long_options[option_index].flag != 0) break; 463 | if (optarg) printf(" with arg %s", optarg); 464 | printf("\n"); 465 | break; 466 | case 'b': 467 | input->option = atoi(optarg); 468 | printf("option = %d\t", input->option); 469 | break; 470 | case 'a': { 471 | int i = 0; 472 | while (algs[i].joinAlg) { 473 | if (strcmp(optarg, algs[i].name) == 0) { 474 | strcpy(input->alg.name, algs[i].name); 475 | input->alg.joinAlg = algs[i].joinAlg; 476 | break; 477 | } 478 | i++; 479 | } 480 | printf("joinAlg = %s\t", input->alg.name); 481 | } 482 | break; 483 | case 'k': 484 | input->R_filename = optarg; 485 | printf("R filename = %s\t", input->R_filename); 486 | break; 487 | case 'l': 488 | input->S_filename = optarg; 489 | printf("S filename = %s\t", input->S_filename); 490 | break; 491 | case 'S': { 492 | uint64_t p = atol(optarg); 493 | if (p > ULONG_MAX / sizeof(int)) { 494 | fprintf(stderr, 495 | "WARNING: SelsNun is too big (%lu). Setting SelsNum to maximum supported value %lu\n", 496 | p, ULONG_MAX / sizeof(int)); 497 | input->SelsNum = ULONG_MAX / sizeof(int); 498 | } else 499 | input->SelsNum = p; 500 | } 501 | printf("||S|| = %lu\t", input->SelsNum); 502 | break; 503 | case 'R': { 504 | uint64_t p = atol(optarg); 505 | if (p > ULONG_MAX / sizeof(int)) { 506 | fprintf(stderr, 507 | "WARNING: RelsNun is too big (%lu). Setting RelsNum to maximum supported value %lu\n", 508 | p, ULONG_MAX / sizeof(int)); 509 | input->RelsNum = ULONG_MAX / sizeof(int); 510 | } else 511 | input->RelsNum = p; 512 | } 513 | printf("||R|| = %lu\t", input->RelsNum); 514 | break; 515 | case 's': 516 | input->skew = atof(optarg); 517 | printf("skew = %f\t", input->skew); 518 | break; 519 | case 't': 520 | input->threadsNum = atoi(optarg); 521 | printf("#threads = %d\t", input->threadsNum); 522 | break; 523 | case 'v': 524 | input->valuesPerThread = atoi(optarg); 525 | printf("values per thread= %d\t", input->valuesPerThread); 526 | break; 527 | case 'm': 528 | input->sharedMem = atoi(optarg); // << 10; 529 | printf("sharedMem = %d\t", input->sharedMem); 530 | break; 531 | case 'p': 532 | input->pivotsNum = atoi(optarg); 533 | printf("pivotsNum = %d\t", input->pivotsNum); 534 | break; 535 | case 'w' : 536 | input->one_to_many = atoi(optarg); 537 | printf("OneToMany = %d\t", input->one_to_many); 538 | break; 539 | case 'x' : 540 | input->SelsMultiplier = atol(optarg); 541 | printf("SelsMultiplier = %d\t", input->SelsMultiplier); 542 | break; 543 | case 'y' : 544 | input->RelsMultiplier = atol(optarg); 545 | printf("RelsMultiplier = %d\t", input->RelsMultiplier); 546 | break; 547 | } 548 | } 549 | 550 | input->uniqueKeys = uniqueKeys_flag; 551 | input->fullRange = fullRange_flag; 552 | input->fileInput = file_flag; 553 | 554 | printf("\n"); 555 | 556 | if (input->option < 1 || (input->option > 9 && input->option < 100) || input->option > 101) usage_exit(0); 557 | } 558 | -------------------------------------------------------------------------------- /src/partition-primitives.cu: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 2 | Ecole Polytechnique Federale de Lausanne 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE.*/ 21 | 22 | #include "partition-primitives.cuh" 23 | 24 | #define LOG_BATCH 8 25 | #define PARTS_CAPACITY 16 26 | 27 | /*CPU-side partitioning 28 | we assume that we already have histograms for M-way partitioning, where M is small like 16 (DBs keep statistics anyway so we do it in single pass) 29 | 30 | S=keys of relation S 31 | P=payload of relation S 32 | out_cnts=count of elements for each partition 33 | output_S=that's where we write partitioned keys 34 | output_P=that's where we write partitioned payloads 35 | cnt=total number of elements in relation 36 | log_parts=number of partitions, logarithmic 37 | first_bit=we shift right before taking bits for radix partitioning 38 | nthreads=the number of threads running this, it helps find out what each thread reads 39 | */ 40 | void partitions_host_omp_nontemporal_payload( 41 | const int32_t * __restrict__ S, 42 | const int32_t * __restrict__ P, 43 | size_t * __restrict__ out_cnts, 44 | int32_t * __restrict__ output_S, 45 | int32_t * __restrict__ output_P, 46 | const size_t cnt, 47 | const uint32_t log_parts, 48 | const uint32_t first_bit, 49 | const uint32_t threadIdx, 50 | const uint32_t nthreads) { 51 | const uint32_t parts = 1 << log_parts; 52 | const int32_t parts_mask = parts - 1; 53 | const int32_t bucket_mask = (1 << log2_bucket_size) - 1; 54 | 55 | size_t out_cnts_local[PARTS_CAPACITY]; 56 | for (int i = 0; i < parts; i++) { 57 | out_cnts_local[i] = out_cnts[threadIdx*OMP_MEMORY_STEP + i]; 58 | } 59 | 60 | /*software-managed caches, they have to be aligned for AVX2*/ 61 | int32_t* cache_S = (int32_t*) aligned_alloc(4096, parts*(1 << LOG_BATCH)*sizeof(int32_t)); 62 | int32_t* cache_P = (int32_t*) aligned_alloc(4096, parts*(1 << LOG_BATCH)*sizeof(int32_t)); 63 | 64 | uint32_t regptr[PARTS_CAPACITY]; 65 | for (int i = 0; i < parts; i++) 66 | regptr[i] = i << LOG_BATCH; 67 | 68 | for (size_t t = threadIdx; t < (cnt + OMP_MEMORY_STEP - 1)/OMP_MEMORY_STEP; t += nthreads) { 69 | const int32_t* chunk_S = S + t*OMP_MEMORY_STEP; 70 | const int32_t* chunk_P = P + t*OMP_MEMORY_STEP; 71 | 72 | int end = ((t+1)*OMP_MEMORY_STEP < cnt) ? 73 | OMP_MEMORY_STEP : 74 | cnt - t*OMP_MEMORY_STEP; 75 | //#pragma loop unroll 76 | for (int i = 0; i < end; i++) { 77 | int32_t key = chunk_S[i]; 78 | int32_t payload = chunk_P[i]; 79 | uint32_t partition = (hasht(key) >> first_bit) & parts_mask; 80 | 81 | /*write element to cache*/ 82 | uint32_t offset = (regptr[partition])++; 83 | cache_S[offset] = key; 84 | cache_P[offset] = payload; 85 | 86 | /*cache for partition is full, flush it to memory 87 | do it with non-temporal writes in order to avoid reading the output locations first*/ 88 | if ((offset & ((1 << LOG_BATCH) - 1)) == ((1 << LOG_BATCH) - 1)) { 89 | for (int k = 0; k < (1 << (LOG_BATCH - 3)); k++) { 90 | __m256i data = *((__m256i*) &cache_S[(partition << LOG_BATCH) + k*8]); 91 | _mm256_stream_si256 ((__m256i*) &output_S[out_cnts_local[partition] + k*8], data); 92 | } 93 | 94 | for (int k = 0; k < (1 << (LOG_BATCH - 3)); k++) { 95 | __m256i data = *((__m256i*) &cache_P[(partition << LOG_BATCH) + k*8]); 96 | _mm256_stream_si256 ((__m256i*) &output_P[out_cnts_local[partition] + k*8], data); 97 | } 98 | 99 | out_cnts_local[partition] += (1 << LOG_BATCH); 100 | regptr[partition] = partition << LOG_BATCH; 101 | } 102 | } 103 | } 104 | 105 | /*flush half-full caches*/ 106 | for (int p = 0; p < parts; p++) { 107 | for (int k = 0; k < (1 << (LOG_BATCH - 3)); k++) { 108 | if (8*k < regptr[p] - (p << LOG_BATCH)) { 109 | __m256i data = *((__m256i*) &cache_S[(p << LOG_BATCH) + k*8]); 110 | _mm256_stream_si256 ((__m256i*) &output_S[out_cnts_local[p] + k*8], data); 111 | } 112 | } 113 | 114 | for (int k = 0; k < (1 << (LOG_BATCH - 3)); k++) { 115 | if (8*k < regptr[p] - (p << LOG_BATCH)) { 116 | __m256i data = *((__m256i*) &cache_P[(p << LOG_BATCH) + k*8]); 117 | _mm256_stream_si256 ((__m256i*) &output_P[out_cnts_local[p] + k*8], data); 118 | } 119 | } 120 | 121 | out_cnts_local[p] += regptr[p] - (p << LOG_BATCH); 122 | } 123 | 124 | #pragma omp barrier 125 | } 126 | 127 | /*compute the offsets at which each thread writes by doing the count and then doing prefix sum 128 | this is not part of runtime measurements*/ 129 | void partition_prepare_payload (int* R, int* P, size_t n, uint32_t log_parts, uint32_t first_bit, 130 | int* R_sock[2], int* out_sock[2], 131 | int* P_sock[2], int* pout_sock[2], 132 | size_t* out_offsets[2], size_t total[2], size_t* offsets_GPU, uint32_t num_threads) { 133 | uint32_t parts = (1 << log_parts); 134 | uint32_t parts_mask = parts - 1; 135 | 136 | #pragma omp parallel num_threads(num_threads) 137 | { 138 | uint32_t threadIdx = omp_get_thread_num(); 139 | uint32_t socket = sched_getcpu() % 2; 140 | 141 | for (size_t t = threadIdx; t < (n + OMP_MEMORY_STEP - 1)/OMP_MEMORY_STEP; t += num_threads) { 142 | int end = ((t+1)*OMP_MEMORY_STEP < n) ? 143 | OMP_MEMORY_STEP : 144 | n - t*OMP_MEMORY_STEP; 145 | 146 | for (int i = 0; i < end; i++) { 147 | R_sock[socket][t*OMP_MEMORY_STEP + i] = R[t*OMP_MEMORY_STEP + i]; 148 | P_sock[socket][t*OMP_MEMORY_STEP + i] = P[t*OMP_MEMORY_STEP + i]; 149 | 150 | uint32_t partition = (hasht(R[t*OMP_MEMORY_STEP + i]) >> first_bit) & parts_mask; 151 | out_offsets[socket][partition + threadIdx*OMP_MEMORY_STEP] += 1; 152 | } 153 | } 154 | } 155 | 156 | size_t prefix1 = 0; 157 | 158 | for (int i = 0; i < parts; i++) { 159 | size_t base = prefix1; 160 | 161 | for (int j = 0; j < num_threads; j++) { 162 | size_t temp = out_offsets[0][i + j*OMP_MEMORY_STEP]; 163 | out_offsets[0][i + j*OMP_MEMORY_STEP] = prefix1; 164 | 165 | offsets_GPU[i*num_threads*4 + 2*j] = prefix1 - base; 166 | 167 | prefix1 += temp; 168 | 169 | offsets_GPU[i*num_threads*4 + 2*j + 1] = prefix1 - base; 170 | 171 | prefix1 = ((prefix1 + 31)/32)*32; 172 | } 173 | 174 | for (int j = 0; j < num_threads; j++) { 175 | size_t temp = out_offsets[1][i + j*OMP_MEMORY_STEP]; 176 | out_offsets[1][i + j*OMP_MEMORY_STEP] = prefix1; 177 | 178 | offsets_GPU[i*num_threads*4 + num_threads*2 + 2*j] = prefix1 - base; 179 | 180 | prefix1 += temp; 181 | 182 | offsets_GPU[i*num_threads*4 + num_threads*2 + 2*j + 1] = prefix1 - base; 183 | 184 | 185 | prefix1 = ((prefix1 + 31)/32)*32; 186 | } 187 | 188 | double fraction = ((double) (prefix1 - base))/n; 189 | } 190 | 191 | total[0] = prefix1; 192 | total[1] = prefix1; 193 | 194 | #pragma omp parallel num_threads(num_threads) 195 | { 196 | uint32_t threadIdx = omp_get_thread_num(); 197 | uint32_t socket = sched_getcpu() % 2; 198 | 199 | /*test run, I use it to warm up the memory (make sure it is allocated by the time I access it)*/ 200 | partitions_host_omp_nontemporal_payload(R_sock[socket], P_sock[socket], out_offsets[socket], out_sock[socket], pout_sock[socket], n, log_parts, first_bit, threadIdx, num_threads); 201 | #pragma omp barrier 202 | } 203 | 204 | 205 | double t1 = cpuSeconds(); 206 | 207 | #pragma omp parallel num_threads(num_threads) 208 | { 209 | uint32_t threadIdx = omp_get_thread_num(); 210 | uint32_t socket = sched_getcpu() % 2; 211 | 212 | partitions_host_omp_nontemporal_payload(R_sock[socket], P_sock[socket], out_offsets[socket], out_sock[socket], pout_sock[socket], n, log_parts, first_bit, threadIdx, num_threads); 213 | #pragma omp barrier 214 | } 215 | 216 | double t2 = cpuSeconds(); 217 | 218 | printf("bw %f MB/s\n", (n * sizeof(int)) / 1000000 / (t2 - t1)); 219 | 220 | } 221 | 222 | /*this function handles the multithreaded partitioning*/ 223 | void partition_do_payload (int* R_sock[2], int* out_sock[2], int* P_sock[2], int* pout_sock[2], size_t* out_offsets[2], size_t n, uint32_t log_parts, uint32_t first_bit, uint32_t num_threads) { 224 | #pragma omp parallel num_threads(num_threads) 225 | { 226 | uint32_t threadIdx = omp_get_thread_num(); 227 | uint32_t socket = sched_getcpu() % 2; 228 | 229 | partitions_host_omp_nontemporal_payload(R_sock[socket], P_sock[socket], out_offsets[socket], out_sock[socket], pout_sock[socket], n, log_parts, first_bit, threadIdx, num_threads); 230 | #pragma omp barrier 231 | } 232 | } 233 | 234 | /*this function handles the multithreaded numa copy (useful for staging between sockets before transfer). I use only some thread to avoid eating away bandwidth from PCIe*/ 235 | void numa_copy_multithread (int* __restrict__ dest, int* __restrict__ src, int n) { 236 | #pragma omp parallel num_threads(OMP_PARALLELISM2) 237 | { 238 | uint32_t threadIdx = omp_get_thread_num() % (OMP_PARALLELISM2/2); 239 | uint32_t socket = sched_getcpu() % 2; 240 | 241 | if (socket == 1) 242 | for (size_t t = threadIdx; t < (n + OMP_MEMORY_STEP - 1)/OMP_MEMORY_STEP; t += OMP_PARALLELISM2/2) { 243 | int end = ((t+1)*OMP_MEMORY_STEP < n) ? 244 | OMP_MEMORY_STEP : 245 | n - t*OMP_MEMORY_STEP; 246 | 247 | for (int i = 0; i < end; i += 8) { 248 | __m256i data = _mm256_load_si256 ((__m256i*) &src[t*OMP_MEMORY_STEP+i]); 249 | _mm256_stream_si256 ((__m256i*) &dest[t*OMP_MEMORY_STEP+i], data); 250 | } 251 | } 252 | } 253 | } 254 | 255 | 256 | /*functions used to find which partitions to batch together*/ 257 | 258 | 259 | void sort (int* key, int* val, int n) { 260 | if (n <= 1) 261 | return; 262 | 263 | int k = 1; 264 | int pivot = key[0]; 265 | 266 | for (int i = 1; i < n; i++) { 267 | if (key[i] >= pivot) { 268 | int temp = key[i]; 269 | key[i] = key[k]; 270 | key[k] = temp; 271 | k++; 272 | } 273 | } 274 | 275 | key[0] = key[k-1]; 276 | key[k-1] = pivot; 277 | 278 | sort (key, val, k-1); 279 | sort (key + k, val + k, n-k); 280 | } 281 | 282 | void shuffle (std::list& chosen, int* weight_global, int maxw, std::list& output) { 283 | int n = chosen.size(); 284 | int cnt = 0; 285 | int totalw = 0; 286 | 287 | int* alias = new int[n]; 288 | int* weight = new int[n]; 289 | 290 | for (std::list::iterator it = chosen.begin(); it != chosen.end(); ++it) { 291 | alias[cnt] = *it; 292 | weight[cnt] = weight_global[*it]; 293 | totalw = totalw + weight[cnt]; 294 | cnt++; 295 | } 296 | 297 | sort (weight, alias, n); 298 | 299 | for (int i = 0; i < n; i++) 300 | output.push_back(alias[i]); 301 | 302 | delete[] alias; 303 | delete[] weight; 304 | } 305 | 306 | 307 | void knapSack (std::list& candidates, int* weight_global, double* gain_global, std::list& output, std::list& remainder) { 308 | int n = candidates.size(); 309 | int w = PARTS_RESIDENT+1; 310 | int cnt = 0; 311 | 312 | int* weight = new int[n]; 313 | double* gain = new double[n]; 314 | int* alias = new int[n]; 315 | 316 | for (std::list::iterator it = candidates.begin(); it != candidates.end(); ++it) { 317 | alias[cnt] = *it; 318 | gain[cnt] = gain_global[*it]; 319 | weight[cnt] = weight_global[*it]; 320 | cnt++; 321 | } 322 | 323 | 324 | double** matrix = new double*[n+1]; 325 | 326 | for (int i = 0; i < n+1; i++) { 327 | matrix[i] = new double[w+1]; 328 | 329 | for (int j = 0; j < w+1; j++) 330 | matrix[i][j] = 0.0; 331 | } 332 | 333 | for (int i = 0; i < n+1; i++) { 334 | int wt = (i > 0)? weight[i-1] : 0; 335 | double g = (i > 0)? gain[i-1] : 0.0; 336 | 337 | for (int j = 0; j < w+1; j++) { 338 | if (i == 0 || j == 0) 339 | matrix[i][j] = 0.0; 340 | else if (wt <= j) 341 | matrix[i][j] = (matrix[i-1][j] + 0.000001 < matrix[i-1][j-wt] + g)? matrix[i-1][j-wt] + g : matrix[i-1][j]; 342 | else 343 | matrix[i][j] = matrix[i-1][j]; 344 | } 345 | } 346 | 347 | int t = PARTS_RESIDENT; 348 | int m = n; 349 | std::list pr_output; 350 | 351 | while (t > 0 && m != 0) { 352 | for (int i = m; i > 0; i--) 353 | if (matrix[i][t] > matrix[i-1][t] + 0.000001) { 354 | pr_output.push_back(alias[i-1]); 355 | t -= weight[i-1]; 356 | m = i-1; 357 | break; 358 | } else { 359 | remainder.push_back(alias[i-1]); 360 | } 361 | } 362 | 363 | shuffle (pr_output, weight_global, PARTS_RESIDENT, output); 364 | 365 | for (int i = m; i > 0; i--) { 366 | remainder.push_back(alias[i-1]); 367 | } 368 | 369 | delete[] weight; 370 | delete[] gain; 371 | delete[] alias; 372 | 373 | for (int i = 0; i < n+1; i++) 374 | delete[] matrix[i]; 375 | 376 | delete[] matrix; 377 | } 378 | 379 | #include 380 | 381 | void groupOptimal (double* gain, int n, std::list >& output) { 382 | std::list candidates; 383 | int* weight = new int[n]; 384 | 385 | for (int i = 0; i < n; i++) { 386 | candidates.push_back(i); 387 | weight[i] = ceil(gain[i]); 388 | } 389 | 390 | while (candidates.empty() == false) { 391 | std::list out; 392 | std::list remainder; 393 | 394 | knapSack (candidates, weight, gain, out, remainder); 395 | 396 | output.push_back(out); 397 | 398 | candidates = remainder; 399 | } 400 | 401 | 402 | delete[] weight; 403 | } 404 | 405 | void groupOptimal2 (double* gain, int n, std::list >& output) { 406 | std::list candidates; 407 | std::list* buckets = new std::list [2]; 408 | int* weight = new int[n]; 409 | 410 | for (int i = 0; i < n; i++) { 411 | candidates.push_back(i); 412 | weight[i] = ceil(gain[i]); 413 | } 414 | 415 | std::list out; 416 | std::list remainder; 417 | knapSack (candidates, weight, gain, out, remainder); 418 | 419 | output.push_back(out); 420 | 421 | for (std::list::iterator it = remainder.begin(); it != remainder.end(); ++it) { 422 | buckets[weight[*it] - 1].push_back(*it); 423 | } 424 | 425 | std::list > out2; 426 | for (std::list::iterator it = buckets[1].begin(); it != buckets[1].end(); ++it) { 427 | std::list new_out; 428 | new_out.push_back(*it); 429 | out2.push_back(new_out); 430 | } 431 | 432 | for (std::list >::iterator it = out2.begin(); it != out2.end(); ++it) { 433 | for (int i = 0; i < 3; i++) { 434 | int next = buckets[0].front(); 435 | (*it).push_back(next); 436 | buckets[0].pop_front(); 437 | 438 | if (buckets[0].empty()) 439 | break; 440 | } 441 | 442 | output.push_back(*it); 443 | 444 | if (buckets[0].empty()) { 445 | ++it; 446 | 447 | while (it != out2.end()) { 448 | output.push_back(*it); 449 | ++it; 450 | } 451 | 452 | break; 453 | } 454 | } 455 | 456 | while (buckets[0].empty() == false) { 457 | std::list last; 458 | 459 | while (last.size() < PARTS_RESIDENT && buckets[0].empty() == false) { 460 | int next = buckets[0].front(); 461 | last.push_back(next); 462 | buckets[0].pop_front(); 463 | } 464 | 465 | output.push_back(last); 466 | } 467 | 468 | delete[] weight; 469 | } -------------------------------------------------------------------------------- /src/partition-primitives.cuh: -------------------------------------------------------------------------------- 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS) 2 | Ecole Polytechnique Federale de Lausanne 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE.*/ 21 | 22 | #ifndef PARTITION_PRIMITIVES_HPP_ 23 | #define PARTITION_PRIMITIVES_HPP_ 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include 31 | 32 | #include 33 | #include 34 | 35 | #include "common.h" 36 | #include "common-host.h" 37 | 38 | #define OMP_PARALLELISM1 16 39 | #define OMP_PARALLELISM2 16 40 | #define OMP_MEMORY_STEP 4096 41 | #define LOG_PARTS_OUTER 4 42 | #define PARTS_RESIDENT 5 43 | 44 | void partitions_host_omp_nontemporal_payload( 45 | const int32_t * __restrict__ S, 46 | const int32_t * __restrict__ P, 47 | size_t * __restrict__ out_cnts, 48 | int32_t * __restrict__ output_S, 49 | int32_t * __restrict__ output_P, 50 | const size_t cnt, 51 | const uint32_t log_parts, 52 | const uint32_t first_bit, 53 | const uint32_t threadIdx, 54 | const uint32_t nthreads); 55 | 56 | void partition_prepare_payload (int* R, int* P, size_t n, uint32_t log_parts, uint32_t first_bit, 57 | int* R_sock[2], int* out_sock[2], 58 | int* P_sock[2], int* pout_sock[2], 59 | size_t* out_offsets[2], size_t total[2], size_t* offsets_GPU, uint32_t num_threads); 60 | 61 | void partition_do_payload (int* R_sock[2], int* out_sock[2], int* P_sock[2], int* pout_sock[2], size_t* out_offsets[2], size_t n, uint32_t log_parts, uint32_t first_bit, uint32_t num_threads); 62 | 63 | void numa_copy_multithread (int* __restrict__ dest, int* __restrict__ src, int n); 64 | 65 | void sort (int* key, int* val, int n); 66 | 67 | void shuffle (std::list& chosen, int* weight_global, int maxw, std::list& output); 68 | 69 | void knapSack (std::list& candidates, int* weight_global, double* gain_global, std::list& output, std::list& remainder); 70 | 71 | void groupOptimal (double* gain, int n, std::list >& output); 72 | 73 | void groupOptimal2 (double* gain, int n, std::list >& output); 74 | 75 | #endif --------------------------------------------------------------------------------