├── LICENSE
├── Makefile
├── README.md
└── src
    ├── common-host.cpp
    ├── common-host.h
    ├── common.cu
    ├── common.h
    ├── generator_ETHZ.cu
    ├── generator_ETHZ.cuh
    ├── hash_join_clustered_probe.cu
    ├── join-primitives.cu
    ├── join-primitives.cuh
    ├── main.cu
    ├── partition-primitives.cu
    └── partition-primitives.cuh


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
 4 |                    Ecole Polytechnique Federale de Lausanne
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | CXX=nvcc
  2 | 
  3 | ARCH=sm_61
  4 | #ARCH=sm_20
  5 | 
  6 | #Use CXXFLAGS=_DENABLE_NVPROF in command line to compile selection with nvprof
  7 | #CXXFLAGS+=-g
  8 | #CXXFLAGS+=-G
  9 | #CXXFLAGS+=-Xptxas
 10 | #CXXFLAGS+=-v
 11 | # CXXFLAGS+= -O3 -lineinfo -Xcompiler -fopenmp -std=c++11 
 12 | # CXXFLAGS+= -lineinfo -Xcompiler -fopenmp -std=c++11 --ptxas-options=-v,-preserve-relocs
 13 | 
 14 | DEBUGFLAGS+= -g -G
 15 | RELEASEFLAGS+= 
 16 | # RELEASEFLAGS+= -DNDEBUG
 17 | 
 18 | CUDA_INSTALL_PATH?=/usr/local/cuda
 19 | 
 20 | LDLIBS=-lgomp -lnuma
 21 | 
 22 | INCLUDE_PATH =-I. -Icub
 23 | 
 24 | # CXXFLAGS+= -lnuma
 25 | CXXFLAGS+= -O3 -arch=$(ARCH) -lineinfo --std=c++11 
 26 | # -D_FORCE_INLINES -D_MWAITXINTRIN_H_INCLUDED
 27 | # CXXFLAGS+= -DNTESTMEMCPY 
 28 | # CXXFLAGS+= --maxrregcount=32
 29 | CXXFLAGS+= -lineinfo -rdc=true
 30 | CXXFLAGS+= --default-stream per-thread --expt-relaxed-constexpr
 31 | CXXFLAGS+= --compiler-options='-O3 -fopenmp -mavx2 -mbmi2'
 32 | #-Wall -Wunsafe-loop-optimizations
 33 | 
 34 | # PROFFLAGS+= -L/usr/local/cuda/lib64 -lnvToolsExt
 35 | 
 36 | CXXFLAGS+= $(INCLUDE_PATH)
 37 | 
 38 | DBG_DIR=debug
 39 | RLS_DIR=release
 40 | 
 41 | BIN_ROOT=bin
 42 | OBJ_ROOT=obj
 43 | SRC_ROOT=src
 44 | DEP_ROOT=.depend
 45 | 
 46 | BIN_DBG=$(BIN_ROOT)/$(DBG_DIR)/
 47 | BIN_RLS=$(BIN_ROOT)/$(RLS_DIR)/
 48 | 
 49 | OBJ_DBG=$(OBJ_ROOT)/$(DBG_DIR)/
 50 | OBJ_RLS=$(OBJ_ROOT)/$(RLS_DIR)/
 51 | 
 52 | DEP_DBG=$(DEP_ROOT)/$(DBG_DIR)/
 53 | DEP_RLS=$(DEP_ROOT)/$(RLS_DIR)/
 54 | 
 55 | SED_ODD=$(subst /,\/,$(OBJ_DBG))
 56 | SED_ORD=$(subst /,\/,$(OBJ_RLS))
 57 | 
 58 | SED_DDD=$(subst /,\/,$(DEP_DBG))
 59 | SED_DRD=$(subst /,\/,$(DEP_RLS))
 60 | 
 61 | EXCLUDE_SOURCES+= src/exclude_me.cu
 62 | EXCLUDE_SOURCES+= src/cub/%
 63 | 
 64 | CXX_SOURCESD= $(shell find $(SRC_ROOT) -name "*.cpp")
 65 | CUDA_SOURCESD= $(shell find $(SRC_ROOT) -name "*.cu")
 66 | CXX_SOURCESD:= $(filter-out $(EXCLUDE_SOURCES),$(CXX_SOURCESD))
 67 | CUDA_SOURCESD:= $(filter-out $(EXCLUDE_SOURCES),$(CUDA_SOURCESD))
 68 | CXX_SOURCES= $(subst $(SRC_ROOT)/,,$(CXX_SOURCESD))
 69 | CUDA_SOURCES= $(subst $(SRC_ROOT)/,,$(CUDA_SOURCESD))
 70 | CXX_OBJECTS= $(CXX_SOURCES:.cpp=.o)
 71 | CUDA_OBJECTS= $(CUDA_SOURCES:.cu=.o)
 72 | 
 73 | OBJ_FILES:=$(addprefix $(OBJ_DBG), $(CXX_OBJECTS)) $(addprefix $(OBJ_RLS), $(CXX_OBJECTS)) $(addprefix $(OBJ_DBG), $(CUDA_OBJECTS)) $(addprefix $(OBJ_RLS), $(CUDA_OBJECTS))
 74 | 
 75 | # .DEFAULT_GOAL := release
 76 | all: debug release
 77 | 
 78 | debug:CXXFLAGS+= $(DEBUGFLAGS) $(PROFFLAGS)
 79 | release:CXXFLAGS+= $(OPTFLAGS) $(PROFFLAGS)
 80 | 
 81 | release:BIN_DIR:= $(BIN_RLS)
 82 | release:IMP_DIR:= $(RLS_DIR)
 83 | release:OBJ_DIR:= $(OBJ_RLS)
 84 | # release:CXX_OBJ_D:= $(addprefix $(OBJ_RLS), $(CXX_OBJECTS)) $(addprefix $(OBJ_DBG), $(CUDA_OBJECTS))
 85 | 
 86 | debug:BIN_DIR:= $(BIN_DBG)
 87 | debug:IMP_DIR:= $(DBG_DIR)
 88 | debug:OBJ_DIR:= $(OBJ_DBG)
 89 | # debug:CXX_OBJ_D:= $(addprefix $(OBJ_DBG), $(CXX_OBJECTS)) $(addprefix $(OBJ_DBG), $(CUDA_OBJECTS))
 90 | 
 91 | -include $(addprefix $(DEP_DBG), $(CUDA_SOURCES:.cu=.d))
 92 | -include $(addprefix $(DEP_RLS), $(CUDA_SOURCES:.cu=.d))
 93 | -include $(addprefix $(DEP_DBG), $(CXX_SOURCES:.cpp=.d))
 94 | -include $(addprefix $(DEP_RLS), $(CXX_SOURCES:.cpp=.d))
 95 | 
 96 | $(BIN_RLS)bench:$(addprefix $(OBJ_RLS), $(CXX_OBJECTS)) $(addprefix $(OBJ_RLS), $(CUDA_OBJECTS))
 97 | $(BIN_DBG)bench:$(addprefix $(OBJ_DBG), $(CXX_OBJECTS)) $(addprefix $(OBJ_DBG), $(CUDA_OBJECTS)) 
 98 | 
 99 | release: $(BIN_RLS)bench
100 | debug:   $(BIN_DBG)bench
101 | 
102 | .PHONY: all debug release 
103 | 
104 | space= 
105 | #do no remove this lines!!! needed!!!
106 | space+= 
107 | 
108 | vpath %.o $(subst $(space),:,$(dir $(OBJ_FILES)))
109 | vpath %.cu $(subst $(space),:,$(dir $(CXX_SOURCESD)))
110 | vpath %.cpp $(subst $(space),:,$(dir $(CUDA_SOURCESD)))
111 | 
112 | $(sort $(subst //,/,$(dir $(OBJ_FILES)))):
113 | 	mkdir -p $@
114 | 
115 | %.o: 
116 | 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(filter $(subst $(OBJ_DIR),$(SRC_ROOT)/,$(@:.o=.cu)),$(CUDA_SOURCESD)) $(filter $(subst $(OBJ_DIR),$(SRC_ROOT)/,$(@:.o=.cpp)),$(CXX_SOURCESD)) -o $@
117 | 
118 | %bench:
119 | 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(LDLIBS) -o $@ $^
120 | 
121 | clean:
122 | 	-rm -r $(OBJ_ROOT) $(BIN_ROOT) $(DEP_ROOT)
123 | 	mkdir -p $(BIN_DBG) $(BIN_RLS) $(OBJ_DBG) $(OBJ_RLS) $(DEP_DBG) $(DEP_RLS)
124 | 
125 | $(DEP_DBG)%.d: %.cu Makefile
126 | 	@mkdir -p $(@D)
127 | 	$(CXX) -E -Xcompiler "-isystem $(CUDA_INSTALL_PATH)/include -MM" $(CPPFLAGS) $(CXXFLAGS) $< | sed -r 's/^(\S+).(\S+):/$(SED_ODD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(<:.cu=.o))) $(SED_DDD)$(subst /,\/,$(<:.cu=.d)): \\\n Makefile \\\n/g' | sed -r 's/(\w)\s+(\w)/\1 \\\n \2/g' | sed '$$s/$$/\\\n | $(SED_ODD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(dir $<)))/g' | sed -r 's/(\w)+\/\.\.\///g' | awk '!x[$$0]++' > $@
128 | 
129 | $(DEP_RLS)%.d: %.cu Makefile
130 | 	@mkdir -p $(@D)
131 | 	$(CXX) -E -Xcompiler "-isystem $(CUDA_INSTALL_PATH)/include -MM" $(CPPFLAGS) $(CXXFLAGS) $< | sed -r 's/^(\S+).(\S+):/$(SED_ORD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(<:.cu=.o))) $(SED_DRD)$(subst /,\/,$(<:.cu=.d)): \\\n Makefile \\\n/g' | sed -r 's/(\w)\s+(\w)/\1 \\\n \2/g' | sed '$$s/$$/\\\n | $(SED_ORD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(dir $<)))/g' | sed -r 's/(\w)+\/\.\.\///g' | awk '!x[$$0]++' > $@
132 | 
133 | $(DEP_DBG)%.d: %.cpp Makefile
134 | 	@mkdir -p $(@D)
135 | 	$(CXX) -E -Xcompiler "-isystem $(CUDA_INSTALL_PATH)/include -MM" $(CPPFLAGS) $(CXXFLAGS) $< | sed -r 's/^(\S+).(\S+):/$(SED_ODD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(<:.cpp=.o))) $(SED_DDD)$(subst /,\/,$(<:.cpp=.d)): \\\n Makefile \\\n/g' | sed -r 's/(\w)\s+(\w)/\1 \\\n \2/g' | sed '$$s/$$/\\\n | $(SED_ODD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(dir $<)))/g' | sed -r 's/(\w)+\/\.\.\///g' | awk '!x[$$0]++' > $@
136 | 
137 | $(DEP_RLS)%.d: %.cpp Makefile
138 | 	@mkdir -p $(@D)
139 | 	$(CXX) -E -Xcompiler "-isystem $(CUDA_INSTALL_PATH)/include -MM" $(CPPFLAGS) $(CXXFLAGS) $< | sed -r 's/^(\S+).(\S+):/$(SED_ORD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(<:.cpp=.o))) $(SED_DRD)$(subst /,\/,$(<:.cpp=.d)): \\\n Makefile \\\n/g' | sed -r 's/(\w)\s+(\w)/\1 \\\n \2/g' | sed '$$s/$$/\\\n | $(SED_ORD)$(subst /,\/,$(subst $(SRC_ROOT)/,,$(dir $<)))/g' | sed -r 's/(\w)+\/\.\.\///g' | awk '!x[$$0]++' > $@
140 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Code repository for the paper "Hardware-conscious Hash-Joins on GPUs" presented in ICDE 2019
2 | 
3 | The publicly available version is a work-in-progress. Soon we will be adding more detailed documentation, 
4 | tips for configurations and tuning, a better interface and some inline explanations for some design choices.
5 | 


--------------------------------------------------------------------------------
/src/common-host.cpp:
--------------------------------------------------------------------------------
  1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
  2 |                    Ecole Polytechnique Federale de Lausanne
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in all
 12 | copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 20 | SOFTWARE.*/
 21 | 
 22 | #include "common-host.h"
 23 | 
 24 | #include <sys/time.h>
 25 | 
 26 | double cpuSeconds() {
 27 |     struct timeval tp;
 28 |     gettimeofday(&tp, NULL);
 29 |     return ((double)tp.tv_sec + (double)tp.tv_usec * 1.e-6);
 30 | }
 31 | 
 32 | void initializeSeq(int *in, size_t size)	{
 33 | 	for(int i = 0 ; i < size; i++)	{
 34 | 		in[i] = i;
 35 | 	}
 36 | }
 37 | 
 38 | void initializeUniform(int *in, size_t size)	{
 39 | 	//srand (time(NULL));
 40 | 	//We want the input to be the same for every test
 41 | 	//BUT: If seed is set to 1, the generator is reinitialized to
 42 | 	//its initial value and produces the same values
 43 | 	//as before any call to rand or srand.
 44 | //	srand (1);
 45 | //	for(int i = 0 ; i < size; i++)	{
 46 | //		in[i] = rand() % size;
 47 | //	}
 48 | 
 49 | 	for (int i = 0; i < size; i++) {
 50 | 		in[i] = rand() % size;
 51 | 	}
 52 | }
 53 | 
 54 | void initializeUniform(int *in, size_t size, int seed) {
 55 | //	//srand (time(NULL));
 56 | //	//We want the input to be the same for every test
 57 | //	//BUT: If seed is set to 1, the generator is reinitialized to
 58 | //	//its initial value and produces the same values
 59 | //	//as before any call to rand or srand.
 60 | //	srand(seed + 10);
 61 | //	for(int i = 0 ; i < size; i++)	{
 62 | //		in[i] = rand() % size;
 63 | //	}
 64 | 
 65 | 	struct random_data* rand_states;
 66 | 	char* rand_statebufs;
 67 | 	int nthreads = 1;
 68 | 	int bufferSize = 32;
 69 | 	rand_states = (struct random_data*) calloc(nthreads,
 70 | 			sizeof(struct random_data));
 71 | 	rand_statebufs = (char*) calloc(nthreads, bufferSize);
 72 | 
 73 | 	/* for each 'thread', initialize a PRNG (the seed is the first argument) */
 74 | 	//initstate_r(random(), &rand_statebufs[t], PRNG_BUFSZ, &rand_states[t]);
 75 | 	initstate_r(seed + 10, &rand_statebufs[0], bufferSize, &rand_states[0]);
 76 | 	int state1;
 77 | 
 78 | 	for (int i = 0; i < size; i++) {
 79 | 		random_r(&rand_states[0], &state1);
 80 | 		in[i] = state1 % size;
 81 | 	}
 82 | 
 83 | 	free(rand_states);
 84 | 	free(rand_statebufs);
 85 | }
 86 | 
 87 | void initializeUniform(int *in, size_t size, int maxNo, int seed) {
 88 | 	//srand (time(NULL));
 89 | 	//We want the input to be the same for every test
 90 | 	//BUT: If seed is set to 1, the generator is reinitialized to
 91 | 	//its initial value and produces the same values
 92 | 	//as before any call to rand or srand.
 93 | //	srand(seed + 10);
 94 | //	for(int i = 0 ; i < size; i++)	{
 95 | //		in[i] = rand() % maxNo;
 96 | //	}
 97 | 
 98 | 	struct random_data* rand_states;
 99 | 	char* rand_statebufs;
100 | 	int nthreads = 1;
101 | 	int bufferSize = 32;
102 | 	rand_states = (struct random_data*) calloc(nthreads,
103 | 			sizeof(struct random_data));
104 | 	rand_statebufs = (char*) calloc(nthreads, bufferSize);
105 | 
106 | 	/* for each 'thread', initialize a PRNG (the seed is the first argument) */
107 | 	//initstate_r(random(), &rand_statebufs[t], PRNG_BUFSZ, &rand_states[t]);
108 | 	initstate_r(seed + 10, &rand_statebufs[0], bufferSize, &rand_states[0]);
109 | 	int state1;
110 | 	for (int i = 0; i < size; i++) {
111 | 		random_r(&rand_states[0], &state1);
112 | 		in[i] = state1 % maxNo;
113 | 	}
114 | 
115 | 	free(rand_states);
116 | 	free(rand_statebufs);
117 | }
118 | 
119 | void initializeZero(int *in, size_t size)	{
120 | 	for(int i = 0 ; i < size; i++)	{
121 | 		in[i] = 0;
122 | 	}
123 | }
124 | 
125 | int NumberOfSetBits(int i) //uint32_t
126 | {
127 |      // Java: use >>> instead of >>
128 |      // C or C++: use uint32_t
129 |      i = i - ((i >> 1) & 0x55555555);
130 |      i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
131 |      return (((i + (i >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
132 | }
133 | 


--------------------------------------------------------------------------------
/src/common-host.h:
--------------------------------------------------------------------------------
 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
 2 |                    Ecole Polytechnique Federale de Lausanne
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.*/
21 | 
22 | #ifndef COMMON_HOST_H_
23 | #define COMMON_HOST_H_
24 | 
25 | #include <chrono>
26 | #include <string>
27 | #include <iostream>
28 | 
29 | #define PAGESIZE 65568 //16384 * sizeof(int) + 32  //
30 | //#define PAGESIZE 16416 //16384 - 32
31 | //#define PAGESIZE  4096
32 | 
33 | #include <cstdlib>
34 | 
35 | //nsight complains about cstdint (?)
36 | typedef unsigned int      uint32_t;
37 | typedef unsigned long int uint64_t;
38 | 
39 | typedef struct args {
40 | 	int *S;
41 | 	size_t S_els;
42 | 	char S_filename[50];
43 | 	int *R;
44 | 	size_t R_els;
45 | 	char R_filename[50];
46 | 	int threadsNum;
47 | //	int blocksNum_max;
48 | //	int valuesPerThread;
49 | 	unsigned int sharedMem;
50 | 	unsigned int pivotsNum;
51 | 
52 | } args;
53 | 
54 | /* Timing */
55 | double cpuSeconds();
56 | 
57 | /* Benchmarking */
58 | void initializeSeq(int *in, size_t size);
59 | void initializeUniform(int *in, size_t size);
60 | void initializeUniform(int *in, size_t size, int seed);
61 | void initializeUniform(int *in, size_t size, int maxNo, int seed);
62 | void initializeZero(int *in, size_t size);
63 | 
64 | /* Bitmap Ops */
65 | int NumberOfSetBits(int i);
66 | 
67 | class time_block{
68 | private:
69 |     std::chrono::time_point<std::chrono::system_clock> start;
70 |     std::string                                        text ;
71 | public:
72 |     inline time_block(std::string text = ""): 
73 |                         text(text), start(std::chrono::system_clock::now()){}
74 | 
75 |     inline ~time_block(){
76 |         auto end   = std::chrono::system_clock::now();
77 |         std::cout << text;
78 |         std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
79 |     }
80 | };
81 | 
82 | #endif /* COMMON_HOST_H_ */
83 | 


--------------------------------------------------------------------------------
/src/common.cu:
--------------------------------------------------------------------------------
  1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
  2 |                    Ecole Polytechnique Federale de Lausanne
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in all
 12 | copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 20 | SOFTWARE.*/
 21 | 
 22 | #include "common.h"
 23 | 
 24 | #include <stdio.h>
 25 | #include <stdlib.h>
 26 | #include <sys/time.h>           /* gettimeofday */
 27 | #include <math.h>
 28 | 
 29 | __constant__ unsigned int valuesToProcess;
 30 | __device__ maxSize_type sum_dev;
 31 | 
 32 | 
 33 | void recordTime(time_st *t) {
 34 | 	gettimeofday(t, NULL);
 35 | }
 36 | 
 37 | unsigned int smallestGreaterPowerOf2(const unsigned int num) {
 38 | 	unsigned int x = (UINT_MAX >> 1) + 1; //the greatest possible power of 2
 39 | 	while (!(x & num))
 40 | 		x >>= 1;
 41 | 	if (x ^ num) return x << 1;
 42 | 	return x; /*size is already a power of 2*/
 43 | 
 44 | }
 45 | 
 46 | unsigned int greatestLowerPowerOf2(const unsigned int num) {
 47 | 	unsigned int x = (UINT_MAX >> 1) + 1; //the greatest possible power of 2
 48 | 	while (!(x & num))
 49 | 		x >>= 1;
 50 | 	if (x ^ num) return x;
 51 | 	return x >> 1; /*size is already a power of 2*/
 52 | }
 53 | 
 54 | void initialise_float(float *A, int N) {
 55 | 	int i;
 56 | 	for (i = 0; i < N; i++)
 57 | 		A[i] = (float) (rand() & 0xff) / 10.0f;
 58 | }
 59 | 
 60 | void initialise_int(int *A, const int N) {
 61 | 	int i;
 62 | 	for (i = 0; i < N; i++)
 63 | 		A[i] = rand() % 100 + 1;
 64 | }
 65 | 
 66 | void printArray_int(int *A, const size_t N) {
 67 | 	if (N > (1 << 10)) return;
 68 | 	int i;
 69 | 	for (i = 0; i < N; i++) {
 70 | 		printf("%5d", A[i]);
 71 | 		if ((i + 1) % 35 == 0) printf("\n");
 72 | 	}
 73 | 	printf("\n");
 74 | }
 75 | 
 76 | void printArray_uint(unsigned int *A, const size_t N) {
 77 | 	if (N > (1 << 22)) return;
 78 | 	int i;
 79 | 	for (i = 0; i < N; i++) {
 80 | 		printf("%6u", A[i]);
 81 | 		if ((i + 1) % 35 == 0) printf("\n");
 82 | 	}
 83 | 	printf("\n");
 84 | }
 85 | 
 86 | void printArray_maxSize_type(maxSize_type *A, const maxSize_type N) {
 87 | 	if (N > (1 << 10)) return;
 88 | 	int i;
 89 | 	for (i = 0; i < N; i++) {
 90 | 		printf("%5lu", A[i]);
 91 | 		if ((i + 1) % 35 == 0) printf("\n");
 92 | 	}
 93 | 	printf("\n");
 94 | }
 95 | 
 96 | void printArray_char(char *A, const maxSize_type N) {
 97 | 	if (N > (1 << 22)) return;
 98 | 	maxSize_type i;
 99 | 	for (i = 0; i < N; i++) {
100 | 		printf("%6d", A[i]);
101 | 		if ((i + 1) % 30 == 0) printf("\n");
102 | 	}
103 | 	printf("\n");
104 | }
105 | 
106 | __global__ void copy(data_type *dataTO, data_type *dataFROM, const maxSize_type size) {
107 | 	maxSize_type gidx = blockIdx.x*blockDim.x + threadIdx.x;
108 | 	if(gidx >= size) return;
109 | 	dataTO[gidx] = dataFROM[gidx];
110 | }
111 | 
112 | __global__ void copy(maxSize_type *dataTO, maxSize_type *dataFROM, const maxSize_type size) {
113 | 	maxSize_type gidx = blockIdx.x*blockDim.x + threadIdx.x;
114 | 	if(gidx >= size) return;
115 | 	dataTO[gidx] = dataFROM[gidx];
116 | }
117 | 
118 | __global__ void scatter(data_type *dataIN, data_type *dataOUT, const maxSize_type size, maxSize_type *pos) {
119 | 	uint64_t gidx = blockIdx.x*blockDim.x + threadIdx.x;
120 | 	if(gidx >= size) return;
121 | //	if(pos[gidx] > 1010) {
122 | //		printf("(%d,%d) : data[%lu] <- %d\n", blockIdx.x, threadIdx.x, pos[gidx], dataIN[gidx]);
123 | //	}
124 | 	dataOUT[pos[gidx]] = dataIN[gidx];
125 | }
126 | 
127 | /*Processes at most threadsNum elements of type uint64_t*/
128 | __device__ void prefixSum_after(maxSize_type *data, const unsigned int size, maxSize_type *total) {
129 | 	unsigned int idx = threadIdx.x, idx_f, idx_s;
130 | 
131 | 	/*iterate until the final result is computed */
132 | 	unsigned int stride = 1;
133 | 	for (stride = 1; stride < size; stride <<= 1) {
134 | 		idx_f = stride * (2 * idx);
135 | 		idx_s = stride * (2 * idx + 1);
136 | 
137 | 		if (idx_s < size) data[idx_f] += data[idx_s];
138 | 
139 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
140 | 		__syncthreads();
141 | 	}
142 | 
143 | 	*total = data[0]; //all threads get the result;
144 | 	__syncthreads();
145 | 
146 | 	/*store the final results*/
147 | 	if (threadIdx.x == 0) {
148 | 		data[0] = 0;
149 | 	}
150 | 	__syncthreads();
151 | 
152 | 	/*now go the other direction*/
153 | 	for (stride >>= 1; stride > 0; stride >>= 1) {
154 | 		idx_f = stride * (2 * idx);
155 | 		idx_s = stride * (2 * idx + 1);
156 | 
157 | 		if (idx_s < size) {
158 | 			int tmp = data[idx_s];
159 | 			data[idx_s] = data[idx_f];
160 | 			data[idx_f] = tmp + data[idx_s];
161 | 		}
162 | 		__syncthreads();
163 | 	}
164 | }
165 | 
166 | /*Processes at most threadsNum elements of type maxSize_type*/
167 | __device__ void prefixSum_before(maxSize_type *data, const unsigned int size, maxSize_type *total) {
168 | 	unsigned int idx = threadIdx.x;
169 | 	unsigned int idx_f, idx_s;
170 | 
171 | 
172 | 	unsigned int stride;
173 | 
174 | 	/*iterate until the final result is computed */
175 | 	for (stride = 1; stride < size; stride <<= 1) {
176 | 		idx_f = stride * (2 * idx);
177 | 		idx_s = stride * (2 * idx + 1);
178 | 
179 | 		if (idx_s < size) data[size - 1 - idx_f] += data[size - 1 - idx_s];
180 | 
181 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
182 | 		__syncthreads();
183 | 	}
184 | 
185 | 	*total = data[size-1];
186 | 
187 | 	__syncthreads();
188 | 
189 | 	/*store the final results*/
190 | 	if (threadIdx.x == 0) {
191 | 		data[size - 1] = 0;
192 | //		printf("*total = %lu\n", *total);
193 | 	}
194 | 
195 | 	__syncthreads();
196 | 
197 | 	/*now go the other direction*/
198 | 	for (stride >>= 1; stride > 0; stride >>= 1) {
199 | 		idx_f = stride * (2 * idx);
200 | 		idx_s = stride * (2 * idx + 1);
201 | 
202 | 		if (idx_s < size) {
203 | 			maxSize_type tmp = data[size - 1 - idx_s];
204 | 			data[size - 1 - idx_s] = data[size - 1 - idx_f];
205 | 			data[size - 1 - idx_f] = tmp + data[size - 1 - idx_s];
206 | 		}
207 | 		__syncthreads();
208 | 	}
209 | }
210 | 
211 | /*Processes at most threadsNum elements of type data_type*/
212 | __device__ void prefixSum_before(data_type *const data, const unsigned int size, data_type *total) {
213 | 	unsigned int idx = threadIdx.x;
214 | 	unsigned int idx_f, idx_s;
215 | 
216 | 
217 | 	unsigned int stride;
218 | 
219 | 	/*iterate until the final result is computed */
220 | 	for (stride = 1; stride < size; stride <<= 1) {
221 | 		idx_f = stride * (2 * idx);
222 | 		idx_s = stride * (2 * idx + 1);
223 | 
224 | 		if (idx_s < size) data[size - 1 - idx_f] += data[size - 1 - idx_s];
225 | 
226 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
227 | 		__syncthreads();
228 | 	}
229 | 
230 | 	*total = data[size-1];
231 | 
232 | 	__syncthreads();
233 | 
234 | 	/*store the final results*/
235 | 	if (threadIdx.x == 0) {
236 | 		data[size - 1] = 0;
237 | //		printf("*total = %lu\n", *total);
238 | 	}
239 | 
240 | 	__syncthreads();
241 | 
242 | 	/*now go the other direction*/
243 | 	for (stride >>= 1; stride > 0; stride >>= 1) {
244 | 		idx_f = stride * (2 * idx);
245 | 		idx_s = stride * (2 * idx + 1);
246 | 
247 | 		if (idx_s < size) {
248 | 			maxSize_type tmp = data[size - 1 - idx_s];
249 | 			data[size - 1 - idx_s] = data[size - 1 - idx_f];
250 | 			data[size - 1 - idx_f] = tmp + data[size - 1 - idx_s];
251 | 		}
252 | 		__syncthreads();
253 | 	}
254 | }
255 | 
256 | __device__ void prefixSum_before_multiple(data_type *const data, const unsigned int size, data_type *total, unsigned int num) {
257 | 	unsigned int idx = threadIdx.x;
258 | 	unsigned int idx_f, idx_s;
259 | 	unsigned int i;
260 | 
261 | 	unsigned int stride;
262 | 
263 | 	for (i = 0; i < num; i++) {
264 | 		data_type *data_local = data + i * size;
265 | //		printf("%d/%d (%d,%d) : data_local [%p, %p]\n", i, num, blockIdx.x, threadIdx.x, data_local, data_local+size);
266 | 
267 | 		/*iterate until the final result is computed */
268 | 		for (stride = 1; stride < size; stride <<= 1) {
269 | 			idx_f = stride * (2 * idx);
270 | 			idx_s = stride * (2 * idx + 1);
271 | 
272 | 			if (idx_s < size) {
273 | //				if(data_local + size - 1 - idx_f == addr)
274 | //				printf("%d/%d U (%d,%d) -> data_local[%d] (%p) = %d, data_local[%d] (%p) = %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local + size - 1 - idx_f,
275 | //						data_local[size - 1 - idx_f], size - 1 - idx_s, data_local + size - 1 - idx_s, data_local[size - 1 - idx_s]);
276 | 				data_local[size - 1 - idx_f] += data_local[size - 1 - idx_s];
277 | //				if(data_local + size - 1 - idx_f == addr) printf("%d/%d U (%d,%d) -> data_local[%d] = %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local[size - 1 - idx_f]);
278 | 			}
279 | 
280 | 			/*wait for all the threads in the block to finish before going to the next iteration*/
281 | 			__syncthreads();
282 | 		}
283 | 
284 | 		total[i] = data_local[size - 1];
285 | //		printf("%u : (%d,%d) -> total[%d] = %d\n", size, blockIdx.x, threadIdx.x, i, total[i]);
286 | 
287 | 		__syncthreads();
288 | //	}
289 | 
290 | //	/*store the final results (assumes more threads than num)*/
291 | //	if (threadIdx.x < num) {
292 | //		(data+threadIdx.x*size)[size - 1] = 0;
293 | ////		printf("(%d,%d) : total[%d] = %d\n", blockIdx.x, threadIdx.x, threadIdx.x, total[threadIdx.x]);
294 | //	}
295 | //
296 | //	__syncthreads();
297 | 
298 | //	for (i = 0; i < num; i++) {
299 | //		data_type *data_local = data + i * size;
300 | //		if(blockIdx.x == 0) printf("%d/%d\n", i,num);
301 | 
302 | 		if (threadIdx.x == 0) data_local[size - 1] = 0;
303 | 		__syncthreads();
304 | 
305 | //		printf("%p\n", addr);
306 | //		if(data_local + threadIdx.x == addr)
307 | //			printf("%d/%d D0 (%d,%d) -> (%p) data_local[%d] = %d\n", i, num, blockIdx.x, threadIdx.x, data_local + threadIdx.x, threadIdx.x, data_local[threadIdx.x]);
308 | 
309 | 		/*now go the other direction*/
310 | 		for (stride >>= 1; stride > 0; stride >>= 1) {
311 | 			idx_f = stride * (2 * idx);
312 | 			idx_s = stride * (2 * idx + 1);
313 | 
314 | //			if(blockIdx.x == 0)	printf("%d/%d D0 (%d,%d) -> data_local[%d], data_local[%d]\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, size - 1 - idx_s);
315 | 
316 | 			if (idx_s < size) {
317 | //				if(data_local + size - 1 - idx_f == addr)
318 | //				printf("%d/%d D0 (%d,%d) -> (%p) data_local[%d] = %d, (%p) data_local[%d] = %d\n", i, num, blockIdx.x, threadIdx.x, data_local + size - 1 - idx_f, size - 1 - idx_f,
319 | //						data_local[size - 1 - idx_f], data_local + size - 1 - idx_s, size - 1 - idx_s, data_local[size - 1 - idx_s]);
320 | //				data_type tmps = data_local[size - 1 - idx_s];
321 | //				data_type tmpf = data_local[size - 1 - idx_f];
322 | 				data_type tmp = data_local[size - 1 - idx_s];
323 | //				if (data_local + size - 1 - idx_f == addr)
324 | //				printf("%d/%d D1 (%d,%d) -> data_local[%d] = %d, data_local[%d] = %d, %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local[size - 1 - idx_f], size - 1 - idx_s,
325 | //						data_local[size - 1 - idx_s], tmp);
326 | 				data_local[size - 1 - idx_s] = data_local[size - 1 - idx_f];
327 | //				data_local[size - 1 - idx_f] = tmpf + tmps;
328 | //				if (data_local + size - 1 - idx_f == addr)
329 | //				printf("%d/%d D2 (%d,%d) -> data_local[%d] = %d, data_local[%d] = %d, %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local[size - 1 - idx_f], size - 1 - idx_s,
330 | //						data_local[size - 1 - idx_s], tmp);
331 | 				data_local[size - 1 - idx_f] = tmp + data_local[size - 1 - idx_s];
332 | //				data_local[size - 1 - idx_s] = tmpf;
333 | //				if (data_local + size - 1 - idx_f == addr)
334 | //				printf("%d/%d D (%d,%d) -> data_local[%d] = %d, data_local[%d] = %d, %d\n", i, num, blockIdx.x, threadIdx.x, size - 1 - idx_f, data_local[size - 1 - idx_f], size - 1 - idx_s,
335 | //						data_local[size - 1 - idx_s], tmp);
336 | 			}
337 | 			__syncthreads();
338 | 		}
339 | 
340 | 		__syncthreads();
341 | 	}
342 | }
343 | 
344 | /*Processes at most threadsNum elements of type uint64_t. Write the total to the sum_dev*/
345 | __device__ void prefixSum_before_device(maxSize_type *data, const unsigned int size) {
346 | 	unsigned int idx = threadIdx.x;
347 | 	unsigned int idx_f, idx_s;
348 | 	unsigned int stride;
349 | 
350 | //	if(threadIdx.x == 0) printf("(%d,%d) : %u\n", blockIdx.x, threadIdx.x, size);
351 | 
352 | //	printf("B: %u data[%u] = %lu (%u,%u)\n", blockIdx.x, idx, data[idx], stride, size);
353 | 
354 | 	/*iterate until the final result is computed */
355 | 	for (stride = 1; stride < size; stride <<= 1) {
356 | 		idx_f = stride * (2 * idx);
357 | 		idx_s = stride * (2 * idx + 1);
358 | 
359 | //		if (idx_s < size) printf("%u (%d,%d) : %u, %u\n", stride, blockIdx.x, threadIdx.x, size - 1 - idx_f, size - 1 - idx_s);
360 | 
361 | 		if (idx_s < size) data[size - 1 - idx_f] += data[size - 1 - idx_s];
362 | 
363 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
364 | 		__syncthreads();
365 | 	}
366 | 
367 | //	printf("A: %u data[%u] = %lu\n", blockIdx.x, idx, data[idx]);
368 | 
369 | 	/*store the final results*/
370 | 	if (threadIdx.x == 0) {
371 | 		atomicAdd(&sum_dev, data[size-1]);
372 | 		data[size - 1] = 0;
373 | 	}
374 | 
375 | 	__syncthreads();
376 | 
377 | 	/*now go the other direction*/
378 | 	for (stride >>= 1; stride > 0; stride >>= 1) {
379 | 		idx_f = stride * (2 * idx);
380 | 		idx_s = stride * (2 * idx + 1);
381 | 
382 | 		if (idx_s < size) {
383 | 			maxSize_type tmp = data[size - 1 - idx_s];
384 | 			data[size - 1 - idx_s] = data[size - 1 - idx_f];
385 | 			data[size - 1 - idx_f] = tmp + data[size - 1 - idx_s];
386 | 		}
387 | 		__syncthreads();
388 | 	}
389 | }
390 | 
391 | __device__ void prefixSum_before_multipleSeq(data_type *const data, const unsigned int size, data_type *const borders, const unsigned int bordersNum, data_type *totalPerSeq) {
392 | 	/*set the ids and the data according to the ranges*/
393 | 	unsigned int lidx = threadIdx.x;
394 | 	data_type range = data[lidx];
395 | 	data_type border = borders[range];
396 | 
397 | 	lidx -= border;
398 | 
399 | 	data_type *data_local = data+border;
400 | 	data_type *totalPerSeq_local = totalPerSeq + range;
401 | 
402 | 	unsigned int size_local = 0;
403 | 	if(range == 0) size_local = borders[range+1];
404 | 	else if(range == bordersNum-1) size_local = size-border;
405 | 	else size_local = borders[range+1]-border;
406 | 
407 | 	if(threadIdx.x < bordersNum)
408 | 		totalPerSeq[threadIdx.x] = 0; //initialise
409 | 
410 | 	__syncthreads();
411 | 
412 | //	printf("(%d,%d) : range = %d, border = %d -> idx = %d, size = %d, data = %d\n", blockIdx.x, threadIdx.x, range, border, lidx, size_local, data_local[lidx]);
413 | 	/*continue with prefixSum as usual*/
414 | 	unsigned int idx_f, idx_s;
415 | 	unsigned int stride;
416 | 
417 | 	/*iterate until the final result is computed */
418 | 	for (stride = 1; stride < size_local; stride <<= 1) {
419 | 		idx_f = stride * (2 * lidx);
420 | 		idx_s = stride * (2 * lidx + 1);
421 | 
422 | 		if (idx_s < size_local) data_local[size_local - 1 - idx_f] += data_local[size_local - 1 - idx_s];
423 | 
424 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
425 | 		__syncthreads();
426 | 	}
427 | 
428 | 	*totalPerSeq_local = data_local[size_local-1];
429 | //	printf("(%d,%d) : total = %d (%d)\n", blockIdx.x, threadIdx.x, *totalPerSeq_local, range);
430 | 
431 | 	__syncthreads();
432 | 
433 | 	/*store the final results*/
434 | 	if (lidx == 0) {
435 | 		data_local[size_local - 1] = 0;
436 | 
437 | 	}
438 | 
439 | 	__syncthreads();
440 | 
441 | 	/*now go the other direction*/
442 | 	for (stride >>= 1; stride > 0; stride >>= 1) {
443 | 		idx_f = stride * (2 * lidx);
444 | 		idx_s = stride * (2 * lidx + 1);
445 | 
446 | 		if (idx_s < size_local) {
447 | 			maxSize_type tmp = data_local[size_local - 1 - idx_s];
448 | 			data_local[size_local - 1 - idx_s] = data_local[size_local - 1 - idx_f];
449 | 			data_local[size_local - 1 - idx_f] = tmp + data_local[size_local - 1 - idx_s];
450 | 		}
451 | 		__syncthreads();
452 | 	}
453 | 
454 | //	printf("(%d,%d) : data = %d\n", blockIdx.x, threadIdx.x, data_local[lidx]);
455 | }
456 | 
457 | 
458 | __global__ void prefixSum_before(maxSize_type *data, const maxSize_type size, maxSize_type *total) {
459 | 	if(size <= blockDim.x*blockIdx.x) return; /*nothing for this block*/
460 | 
461 | 	maxSize_type gidx = blockIdx.x*blockDim.x+threadIdx.x;
462 | 	maxSize_type blockTotal;
463 | 
464 | 	/*write data to shared memory*/
465 | 	if(gidx < size) uint64_shared[threadIdx.x] = data[gidx];
466 | 
467 | 	__syncthreads(); /*wait for all threads to finish writing in shared memory*/
468 | 
469 | 	unsigned int blockSize = blockDim.x;
470 | 	if(blockSize > size - blockDim.x*blockIdx.x) blockSize = size - blockDim.x*blockIdx.x;
471 | 
472 | 	prefixSum_before(uint64_shared, blockSize, &blockTotal);
473 | 
474 | 	/*write the results back from shared memory*/
475 | 	if(gidx < size) data[gidx] = uint64_shared[threadIdx.x];
476 | 
477 | 	/*all threads in the block get the blockTotal but only one writes it in the output*/
478 | 	if(threadIdx.x == 0) *(total+blockIdx.x) = blockTotal;
479 | }
480 | 
481 | /*there is no total per block, there is only the sum of all totals of all blocks
482 |  * If there is only one block then the sum of all totals is the total sum of the block.
483 |  * The kernel does not return the total. it just computes it and stores it in case another
484 |  * kernel wants to use it later*/
485 | __global__ void prefixSum_before_device(maxSize_type *data, const maxSize_type size) {
486 | 
487 | 	if(size <= blockDim.x*blockIdx.x) return; /*nothing for this block*/
488 | 
489 | 	maxSize_type gidx = blockIdx.x*blockDim.x+threadIdx.x;
490 | 
491 | 	if(threadIdx.x == 0) sum_dev = 0; //initialise the sum to clean from any old result
492 | 
493 | 	/*write data to shared memory*/
494 | 	if(gidx < size) uint64_shared[threadIdx.x] = data[gidx];
495 | 
496 | 	__syncthreads(); /*wait for all threads to finish writing in shared memory*/
497 | 
498 | 	unsigned int blockSize = blockDim.x;
499 | 	if(blockSize > size - blockDim.x*blockIdx.x) blockSize = size - blockDim.x*blockIdx.x;
500 | 
501 | 	prefixSum_before_device(uint64_shared, blockSize);
502 | 
503 | 	/*write the results back from shared memory*/
504 | 	if(gidx < size) data[gidx] = uint64_shared[threadIdx.x];
505 | 
506 | //	if(threadIdx.x == 0) printf("%d : blockSum = %lu\n", blockIdx.x, sum_dev);
507 | }
508 | 
509 | __device__ void sum(maxSize_type *data, const unsigned int size, maxSize_type *res) {
510 | 	/*iterate until the final result is computed */
511 | 	maxSize_type stride = 1;
512 | 	for (stride = 1; stride < size; stride <<= 1) {
513 | 		maxSize_type idx_f = stride * (2 * threadIdx.x);
514 | 		maxSize_type idx_s = stride * (2 * threadIdx.x + 1);
515 | 
516 | //		if(idx_f < size)
517 | //			printf("(%d, %d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, idx_f, data[idx_f]);
518 | //		if(idx_s < size)
519 | //			printf("(%d, %d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, idx_s, data[idx_s]);
520 | 
521 | 		if (idx_s < size) data[idx_f] += data[idx_s];
522 | 
523 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
524 | 		__syncthreads();
525 | 	}
526 | 
527 | 	/*store the final results*/
528 | 	*res = data[0];
529 | //	if (threadIdx.x == 0)	printf("(%d,%d): %u %lu - %lu\n", blockIdx.x, threadIdx.x, size, *res, data[0]);
530 | }
531 | 
532 | __device__ void sum_device(maxSize_type *data, const unsigned int size) {
533 | 	/*iterate until the final result is computed */
534 | 	maxSize_type stride = 1;
535 | 	for (stride = 1; stride < size; stride <<= 1) {
536 | 		maxSize_type idx_f = stride * (2 * threadIdx.x);
537 | 		maxSize_type idx_s = stride * (2 * threadIdx.x + 1);
538 | 
539 | //		if(idx_f < size)
540 | //			printf("(%d, %d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, idx_f, data[idx_f]);
541 | //		if(idx_s < size)
542 | //			printf("(%d, %d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, idx_s, data[idx_s]);
543 | 
544 | 		if (idx_s < size) data[idx_f] += data[idx_s];
545 | 
546 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
547 | 		__syncthreads();
548 | 	}
549 | 
550 | 	/*store the final results*/
551 | 	if(threadIdx.x == 0) sum_dev = data[0];
552 | //	if (threadIdx.x == 0)	printf("(%d,%d): %u %lu - %lu\n", blockIdx.x, threadIdx.x, size, *res, data[0]);
553 | }
554 | 
555 | __global__ void sum(maxSize_type *data, const maxSize_type size, maxSize_type *res) {
556 | 	if(size <= blockDim.x*blockIdx.x) return; /*nothing for this block*/
557 | 
558 | 	maxSize_type gidx = blockIdx.x*blockDim.x + threadIdx.x;
559 | 
560 | 	maxSize_type total;
561 | 
562 | 	/*write data to shared memory*/
563 | 	if(gidx < size) uint64_shared[threadIdx.x] = data[gidx];
564 | 
565 | 	__syncthreads();
566 | 
567 | 	unsigned int blockSize = blockDim.x;
568 | 	if (blockSize > size - blockDim.x * blockIdx.x) blockSize = size - blockDim.x * blockIdx.x;
569 | 	sum(uint64_shared, blockSize, &total);
570 | 
571 | 	/*I want to write to output only once. For that I use a
572 | 	 * local variable for the local and then have only thread 0
573 | 	 * write to output
574 | 	 */
575 | 	if (threadIdx.x == 0)
576 | 		*res = total;
577 | }
578 | 
579 | __global__ void sum_device(maxSize_type *data, const maxSize_type size) {
580 | 	if(size <= blockDim.x*blockIdx.x) return; /*nothing for this block*/
581 | 
582 | 	maxSize_type gidx = blockIdx.x*blockDim.x + threadIdx.x;
583 | 
584 | 	/*write data to shared memory*/
585 | 	if(gidx < size) uint64_shared[threadIdx.x] = data[gidx];
586 | //	printf("(%d,%d) : data[%lu] = %lu\n", blockIdx.x, threadIdx.x, gidx, data[gidx]);
587 | 
588 | 	__syncthreads();
589 | 
590 | 	unsigned int blockSize = blockDim.x;
591 | 	if (blockSize > size - blockDim.x * blockIdx.x) blockSize = size - blockDim.x * blockIdx.x;
592 | 	sum_device(uint64_shared, blockSize);
593 | }
594 | 
595 | 
596 | /*I assume threads equal to the total number of elements per range. Data are already at shared memory*/
597 | __device__ void prefixSum_sharedMem_before_multipleRanges(maxSize_type *data, maxSize_type data_els, maxSize_type *total1,
598 | 		maxSize_type *total2, maxSize_type *total3, unsigned int partitionsNum) {
599 | 	maxSize_type idx_f, idx_s;
600 | 	maxSize_type gidx = blockIdx.x * blockDim.x + threadIdx.x;
601 | 	unsigned int lidx = threadIdx.x;
602 | 	unsigned int size = blockDim.x;
603 | 	maxSize_type stride;
604 | 	unsigned int i;
605 | 
606 | //	if(gidx >= data_els) return; /* I want all  threads to terminate so that all of them can have the total value*/
607 | 
608 | 	/*find exactly how many elements this block has in shared memory*/
609 | 	if (size > data_els - blockIdx.x * blockDim.x) size = data_els - blockIdx.x * blockDim.x;
610 | 
611 | //	printf("\n(%d,%d) : shared[%u] = data[%lu] = %lu", blockIdx.x, lidx, lidx, gidx, uint64_shared[lidx]);
612 | 
613 | 	maxSize_type *data_ptr;
614 | 
615 | 	/*iterate until the final result is computed */
616 | 	for (stride = 1; stride < size; stride <<= 1) {
617 | 		idx_f = stride * (2 * lidx);
618 | 		idx_s = stride * (2 * lidx + 1);
619 | 
620 | 		data_ptr = data;
621 | 		for (i = 0; i < partitionsNum; i++) {
622 | 
623 | //		printf("\n%lu (%d,%d) : %lu, %lu -> %lu, %lu", stride, blockIdx.x, lidx, idx_f, idx_s, size - 1 - idx_f,size - 1 - idx_s);
624 | 
625 | 			if (idx_s < size) {
626 | //			uint64_t tmp = uint64_shared[size - 1 - idx_f];
627 | 				data_ptr[size - 1 - idx_f] += data_ptr[size - 1 - idx_s];
628 | //			printf("\nUP %lu (%d,%d) : shared[%u] = %lu+%lu=%lu", stride, blockIdx.x, lidx, size - 1 - idx_f, tmp, uint64_shared[size - 1 - idx_s], uint64_shared[size - 1 - idx_f]);
629 | 			}
630 | 			data_ptr += blockDim.x;
631 | 		}
632 | 
633 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
634 | 		__syncthreads();
635 | 	}
636 | 
637 | 	/*store the final results*/
638 | //	if (threadIdx.x == 0) {
639 | 	*total1 = data[size - 1];
640 | 	*total2 = data[blockDim.x + size - 1];
641 | 	*total3 = data[2 * blockDim.x + size - 1];
642 | 
643 | 	__syncthreads();
644 | //printf("\n-> (%d,%d) : %d", blockIdx.x, threadIdx.x, *total);
645 | 	if (threadIdx.x == 0) {
646 | 		for (i = 0; i < partitionsNum; i++)
647 | 			data[i * blockDim.x + size - 1] = 0;
648 | 	}
649 | 
650 | 	__syncthreads();
651 | 
652 | 	/*now go the other direction*/
653 | 	for (stride >>= 1; stride > 0; stride >>= 1) {
654 | 		idx_f = stride * (2 * lidx);
655 | 		idx_s = stride * (2 * lidx + 1);
656 | 
657 | 		data_ptr = data;
658 | 		for (i = 0; i < partitionsNum; i++) {
659 | 
660 | 			if (idx_s < size) {
661 | 				maxSize_type tmp = data_ptr[size - 1 - idx_s];
662 | 				data_ptr[size - 1 - idx_s] = data_ptr[size - 1 - idx_f];
663 | 				data_ptr[size - 1 - idx_f] = tmp + data_ptr[size - 1 - idx_s];
664 | //			printf("\nDOWN %lu (%d,%d) : shared[%u] = %lu -  shared[%u] = %lu", stride, blockIdx.x, lidx, size - 1 - idx_f, uint64_shared[size - 1 - idx_f], size - 1 - idx_s, uint64_shared[size - 1 - idx_s]);
665 | 			}
666 | 			data_ptr += blockDim.x;
667 | 		}
668 | 		__syncthreads();
669 | 	}
670 | }
671 | 
672 | __device__ void sum(int *data, unsigned int size, int *res) {
673 | 	unsigned int idx, idx_f, idx_s;
674 | 
675 | 	/*iterate until the final result is computed */
676 | 	int stride = 1;
677 | 	for (stride = 1; stride < size; stride <<= 1) {
678 | 		idx = threadIdx.x;
679 | 
680 | 		idx_f = stride * (2 * idx);
681 | 		idx_s = stride * (2 * idx + 1);
682 | 
683 | 		if (idx_s < size) data[idx_f] += data[idx_s];
684 | 
685 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
686 | 		__syncthreads();
687 | 	}
688 | 
689 | //	/*store the final results*/
690 | //	if (threadIdx.x == 0) {
691 | 	*res = data[0];
692 | //	}
693 | }
694 | 
695 | 
696 | 
697 | __device__ void max(int *data, unsigned int size, int *res) {
698 | 	unsigned int idx, idx_f, idx_s;
699 | 
700 | 	/*iterate until the final result is computed */
701 | 	int stride = 1;
702 | 	for (stride = 1; stride < size; stride <<= 1) {
703 | 		idx = threadIdx.x;
704 | 
705 | 		idx_f = stride * (2 * idx);
706 | 		idx_s = stride * (2 * idx + 1);
707 | 
708 | 		while (idx_s < size) {
709 | //			int tmp = data[idx_f];
710 | 			if (data[idx_f] < data[idx_s]) data[idx_f] = data[idx_s];
711 | //			printf("UP %d (%d,%d): %d [%d]=[%d]+[%d]=%d+%d=%d\n", size, blockIdx.x, threadIdx.x, stride, idx_f, idx_f, idx_s, tmp, data[idx_s], data[idx_f]);
712 | 
713 | 			idx += blockDim.x;
714 | 
715 | 			idx_f = stride * (2 * idx);
716 | 			idx_s = stride * (2 * idx + 1);
717 | 		}
718 | 
719 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
720 | 		__syncthreads();
721 | 	}
722 | 
723 | 	/*store the final results*/
724 | 	if (threadIdx.x == 0) {
725 | 		*res = data[0];
726 | //	printf("%d: Max = %d\n", blockIdx.x, *res);
727 | 	}
728 | }
729 | 
730 | __device__ void min(int *data, unsigned int size, int *res) {
731 | 	unsigned int idx, idx_f, idx_s;
732 | 
733 | 	/*iterate until the final result is computed */
734 | 	int stride = 1;
735 | 	for (stride = 1; stride < size; stride <<= 1) {
736 | 		idx = threadIdx.x;
737 | 
738 | 		idx_f = stride * (2 * idx);
739 | 		idx_s = stride * (2 * idx + 1);
740 | 
741 | 		while (idx_s < size) {
742 | //			int tmp = data[idx_f];
743 | 			if (data[idx_f] > data[idx_s]) data[idx_f] = data[idx_s];
744 | //			printf("UP %d (%d,%d): %d [%d]=[%d]+[%d]=%d+%d=%d\n", size, blockIdx.x, threadIdx.x, stride, idx_f, idx_f, idx_s, tmp, data[idx_s], data[idx_f]);
745 | 
746 | 			idx += blockDim.x;
747 | 
748 | 			idx_f = stride * (2 * idx);
749 | 			idx_s = stride * (2 * idx + 1);
750 | 		}
751 | 
752 | 		/*wait for all the threads in the block to finish before going to the next iteration*/
753 | 		__syncthreads();
754 | 	}
755 | 
756 | 	/*store the final results*/
757 | 	if (threadIdx.x == 0) {
758 | 		*res = data[0];
759 | //	printf("%d: Min = %d\n", blockIdx.x, *res);
760 | 	}
761 | }
762 | 
763 | __global__ void aggregate(int *data, unsigned int size, int *res, int funcId) {
764 | 	/*all processing should be done in the same block*/
765 | 	if (blockIdx.x > 0) return;
766 | 
767 | 	unsigned int idx;
768 | 	for (idx = threadIdx.x; idx < size; idx += blockDim.x)
769 | 		int_shared[idx] = data[idx];
770 | 
771 | 	__syncthreads();
772 | 
773 | 	if (funcId == 1)
774 | 		min(int_shared, size, res);
775 | 	else if (funcId == 2)
776 | 		max(int_shared, size, res);
777 | 	else if (funcId == 3) sum(int_shared, size, res);
778 | }
779 | 
780 | void* test(size_t sz) {
781 | 	//CUDA UVA
782 | 
783 | 	void* mem;
784 | 	cudaHostAlloc((void **) &mem, sz, cudaHostAllocMapped);
785 | 
786 | 	if (mem)
787 | 		return mem;
788 | 	else
789 | 		return NULL;
790 | }
791 | 
792 | #define PREFIX_SUM2
793 | #define SUM
794 | 
795 | static __global__ void addWithStepANDsum_device(maxSize_type *data, maxSize_type size, maxSize_type *total, maxSize_type total_els, maxSize_type group) {
796 | 	maxSize_type gidx = blockDim.x * blockIdx.x + threadIdx.x;
797 | 
798 | 	if (blockIdx.x <= group) return;
799 | 
800 | //	if(threadIdx.x == 0) printf("(%d,%d) : blockSum = %lu\n", blockIdx.x, threadIdx.x, sum_dev);
801 | 
802 | 	unsigned int blockIdx_normalised = blockIdx.x - group;
803 | #if defined(SUM) || defined(PREFIX_SUM)
804 | 	if (blockIdx_normalised >= total_els && gidx < size) {
805 | 		/*use the pre-computed sum*/
806 | //		if(gidx == 20) printf("(%d,%d) : blockSum = %lu\n", blockIdx.x, threadIdx.x, sum_dev);
807 | 		data[gidx] += sum_dev;
808 | 	} else {
809 | #endif
810 | 		/*use the pre-computed prefixSum*/
811 | //		if(gidx == 20) printf("(%d,%d) : blockSum = %lu\n", blockIdx.x, threadIdx.x, total[blockIdx_normalised]);
812 | #if defined(PREFIX_SUM)
813 | 		if(gidx < size) data[gidx] += total[blockIdx_normalised];
814 | #else
815 | 		if (threadIdx.x < blockIdx_normalised && threadIdx.x < total_els)
816 | 			uint64_shared[threadIdx.x] = total[threadIdx.x];
817 | 
818 | //		if(threadIdx.x == 0) printf("(%d,%d) : shared memory\n", blockIdx.x, threadIdx.x);
819 | 
820 | 		__syncthreads();
821 | 
822 | 		maxSize_type sumOfAll;
823 | 		unsigned int elsNum = blockDim.x;
824 | 		if (blockIdx.x - group < blockDim.x) elsNum = blockIdx.x - group;
825 | 
826 | 		sum(uint64_shared, elsNum, &sumOfAll);
827 | 
828 | //		if(gidx == 4)
829 | //			printf("(%d,%d) : %lu elsNum = %lu, sum = %lu\n", blockIdx.x, threadIdx.x, gidx, elsNum, sumOfAll);
830 | 
831 | 		if(gidx < size) data[gidx] += sumOfAll;
832 | #endif
833 | #if defined(SUM) || defined(PREFIX_SUM)
834 | 	}
835 | #endif
836 | }
837 | 
838 | void totalPrefixSum(maxSize_type *data, maxSize_type size, maxSize_type *total, maxSize_type *sumOfAll, uint threadsNum, unsigned int iterNum) {
839 | 	unsigned int blocksNum = (size + threadsNum - 1) / threadsNum;
840 | 	dim3 block(threadsNum);
841 | 	dim3 grid(blocksNum);
842 | 	unsigned int sharedMemSize = threadsNum * sizeof(maxSize_type); /*one positions for each thread*/
843 | 
844 | 	unsigned int i = 0;
845 | 	maxSize_type s = data[size - 1];
846 | 
847 | //printf("%u --> s: %u d: %u - %u t: %u - %u\n", iterNum, sumOfAll, data, data+size, total, total+block.x);
848 | 	/*compute first prefix sum*/
849 | 		prefixSum_before<<<grid,block, sharedMemSize, 0>>>(data, size, total);
850 | 	
851 | 	
852 | 	if (blocksNum > threadsNum) {
853 | 		for (i = 0; i < blocksNum - threadsNum; i += threadsNum) {
854 | //			printf("Adding %lu (%d, %d)\n", i, blocksNum, threadsNum);
855 | 			/*compute the sum of as many elements as the number of threads*/
856 | #if defined(PREFIX_SUM)
857 | 			prefixSum_before_device<<<grid, block, sharedMemSize,0>>>(total+i, (maxSize_type)threadsNum);
858 | #elif defined(SUM)
859 | 			sum_device<<<grid, block, sharedMemSize,0>>>(total+i, (maxSize_type)threadsNum);
860 | #endif
861 | //			CHK_ERROR(cudaDeviceSynchronize());
862 | 			addWithStepANDsum_device<<<grid,block,sharedMemSize,0>>>(data, size, total+i, threadsNum, i);
863 | 			CHK_ERROR(cudaDeviceSynchronize());
864 | 		}
865 | 	}
866 | 
867 | 	/*last iteration*/
868 | //	printf("Last adding %lu (%d, %d) remaining %lu\n", i, blocksNum, threadsNum, blocksNum-i);
869 | #if defined(PREFIX_SUM)
870 | 	prefixSum_before_device<<<grid, block, sharedMemSize,0>>>(total+i, (maxSize_type)blocksNum-i);
871 | #endif
872 | 	addWithStepANDsum_device<<<grid,block,sharedMemSize,0>>>(data, size, total+i, blocksNum-i, i);
873 | 	CHK_ERROR(cudaDeviceSynchronize());
874 | 
875 | 	s += data[size - 1];
876 | 	*sumOfAll = s;
877 | 
878 | //	printf("R sumOfAll = %lu \n", *sumOfAll);
879 | 
880 | }
881 | ///* Globally added mem allocators */
882 | //__host__ void* operator new(size_t sz) throw (std::bad_alloc)
883 | //{
884 | ////    cerr << "allocating " << sz << " bytes\n";
885 | ////    void* mem = malloc(sz);
886 | ////    if (mem)
887 | ////        return mem;
888 | ////    else
889 | ////        return NULL;
890 | ////    	//throw std::bad_alloc();
891 | //
892 | ////    //CUDA UVA
893 | //	cerr << "[UVA: ] allocating " << sz << " bytes\n";
894 | //	void* mem;
895 | //	cudaHostAlloc((void **) &mem, sz, cudaHostAllocMapped);
896 | //
897 | //	if (mem)
898 | //		return mem;
899 | //	else	{
900 | //		cerr << "[UVA: ] error during allocation!" << endl;
901 | //		return NULL;
902 | //		//throw std::bad_alloc();
903 | //	}
904 | //
905 | //	//throw std::bad_alloc();
906 | //}
907 | //
908 | //__host__ void operator delete(void* ptr) throw()
909 | //{
910 | //    cerr << "deallocating at " << ptr << endl;
911 | //    //free(ptr);
912 | //
913 | //	cudaFreeHost(ptr);
914 | //
915 | //}
916 | 


--------------------------------------------------------------------------------
/src/common.h:
--------------------------------------------------------------------------------
  1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
  2 |                    Ecole Polytechnique Federale de Lausanne
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in all
 12 | copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 20 | SOFTWARE.*/
 21 | 
 22 | #ifndef COMMON_H_
 23 | #define COMMON_H_
 24 | 
 25 | #include <cstdint> /*uint8_t, uint16_t, uint32_t, uint64_t*/
 26 | #include<limits.h>
 27 | #include <type_traits>
 28 | 
 29 | /* Constants */
 30 | #define WARP_SZ 32
 31 | #define NSTREAM 16//32
 32 | #define BDIM 1024
 33 | //Must be equal to BDIM, no?
 34 | #define SHMEMDIM 1024
 35 | 
 36 | #define DELIM ','
 37 | 
 38 | #define COMPUTE_CAPABILITY_5
 39 | 
 40 | #define BANKSNUM 4
 41 | #define BANKSIZE 8
 42 | #define PADSTEP BANKSNUM*BANKSIZE/sizeof(int);
 43 | #define SHIFT log2((double)BANKSNUM*BANKSIZE/sizeof(int))
 44 | 
 45 | __host__ __device__ __forceinline__ uint32_t hasht(uint32_t x) {
 46 |     return x;
 47 | }
 48 | 
 49 | #define CHUNK_SIZE ((uint64_t) (1 << 31))
 50 | 
 51 | constexpr uint32_t log_parts1 = 8;//9;         //< 12      2^(log_parts1 + log_parts2 + p_d + 5) ~= 'hash table size"  ~= 2 * input size
 52 | constexpr uint32_t log_parts2 = 5;//6;//8;      //< 12    
 53 | 
 54 | constexpr int32_t g_d        = log_parts1 + log_parts2; 
 55 | constexpr int32_t p_d        = 3;
 56 | 
 57 | constexpr int32_t max_chain  = (32 - 1) * 1 - 1; //(32 - 1) * 2 - 1;
 58 | 
 59 | #define hj_d (5 + p_d + g_d)
 60 | 
 61 | constexpr uint32_t hj_mask = ((1 << hj_d) - 1);
 62 | 
 63 | constexpr int32_t partitions = 1 << p_d; 
 64 | constexpr int32_t partitions_mask = partitions - 1;
 65 | 
 66 | constexpr int32_t grid_parts = 1 << g_d;
 67 | constexpr int32_t grid_parts_mask = grid_parts - 1;
 68 | 
 69 | constexpr uint32_t log2_bucket_size = 12;
 70 | constexpr uint32_t bucket_size      = 1 << log2_bucket_size;
 71 | constexpr uint32_t bucket_size_mask = bucket_size - 1;
 72 | 
 73 | 
 74 | #define MEM_TYPE 0
 75 | 
 76 | #if MEM_TYPE == 0
 77 | #define MEM_HOST
 78 | #elif MEM_TYPE == 1
 79 | #define MEM_DEVICE
 80 | #elif MEM_TYPE == 2
 81 | #define MEM_MANAGED
 82 | #elif MEM_TYPE == 3
 83 | #define MEM_S_DEVICE
 84 | #else
 85 | #define MEM_HOST
 86 | #endif
 87 | 
 88 | #define data_type int
 89 | #define maxSize_type unsigned long long int
 90 | #define data_min INT_MIN
 91 | 
 92 | extern __shared__ data_type int_shared[];
 93 | extern __shared__ maxSize_type uint64_shared[];
 94 | 
 95 | extern __constant__ unsigned int valuesToProcess;
 96 | 
 97 | extern __device__ maxSize_type sum_dev;
 98 | 
 99 | typedef struct timeval time_st;
100 | 
101 | typedef struct timingInfo {
102 | 	unsigned int n = 5;
103 | 	time_st start[5];
104 | 	time_st end[5];
105 | 	double greaterTime = 0;
106 | 	double reduce_usecs = 0;
107 | 	double fixPositions_usecs = 0;
108 | 	double scatter_usecs = 0;
109 | 	double copy_usecs = 0;
110 | 	double bitonic_usecs = 0;
111 | 	double total_usecs = 0;
112 | 
113 | 	double greaterEventTime = 0;
114 | 
115 | 	unsigned int greaterCallsNum = 0;
116 | 	unsigned int bitonicCallsNum = 0;
117 | 	unsigned int reduceCallsNum = 0;
118 | 	unsigned int fixPositionsCallsNum = 0;
119 | } timingInfo;
120 | 
121 | union vec4{
122 |     int4    vec ;
123 |     int32_t i[4];
124 | };
125 | 
126 | union vec2{
127 |     long2   vec ;
128 |     int64_t i[4];
129 | };
130 | 
131 | /* Error Checking*/
132 | #define CHK_ERROR(call)																	\
133 | {                                                                       			\
134 |    const cudaError_t error = call;                                      			\
135 |    if (error != cudaSuccess)                                            			\
136 |    {                                                                    			\
137 |       fprintf(stderr, "GPU Error: %s:%d, ", __FILE__, __LINE__);        			\
138 |       fprintf(stderr, "code:%d, reason: %s\n", error, cudaGetErrorString(error));	\
139 |       exit(-10*error);  															\
140 |    }																				\
141 | }
142 | 
143 | __device__ __forceinline__ uint32_t get_laneid(){
144 |     uint32_t laneid;
145 |     asm("mov.u32 %0, %%laneid;" : "=r"(laneid));
146 |     return laneid;
147 | }
148 | 
149 | //__host__ void* operator new(size_t sz) throw (std::bad_alloc);
150 | //__host__ void operator delete(void* ptr) throw();
151 | 
152 | #define USECS(start, end) (((end)->tv_sec * 1000000L + (end)->tv_usec) - ((start)->tv_sec * 1000000L + (start)->tv_usec))
153 | #define MSECS(start, end) (((end)->tv_sec * 1000000L + (end)->tv_usec) - ((start)->tv_sec * 1000000L + (start)->tv_usec))/1000.0
154 | 
155 | void recordTime(time_st *t);
156 | 
157 | unsigned int smallestGreaterPowerOf2(const unsigned int num);
158 | unsigned int greatestLowerPowerOf2(const unsigned int num);
159 | 
160 | void initialise_float(float *A, int N);
161 | void initialise_int(int *A, const int N);
162 | void printArray_int(int *A, const size_t N);
163 | void printArray_uint(unsigned int *A, const size_t N);
164 | void printArray_char(char *A, const maxSize_type N);
165 | void printArray_maxSize_type(maxSize_type *A, const maxSize_type N);
166 | 
167 | void totalPrefixSum(maxSize_type *data, maxSize_type size, maxSize_type *total, maxSize_type *sumOfAll, unsigned int threadsNum, unsigned int iterNum);
168 | 
169 | /*per block*/
170 | static __device__ void prefixSum_after(maxSize_type *data, const unsigned int size, maxSize_type *total);
171 | __device__ void prefixSum_before(maxSize_type *data, const unsigned int size, maxSize_type *total);
172 | __device__ void prefixSum_before(data_type *const data, const unsigned int size, data_type *total);
173 | __device__ void prefixSum_before_multiple(data_type *const data, const unsigned int size, data_type *total, unsigned int num);
174 | static __device__ void prefixSum_before_device(maxSize_type *data, const unsigned int size);
175 | __device__ void prefixSum_before_multipleSeq(data_type *const data, const unsigned int sise, data_type *const borders, const unsigned int bordersNum, data_type *totalPerSeq);
176 | 
177 | static __device__ void sum(maxSize_type *data, const unsigned int size, maxSize_type *res);
178 | static __device__ void sum_device(maxSize_type *data, const unsigned int size);
179 | 
180 | /*the whole dataset*/
181 | __global__ void prefixSum_before(maxSize_type *data, const maxSize_type size, maxSize_type *total);
182 | __global__ void prefixSum_before_device(maxSize_type *data, const maxSize_type size);
183 | 
184 | __global__ void sum(maxSize_type *data, const maxSize_type size, maxSize_type *res);
185 | __global__ void sum_device(maxSize_type *data, const maxSize_type size);
186 | 
187 | __global__ void copy(data_type *dataTO, data_type *dataFROM, const maxSize_type size);
188 | __global__ void copy(maxSize_type *dataTO, maxSize_type *dataFROM, const maxSize_type size);
189 | 
190 | __global__ void scatter(data_type *dataIN, data_type *dataOUT, maxSize_type size, maxSize_type *pos);
191 | 
192 | 
193 | __device__ void prefixSum_sharedMem_before_multipleRanges(maxSize_type *data, maxSize_type data_els, maxSize_type *total1,
194 | 		maxSize_type *total2, maxSize_type *total3, unsigned int partitionsNum);
195 | 
196 | 
197 | __device__ void sum(int *data, unsigned int size, int *res);
198 | 
199 | __device__ void max(int *data, unsigned int size, int *res);
200 | __device__ void min(int *data, unsigned int size, int *res);
201 | 
202 | __global__ void aggregate(int *data, unsigned int size, int *res, int funcId);
203 | 
204 | // Handle missmatch of atomics for (u)int64/32_t with cuda's definitions
205 | template<typename T,
206 |          typename std::enable_if<sizeof(T) == sizeof(unsigned long long int),
207 |             int>::type = 0>
208 | __device__ __forceinline__ T atomicExch(T *address, T val){
209 |     return (T) atomicExch((unsigned long long int*) address, (unsigned long long int) val);
210 | }
211 | 
212 | template<typename T,
213 |          typename std::enable_if<sizeof(T) == sizeof(unsigned int) && !std::is_signed<T>::value,
214 |             int>::type = 0>
215 | __device__ __forceinline__ T atomicExch(T *address, T val){
216 |     return (T) atomicExch((unsigned int*) address, (unsigned int) val);
217 | }
218 | 
219 | template<typename T,
220 |          typename std::enable_if<sizeof(T) == sizeof(unsigned long long int),
221 |             int>::type = 0>
222 | __device__ __forceinline__ T atomicExch_block(T *address, T val){
223 |     return (T) atomicExch_block((unsigned long long int*) address, (unsigned long long int) val);
224 | }
225 | 
226 | template<typename T,
227 |          typename std::enable_if<sizeof(T) == sizeof(unsigned int) && !std::is_signed<T>::value,
228 |             int>::type = 0>
229 | __device__ __forceinline__ T atomicExch_block(T *address, T val){
230 |     return (T) atomicExch_block((unsigned int*) address, (unsigned int) val);
231 | }
232 | 
233 | 
234 | template<typename T,
235 |          typename std::enable_if<sizeof(T) == sizeof(int) && std::is_signed<T>::value,
236 |             int>::type = 0>
237 | __device__ __forceinline__ T atomicExch(T *address, T val){
238 |     return (T) atomicExch((int*) address, (int) val);
239 | }
240 | 
241 | template<typename T,
242 |          typename std::enable_if<sizeof(T) == sizeof(unsigned long long int),
243 |             int>::type = 0>
244 | __device__ __forceinline__ T atomicOr(T *address, T val){
245 |     return (T) atomicOr((unsigned long long int*) address, (unsigned long long int) val);
246 | }
247 | 
248 | template<typename T,
249 |          typename std::enable_if<sizeof(T) == sizeof(unsigned int) && !std::is_signed<T>::value,
250 |             int>::type = 0>
251 | __device__ __forceinline__ T atomicOr(T *address, T val){
252 |     return (T) atomicOr((unsigned int*) address, (unsigned int) val);
253 | }
254 | 
255 | template<typename T,
256 |          typename std::enable_if<sizeof(T) == sizeof(unsigned long long int),
257 |             int>::type = 0>
258 | __device__ __forceinline__ T atomicOr_block(T *address, T val){
259 |     return (T) atomicOr_block((unsigned long long int*) address, (unsigned long long int) val);
260 | }
261 | 
262 | template<typename T,
263 |          typename std::enable_if<sizeof(T) == sizeof(unsigned int) && !std::is_signed<T>::value,
264 |             int>::type = 0>
265 | __device__ __forceinline__ T atomicOr_block(T *address, T val){
266 |     return (T) atomicOr_block((unsigned int*) address, (unsigned int) val);
267 | }
268 | 
269 | 
270 | template<typename T,
271 |          typename std::enable_if<sizeof(T) == sizeof(int) && std::is_signed<T>::value,
272 |             int>::type = 0>
273 | __device__ __forceinline__ T atomicOr(T *address, T val){
274 |     return (T) atomicOr((int*) address, (int) val);
275 | }
276 | 
277 | template<typename T,
278 |          typename std::enable_if<sizeof(T) == sizeof(unsigned long long int) && std::is_integral<T>::value && !std::is_signed<T>::value,
279 |             int>::type = 0>
280 | __device__ __forceinline__ T atomicMin(T *address, T val){
281 |     return (T) atomicMin((unsigned long long int*) address, (unsigned long long int) val);
282 | }
283 | 
284 | template<typename T,
285 |          typename std::enable_if<sizeof(T) == sizeof(unsigned int) && std::is_integral<T>::value && !std::is_signed<T>::value,
286 |             int>::type = 0>
287 | __device__ __forceinline__ T atomicMin(T *address, T val){
288 |     return (T) atomicMin((unsigned int*) address, (unsigned int) val);
289 | }
290 | 
291 | template<typename T,
292 |          typename std::enable_if<sizeof(T) == sizeof(int) && std::is_integral<T>::value  && std::is_signed<T>::value,
293 |             int>::type = 0>
294 | __device__ __forceinline__ T atomicMin(T *address, T val){
295 |     return (T) atomicMin((int*) address, (int) val);
296 | }
297 | 
298 | template<typename T,
299 |          typename std::enable_if<sizeof(T) == sizeof(unsigned long long int) && std::is_integral<T>::value && !std::is_signed<T>::value,
300 |             int>::type = 0>
301 | __device__ __forceinline__ T atomicMin_block(T *address, T val){
302 |     return (T) atomicMin_block((unsigned long long int*) address, (unsigned long long int) val);
303 | }
304 | 
305 | template<typename T,
306 |          typename std::enable_if<sizeof(T) == sizeof(unsigned int) && std::is_integral<T>::value && !std::is_signed<T>::value,
307 |             int>::type = 0>
308 | __device__ __forceinline__ T atomicMin_block(T *address, T val){
309 |     return (T) atomicMin_block((unsigned int*) address, (unsigned int) val);
310 | }
311 | 
312 | template<typename T,
313 |          typename std::enable_if<sizeof(T) == sizeof(int) && std::is_integral<T>::value  && std::is_signed<T>::value,
314 |             int>::type = 0>
315 | __device__ __forceinline__ T atomicMin_block(T *address, T val){
316 |     return (T) atomicMin_block((int*) address, (int) val);
317 | }
318 | 
319 | template<typename T,
320 |          typename std::enable_if<sizeof(T) == sizeof(unsigned long long int) && std::is_integral<T>::value && !std::is_signed<T>::value,
321 |             int>::type = 0>
322 | __device__ __forceinline__ T atomicAdd(T *address, T val){
323 |     return (T) atomicAdd((unsigned long long int*) address, (unsigned long long int) val);
324 | }
325 | 
326 | template<typename T,
327 |          typename std::enable_if<sizeof(T) == sizeof(unsigned int) && std::is_integral<T>::value && !std::is_signed<T>::value,
328 |             int>::type = 0>
329 | __device__ __forceinline__ T atomicAdd(T *address, T val){
330 |     return (T) atomicAdd((unsigned int*) address, (unsigned int) val);
331 | }
332 | 
333 | template<typename T,
334 |          typename std::enable_if<sizeof(T) == sizeof(int) && std::is_integral<T>::value  && std::is_signed<T>::value,
335 |             int>::type = 0>
336 | __device__ __forceinline__ T atomicAdd(T *address, T val){
337 |     return (T) atomicAdd((int*) address, (int) val);
338 | }
339 | 
340 | template<typename T,
341 |          typename std::enable_if<sizeof(T) == sizeof(unsigned long long int) && std::is_integral<T>::value && !std::is_signed<T>::value,
342 |             int>::type = 0>
343 | __device__ __forceinline__ T atomicAdd_block(T *address, T val){
344 |     return (T) atomicAdd_block((unsigned long long int*) address, (unsigned long long int) val);
345 | }
346 | 
347 | template<typename T,
348 |          typename std::enable_if<sizeof(T) == sizeof(unsigned int) && std::is_integral<T>::value && !std::is_signed<T>::value,
349 |             int>::type = 0>
350 | __device__ __forceinline__ T atomicAdd_block(T *address, T val){
351 |     return (T) atomicAdd_block((unsigned int*) address, (unsigned int) val);
352 | }
353 | 
354 | template<typename T,
355 |          typename std::enable_if<sizeof(T) == sizeof(int) && std::is_integral<T>::value  && std::is_signed<T>::value,
356 |             int>::type = 0>
357 | __device__ __forceinline__ T atomicAdd_block(T *address, T val){
358 |     return (T) atomicAdd_block((int*) address, (int) val);
359 | }
360 | 
361 | #endif /* COMMON_H_ */
362 | 


--------------------------------------------------------------------------------
/src/generator_ETHZ.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | Code adapted from  multicore-hashjoins-0.2@https://www.systems.ethz.ch/node/334
  3 | All credit to the original author: Cagri Balkesen <cagri.balkesen@inf.ethz.ch>
  4 | */
  5 | 
  6 | #include "generator_ETHZ.cuh"
  7 | 
  8 | #include <cstdint>
  9 | #include <cassert>
 10 | #include <cstdio>	/*printf*/
 11 | #include <cstdlib>
 12 | #include <ctime>
 13 | #include <cstring>
 14 | #include <cmath>
 15 | 
 16 | #define RAND_RANGE(N) ((double)rand() / ((double)RAND_MAX + 1) * (N))
 17 | #define RAND_RANGE48(N,STATE) ((double)nrand48(STATE)/((double)RAND_MAX+1)*(N))
 18 | 
 19 | 
 20 | static int seeded = 0;
 21 | static unsigned int seedValue;
 22 | 
 23 | void seed_generator(unsigned int seed) {
 24 | 	srand(seed);
 25 | 	seedValue = seed;
 26 | 	seeded = 1;
 27 | }
 28 | 
 29 | /** Check whether seeded, if not seed the generator with current time */
 30 | static void check_seed() {
 31 | 	if (!seeded) {
 32 | 		seedValue = time(NULL);
 33 | 		srand(seedValue);
 34 | 		seeded = 1;
 35 | 	}
 36 | }
 37 | 
 38 | int readFromFile(const char * filename, int *relation, uint64_t num_tuples) {
 39 | 	char path[100];
 40 | 	sprintf(path, "%s", filename);
 41 | 	FILE *fp = fopen(path, "rb");
 42 | 
 43 | 	if (!fp) return 1;
 44 | 
 45 | 	printf("Reading file %s ", path);
 46 | 	fflush(stdout);
 47 | 
 48 | 	fread(relation, sizeof(int), num_tuples, fp);
 49 | 
 50 | 	/*for (int i = 0; i < num_tuples; i++) {
 51 |         int k = rand() % num_tuples;
 52 |         int tmp = relation[k];
 53 |         relation[k] = relation[i];
 54 |         relation[i] = tmp;
 55 |     }*/
 56 | 
 57 | 	fclose(fp);
 58 | 	return 0;
 59 | }
 60 | 
 61 | static int writeToFile(const char * filename, int *relation, uint64_t num_tuples) {
 62 | 	FILE *fp = fopen(filename, "wb");
 63 | 	if (!fp) return 1;
 64 | 
 65 | 	fwrite(relation, sizeof(int), num_tuples, fp);
 66 | 	fclose(fp);
 67 | 
 68 | 	char path[100];
 69 | 	sprintf(path, "%s", filename);
 70 | 	rename(filename, path);
 71 | 	return 0;
 72 | }
 73 | 
 74 | int create_relation_nonunique(const char *filename, int *relation, uint64_t num_tuples, const int64_t maxid) {
 75 | 	/*first try to read from a file*/
 76 | 	if (readFromFile(filename, relation, num_tuples)) {
 77 | 		check_seed();
 78 | 		random_gen(relation, num_tuples, maxid);
 79 | 
 80 | 		return writeToFile(filename, relation, num_tuples);
 81 | 	}
 82 | 	return 0;
 83 | }
 84 | 
 85 | 
 86 | int create_relation_unique(const char *filename, int *relation, uint64_t num_tuples, const int64_t maxid) {
 87 | 	/*first try to read from a file*/
 88 | 	if (readFromFile(filename, relation, num_tuples)) {
 89 | 		random_unique_gen(relation, num_tuples, maxid);
 90 | 		return writeToFile(filename, relation, num_tuples);
 91 | 	}
 92 | 
 93 | 	return 0;
 94 | }
 95 | 
 96 | 
 97 | int create_relation_n(int* in_relation, int* out_relation, uint64_t num_tuples, uint64_t n) {
 98 | 	for (int i = 0; i < n; i++) {
 99 | 		memcpy (out_relation + i * num_tuples, in_relation, num_tuples * sizeof(int));
100 | 	}
101 | 
102 | 	/*unsigned short state[3] = {0, 0, 0};
103 |     unsigned int seed       = time(NULL);
104 |     memcpy(state, &seed, sizeof(seed));
105 | 
106 | 	knuth_shuffle48(out_relation, num_tuples * n, state);*/
107 | 
108 | 	return 0;
109 | }
110 | 
111 | /**
112 |  * Generate tuple IDs -> random distribution
113 |  * relation must have been allocated
114 |  */
115 | void random_gen(int *rel, uint64_t elsNum, const int64_t maxid) {
116 | 	uint64_t i;
117 | 
118 | 	for (i = 0; i < elsNum; i++) {
119 | 		rel[i] = RAND_RANGE(maxid);
120 | //		printf("%d: rel[%d] = %d\n", maxid, i, rel[i]);
121 | 	}
122 | }
123 | 
124 | /**
125 |  * Create random unique keys starting from firstkey
126 |  */
127 | void random_unique_gen(int *rel, uint64_t elsNum, const int64_t maxid) {
128 |     uint64_t i;
129 | 
130 |     uint64_t firstkey = 0;
131 | 
132 |     /* for randomly seeding nrand48() */
133 |     unsigned short state[3] = {0, 0, 0};
134 |     unsigned int seed       = time(NULL);
135 |     memcpy(state, &seed, sizeof(seed));
136 | 
137 |     for (i = 0; i < elsNum; i++) {
138 |         rel[i] = firstkey;
139 | 
140 |         if(firstkey == maxid)
141 |             firstkey = 0;
142 | 
143 |         firstkey ++;
144 |     }
145 | 
146 |     /* randomly shuffle elements */
147 |     knuth_shuffle48(rel, elsNum, state);
148 | 
149 |    }
150 | 
151 | /**
152 |  * Create a foreign-key relation using the given primary-key relation and
153 |  * foreign-key relation size. Keys in pkrel is randomly distributed in the full
154 |  * integer range.
155 |  *
156 |  * @param fkrel [output] foreign-key relation
157 |  * @param pkrel [input] primary-key relation
158 |  * @param num_tuples
159 |  *
160 |  * @return
161 |  */
162 | int create_relation_fk_from_pk(const char *fkrelFilename, int *fkrel, uint64_t fkrelElsNum, int *pkrel,
163 | 		uint64_t pkrelElsNum) {
164 | 	/*first try to read from a file*/
165 | 	if (readFromFile(fkrelFilename, fkrel, fkrelElsNum)) {
166 | 		int i, iters;
167 | 		int64_t remainder;
168 | 
169 | 		/* alternative generation method */
170 | 		iters = fkrelElsNum / pkrelElsNum;
171 | 		for (i = 0; i < iters; i++) {
172 | 			memcpy(fkrel + i * pkrelElsNum, pkrel, pkrelElsNum * sizeof(int));
173 | 		}
174 | 
175 | 		/* if num_tuples is not an exact multiple of pkrel->num_tuples */
176 | 		remainder = fkrelElsNum % pkrelElsNum;
177 | 		if (remainder > 0) {
178 | 			memcpy(fkrel + i * pkrelElsNum, pkrel, remainder * sizeof(int));
179 | 		}
180 | 
181 | 		knuth_shuffle(fkrel, fkrelElsNum);
182 | 
183 | 		return writeToFile(fkrelFilename, fkrel, fkrelElsNum);
184 | 	}
185 | 
186 | 	return 0;
187 | }
188 | 
189 | /**
190 |  * Shuffle tuples of the relation using Knuth shuffle.
191 |  *
192 |  * @param relation
193 |  */
194 | void knuth_shuffle(int *relation, uint64_t elsNum) {
195 | 	int64_t i;
196 | 	for (i = elsNum - 1; i > 0; i--) {
197 | 		int64_t j = RAND_RANGE(i);
198 | 		int tmp = relation[i];
199 | 		relation[i] = relation[j];
200 | 		relation[j] = tmp;
201 | 	}
202 | }
203 | 
204 | void knuth_shuffle48(int *relation, uint64_t elsNum, unsigned short * state) {
205 | 	int64_t i;
206 | 	for (i = elsNum - 1; i > 0; i--) {
207 | 		int64_t j = RAND_RANGE48(i, state);
208 | 		int tmp = relation[i];
209 | 		relation[i] = relation[j];
210 | 		relation[j] = tmp;
211 | 	}
212 | }
213 | 
214 | int create_relation_zipf(const char *filename, int *relation, uint64_t elsNum, const int64_t maxid,
215 | 		const double zipf_param) {
216 | 	/*first try to read from a file*/
217 | 	if (readFromFile(filename, relation, elsNum)) {
218 | 		check_seed();
219 | 
220 | 		gen_zipf(elsNum, maxid, zipf_param, relation);
221 | 
222 | 		return writeToFile(filename, relation, elsNum);
223 | 	}
224 | 	return 0;
225 | }
226 | 
227 | /**
228 |  * Create an alphabet, an array of size @a size with randomly
229 |  * permuted values 0..size-1.
230 |  *
231 |  * @param size alphabet size
232 |  * @return an <code>item_t</code> array with @a size elements;
233 |  *         contains values 0..size-1 in a random permutation; the
234 |  *         return value is malloc'ed, don't forget to free it afterward.
235 |  */
236 | static uint32_t *gen_alphabet(unsigned int size) {
237 | 	uint32_t *alphabet;
238 | 
239 | 	/* allocate */
240 | 	alphabet = (uint32_t *) malloc(size * sizeof(*alphabet));
241 | 	assert(alphabet);
242 | 
243 | 	/* populate */
244 | 	for (unsigned int i = 0; i < size; i++)
245 | 		alphabet[i] = i + 1; /* don't let 0 be in the alphabet */
246 | 
247 | 	/* permute */
248 | 	for (unsigned int i = size - 1; i > 0; i--) {
249 | 		unsigned int k = (unsigned long) i * rand() / RAND_MAX;
250 | 		unsigned int tmp;
251 | 
252 | 		tmp = alphabet[i];
253 | 		alphabet[i] = alphabet[k];
254 | 		alphabet[k] = tmp;
255 | 	}
256 | 
257 | 	return alphabet;
258 | }
259 | 
260 | /**
261 |  * Generate a lookup table with the cumulated density function
262 |  *
263 |  * (This is derived from code originally written by Rene Mueller.)
264 |  */
265 | static double *gen_zipf_lut(double zipf_factor, unsigned int alphabet_size) {
266 | 	double scaling_factor;
267 | 	double sum;
268 | 
269 | 	double *lut; /**< return value */
270 | 
271 | 	lut = (double *) malloc(alphabet_size * sizeof(*lut));
272 | 	assert(lut);
273 | 
274 | 	/*
275 | 	 * Compute scaling factor such that
276 | 	 *
277 | 	 *   sum (lut[i], i=1..alphabet_size) = 1.0
278 | 	 *
279 | 	 */
280 | 	scaling_factor = 0.0;
281 | 	for (unsigned int i = 1; i <= alphabet_size; i++)
282 | 		scaling_factor += 1.0 / pow(i, zipf_factor);
283 | 
284 | 	/*
285 | 	 * Generate the lookup table
286 | 	 */
287 | 	sum = 0.0;
288 | 	for (unsigned int i = 1; i <= alphabet_size; i++) {
289 | 		sum += 1.0 / pow(i, zipf_factor);
290 | 		lut[i - 1] = sum / scaling_factor;
291 | 	}
292 | 
293 | 	return lut;
294 | }
295 | 
296 | /**
297 |  * Generate a stream with Zipf-distributed content.
298 |  */
299 | void gen_zipf(uint64_t stream_size, unsigned int alphabet_size, double zipf_factor, int *ret) {
300 | 	//uint64_t i;
301 | 	/* prepare stuff for Zipf generation */
302 | 	uint32_t *alphabet = gen_alphabet(alphabet_size);
303 | 	assert(alphabet);
304 | 
305 | 	double *lut = gen_zipf_lut(zipf_factor, alphabet_size);
306 | 	assert(lut);
307 | 
308 | 	uint32_t seeds[64];
309 | 
310 | 	for (int i = 0; i < 64; i++)
311 | 		seeds[i] = rand();
312 | 
313 | 	for (uint64_t i = 0; i < stream_size; i++) {
314 | 		if (i % 1000000 == 0)
315 | 			printf("live %d\n", i / 1000000);
316 | 
317 | 		/* take random number */
318 | 		double r;
319 | 
320 | 		r = ((double) (rand())) / RAND_MAX;
321 | 
322 | 		/* binary search in lookup table to determine item */
323 | 		unsigned int left = 0;
324 | 		unsigned int right = alphabet_size - 1;
325 | 		unsigned int m; /* middle between left and right */
326 | 		unsigned int pos; /* position to take */
327 | 
328 | 		if (lut[0] >= r)
329 | 			pos = 0;
330 | 		else {
331 | 			while (right - left > 1) {
332 | 				m = (left + right) / 2;
333 | 
334 | 				if (lut[m] < r)
335 | 					left = m;
336 | 				else
337 | 					right = m;
338 | 			}
339 | 
340 | 			pos = right;
341 | 		}
342 | 
343 | 		ret[i] = alphabet[pos];
344 | 	}
345 | 
346 | 	free(lut);
347 | 	free(alphabet);
348 | }
349 | 
350 | 


--------------------------------------------------------------------------------
/src/generator_ETHZ.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Code adapted from  multicore-hashjoins-0.2@https://www.systems.ethz.ch/node/334
 3 | All credit to the original author: Cagri Balkesen <cagri.balkesen@inf.ethz.ch>
 4 | */
 5 | 
 6 | #ifndef GENERATOR_ETHZ_CUH_
 7 | #define GENERATOR_ETHZ_CUH_
 8 | 
 9 | #include <cstdint>	/*uint64_t*/
10 | 
11 | void seed_generator(unsigned int seed);
12 | 
13 | int readFromFile(const char * filename, int *relation, uint64_t num_tuples);
14 | int create_relation_nonunique(const char *filename, int *relation, uint64_t num_tuples, const int64_t maxid);
15 | int create_relation_unique(const char *filename, int *relation, uint64_t num_tuples, const int64_t maxid);
16 | void random_gen(int *rel, uint64_t elsNum, const int64_t maxid);
17 | void random_unique_gen(int *rel, uint64_t elsNum, const int64_t maxid);
18 | int create_relation_fk_from_pk(const char *filename, int *fkrel, uint64_t fkrelElsNum, int *pkrel, uint64_t pkrelElsNum);
19 | void knuth_shuffle(int *relation, uint64_t elsNum);
20 | void knuth_shuffle48(int *relation, uint64_t elsNum, unsigned short * state);
21 | int create_relation_zipf(const char *filename, int *relation, uint64_t elsNum, const int64_t maxid, const double zipf_param);
22 | void gen_zipf(uint64_t stream_size, unsigned int alphabet_size, double zipf_factor, int *ret);
23 | int create_relation_n(int* in_relation, int* out_relation, uint64_t num_tuples, uint64_t n);
24 | 
25 | #endif /* GENERATOR_ETHZ-CUH_ */
26 | 


--------------------------------------------------------------------------------
/src/join-primitives.cu:
--------------------------------------------------------------------------------
   1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
   2 |                    Ecole Polytechnique Federale de Lausanne
   3 | 
   4 | Permission is hereby granted, free of charge, to any person obtaining a copy
   5 | of this software and associated documentation files (the "Software"), to deal
   6 | in the Software without restriction, including without limitation the rights
   7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   8 | copies of the Software, and to permit persons to whom the Software is
   9 | furnished to do so, subject to the following conditions:
  10 | 
  11 | The above copyright notice and this permission notice shall be included in all
  12 | copies or substantial portions of the Software.
  13 | 
  14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  20 | SOFTWARE.*/
  21 | 
  22 | #include <cassert>
  23 | #include <iostream>
  24 | #include <numa.h>
  25 | #include <unistd.h>
  26 | 
  27 | 
  28 | #include "join-primitives.cuh"
  29 | 
  30 | __global__ void init_payload (int* R, int n) {
  31 |     for (int i = threadIdx.x + blockIdx.x*blockDim.x; i < n; i += blockDim.x*gridDim.x)
  32 |         R[i] = i;
  33 | }
  34 | 
  35 | /*
  36 | S= keys of data to be partitioned
  37 | P= payloads of data to be partitioned
  38 | heads= keeps information on first bucket per partition and number of elements in it, packet in one 64-bit integer (only used here)
  39 | chains= the successor of a bucket in the bucket list
  40 | out_cnts= number of elements per partition
  41 | buckets_used= how many buckets are reserved by the partitioning already
  42 | offsets= describe the segments that occur due to partitioning
  43 | note: multithreaded partitioning creates partitions that consist of contiguous segments
  44 | => iterate over these segments to avoid handling empty slots
  45 | 
  46 | output_S= bucketized partitions of data keys
  47 | output_P= bucketized partitions of data payloads
  48 | cnt= number of elements to partition on total
  49 | log_parts- log of number of partitions
  50 | first_bit= shift the keys before "hashing"
  51 | num_threads= number of threads used in CPU side, used together with offsets
  52 | 
  53 | preconditions:
  54 | heads: current bucket (1 << 18) [special value for no bucket] and -1 elements (first write allocates bucket)
  55 | out_cnts: 0
  56 | buckets_used= number of partitions (first num_parts buckets are reserved)
  57 | */
  58 | __global__ void partition_pass_one (
  59 |                                     const int32_t   * __restrict__ S,
  60 |                                     const int32_t   * __restrict__ P,
  61 |                                     const size_t    * __restrict__ offsets,
  62 |                                           uint64_t  * __restrict__ heads,
  63 |                                           uint32_t  * __restrict__ buckets_used,
  64 |                                           uint32_t  * __restrict__ chains,
  65 |                                           uint32_t  * __restrict__ out_cnts,
  66 |                                           int32_t   * __restrict__ output_S,
  67 |                                           int32_t   * __restrict__ output_P,
  68 |                                           size_t                   cnt,
  69 |                                           uint32_t                 log_parts,
  70 |                                           uint32_t                 first_bit,
  71 |                                           uint32_t                 num_threads) {
  72 |     assert((((size_t) bucket_size) + ((size_t) blockDim.x) * gridDim.x) < (((size_t) 1) << 32));
  73 |     const uint32_t parts     = 1 << log_parts;
  74 |     const int32_t parts_mask = parts - 1;
  75 | 
  76 |     uint32_t * router = (uint32_t *) int_shared;
  77 | 
  78 |     uint32_t segment = 0;
  79 |     size_t segment_limit = offsets[1];
  80 |     size_t segment_next = offsets[2];
  81 | 
  82 |     size_t* shared_offsets = (size_t*) (int_shared + 1024*4 + 4*parts);
  83 | 
  84 |     /*if no segmentation in input use one segment with all data, else copy the segment info*/
  85 |     if (offsets != NULL) {
  86 |     	for (int i = threadIdx.x; i < 4*num_threads; i += blockDim.x) {
  87 |         	shared_offsets[i] = offsets[i];
  88 |     	}
  89 | 	} else {
  90 | 		for (int i = threadIdx.x; i < 4*num_threads; i += blockDim.x) {
  91 | 			if (i == 1)
  92 | 				shared_offsets[i] = cnt;
  93 | 			else
  94 | 	        	shared_offsets[i] = 0;
  95 |         }
  96 | 	}
  97 | 
  98 |     shared_offsets[4*num_threads] = cnt+4096;
  99 |     shared_offsets[4*num_threads+1] = cnt+4096;
 100 | 
 101 |     /*partition element counter starts at 0*/
 102 |     for (size_t j = threadIdx.x ; j < parts ; j += blockDim.x ) 
 103 |         router[1024*4 + parts + j] = 0;
 104 |     
 105 |     if (threadIdx.x == 0) 
 106 |         router[0] = 0;
 107 | 
 108 |     __syncthreads();
 109 | 
 110 |     
 111 |     /*iterate over the segments*/
 112 |     for (int u = 0; u < 2*num_threads; u++) {
 113 |         size_t segment_start = shared_offsets[2*u];
 114 |         size_t segment_limit = shared_offsets[2*u + 1]; 
 115 |         size_t segment_end   = segment_start + ((segment_limit - segment_start + 4096 - 1)/4096)*4096;
 116 | 
 117 |         for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x) + segment_start; i < segment_end ; i += 4 * blockDim.x * gridDim.x) {
 118 |             vec4 thread_vals = *(reinterpret_cast<const vec4 *>(S + i));
 119 | 
 120 |             uint32_t thread_keys[4];
 121 | 
 122 |             /*compute local histogram for a chunk of 4*blockDim.x elements*/
 123 |             #pragma unroll
 124 |             for (int k = 0 ; k < 4 ; ++k){
 125 |                 if (i + k < segment_limit){
 126 |                     uint32_t partition = (hasht(thread_vals.i[k]) >> first_bit) & parts_mask;
 127 | 
 128 |                     atomicAdd(router + (1024 * 4 + parts + partition), 1);
 129 |                 
 130 |                     thread_keys[k] = partition;
 131 |                 } else {
 132 |                     thread_keys[k] = 0;
 133 |                 }
 134 |             }
 135 | 
 136 |             __syncthreads();
 137 | 
 138 |             for (size_t j = threadIdx.x; j < parts ; j += blockDim.x ) {
 139 |                 uint32_t cnt = router[1024 * 4 + parts + j];
 140 | 
 141 |                 if (cnt > 0){
 142 |                     atomicAdd(out_cnts + j, cnt);
 143 |                 
 144 |                     uint32_t pcnt     ;
 145 |                     uint32_t bucket   ;
 146 |                     uint32_t next_buck;
 147 | 
 148 |                     bool repeat = true;
 149 | 
 150 |                     while (__any(repeat)){
 151 |                         if (repeat){
 152 |                             /*check if any of the output bucket is filling up*/
 153 |                             uint64_t old_heads = atomicAdd(heads + j, ((uint64_t) cnt) << 32);
 154 |     
 155 |                             atomicMin(heads + j, ((uint64_t) (2*bucket_size)) << 32);
 156 | 
 157 |                             pcnt       = ((uint32_t) (old_heads >> 32));
 158 |                             bucket     =  (uint32_t)  old_heads        ;
 159 | 
 160 |                             /*now there are two cases:
 161 |                             // 2) old_heads.cnt >  bucket_size ( => locked => retry)
 162 |                             // if (pcnt       >= bucket_size) continue;*/
 163 | 
 164 |                             if (pcnt < bucket_size){
 165 |                                 /* 1) old_heads.cnt <= bucket_size*/
 166 | 
 167 |                                 /*check if the bucket was filled*/
 168 |                                 if (pcnt + cnt >= bucket_size){
 169 |                                     if (bucket < (1 << 18)) {
 170 |                                         next_buck = atomicAdd(buckets_used, 1);                                
 171 |                                         chains[bucket]     = next_buck;
 172 |                                     } else {
 173 |                                         next_buck = j;
 174 |                                     }
 175 |                                     uint64_t tmp =  next_buck + (((uint64_t) (pcnt + cnt - bucket_size)) << 32);
 176 |     
 177 |                                     atomicExch(heads + j, tmp);
 178 |                                 } else {
 179 |                                     next_buck = bucket;
 180 |                                 }
 181 |     
 182 |                                 repeat = false;
 183 |                             }
 184 |                         }
 185 |                     }
 186 |     
 187 |                     router[1024 * 4             + j] = atomicAdd(router, cnt);
 188 |                     router[1024 * 4 +     parts + j] = 0;//cnt;//pcnt     ;
 189 |                     router[1024 * 4 + 2 * parts + j] = (bucket    << log2_bucket_size) + pcnt;
 190 |                     router[1024 * 4 + 3 * parts + j] =  next_buck << log2_bucket_size        ;
 191 |                 }
 192 |             }
 193 |     
 194 |             __syncthreads();
 195 |     
 196 |     
 197 |             uint32_t total_cnt = router[0];
 198 |     
 199 |             __syncthreads();
 200 | 
 201 |             /*calculate write positions for block-wise shuffle => atomicAdd on start of partition*/
 202 |             #pragma unroll
 203 |             for (int k = 0 ; k < 4 ; ++k){
 204 |                 if (i + k < segment_limit)
 205 |                     thread_keys[k] = atomicAdd(router + (1024 * 4 + thread_keys[k]), 1);
 206 |             }
 207 |     
 208 |             /*write the keys in shared memory*/
 209 |             #pragma unroll
 210 |             for (int k = 0 ; k < 4 ; ++k) 
 211 |                 if (i + k < segment_limit)
 212 |                     router[thread_keys[k]] = thread_vals.i[k];
 213 |     
 214 |             __syncthreads();
 215 |     
 216 |             int32_t thread_parts[4];
 217 | 
 218 |             /*read shuffled keys and write them to output partitions "somewhat" coalesced*/
 219 |             #pragma unroll
 220 |             for (int k = 0 ; k < 4 ; ++k){
 221 |                 if (threadIdx.x + 1024 * k < total_cnt) {
 222 |                     int32_t  val       = router[threadIdx.x + 1024 * k];
 223 |                     uint32_t partition = (hasht(val) >> first_bit) & parts_mask;
 224 | 
 225 |                     uint32_t cnt       = router[1024 * 4 +             partition] - (threadIdx.x + 1024 * k);
 226 | 
 227 |                     uint32_t bucket    = router[1024 * 4 + 2 * parts + partition];
 228 | 
 229 |                     if (((bucket + cnt) ^ bucket) & ~bucket_size_mask){
 230 |                         uint32_t next_buck = router[1024 * 4 + 3 * parts + partition];
 231 |                         cnt    = ((bucket + cnt) & bucket_size_mask);
 232 |                         bucket = next_buck;
 233 |                     }
 234 |                     
 235 |                     bucket += cnt;
 236 |             
 237 |                     output_S[bucket] = val;
 238 | 
 239 |                     thread_parts[k] = partition;
 240 |                 }
 241 |             }
 242 | 
 243 |             __syncthreads();
 244 | 
 245 |             /*read payloads of original data*/
 246 |             thread_vals = *(reinterpret_cast<const vec4 *>(P + i));
 247 | 
 248 |             /*shuffle payloads in shared memory, in the same offsets that we used for their corresponding keys*/
 249 |             #pragma unroll
 250 |             for (int k = 0 ; k < 4 ; ++k) 
 251 |                 if (i + k < segment_limit) {
 252 |                     router[thread_keys[k]] = thread_vals.i[k];
 253 |                 }
 254 | 
 255 |             __syncthreads();
 256 | 
 257 |             /*write payloads to partition buckets in "somewhat coalesced manner"*/
 258 |             #pragma unroll
 259 |             for (int k = 0 ; k < 4 ; ++k){
 260 |                 if (threadIdx.x + 1024 * k < total_cnt) {
 261 |                     int32_t  val       = router[threadIdx.x + 1024 * k];
 262 | 
 263 |                     int32_t partition = thread_parts[k];
 264 | 
 265 |                     uint32_t cnt       = router[1024 * 4 +             partition] - (threadIdx.x + 1024 * k);
 266 | 
 267 |                     uint32_t bucket    = router[1024 * 4 + 2 * parts + partition];
 268 | 
 269 |                     if (((bucket + cnt) ^ bucket) & ~bucket_size_mask){
 270 |                         uint32_t next_buck = router[1024 * 4 + 3 * parts + partition];
 271 |                         cnt    = ((bucket + cnt) & bucket_size_mask);
 272 |                         bucket = next_buck;
 273 |                     }
 274 |                     bucket += cnt;
 275 |             
 276 |                     output_P[bucket] = val;
 277 |                 }
 278 |             }
 279 | 
 280 |             if (threadIdx.x == 0) router[0] = 0;
 281 |         }
 282 |     }
 283 | }
 284 | 
 285 | /*
 286 | compute information for the second partitioning pass
 287 | 
 288 | input:
 289 | chains=points to the successor in the bucket list for each bucket (hint: we append new buckets to the end)
 290 | out_cnts=count of elements per partition
 291 | output:
 292 | chains=packed value of element count in bucket and the partition the bucket belongs to
 293 | */
 294 | __global__ void compute_bucket_info (uint32_t* chains, uint32_t* out_cnts, uint32_t log_parts) {
 295 |     uint32_t parts = 1 << log_parts;
 296 | 
 297 |     for (int p = threadIdx.x + blockIdx.x*blockDim.x; p < parts; p += gridDim.x*blockDim.x) {
 298 |         uint32_t cur = p;
 299 |         int32_t cnt = out_cnts[p];
 300 | 
 301 |         while (cnt > 0) {
 302 |             uint32_t local_cnt = (cnt >= 4096)? 4096 : cnt;
 303 |             uint32_t val = (p << 13) + local_cnt;
 304 |             
 305 |             uint32_t next = chains[cur];
 306 |             chains[cur] = val;
 307 | 
 308 |             cur = next;
 309 |             cnt -= 4096;
 310 |         }
 311 |     }
 312 | }
 313 | 
 314 | /*
 315 | S= keys of data to be re-partitioned
 316 | P= payloads of data to be re-partitioned
 317 | heads= keeps information on first bucket per partition and number of elements in it, packet in one 64-bit integer (only used here)
 318 | chains= the successor of a bucket in the bucket list
 319 | out_cnts= number of elements per partition
 320 | buckets_used= how many buckets are reserved by the partitioning already
 321 | offsets= describe the segments that occur due to partitioning
 322 | note: multithreaded partitioning creates partitions that consist of contiguous segments
 323 | => iterate over these segments to avoid handling empty slots
 324 | 
 325 | output_S= bucketized partitions of data keys (results)
 326 | output_P= bucketized partitions of data payloads (results)
 327 | 
 328 | S_log_parts- log of number of partitions for previous pass
 329 | log_parts- log of number of partitions for this pass
 330 | first_bit= shift the keys before "hashing"
 331 | bucket_num_ptr: number of input buckets
 332 | 
 333 | preconditions:
 334 | heads: current bucket (1 << 18) [special value for no bucket] and -1 elements (first write allocates bucket)
 335 | out_cnts: 0
 336 | buckets_used= number of partitions (first num_parts buckets are reserved)
 337 | */
 338 | __global__ void partition_pass_two (
 339 |                                     const int32_t   * __restrict__ S,
 340 |                                     const int32_t   * __restrict__ P,
 341 |                                     const uint32_t  * __restrict__ bucket_info,
 342 |                                           uint32_t  * __restrict__ buckets_used,
 343 |                                           uint64_t  *              heads,
 344 |                                           uint32_t  * __restrict__ chains,
 345 |                                           uint32_t  * __restrict__ out_cnts,
 346 |                                           int32_t   * __restrict__ output_S,
 347 |                                           int32_t   * __restrict__ output_P,
 348 |                                           uint32_t                 S_log_parts,
 349 |                                           uint32_t                 log_parts,
 350 |                                           uint32_t                 first_bit,
 351 |                                           uint32_t  *              bucket_num_ptr) {
 352 |     assert((((size_t) bucket_size) + ((size_t) blockDim.x) * gridDim.x) < (((size_t) 1) << 32));
 353 |     const uint32_t S_parts   = 1 << S_log_parts;
 354 |     const uint32_t parts     = 1 << log_parts;
 355 |     const int32_t parts_mask = parts - 1;
 356 | 
 357 |     uint32_t buckets_num = *bucket_num_ptr;
 358 | 
 359 |     uint32_t * router = (uint32_t *) int_shared; //[1024*4 + parts];
 360 | 
 361 |     for (size_t j = threadIdx.x ; j < parts ; j += blockDim.x ) 
 362 |         router[1024*4 + parts + j] = 0;
 363 |     
 364 |     if (threadIdx.x == 0) 
 365 |         router[0] = 0;
 366 | 
 367 |     __syncthreads();
 368 | 
 369 |     
 370 |     /*each CUDA block processes a bucket at a time*/
 371 |     for (size_t i = blockIdx.x; i < buckets_num; i += gridDim.x) {
 372 |         uint32_t info = bucket_info[i];
 373 |         /*number of elements per bucket*/
 374 |         uint32_t cnt = info & ((1 << 13) - 1);
 375 |         /*id of original partition*/
 376 |         uint32_t pid = info >> 13;
 377 | 
 378 |         vec4 thread_vals = *(reinterpret_cast<const vec4 *>(S + bucket_size * i + 4*threadIdx.x));
 379 | 
 380 |         uint32_t thread_keys[4];
 381 | 
 382 |         /*compute local histogram for the bucket*/
 383 |         #pragma unroll
 384 |         for (int k = 0 ; k < 4 ; ++k){
 385 |             if (4*threadIdx.x + k < cnt){
 386 |                 uint32_t partition = (hasht(thread_vals.i[k]) >> first_bit) & parts_mask;
 387 | 
 388 |                 atomicAdd(router + (1024 * 4 + parts + partition), 1);
 389 |                 
 390 |                 thread_keys[k] = partition;
 391 |             } else {
 392 |                 thread_keys[k] = 0;
 393 |             }
 394 |         }
 395 | 
 396 |         __syncthreads();
 397 | 
 398 |         for (size_t j = threadIdx.x; j < parts ; j += blockDim.x ) {
 399 |             uint32_t cnt = router[1024 * 4 + parts + j];
 400 | 
 401 |             if (cnt > 0){
 402 |                 atomicAdd(out_cnts + (pid << log_parts) + j, cnt);
 403 |                 
 404 |                 uint32_t pcnt     ;
 405 |                 uint32_t bucket   ;
 406 |                 uint32_t next_buck;
 407 | 
 408 |                 bool repeat = true;
 409 | 
 410 |                 while (__any(repeat)){
 411 |                     if (repeat){
 412 |                         uint64_t old_heads = atomicAdd(heads + (pid << log_parts) + j, ((uint64_t) cnt) << 32);
 413 |     
 414 |                         atomicMin(heads + (pid << log_parts) + j, ((uint64_t) (2*bucket_size)) << 32);
 415 | 
 416 |                         pcnt       = ((uint32_t) (old_heads >> 32));
 417 |                         bucket     =  (uint32_t)  old_heads        ;
 418 | 
 419 |                         if (pcnt < bucket_size){
 420 |                             if (pcnt + cnt >= bucket_size){
 421 |                                 if (bucket < (1 << 18)) {
 422 |                                     next_buck = atomicAdd(buckets_used, 1);                                
 423 |                                     chains[bucket]     = next_buck;
 424 |                                 } else {
 425 |                                     next_buck = (pid << log_parts) + j;
 426 |                                 }
 427 | 
 428 |                                 uint64_t tmp =  next_buck + (((uint64_t) (pcnt + cnt - bucket_size)) << 32);
 429 | 
 430 |                                 atomicExch(heads + (pid << log_parts) + j, tmp);
 431 |                             } else {
 432 |                                 next_buck = bucket;
 433 |                             }
 434 |     
 435 |                             repeat = false;
 436 |                         }
 437 |                     }
 438 |                 }
 439 |     
 440 |                 router[1024 * 4             + j] = atomicAdd(router, cnt);
 441 |                 router[1024 * 4 +     parts + j] = 0;
 442 |                 router[1024 * 4 + 2 * parts + j] = (bucket    << log2_bucket_size) + pcnt;
 443 |                 router[1024 * 4 + 3 * parts + j] =  next_buck << log2_bucket_size        ;
 444 |             }
 445 |         }
 446 | 
 447 |         __syncthreads();
 448 |     
 449 |     
 450 |         uint32_t total_cnt = router[0];
 451 |     
 452 |         __syncthreads();
 453 | 
 454 |         /*calculate write positions for block-wise shuffle => atomicAdd on start of partition*/
 455 |         #pragma unroll
 456 |         for (int k = 0 ; k < 4 ; ++k){
 457 |             if (4*threadIdx.x + k < cnt)
 458 |                 thread_keys[k] = atomicAdd(router + (1024 * 4 + thread_keys[k]), 1);
 459 |         }
 460 |     
 461 |         /*write the keys in shared memory*/
 462 |         #pragma unroll
 463 |         for (int k = 0 ; k < 4 ; ++k) 
 464 |             if (4*threadIdx.x + k < cnt)
 465 |                 router[thread_keys[k]] = thread_vals.i[k];
 466 |     
 467 |         __syncthreads();
 468 |     
 469 |         int32_t thread_parts[4];
 470 | 
 471 |         /*read shuffled keys and write them to output partitions "somewhat" coalesced*/
 472 |         #pragma unroll
 473 |         for (int k = 0 ; k < 4 ; ++k){
 474 |             if (threadIdx.x + 1024 * k < total_cnt) {
 475 |                 int32_t  val       = router[threadIdx.x + 1024 * k];
 476 |                 uint32_t partition = (hasht(val) >> first_bit) & parts_mask;
 477 | 
 478 |                 uint32_t cnt       = router[1024 * 4 +             partition] - (threadIdx.x + 1024 * k);
 479 | 
 480 |                 uint32_t bucket    = router[1024 * 4 + 2 * parts + partition];
 481 | 
 482 |                 if (((bucket + cnt) ^ bucket) & ~bucket_size_mask){
 483 |                     uint32_t next_buck = router[1024 * 4 + 3 * parts + partition];
 484 |                     cnt    = ((bucket + cnt) & bucket_size_mask);
 485 |                     bucket = next_buck;
 486 |                 }
 487 |                     
 488 |                 bucket += cnt;
 489 |             
 490 |                 output_S[bucket] = val;
 491 | 
 492 |                 thread_parts[k] = partition;
 493 |             }
 494 |         }
 495 | 
 496 |         __syncthreads();
 497 | 
 498 |         /*read payloads of original data*/
 499 |         thread_vals = *(reinterpret_cast<const vec4 *>(P + i*bucket_size + 4*threadIdx.x));
 500 | 
 501 |         /*shuffle payloads in shared memory, in the same offsets that we used for their corresponding keys*/
 502 |         #pragma unroll
 503 |         for (int k = 0 ; k < 4 ; ++k) 
 504 |             if (4*threadIdx.x + k < cnt) {
 505 |                 router[thread_keys[k]] = thread_vals.i[k];
 506 |             }
 507 | 
 508 |         __syncthreads();
 509 | 
 510 |         /*write payloads to partition buckets in "somewhat coalesced manner"*/
 511 |         #pragma unroll
 512 |         for (int k = 0 ; k < 4 ; ++k){
 513 |             if (threadIdx.x + 1024 * k < total_cnt) {
 514 |                 int32_t  val       = router[threadIdx.x + 1024 * k];
 515 | 
 516 |                 int32_t partition = thread_parts[k];
 517 | 
 518 |                 uint32_t cnt       = router[1024 * 4 +             partition] - (threadIdx.x + 1024 * k);
 519 | 
 520 |                 uint32_t bucket    = router[1024 * 4 + 2 * parts + partition];
 521 | 
 522 |                 if (((bucket + cnt) ^ bucket) & ~bucket_size_mask){
 523 |                     uint32_t next_buck = router[1024 * 4 + 3 * parts + partition];
 524 |                     cnt    = ((bucket + cnt) & bucket_size_mask);
 525 |                     bucket = next_buck;
 526 |                 }
 527 |                 bucket += cnt;
 528 |            
 529 |                 output_P[bucket] = val;
 530 |             }
 531 |         }
 532 | 
 533 |         if (threadIdx.x == 0) router[0] = 0;
 534 |     }
 535 | }
 536 | 
 537 | #define LOCAL_BUCKETS_BITS 10
 538 | #define LOCAL_BUCKETS ((1 << LOCAL_BUCKETS_BITS))
 539 | 
 540 | #define MAX_BIT 32
 541 | 
 542 | __device__ int ctzd (int x) {
 543 |     if (x == 0)
 544 |         return 32;
 545 |     
 546 |     int n = 0;
 547 | 
 548 |     if ((n & 0x0000FFFF) == 0) {
 549 |         n += 16;
 550 |         x >>= 16;
 551 |     }
 552 | 
 553 |     if ((n & 0x000000FF) == 0) {
 554 |         n += 8;
 555 |         x >>= 8;
 556 |     }
 557 | 
 558 |     if ((n & 0x0000000F) == 0) {
 559 |         n += 4;
 560 |         x >>= 4;
 561 |     }
 562 | 
 563 |     if ((n & 0x00000003) == 0) {
 564 |         n += 2;
 565 |         x >>= 2;
 566 |     }
 567 | 
 568 |     if ((n & 0x00000001) == 0) {
 569 |         n += 1;
 570 |         x >>= 1;
 571 |     }
 572 | 
 573 |     return n;
 574 | }
 575 | 
 576 | 
 577 | __global__ void init_metadata_double ( 
 578 |                                 uint64_t  * __restrict__ heads1,
 579 |                                 uint32_t  * __restrict__ buckets_used1,
 580 |                                 uint32_t  * __restrict__ chains1,
 581 |                                 uint32_t  * __restrict__ out_cnts1,
 582 |                                 uint32_t parts1,
 583 |                                 uint32_t buckets_num1,
 584 |                                 uint64_t  * __restrict__ heads2,
 585 |                                 uint32_t  * __restrict__ buckets_used2,
 586 |                                 uint32_t  * __restrict__ chains2,
 587 |                                 uint32_t  * __restrict__ out_cnts2,
 588 |                                 uint32_t parts2,
 589 |                                 uint32_t buckets_num2
 590 |                                 ) {
 591 |     int tid = threadIdx.x + blockIdx.x * blockDim.x;
 592 | 
 593 |     for (int i = tid; i < buckets_num1; i += blockDim.x*gridDim.x)
 594 |         chains1[i] = 0;
 595 | 
 596 |     for (int i = tid; i < parts1; i += blockDim.x*gridDim.x)
 597 |         out_cnts1[i] = 0;
 598 | 
 599 |     for (int i = tid; i < parts1; i += blockDim.x*gridDim.x)
 600 |         heads1[i] = (1 << 18) + (((uint64_t) bucket_size_mask) << 32);
 601 | 
 602 |     if (tid == 0) {
 603 |         *buckets_used1 = parts1;
 604 |     }
 605 | 
 606 |     for (int i = tid; i < buckets_num2; i += blockDim.x*gridDim.x)
 607 |         chains2[i] = 0;
 608 | 
 609 |     for (int i = tid; i < parts2; i += blockDim.x*gridDim.x)
 610 |         out_cnts2[i] = 0;
 611 | 
 612 |     for (int i = tid; i < parts2; i += blockDim.x*gridDim.x)
 613 |         heads2[i] = (1 << 18) + (((uint64_t) bucket_size_mask) << 32);
 614 | 
 615 |     if (tid == 0) {
 616 |         *buckets_used2 = parts2;
 617 |     }
 618 | }
 619 | 
 620 | /*
 621 | Building phase for non-partitioned hash join with perfect hashing (so this property is reflected in the code, we don't follow chains), it is the best case for non-partitioned
 622 | 
 623 | data=array of the keys
 624 | payload=array of payloads
 625 | n=number of tuples
 626 | lookup=lookup table/hashtable that we build => we store the payload at position lookup[key]
 627 | */
 628 | __global__ void build_perfect_array (int32_t* data, int32_t* payload, int n, int32_t* lookup) {
 629 |     for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){
 630 |         vec4 thread_vals = *(reinterpret_cast<const vec4 *>(data + i));
 631 |         vec4 thread_payloads = *(reinterpret_cast<const vec4 *>(payload + i));
 632 | 
 633 |         #pragma unroll
 634 |         for (int k = 0; k < 4; ++k) {
 635 |             int32_t val = thread_vals.i[k];
 636 |             int32_t payload = thread_payloads.i[k];
 637 |             lookup[val] = payload + 1;      
 638 |         }
 639 |     }
 640 | }
 641 | 
 642 | /*Probing phase for non-partitioned hash join with perfect hashing
 643 | 
 644 | data=keys for probe side
 645 | payload=payloads for probe side
 646 | n=number of elements
 647 | lookup=hashtable
 648 | aggr=the memory location in which we aggregate with atomics at the end*/
 649 | __global__ void probe_perfect_array (int32_t* data, int32_t* payload, int n, int32_t* lookup, int* aggr) {
 650 |     int count = 0;
 651 | 
 652 |     for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){
 653 |         vec4 thread_vals = *(reinterpret_cast<const vec4 *>(data + i));
 654 |         vec4 thread_payloads = *(reinterpret_cast<const vec4 *>(payload + i));
 655 | 
 656 |         #pragma unroll
 657 |         for (int k = 0; k < 4; ++k) {
 658 |             int val = thread_vals.i[k];
 659 |             int payload = thread_payloads.i[k];
 660 |             int res = lookup[val];
 661 | 
 662 |             if (res)
 663 |                 count += (payload * (res - 1));    
 664 |         }
 665 |     }
 666 | 
 667 |     atomicAdd(aggr, count);
 668 | }
 669 | 
 670 | 
 671 | /*
 672 | Building phase for non-partitioned hash join with chaining
 673 | 
 674 | data=array of the keys
 675 | payload=array of payloads
 676 | n=number of tuples
 677 | log_parts=log size of hashtable/chains
 678 | output=the chains [the rest of the array stays in place]
 679 | head=the first element of each chain
 680 | */
 681 | __global__ void build_ht_chains (int32_t* data, int n, uint32_t log_parts, int32_t* output, int* head) {
 682 |     int parts = 1 << log_parts;
 683 |     int parts_mask = parts-1;
 684 | 
 685 |     for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){
 686 |         vec4 thread_vals = *(reinterpret_cast<const vec4 *>(data + i));
 687 | 
 688 |         #pragma unroll
 689 |         for (int k = 0; k < 4; ++k) {
 690 |             int val = thread_vals.i[k];
 691 |             int hval = val & parts_mask;
 692 | 
 693 |             int last = atomicExch(head + hval, i+k+1);
 694 |             //int64_t wr = (((int64_t) last) << 32) + val;
 695 |             output[i + k] = last;         
 696 |         }
 697 |     }
 698 | }
 699 | 
 700 | /*
 701 | Probing phase for non-partitioned hash join with chaining
 702 | 
 703 | data=array of the keys
 704 | payload=array of payloads
 705 | n=number of tuples
 706 | log_parts=log size of hashtable/chains
 707 | ht=the chains that show the successor for each build element
 708 | head=the first element of each chain
 709 | ht_key=the keys of the hashtable as an array
 710 | ht_pay=the payloads of the hashtable as an array
 711 | aggr=the memory location in which we aggregate with atomics at the end
 712 | */
 713 | __global__ void chains_probing (int32_t* data, int32_t* payload, int n, uint32_t log_parts, int32_t* ht, int32_t* ht_key, int32_t* ht_pay, int* head, int* aggr) {
 714 |     int parts = 1 << log_parts;
 715 |     int parts_mask = parts-1;
 716 |     int count = 0;
 717 | 
 718 |     for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){
 719 |         vec4 thread_vals = *(reinterpret_cast<const vec4 *>(data + i));
 720 |         vec4 thread_payloads = *(reinterpret_cast<const vec4 *>(payload + i));
 721 | 
 722 |         #pragma unroll
 723 |         for (int k = 0; k < 4; ++k) {
 724 |             int val = thread_vals.i[k];
 725 |             int payload = thread_payloads.i[k];
 726 |             int hval = val & parts_mask;
 727 | 
 728 |             int next = head[hval];
 729 | 
 730 |             while (next != 0) {
 731 |                 int ht_val = ht_key[next-1];
 732 | 
 733 |                 if (ht_val == val)
 734 |                     count += (payload * ht_pay[next-1]);
 735 | 
 736 |                 next = ht[next-1];
 737 |             }       
 738 |         }
 739 |     }
 740 | 
 741 |     atomicAdd(aggr, count);
 742 | }
 743 | 
 744 | 
 745 | /*functions for linear probing
 746 | 
 747 | FIXME: there is a bug so it is not operational yet [was not in paper so this is not urgent]
 748 | */
 749 | 
 750 | __global__ void ht_hist (int* data, int n, int log_parts, int* hist) {
 751 |     int parts = 1 << log_parts;
 752 |     int parts_mask = parts-1;
 753 | 
 754 |     for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){
 755 |         vec4 thread_vals = *(reinterpret_cast<const vec4 *>(data + i));
 756 | 
 757 |         #pragma unroll
 758 |         for (int k = 0; k < 4; ++k) {
 759 |             int val = thread_vals.i[k];
 760 |             int hval = val & parts_mask;
 761 | 
 762 |             int off = atomicAdd(hist + hval, 1);
 763 |         }
 764 |     }
 765 | }
 766 | 
 767 | __global__ void ht_offsets (int log_parts, int* hist, int* offset, int* aggr) {
 768 |     int parts = 1 << log_parts;
 769 |     int parts_mask = parts-1;
 770 | 
 771 |     for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < parts; i += blockDim.x * gridDim.x) {
 772 |         int cur = hist[i];
 773 |         int off = atomicAdd(aggr, cur);
 774 |         hist[i] = off;
 775 |         offset[i] = off;
 776 |     }
 777 | } 
 778 | 
 779 | __global__ void build_ht_linear (int* data, int* payload, size_t n, int log_parts, int* offset, int* ht, int* htp) {
 780 |     int parts = 1 << log_parts;
 781 |     int parts_mask = parts-1;
 782 | 
 783 |     for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){
 784 |         vec4 thread_vals = *(reinterpret_cast<const vec4 *>(data + i));
 785 |         vec4 thread_payloads = *(reinterpret_cast<const vec4 *>(payload + i));
 786 | 
 787 |         #pragma unroll
 788 |         for (int k = 0; k < 4; ++k) {
 789 |             int val = thread_vals.i[k];
 790 |             int hval = val & parts_mask;
 791 | 
 792 |             int off = atomicAdd(offset + hval, 1);
 793 | 
 794 |             ht[off] = val;
 795 |             htp[off] = thread_payloads.i[k];
 796 | 
 797 |         }
 798 |     }
 799 | }
 800 | 
 801 | __global__ void linear_probing (int* data, int* payload, int* ht, int* htp, int* offset_s, int* offset_e, size_t n, int log_parts, int* aggr) {
 802 |     int parts = 1 << log_parts;
 803 |     int parts_mask = parts-1;
 804 |     int count = 0;
 805 | 
 806 |     for (size_t i = 4 *(threadIdx.x + blockIdx.x * blockDim.x); i < n ; i += 4 * blockDim.x * gridDim.x){
 807 |         vec4 thread_vals = *(reinterpret_cast<const vec4 *>(data + i));
 808 |         vec4 thread_payloads = *(reinterpret_cast<const vec4 *>(payload + i));
 809 | 
 810 |         #pragma unroll
 811 |         for (int k = 0; k < 4; ++k) {
 812 |             int val = thread_vals.i[k];
 813 | 
 814 |             for (int j = 0; j < 32; j++) {
 815 |                 int probe = __shfl(val, j);
 816 |                 int pay = __shfl(thread_payloads.i[k], j);
 817 |                 int hval = probe & parts_mask;
 818 | 
 819 |                 int start = offset_s[hval];
 820 |                 int end = offset_e[hval];
 821 | 
 822 |                 for (int p = start + threadIdx.x % 32; p < end; p += 32) {
 823 |                     if (ht[p] == probe) {
 824 |                         count += pay*htp[p];
 825 |                     }
 826 |                 }
 827 |             }
 828 |         }
 829 |     }
 830 | 
 831 |     atomicAdd(aggr, count);
 832 | }
 833 | 
 834 | /*break "long" bucket chains to smaller chains
 835 | this helps load balancing because we can allocate work at sub-chain granularity
 836 | and effectively solve the skew problem
 837 | 
 838 | bucket_info=we store the packed (partition, element count) value for each bucket
 839 | chains=successor in partition's bucket list
 840 | out_cnts=count of elements in this partition
 841 | log_parts= log of number of partitions
 842 | threshold=the maximum number of elements per subchain*/
 843 | __global__ void decompose_chains (uint32_t* bucket_info, uint32_t* chains, uint32_t* out_cnts, uint32_t log_parts, int threshold) {
 844 |     uint32_t parts = 1 << log_parts;
 845 | 
 846 |     for (int p = threadIdx.x + blockIdx.x*blockDim.x; p < parts; p += gridDim.x*blockDim.x) {
 847 |         uint32_t cur = p;
 848 |         int32_t  cnt = out_cnts[p];
 849 |         uint32_t first_cnt = (cnt >= threshold)? threshold : cnt;
 850 |         int32_t  cutoff = 0; 
 851 | 
 852 |         while (cnt > 0) {
 853 |             cutoff += bucket_size;
 854 |             cnt -= bucket_size;
 855 | 
 856 |             uint32_t next = chains[cur];
 857 |             
 858 |             if (cutoff >= threshold && cnt > 0) {
 859 |                 uint32_t local_cnt = (cnt >= threshold)? threshold : cnt;
 860 | 
 861 |                 bucket_info[next] = (p << 15) + local_cnt;
 862 |                 chains[cur] = 0;
 863 |                 cutoff = 0;
 864 |             } else if (next != 0) {
 865 |                 bucket_info[next] = 0;
 866 |             }
 867 | 
 868 | 
 869 |             cur = next;
 870 |         }
 871 | 
 872 |         bucket_info[p] = (p << 15) + first_cnt;
 873 |     }
 874 | }
 875 | 
 876 | /*kernel for performing the join between the partitioned relations
 877 | 
 878 | R,Pr= bucketized keys and payloads for relation R (probe side)
 879 | S,Ps= buckerized keys and payloads for relation S (build side)
 880 | bucket_info=the info that tells us which partition each bucket belongs to, the number of elements (or whether it belongs to a chain)
 881 | S_cnts, S_chain= for build-side we don't pack the info since we operate under the assumption that it is usually one bucket per partition (we don't load balance)
 882 | buckets_num=number of buckets for R
 883 | results=the memory address where we aggregate
 884 | */
 885 | __global__ void join_partitioned_aggregate (
 886 |                                     const int32_t*               R,
 887 |                                     const int32_t*               Pr,
 888 |                                     const uint32_t*              R_chain,
 889 |                                     const uint32_t*              bucket_info,
 890 |                                     const int32_t*               S,
 891 |                                     const int32_t*               Ps,
 892 |                                     const uint32_t*              S_cnts,
 893 |                                     const uint32_t*              S_chain,
 894 |                                     int32_t                      log_parts,
 895 |                                     uint32_t*                    buckets_num,
 896 |                                     int32_t*                     results) {
 897 | 
 898 |     /*in order to saze space, we discard the partitioning bits, then we can try fitting keys in int16_t [HACK]*/
 899 |     __shared__ int16_t elem[4096 + 512];
 900 |     __shared__ int32_t payload[4096 + 512];
 901 |     __shared__ int16_t next[4096 + 512];
 902 |     __shared__ int32_t head[LOCAL_BUCKETS];
 903 | 
 904 | 
 905 |     int tid = threadIdx.x;
 906 |     int block = blockIdx.x;
 907 |     int width = blockDim.x;
 908 |     int pwidth = gridDim.x;
 909 |     int parts = 1 << log_parts;
 910 | 
 911 |     int lid = tid % 32;
 912 |     int gnum = blockDim.x/32;
 913 | 
 914 |     int count = 0;
 915 | 
 916 |     int buckets_cnt = *buckets_num;
 917 | 
 918 |     for (uint32_t bucket_r = block; bucket_r < buckets_cnt; bucket_r += pwidth) {
 919 |         int info = bucket_info[bucket_r];
 920 | 
 921 |         if (info != 0) {
 922 |             /*unpack information on the subchain*/
 923 |             int p = info >> 15;
 924 |             int len_R = info & ((1 << 15) - 1);
 925 | 
 926 |             int len_S = S_cnts[p];
 927 | 
 928 |             /*S partition doesn't fit in shared memory*/
 929 |             if (len_S > 4096+512) {
 930 |                 int bucket_r_loop = bucket_r;
 931 | 
 932 |                 /*now we will build a bucket of R side in the shared memory at a time and then probe it with S-side
 933 |                 sensible because
 934 |                 1) we have guarantees on size of R from the chain decomposition
 935 |                 2) this is a skewed scenario so size of S can be arbitrary*/
 936 |                 for (int offset_r = 0; offset_r < len_R; offset_r += bucket_size) {
 937 |                     for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x)
 938 |                         head[i] = -1;
 939 |                     __syncthreads();
 940 | 
 941 |                     /*build a hashtable from an R bucket*/
 942 |                     for (int base_r = 0; base_r < bucket_size; base_r += 4*blockDim.x) {
 943 |                         vec4 data_R = *(reinterpret_cast<const vec4 *>(R + bucket_size * bucket_r_loop + base_r + 4*threadIdx.x));
 944 |                         vec4 data_Pr = *(reinterpret_cast<const vec4 *>(Pr + bucket_size * bucket_r_loop + base_r + 4*threadIdx.x));
 945 |                         int l_cnt_R = len_R - offset_r - base_r - 4 * threadIdx.x;
 946 | 
 947 |                         int cnt = 0;                    
 948 | 
 949 |                         #pragma unroll
 950 |                         for (int k = 0; k < 4; k++) {
 951 |                             if (k < l_cnt_R) {
 952 |                                 int val = data_R.i[k];
 953 |                                 elem[base_r + k*blockDim.x + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
 954 |                                 payload[base_r + k*blockDim.x + tid] = data_Pr.i[k];
 955 |                                 int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1);
 956 | 
 957 |                                 int32_t last = atomicExch(&head[hval], base_r + k*blockDim.x + tid);
 958 |                                 next[base_r + k*blockDim.x + tid] = last;
 959 |                             }
 960 |                         }
 961 |                     }
 962 | 
 963 |                     bucket_r_loop = R_chain[bucket_r_loop];
 964 | 
 965 |                     __syncthreads();
 966 | 
 967 |                     int bucket_s_loop = p;
 968 |                     int base_s = 0;
 969 |         
 970 |                     /*probe hashtable from an S bucket*/
 971 |                     for (int offset_s = 0; offset_s < len_S; offset_s += 4*blockDim.x) {
 972 |                         vec4 data_S = *(reinterpret_cast<const vec4 *>(S + bucket_size * bucket_s_loop + base_s + 4*threadIdx.x));
 973 |                         vec4 data_Ps = *(reinterpret_cast<const vec4 *>(Ps + bucket_size * bucket_s_loop + base_s + 4*threadIdx.x));
 974 |                         int l_cnt_S = len_S - offset_s - 4 * threadIdx.x;
 975 | 
 976 |                         #pragma unroll
 977 |                         for (int k = 0; k < 4; k++) {
 978 |                             int32_t val = data_S.i[k];
 979 |                             int32_t pval = data_Ps.i[k];
 980 |                             int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
 981 |                             int32_t hval =  (val >> log_parts) & (LOCAL_BUCKETS - 1);
 982 | 
 983 |                             if (k < l_cnt_S) {
 984 |                                 int32_t pos = head[hval];
 985 |                                 while (pos >= 0) {
 986 |                                     if (elem[pos] == tval) {
 987 |                                         count += pval*payload[pos];
 988 |                                     }
 989 | 
 990 |                                     pos = next[pos];
 991 |                                 }
 992 |                             }                   
 993 |                         }
 994 | 
 995 |                         base_s += 4*blockDim.x;
 996 |                         if (base_s >= bucket_size) {
 997 |                             bucket_s_loop = S_chain[bucket_s_loop];
 998 |                             base_s = 0;
 999 |                         }
1000 |                     }
1001 | 
1002 |                     __syncthreads();
1003 |                 }
1004 |             } else {
1005 |                 for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x)
1006 |                     head[i] = -1;
1007 | 
1008 |                 int rem_s = len_S % 4096;
1009 |                 rem_s = (rem_s + 4 - 1)/4;
1010 | 
1011 |                 __syncthreads();
1012 | 
1013 |                 int off;
1014 |                 int it;
1015 |                 int base = 0;
1016 | 
1017 |                 it = p;
1018 |                 off = 0;
1019 | 
1020 |                 /*build hashtable for S-side*/
1021 |                 for (off = 0; off < len_S;) {
1022 |                     vec4 data_S = *(reinterpret_cast<const vec4 *>(S + bucket_size * it + base + 4*threadIdx.x));
1023 |                     vec4 data_Ps = *(reinterpret_cast<const vec4 *>(Ps + bucket_size * it + base +4*threadIdx.x));
1024 |                     int l_cnt_S = len_S - off - 4 * threadIdx.x;
1025 | 
1026 |                     #pragma unroll
1027 |                     for (int k = 0; k < 4; k++) {
1028 |                         if (k < l_cnt_S) {
1029 |                             int val = data_S.i[k];
1030 |                             elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
1031 |                             payload[off + tid] = data_Ps.i[k];
1032 |                             int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1);
1033 | 
1034 |                             int32_t last = atomicExch(&head[hval], off + tid);
1035 |                             next[off + tid] = last;
1036 |                         }   
1037 | 
1038 |                         off += (off < bucket_size)? blockDim.x : rem_s;
1039 |                         base += blockDim.x;
1040 |                     }
1041 | 
1042 |                     if (base >= bucket_size) {
1043 |                         it = S_chain[it];  
1044 |                         base = 0;
1045 |                     }
1046 | 
1047 | 
1048 |                 }
1049 | 
1050 |                 __syncthreads();
1051 | 
1052 |                 it = bucket_r;
1053 |                 off = 0;
1054 | 
1055 |                 /*probe from R-side*/
1056 |                 for (; 0 < len_R; off += 4*blockDim.x, len_R -= 4*blockDim.x) {
1057 |                     vec4 data_R = *(reinterpret_cast<const vec4 *>(R + bucket_size * it + off + 4*threadIdx.x));
1058 |                     vec4 data_Pr = *(reinterpret_cast<const vec4 *>(Pr + bucket_size * it + off + 4*threadIdx.x));
1059 |                     int l_cnt_R = len_R - 4 * threadIdx.x;
1060 | 
1061 |                     #pragma unroll
1062 |                     for (int k = 0; k < 4; k++) {
1063 |                         int32_t val = data_R.i[k];
1064 |                         int32_t pval = data_Pr.i[k];
1065 |                         /*hack to fit more data in shared memory*/
1066 |                         int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
1067 |                         int32_t hval =  (val >> log_parts) & (LOCAL_BUCKETS - 1);
1068 | 
1069 |                         if (k < l_cnt_R) {
1070 |                             int32_t pos = head[hval];
1071 |                             while (pos >= 0) {
1072 |                                 if (elem[pos] == tval) {
1073 |                                     count += pval*payload[pos];
1074 |                                 }
1075 | 
1076 |                                 pos = next[pos];
1077 |                             }
1078 |                         }                   
1079 |                     }
1080 | 
1081 |                     if (off >= bucket_size) {
1082 |                         it = R_chain[it];
1083 |                         off = 0;
1084 |                     }
1085 |                 }
1086 | 
1087 |                 __syncthreads();
1088 |             }
1089 |         }
1090 |     }
1091 | 
1092 |     atomicAdd(results, count);
1093 | 
1094 |     __syncthreads();
1095 | }
1096 | 
1097 | /*maximum size of output, we always write at *write_offset MOD (FOLD+1)*
1098 | we use it in order to simulate the cases that output size explodes. we do the actual writes then overwrite them*/
1099 | #define FOLD ((1 << 24) - 1)
1100 | /*the number of elements that can be stored in a warp-level buffer during the join materialization*/
1101 | #define SHUFFLE_SIZE 16
1102 | 
1103 | 
1104 | /*practically the same as join_partitioned_aggregate
1105 | 
1106 | i add extra comments for the materialization technique*/
1107 | __global__ void join_partitioned_results (
1108 |                                     const int32_t*               R,
1109 |                                     const int32_t*               Pr,
1110 |                                     const uint32_t*              R_chain,
1111 |                                     const uint32_t*              bucket_info,
1112 |                                     const int32_t*               S,
1113 |                                     const int32_t*               Ps,
1114 |                                     const uint32_t*              S_cnts,
1115 |                                     const uint32_t*              S_chain,
1116 |                                     int32_t                      log_parts,
1117 |                                     uint32_t*                    buckets_num,
1118 |                                     int32_t*                     results,
1119 |                                     int32_t*                     output) {
1120 |     __shared__ int16_t elem[4096 + 512];
1121 |     __shared__ int32_t payload[4096 + 512];
1122 |     __shared__ int16_t next[4096 + 512];
1123 |     __shared__ int32_t head[LOCAL_BUCKETS];
1124 |     __shared__ int32_t shuffle[2*SHUFFLE_SIZE*32];
1125 | 
1126 | 
1127 |     int tid = threadIdx.x;
1128 |     int block = blockIdx.x;
1129 |     int width = blockDim.x;
1130 |     int pwidth = gridDim.x;
1131 |     int parts = 1 << log_parts;
1132 | 
1133 |     int lid = tid % 32;
1134 |     int gid = tid / 32;
1135 |     int gnum = blockDim.x/32;
1136 | 
1137 |     int count = 0;
1138 | 
1139 |     int ptr;
1140 | 
1141 |     int threadmask = (lid < 31)? ~((1 << (lid+1)) - 1) : 0;
1142 | 
1143 |     int shuffle_ptr = 0;
1144 | 
1145 |     int32_t* warp_shuffle = shuffle + gid * 2 * SHUFFLE_SIZE;
1146 | 
1147 |     int buckets_cnt = *buckets_num;
1148 | 
1149 | 
1150 |     for (uint32_t bucket_r = block; bucket_r < buckets_cnt; bucket_r += pwidth) {
1151 |         int info = bucket_info[bucket_r];
1152 | 
1153 |         if (info != 0) { 
1154 |             int p = info >> 15;
1155 |             int len_R = info & ((1 << 15) - 1);
1156 |             int len_S = S_cnts[p];
1157 | 
1158 |             if (len_S > 4096+512) {
1159 |                 int bucket_r_loop = bucket_r;
1160 | 
1161 |                 for (int offset_r = 0; offset_r < len_R; offset_r += bucket_size) {
1162 |                     for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x)
1163 |                         head[i] = -1;
1164 |                     __syncthreads();
1165 | 
1166 |                     for (int base_r = 0; base_r < bucket_size; base_r += 4*blockDim.x) {
1167 |                         vec4 data_R = *(reinterpret_cast<const vec4 *>(R + bucket_size * bucket_r_loop + base_r + 4*threadIdx.x));
1168 |                         vec4 data_Pr = *(reinterpret_cast<const vec4 *>(Pr + bucket_size * bucket_r_loop + base_r + 4*threadIdx.x));
1169 |                         int l_cnt_R = len_R - offset_r - base_r - 4 * threadIdx.x;
1170 | 
1171 |                         int cnt = 0;                    
1172 | 
1173 |                         #pragma unroll
1174 |                         for (int k = 0; k < 4; k++) {
1175 |                             if (k < l_cnt_R) {
1176 |                                 int val = data_R.i[k];
1177 |                                 elem[base_r + k*blockDim.x + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
1178 |                                 payload[base_r + k*blockDim.x + tid] = data_Pr.i[k];
1179 |                                 int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1);
1180 | 
1181 |                                 int32_t last = atomicExch(&head[hval], base_r + k*blockDim.x + tid);
1182 |                                 next[base_r + k*blockDim.x + tid] = last;
1183 |                             }
1184 |                         }
1185 |                     }
1186 | 
1187 |                     bucket_r_loop = R_chain[bucket_r_loop];
1188 | 
1189 |                     __syncthreads();
1190 | 
1191 |                     int bucket_s_loop = p;
1192 |                     int base_s = 0;
1193 | 
1194 |                     for (int offset_s = 0; offset_s < len_S; offset_s += 4*blockDim.x) {
1195 |                         vec4 data_S = *(reinterpret_cast<const vec4 *>(S + bucket_size * bucket_s_loop + base_s + 4*threadIdx.x));
1196 |                         vec4 data_Ps = *(reinterpret_cast<const vec4 *>(Ps + bucket_size * bucket_s_loop + base_s + 4*threadIdx.x));
1197 |                         int l_cnt_S = len_S - offset_s - 4 * threadIdx.x;
1198 | 
1199 |                         #pragma unroll
1200 |                         for (int k = 0; k < 4; k++) {
1201 |                             int32_t val = data_S.i[k];
1202 |                             int32_t pval = data_Ps.i[k];
1203 |                             int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
1204 |                             int32_t hval =  (val >> log_parts) & (LOCAL_BUCKETS - 1);
1205 |                             int32_t pay;
1206 | 
1207 |                             int32_t pos = (k < l_cnt_S)? head[hval] : -1;
1208 | 
1209 |                             /*check at warp level whether someone is still following chain => this way we can shuffle without risk*/
1210 |                             int pred = (pos >= 0);
1211 | 
1212 |                             while (__any(pred)) {
1213 |                                 int wr_intention = 0;
1214 | 
1215 |                                 /*we have a match, fetch the data to be written*/
1216 |                                 if (pred) {
1217 |                                     if (elem[pos] == tval) {
1218 |                                         pay = payload[pos];
1219 |                                         wr_intention = 1;
1220 |                                         count++;
1221 |                                     }
1222 | 
1223 |                                     pos = next[pos];
1224 |                                     pred = (pos >= 0);
1225 |                                 }
1226 | 
1227 |                                 /*find out who had a match in this execution step*/
1228 |                                 int mask = __ballot(wr_intention);
1229 | 
1230 |                                 /*our software managed buffer will overflow, flush it*/
1231 |                                 int wr_offset = shuffle_ptr +  __popc(mask & threadmask);
1232 |                                 shuffle_ptr = shuffle_ptr + __popc(mask);
1233 |                                 
1234 |                                 /*while it overflows, flush
1235 |                                 we flush 16 keys and then the 16 corresponding payloads consecutively, of course other formats might be friendlier*/
1236 |                                 while (shuffle_ptr >= SHUFFLE_SIZE) {
1237 |                                     if (wr_intention && (wr_offset < SHUFFLE_SIZE)) {
1238 |                                         warp_shuffle[wr_offset] = pay;
1239 |                                         warp_shuffle[wr_offset+SHUFFLE_SIZE] = pval;
1240 |                                         wr_intention = 0;
1241 |                                     }
1242 | 
1243 |                                    if (lid == 0) {
1244 |                                         ptr = atomicAdd(results, 2*SHUFFLE_SIZE);
1245 |                                         ptr = ptr & FOLD;
1246 |                                    }
1247 | 
1248 |                                     ptr = __shfl(ptr, 0);
1249 | 
1250 |                                     output[ptr + lid] = warp_shuffle[lid];
1251 | 
1252 |                                     wr_offset -= SHUFFLE_SIZE;
1253 |                                     shuffle_ptr -= SHUFFLE_SIZE;
1254 |                                 }
1255 | 
1256 |                                 /*now the fit, write them in buffer*/
1257 |                                 if (wr_intention && (wr_offset >= 0)) {
1258 |                                     warp_shuffle[wr_offset] = pay;
1259 |                                     warp_shuffle[wr_offset+SHUFFLE_SIZE] = pval;
1260 |                                     wr_intention = 0;
1261 |                                 }
1262 |                             }                   
1263 |                         }
1264 | 
1265 |                         base_s += 4*blockDim.x;
1266 |                         if (base_s >= bucket_size) {
1267 |                             bucket_s_loop = S_chain[bucket_s_loop];
1268 |                             base_s = 0;
1269 |                         }
1270 |                     }
1271 | 
1272 |                     __syncthreads();
1273 |                 }
1274 |             } else {
1275 |                 for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x)
1276 |                     head[i] = -1;
1277 | 
1278 |                 int rem_s = len_S % 4096;
1279 |                 rem_s = (rem_s + 4 - 1)/4;
1280 | 
1281 |                 __syncthreads();
1282 | 
1283 |                 int off;
1284 |                 int it;
1285 |                 int base = 0;
1286 | 
1287 |                 it = p;
1288 |                 off = 0;
1289 | 
1290 | 
1291 |                 for (off = 0; off < len_S;) {
1292 |                     vec4 data_S = *(reinterpret_cast<const vec4 *>(S + bucket_size * it + base + 4*threadIdx.x));
1293 |                     vec4 data_Ps = *(reinterpret_cast<const vec4 *>(Ps + bucket_size * it + base +4*threadIdx.x));
1294 |                     int l_cnt_S = len_S - off - 4 * threadIdx.x;
1295 | 
1296 |                     #pragma unroll
1297 |                     for (int k = 0; k < 4; k++) {
1298 |                         if (k < l_cnt_S) {
1299 |                             int val = data_S.i[k];
1300 |                             elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
1301 |                             payload[off + tid] = data_Ps.i[k];
1302 |                             int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1);
1303 | 
1304 |                             int32_t last = atomicExch(&head[hval], off + tid);
1305 |                             next[off + tid] = last;
1306 |                         }   
1307 | 
1308 |                         off += (off < bucket_size)? blockDim.x : rem_s;
1309 |                         base += blockDim.x;
1310 |                     }
1311 | 
1312 |                     if (base >= bucket_size) {
1313 |                         it = S_chain[it];  
1314 |                         base = 0;
1315 |                     }
1316 |                 }
1317 | 
1318 |                 __syncthreads();
1319 | 
1320 |                 it = bucket_r;
1321 |                 off = 0;
1322 | 
1323 |                 for (; 0 < len_R; off += 4*blockDim.x, len_R -= 4*blockDim.x) {
1324 |                     int l_cnt_R = len_R - 4 * threadIdx.x;
1325 |                     vec4 data_R;
1326 |                     vec4 data_Pr;
1327 | 
1328 |                     data_R = *(reinterpret_cast<const vec4 *>(R + bucket_size * it + off + 4*threadIdx.x));
1329 |                     data_Pr = *(reinterpret_cast<const vec4 *>(Pr + bucket_size * it + off + 4*threadIdx.x));
1330 | 
1331 |                     #pragma unroll
1332 |                     for (int k = 0; k < 4; k++) {
1333 |                         int32_t val = data_R.i[k];
1334 |                         int32_t pval = data_Pr.i[k];
1335 |                         int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
1336 |                         int32_t hval =  (val >> log_parts) & (LOCAL_BUCKETS - 1);
1337 |                         int32_t pay;
1338 | 
1339 |                         int32_t pos = (k < l_cnt_R)? head[hval] : -1;
1340 | 
1341 |                         /*same as previous code block*/
1342 |                         int pred = (pos >= 0);
1343 | 
1344 |                         while (__any(pred)) {
1345 |                             int wr_intention = 0;
1346 | 
1347 |                             if (pred) {
1348 |                                 if (elem[pos] == tval) {
1349 |                                     pay = payload[pos];
1350 |                                     wr_intention = 1;
1351 |                                     count++;
1352 |                                 }
1353 | 
1354 |                                 pos = next[pos];
1355 |                                 pred = (pos >= 0);
1356 |                             }
1357 | 
1358 |                             int mask = __ballot(wr_intention);
1359 | 
1360 |                             int wr_offset = shuffle_ptr +  __popc(mask & threadmask);
1361 |                             shuffle_ptr = shuffle_ptr + __popc(mask);
1362 |                                 
1363 |                             while (shuffle_ptr >= SHUFFLE_SIZE) {
1364 |                                 if (wr_intention && (wr_offset < SHUFFLE_SIZE)) {
1365 |                                     warp_shuffle[wr_offset] = pval;
1366 |                                     warp_shuffle[wr_offset+SHUFFLE_SIZE] = pay;
1367 |                                     wr_intention = 0;
1368 |                                 }
1369 | 
1370 |                                 if (lid == 0) {
1371 |                                     ptr = atomicAdd(results, 2*SHUFFLE_SIZE);
1372 | 
1373 |                                     ptr = ptr & FOLD;
1374 |                                 }
1375 | 
1376 |                                 ptr = __shfl(ptr, 0);
1377 | 
1378 |                                 output[ptr + lid] = warp_shuffle[lid];
1379 | 
1380 |                                 wr_offset -= SHUFFLE_SIZE;
1381 |                                 shuffle_ptr -= SHUFFLE_SIZE;
1382 |                             }
1383 | 
1384 |                             if (wr_intention && (wr_offset >= 0)) {
1385 |                                 warp_shuffle[wr_offset] = pval;
1386 |                                 warp_shuffle[wr_offset+SHUFFLE_SIZE] = pay;
1387 |                                 wr_intention = 0;
1388 |                             }
1389 |                         }                   
1390 |                     }
1391 | 
1392 |                     if (off >= bucket_size) {
1393 |                         it = R_chain[it];
1394 |                         off = 0;
1395 |                     }
1396 |                 }
1397 | 
1398 |                 __syncthreads();
1399 |             }
1400 |         }
1401 |     }
1402 | 
1403 |     if (lid == 0) {
1404 |         ptr = atomicAdd(results, 2*shuffle_ptr);
1405 |         ptr = ptr & FOLD;
1406 |     }
1407 | 
1408 |     ptr = __shfl(ptr, 0);
1409 | 
1410 |     if (lid < shuffle_ptr) {
1411 |         output[ptr + lid] = warp_shuffle[lid];
1412 |         output[ptr + lid + shuffle_ptr] = warp_shuffle[lid + SHUFFLE_SIZE]; 
1413 |     }
1414 | 
1415 |     __syncthreads();
1416 | }
1417 | 
1418 | /*again the same but payload is the virtual tuple id and we late materialize from Dx arrays which store the actual columns that we need
1419 | also here we have no overflows because if we did, we wouldn't fit the data/extra columns :) */
1420 | __global__ void join_partitioned_varpayload (
1421 |                                     const int32_t*               R,
1422 |                                     const int32_t*               Pr,
1423 |                                     const int32_t*               Dr,
1424 |                                     const uint32_t*              R_chain,
1425 |                                     const uint32_t*              bucket_info,
1426 |                                     const int32_t*               S,
1427 |                                     const int32_t*               Ps,
1428 |                                     const int32_t*               Ds,
1429 |                                     const uint32_t*              S_cnts,
1430 |                                     const uint32_t*              S_chain,
1431 |                                     int32_t                      log_parts,
1432 |                                     int32_t                      col_num1,
1433 |                                     int32_t                      col_num2,
1434 |                                     int32_t                      rel_size,
1435 |                                     uint32_t*                    buckets_num,
1436 |                                     int32_t*                     results) {
1437 |     __shared__ int16_t elem[4096 + 512];
1438 |     __shared__ int32_t payload[4096 + 512];
1439 |     __shared__ int16_t next[4096 + 512];
1440 |     __shared__ int32_t head[LOCAL_BUCKETS];
1441 | 
1442 | 
1443 |     int tid = threadIdx.x;
1444 |     int block = blockIdx.x;
1445 |     int width = blockDim.x;
1446 |     int pwidth = gridDim.x;
1447 |     int parts = 1 << log_parts;
1448 | 
1449 |     int lid = tid % 32;
1450 |     int gnum = blockDim.x/32;
1451 | 
1452 |     int count = 0;
1453 | 
1454 |     int buckets_cnt = *buckets_num;
1455 | 
1456 |     for (uint32_t bucket_r = block; bucket_r < buckets_cnt; bucket_r += pwidth) {
1457 |         int info = bucket_info[bucket_r];
1458 | 
1459 |         if (info != 0)  {
1460 |             int p = info >> 15;
1461 |             int len_R = info & ((1 << 15) - 1);
1462 | 
1463 |             int len_S = S_cnts[p];
1464 | 
1465 |             for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x)
1466 |                 head[i] = -1;
1467 | 
1468 |             int rem_s = len_S % 4096;
1469 |             rem_s = (rem_s + 4 - 1)/4;
1470 | 
1471 |             __syncthreads();
1472 | 
1473 |             int off;
1474 |             int it;
1475 |             int base = 0;
1476 | 
1477 |             it = p;
1478 |             off = 0;
1479 | 
1480 |             for (off = 0; off < len_S;) {
1481 |                 vec4 data_S = *(reinterpret_cast<const vec4 *>(S + bucket_size * it + base + 4*threadIdx.x));
1482 |                 vec4 data_Ps = *(reinterpret_cast<const vec4 *>(Ps + bucket_size * it + base +4*threadIdx.x));
1483 |                 int l_cnt_S = len_S - off - 4 * threadIdx.x;
1484 | 
1485 |                 #pragma unroll
1486 |                 for (int k = 0; k < 4; k++) {
1487 |                     if (k < l_cnt_S) {
1488 |                         int val = data_S.i[k];
1489 |                         elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
1490 |                         payload[off + tid] = data_Ps.i[k];
1491 |                         int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1);
1492 | 
1493 |                         int32_t last = atomicExch(&head[hval], off + tid);
1494 |                         next[off + tid] = last;
1495 |                     }   
1496 | 
1497 |                     off += (off < bucket_size)? blockDim.x : rem_s;
1498 |                 }
1499 | 
1500 |                 if (base >= bucket_size) {
1501 |                     it = S_chain[it];  
1502 |                     base = 0;
1503 |                 }
1504 | 
1505 | 
1506 |             }
1507 | 
1508 |             __syncthreads();
1509 | 
1510 |             it = bucket_r;
1511 |             off = 0;
1512 | 
1513 |             for (; 0 < len_R; off += 4*blockDim.x, len_R -= 4*blockDim.x) {
1514 |                 vec4 data_R = *(reinterpret_cast<const vec4 *>(R + bucket_size * it + off + 4*threadIdx.x));
1515 |                 vec4 data_Pr = *(reinterpret_cast<const vec4 *>(Pr + bucket_size * it + off + 4*threadIdx.x));
1516 |                 int l_cnt_R = len_R - 4 * threadIdx.x;
1517 | 
1518 |                 #pragma unroll
1519 |                 for (int k = 0; k < 4; k++) {
1520 |                     int32_t val = data_R.i[k];
1521 |                     int32_t pval = data_Pr.i[k];
1522 |                     int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
1523 |                     int32_t hval =  (val >> log_parts) & (LOCAL_BUCKETS - 1);
1524 | 
1525 |                     if (k < l_cnt_R) {
1526 |                         int32_t pos = head[hval];
1527 |                         while (pos >= 0) {
1528 |                             if (elem[pos] == tval) {
1529 |                                 int32_t bval = payload[pos];
1530 | 
1531 |                                 for (int z = 0; z < col_num1; z++)
1532 |                                     count += Dr[pval + z*rel_size];
1533 | 
1534 |                                 for (int z = 0; z < col_num2; z++)
1535 |                                     count += Ds[bval + z*rel_size];
1536 |                             }
1537 | 
1538 |                             pos = next[pos];
1539 |                         }
1540 |                     }                   
1541 |                 }
1542 | 
1543 |                 if (off >= bucket_size) {
1544 |                     it = R_chain[it];
1545 |                     off = 0;
1546 |                 }
1547 |             }
1548 | 
1549 |             __syncthreads();
1550 | 
1551 |         }
1552 |     }
1553 | 
1554 |     atomicAdd(results, count);
1555 | 
1556 |     __syncthreads();
1557 | }
1558 | 
1559 | /*late materialization and perfect hashing*/
1560 | __global__ void probe_perfect_array_varpay (int32_t* data, int32_t* Dr, int n, int32_t* lookup, int32_t* Ds, int col_num1, int col_num2, int rel_size, int* aggr) {
1561 |     int count = 0;
1562 | 
1563 |     for (size_t i = threadIdx.x + blockIdx.x * blockDim.x; i < n ; i += blockDim.x * gridDim.x) {
1564 |         int val = data[i];
1565 |         int payload = i;
1566 |         int res = lookup[val];
1567 | 
1568 |         if (res > 0) {
1569 |             res--;
1570 | 
1571 |             for (int z = 0; z < col_num1; z++)
1572 |                 count += Dr[payload + z*rel_size];
1573 |             for (int z = 0; z < col_num2; z++)
1574 |                 count += Ds[res + z*rel_size];
1575 |         }
1576 |     }
1577 | 
1578 |     atomicAdd(aggr, count);
1579 | }
1580 | 
1581 | /*partition and compute metadata for relation with key+payload*/
1582 | void prepare_Relation_payload (int* R, int* R_temp, int* P, int* P_temp, size_t RelsNum, uint32_t buckets_num, uint64_t* heads[2], uint32_t* cnts[2], uint32_t* chains[2], uint32_t* buckets_used[2], uint32_t log_parts1, uint32_t log_parts2, uint32_t first_bit, cudaStream_t streams, size_t* offsets_GPU, uint32_t num_threads) {
1583 |     init_metadata_double<<<64, 1024, 0, streams>>> (
1584 |         heads[0], buckets_used[0], chains[0], cnts[0], 1 << log_parts1, buckets_num,
1585 |         heads[1], buckets_used[1], chains[1], cnts[1], 1 << (log_parts1 + log_parts2), buckets_num
1586 |     );
1587 | 
1588 |     partition_pass_one <<<64, 1024, (1024*4 + 4*(1 << log_parts1)) * sizeof(int32_t) + (4*num_threads+2)*sizeof(size_t), streams>>>(
1589 |                                                 R, P,
1590 |                                                 offsets_GPU,
1591 |                                                 heads[0],
1592 |                                                 buckets_used[0],
1593 |                                                 chains[0],
1594 |                                                 cnts[0],
1595 |                                                 R_temp, P_temp,
1596 |                                                 RelsNum,
1597 |                                                 log_parts1,
1598 |                                                 first_bit + log_parts2,
1599 |                                                 num_threads
1600 |     );
1601 | 
1602 | 
1603 |     compute_bucket_info <<<64, 1024, 0, streams>>> (chains[0], cnts[0], log_parts1);
1604 | 
1605 |     partition_pass_two <<<64, 1024, (1024*4 + 4*(1 << log_parts2)) * sizeof(int32_t) + ((2 * (1 << log_parts2) + 1)* sizeof(int32_t)), streams>>>(
1606 |                                     R_temp, P_temp,
1607 |                                     chains[0],
1608 |                                     buckets_used[1], heads[1], chains[1], cnts[1],
1609 |                                     R, P,
1610 |                                     log_parts1, log_parts2, first_bit,
1611 |                                     buckets_used[0]);
1612 | 
1613 | }
1614 | 
1615 | /*partition and compute metadata for relation with key+payload. We use different buffers at the end (it makes sense for UVA based techniques)*/
1616 | void prepare_Relation_payload_triple (int* R, int* R_temp, int* R_final, int* P, int* P_temp, int* P_final, size_t RelsNum, uint32_t buckets_num, uint64_t* heads[2], uint32_t* cnts[2], uint32_t* chains[2], uint32_t* buckets_used[2], uint32_t log_parts1, uint32_t log_parts2, uint32_t first_bit, cudaStream_t streams, size_t* offsets_GPU, uint32_t num_threads) {
1617 |     init_metadata_double<<<64, 1024, 0, streams>>> (
1618 |         heads[0], buckets_used[0], chains[0], cnts[0], 1 << log_parts1, buckets_num,
1619 |         heads[1], buckets_used[1], chains[1], cnts[1], 1 << (log_parts1 + log_parts2), buckets_num
1620 |     );
1621 | 
1622 |     partition_pass_one <<<64, 1024, (1024*4 + 4*(1 << log_parts1)) * sizeof(int32_t) + (4*num_threads+2)*sizeof(size_t), streams>>>(
1623 |                                                 R, P,
1624 |                                                 offsets_GPU,
1625 |                                                 heads[0],
1626 |                                                 buckets_used[0],
1627 |                                                 chains[0],
1628 |                                                 cnts[0],
1629 |                                                 R_temp, P_temp,
1630 |                                                 RelsNum,
1631 |                                                 log_parts1,
1632 |                                                 first_bit + log_parts2,
1633 |                                                 num_threads
1634 |     );
1635 | 
1636 |     CHK_ERROR(cudaDeviceSynchronize());
1637 | 
1638 | 
1639 |     compute_bucket_info <<<64, 1024, 0, streams>>> (chains[0], cnts[0], log_parts1);
1640 | 
1641 |     partition_pass_two <<<64, 1024, (1024*4 + 4*(1 << log_parts2)) * sizeof(int32_t) + ((2 * (1 << log_parts2) + 1)* sizeof(int32_t)), streams>>>(
1642 |                                     R_temp, P_temp,
1643 |                                     chains[0],
1644 |                                     buckets_used[1], heads[1], chains[1], cnts[1],
1645 |                                     R_final, P_final,
1646 |                                     log_parts1, log_parts2, first_bit,
1647 |                                     buckets_used[0]);
1648 | 
1649 |     
1650 | 
1651 | }
1652 | 
1653 | template <typename Tv>
1654 | struct chain_iterator_ref_generic{
1655 |     Tv   x  ;
1656 |     int  cnt;
1657 | };
1658 | 
1659 | template <typename T, typename Tv>
1660 | class chain_iterator_generic{
1661 | private:
1662 |     const T         * __restrict__ S_parts      ;
1663 |     const uint32_t  * __restrict__ S_chains     ;
1664 |     const uint32_t cnt ;
1665 |     
1666 |     const T         * __restrict__ ptr          ;
1667 |     
1668 |     uint32_t current_bucket                     ;
1669 |     uint32_t next_bucket                        ;
1670 |     uint32_t i                                  ;
1671 | public:
1672 |     __device__ __forceinline__ chain_iterator_generic(
1673 |                     const T         * __restrict__ S_parts      ,
1674 |                     const uint32_t  * __restrict__ S_cnts       ,
1675 |                     const uint32_t  * __restrict__ S_chains     ,
1676 |                           uint32_t                 current_partition):
1677 |         S_parts(S_parts + (16/sizeof(T)) * threadIdx.x), S_chains(S_chains), 
1678 |         cnt((S_cnts[current_partition]/((16/sizeof(T)) * blockDim.x))*(16/sizeof(T)) + max(((int32_t) (S_cnts[current_partition] % ((16/sizeof(T)) * blockDim.x))) - ((int32_t) ((16/sizeof(T)) * threadIdx.x)), 0)), 
1679 |         ptr(S_parts + ((size_t) current_partition << log2_bucket_size) + (16/sizeof(T)) * threadIdx.x), 
1680 |         current_bucket(current_partition), 
1681 |         next_bucket(S_chains[current_partition]), 
1682 |         i(0){}
1683 | 
1684 |     __device__ __forceinline__ chain_iterator_generic(
1685 |                     const uint32_t  * __restrict__ S_cnts,
1686 |                           uint32_t                 current_partition):
1687 |        cnt(0), 
1688 |        i(((S_cnts[current_partition] + (16/sizeof(T)) * blockDim.x - 1)/((16/sizeof(T)) * blockDim.x))*(16/sizeof(T))){}
1689 | 
1690 |     __device__ __forceinline__ chain_iterator_generic<T, Tv>& operator++(){
1691 |         i   += (16/sizeof(T));// * blockDim.x;
1692 |         ptr += (16/sizeof(T)) * blockDim.x;
1693 | 
1694 |         if ((i * blockDim.x) & bucket_size_mask) return *this;
1695 |         
1696 |         current_bucket = next_bucket;//int_shared[0];
1697 | 
1698 |         ptr = S_parts + (current_bucket << log2_bucket_size);
1699 | 
1700 |         next_bucket = S_chains[next_bucket];
1701 | 
1702 |         return *this;
1703 |     }
1704 | 
1705 |     __device__ __forceinline__ chain_iterator_ref_generic<Tv> operator*() const {
1706 |         chain_iterator_ref_generic<Tv> tmp;
1707 |         tmp.x   = *reinterpret_cast<const Tv *>(ptr);
1708 |         tmp.cnt = cnt - i;
1709 |         return tmp;
1710 |     }
1711 | 
1712 |     __device__ __forceinline__ bool operator!=(const chain_iterator_generic<T, Tv>& o){
1713 |         return i != o.i;
1714 |     }
1715 | };
1716 | 
1717 | template <typename T, typename Tv>
1718 | class chain_generic{
1719 | private:
1720 |     const T         * __restrict__ S_parts  ;
1721 |     const uint32_t  * __restrict__ S_cnts   ;
1722 |     const uint32_t  * __restrict__ S_chains ;
1723 |     const uint32_t                 partition;
1724 | public:
1725 |     __device__ __host__ __forceinline__ chain_generic(
1726 |                     const T         * __restrict__ S_parts      ,
1727 |                     const uint32_t  * __restrict__ S_cnts       ,
1728 |                     const uint32_t  * __restrict__ S_chains     ,
1729 |                           uint32_t                 partition):
1730 |         S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains), partition(partition){}
1731 | 
1732 |     __device__ __forceinline__ chain_iterator_generic<T, Tv> begin() const {
1733 |         return chain_iterator_generic<T, Tv>(S_parts, S_cnts, S_chains, partition);
1734 |     }
1735 | 
1736 |     __device__ __forceinline__ chain_iterator_generic<T, Tv> end() const {
1737 |         return chain_iterator_generic<T, Tv>(S_cnts, partition);
1738 |     }
1739 | };
1740 | 
1741 | template <typename T, typename Tv>
1742 | class chains_generic {
1743 | private:
1744 |     const T         * __restrict__ S_parts  ;
1745 |     const uint32_t  * __restrict__ S_cnts   ;
1746 |     const uint32_t  * __restrict__ S_chains ;
1747 | public:
1748 |     __device__ __host__ __forceinline__ chains_generic(
1749 |                     const T         * __restrict__ S_parts      ,
1750 |                     const uint32_t  * __restrict__ S_cnts       ,
1751 |                     const uint32_t  * __restrict__ S_chains     ):
1752 |         S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains){}
1753 | 
1754 |     __device__ __host__ __forceinline__ chain_generic<T, Tv> get_chain(uint32_t partition) const{
1755 |         return chain_generic<T, Tv>(S_parts, S_cnts, S_chains, partition);
1756 |     }
1757 | 
1758 |     __device__ __forceinline__ uint32_t get_chain_size(uint32_t partition) const{
1759 |         return S_cnts[partition];
1760 |     }
1761 | };
1762 | 
1763 | struct chain_iterator_ref{
1764 |     vec4 x  ;
1765 |     int  cnt;
1766 | };
1767 | 
1768 | struct chain_iterator_i_ref{
1769 |     int32_t x;
1770 |     bool    v;
1771 | };
1772 | 
1773 | class chain_iterator{
1774 | private:
1775 |     const int32_t   * __restrict__ S_parts      ;
1776 |     const uint32_t  * __restrict__ S_chains     ;
1777 |     const uint32_t cnt ;
1778 |     
1779 |     const int32_t   * __restrict__ ptr          ;
1780 |     
1781 |     uint32_t current_bucket                     ;
1782 |     uint32_t next_bucket                        ;
1783 |     uint32_t i                                  ;
1784 | public:
1785 |     // __device__ __forceinline__ chain_iterator(
1786 |     //                 const int32_t   * __restrict__ S_parts      ,
1787 |     //                 const uint32_t  * __restrict__ S_cnts       ,
1788 |     //                 const uint32_t  * __restrict__ S_chains     ):
1789 |     //     S_parts(S_parts), S_chains(S_chains), cnt(S_cnts[blockIdx.x]), current_bucket(blockIdx.x), i(0){}
1790 | 
1791 |     __device__ __forceinline__ chain_iterator(
1792 |                     const int32_t   * __restrict__ S_parts      ,
1793 |                     const uint32_t  * __restrict__ S_cnts       ,
1794 |                     const uint32_t  * __restrict__ S_chains     ,
1795 |                           uint32_t                 current_partition):
1796 |         S_parts(S_parts + 4 * threadIdx.x), S_chains(S_chains), cnt((S_cnts[current_partition]/(4 * blockDim.x))*4 + max(((int32_t) (S_cnts[current_partition] % (4 * blockDim.x))) - ((int32_t) (4 * threadIdx.x)), 0)), ptr(S_parts + ((size_t) current_partition << log2_bucket_size) + 4 * threadIdx.x), current_bucket(current_partition), next_bucket(S_chains[current_partition]), i(0){}
1797 | 
1798 |     // __device__ __forceinline__ chain_iterator(
1799 |     //                 const uint32_t  * __restrict__ S_cnts):
1800 |     //    cnt(0), i(((S_cnts[blockIdx.x] + 4 * blockDim.x - 1)/(4 * blockDim.x)) * 4 * blockDim.x){}
1801 | 
1802 |     __device__ __forceinline__ chain_iterator(
1803 |                     const uint32_t  * __restrict__ S_cnts,
1804 |                           uint32_t                 current_partition):
1805 |        cnt(0), i(((S_cnts[current_partition] + 4 * blockDim.x - 1)/(4 * blockDim.x))*4){}
1806 | 
1807 |     __device__ __forceinline__ chain_iterator& operator++(){
1808 |         i   += 4;// * blockDim.x;
1809 |         ptr += 4 * blockDim.x;
1810 | 
1811 |         if ((i * blockDim.x) & bucket_size_mask) return *this;
1812 |         
1813 |         current_bucket = next_bucket;//int_shared[0];
1814 | 
1815 |         ptr = S_parts + (current_bucket << log2_bucket_size);
1816 | 
1817 |         next_bucket = S_chains[next_bucket];
1818 | 
1819 |         return *this;
1820 |     }
1821 | 
1822 |     __device__ __forceinline__ chain_iterator_ref operator*() const {
1823 |         chain_iterator_ref tmp;
1824 |         tmp.x   = *reinterpret_cast<const vec4 *>(ptr);
1825 |         tmp.cnt = cnt - i;
1826 |         return tmp;
1827 |     }
1828 | 
1829 |     __device__ __forceinline__ bool operator!=(const chain_iterator& o){
1830 |         return i != o.i;
1831 |     }
1832 | };
1833 | 
1834 | class chain_iterator_i{
1835 | private:
1836 |     const int32_t   * __restrict__ S_parts      ;
1837 |     const uint32_t  * __restrict__ S_chains     ;
1838 |     const uint32_t cnt ;
1839 |     
1840 |     const int32_t   * __restrict__ ptr          ;
1841 |     
1842 |     uint32_t current_bucket                     ;
1843 |     uint32_t next_bucket                        ;
1844 |     uint32_t i                                  ;
1845 | public:
1846 |     // __device__ __forceinline__ chain_iterator_i(
1847 |     //                 const int32_t   * __restrict__ S_parts      ,
1848 |     //                 const uint32_t  * __restrict__ S_cnts       ,
1849 |     //                 const uint32_t  * __restrict__ S_chains     ):
1850 |     //     S_parts(S_parts), S_chains(S_chains), cnt(S_cnts[blockIdx.x]), current_bucket(blockIdx.x), i(0){}
1851 | 
1852 |     __device__ __forceinline__ chain_iterator_i(
1853 |                     const int32_t   * __restrict__ S_parts      ,
1854 |                     const uint32_t  * __restrict__ S_cnts       ,
1855 |                     const uint32_t  * __restrict__ S_chains     ,
1856 |                           uint32_t                 current_partition):
1857 |         S_parts(S_parts + threadIdx.x), S_chains(S_chains), cnt((S_cnts[current_partition]/blockDim.x) + max(((int32_t) (S_cnts[current_partition] % (blockDim.x))) - ((int32_t) (threadIdx.x)), 0)), ptr(S_parts + ((size_t) current_partition << log2_bucket_size) + threadIdx.x), current_bucket(current_partition), next_bucket(S_chains[current_partition]), i(0){}
1858 | 
1859 |     // __device__ __forceinline__ chain_iterator_i(
1860 |     //                 const uint32_t  * __restrict__ S_cnts):
1861 |     //    cnt(0), i(((S_cnts[blockIdx.x] + 4 * blockDim.x - 1)/(4 * blockDim.x)) * 4 * blockDim.x){}
1862 | 
1863 |     __device__ __forceinline__ chain_iterator_i(
1864 |                     const uint32_t  * __restrict__ S_cnts,
1865 |                           uint32_t                 current_partition):
1866 |        cnt(0), i(((S_cnts[current_partition] + blockDim.x - 1)/(blockDim.x))){}
1867 | 
1868 |     __device__ __forceinline__ chain_iterator_i& operator++(){
1869 |         ++i;// * blockDim.x;
1870 |         ptr += blockDim.x;
1871 | 
1872 |         if ((i * blockDim.x) & bucket_size_mask) return *this;
1873 |         
1874 |         current_bucket = next_bucket;//int_shared[0];
1875 | 
1876 |         ptr = S_parts + (current_bucket << log2_bucket_size);
1877 | 
1878 |         next_bucket = S_chains[next_bucket];
1879 | 
1880 |         return *this;
1881 |     }
1882 | 
1883 |     __device__ __forceinline__ chain_iterator_i_ref operator*() const {
1884 |         chain_iterator_i_ref tmp;
1885 |         tmp.x = *ptr;
1886 |         tmp.v = i < cnt;
1887 |         return tmp;
1888 |     }
1889 | 
1890 |     __device__ __forceinline__ bool operator!=(const chain_iterator_i& o){
1891 |         return i != o.i;
1892 |     }
1893 | };
1894 | 
1895 | class chain_i{
1896 | private:
1897 |     const int32_t   * __restrict__ S_parts  ;
1898 |     const uint32_t  * __restrict__ S_cnts   ;
1899 |     const uint32_t  * __restrict__ S_chains ;
1900 |     const uint32_t                 partition;
1901 | public:
1902 |     __device__ __host__ __forceinline__ chain_i(
1903 |                     const int32_t   * __restrict__ S_parts      ,
1904 |                     const uint32_t  * __restrict__ S_cnts       ,
1905 |                     const uint32_t  * __restrict__ S_chains     ,
1906 |                           uint32_t                 partition):
1907 |         S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains), partition(partition){}
1908 | 
1909 |     __device__ __forceinline__ chain_iterator_i begin() const {
1910 |         return chain_iterator_i(S_parts, S_cnts, S_chains, partition);
1911 |     }
1912 | 
1913 |     __device__ __forceinline__ chain_iterator_i end() const {
1914 |         return chain_iterator_i(S_cnts, partition);
1915 |     }
1916 | };
1917 | 
1918 | class chain{
1919 | private:
1920 |     const int32_t   * __restrict__ S_parts  ;
1921 |     const uint32_t  * __restrict__ S_cnts   ;
1922 |     const uint32_t  * __restrict__ S_chains ;
1923 |     const uint32_t                 partition;
1924 | public:
1925 |     __device__ __host__ __forceinline__ chain(
1926 |                     const int32_t   * __restrict__ S_parts      ,
1927 |                     const uint32_t  * __restrict__ S_cnts       ,
1928 |                     const uint32_t  * __restrict__ S_chains     ,
1929 |                           uint32_t                 partition):
1930 |         S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains), partition(partition){}
1931 | 
1932 |     __device__ __forceinline__ chain_iterator begin() const {
1933 |         return chain_iterator(S_parts, S_cnts, S_chains, partition);
1934 |     }
1935 | 
1936 |     __device__ __forceinline__ chain_iterator end() const {
1937 |         return chain_iterator(S_cnts, partition);
1938 |     }
1939 | };
1940 | 
1941 | 
1942 | class chains{
1943 | private:
1944 |     const int32_t   * __restrict__ S_parts  ;
1945 |     const uint32_t  * __restrict__ S_cnts   ;
1946 |     const uint32_t  * __restrict__ S_chains ;
1947 | public:
1948 |     __device__ __host__ __forceinline__ chains(
1949 |                     const int32_t   * __restrict__ S_parts      ,
1950 |                     const uint32_t  * __restrict__ S_cnts       ,
1951 |                     const uint32_t  * __restrict__ S_chains     ):
1952 |         S_parts(S_parts), S_cnts(S_cnts), S_chains(S_chains){}
1953 | 
1954 |     __device__ __host__ __forceinline__ chain get_chain(uint32_t partition) const{
1955 |         return chain(S_parts, S_cnts, S_chains, partition);
1956 |     }
1957 | 
1958 |     __device__ __host__ __forceinline__ chain_i get_chain_i(uint32_t partition) const{
1959 |         return chain_i(S_parts, S_cnts, S_chains, partition);
1960 |     }
1961 | 
1962 |     __device__ __forceinline__ uint32_t get_chain_size(uint32_t partition) const{
1963 |         return S_cnts[partition];
1964 |     }
1965 | };
1966 | 
1967 | /*essentially the join_partitioned_aggregate*/
1968 | __global__ void join_partitioned_shared (
1969 |                                     const int32_t*               R,
1970 |                                     const int32_t*               Pr,
1971 |                                     const uint32_t*              R_cnts,
1972 |                                     const uint32_t*              R_chain,
1973 |                                     const int32_t*               S,
1974 |                                     const int32_t*               Ps,
1975 |                                     const uint32_t*              S_cnts,
1976 |                                     const uint32_t*              S_chain,
1977 |                                     int32_t                      log_parts,
1978 |                                     int32_t*                      results) {
1979 |     __shared__ int16_t elem[4096 + 512];
1980 |     __shared__ int32_t payload[4096 + 512];
1981 |     __shared__ int16_t next[4096 + 512];
1982 |     __shared__ int32_t head[LOCAL_BUCKETS];
1983 | 
1984 | 
1985 |     int tid = threadIdx.x;
1986 |     int block = blockIdx.x;
1987 |     int width = blockDim.x;
1988 |     int pwidth = gridDim.x;
1989 |     int parts = 1 << log_parts;
1990 | 
1991 |     int lid = tid % 32;
1992 |     int gnum = blockDim.x/32;
1993 | 
1994 |     int count = 0;
1995 | 
1996 |     int pr = -1;
1997 |     int ps = -1;
1998 | 
1999 | 
2000 |     for (uint32_t p = block; p < parts; p += pwidth) {
2001 |         int len_R = R_cnts[p];
2002 |         int len_S = S_cnts[p];
2003 | 
2004 |         if (len_S > 4096 + 512) {
2005 |             /*it was a microbenchmark so I didn't code this part*/
2006 |             continue;
2007 |         } else {
2008 |             chain R_chains(R, R_cnts, R_chain, p);
2009 |             chain Pr_chains(Pr, R_cnts, R_chain, p);
2010 | 
2011 |             chain S_chains(S, S_cnts, S_chain, p);
2012 |             chain Ps_chains(Ps, S_cnts, S_chain, p);
2013 | 
2014 |             int off = 0;
2015 | 
2016 |             for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x)
2017 |                 head[i] = -1;
2018 | 
2019 |             int rem_s = len_S % 4096;
2020 |             rem_s = (rem_s + 4 - 1)/4;
2021 | 
2022 |             __syncthreads();
2023 | 
2024 |             chain_iterator it_S = S_chains.begin();
2025 |             chain_iterator it_Ps = Ps_chains.begin();
2026 | 
2027 |             for (;it_S != S_chains.end(); ++it_S, ++it_Ps) {
2028 |                 vec4 data_S = (*it_S).x;
2029 |                 vec4 data_Ps = (*it_Ps).x;
2030 |                 int l_cnt_S = (*it_S).cnt;
2031 |                 
2032 |                 #pragma unroll
2033 |                 for (int k = 0; k < 4; k++) {
2034 |                     if (k < l_cnt_S) {
2035 |                         int val = data_S.i[k];
2036 |                         elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
2037 |                         payload[off + tid] = data_Ps.i[k];
2038 |                         int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1);
2039 | 
2040 |                         int32_t last = atomicExch(&head[hval], off + tid);
2041 |                         next[off + tid] = last;
2042 |                     }   
2043 | 
2044 |                     off += (off < 4096)? blockDim.x : rem_s;
2045 |                 }               
2046 |             }
2047 | 
2048 |             __syncthreads();
2049 | 
2050 | 
2051 |             chain_iterator it_R = R_chains.begin();
2052 |             chain_iterator it_Pr = Pr_chains.begin();
2053 | 
2054 |             for (;it_R != R_chains.end(); ++it_R, ++it_Pr) {
2055 |                 vec4 data_R = (*it_R).x;
2056 |                 vec4 data_Pr = (*it_Pr).x;
2057 |                 int l_cnt_R = (*it_R).cnt;
2058 | 
2059 |                 #pragma unroll
2060 |                 for (int k = 0; k < 4; k++) {
2061 |                     int32_t val = data_R.i[k];
2062 |                     int32_t pval = data_Pr.i[k];
2063 |                     int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
2064 |                     int32_t hval =  (val >> log_parts) & (LOCAL_BUCKETS - 1);
2065 | 
2066 |                     if (k < l_cnt_R) {
2067 |                         int32_t pos = head[hval];
2068 |                         while (pos >= 0) {
2069 |                             if (elem[pos] == tval) {
2070 |                                 count += pval*payload[pos];
2071 |                             }
2072 | 
2073 |                             pos = next[pos];
2074 |                         }
2075 |                     }                   
2076 |                 }
2077 |             }
2078 | 
2079 | 
2080 |             __syncthreads();
2081 |         }
2082 |     }
2083 | 
2084 |     atomicAdd(results, count);
2085 | 
2086 |     __syncthreads();
2087 | }
2088 | 
2089 | /*essentially the join_partitioned_aggregate but builds hashtable in GPU memory*/
2090 | __global__ void join_partitioned_global (
2091 |                                     const int32_t*               R,
2092 |                                     const int32_t*               Pr,
2093 |                                     const uint32_t*              R_cnts,
2094 |                                     const uint32_t*              R_chain,
2095 |                                     const int32_t*               S,
2096 |                                     const int32_t*               Ps,
2097 |                                     const uint32_t*              S_cnts,
2098 |                                     const uint32_t*              S_chain,
2099 |                                     int32_t                      log_parts,
2100 |                                     int32_t*                     results,
2101 |                                     int32_t*                     buffer) {
2102 |     
2103 |     int tid = threadIdx.x;
2104 |     int block = blockIdx.x;
2105 |     int width = blockDim.x;
2106 |     int pwidth = gridDim.x;
2107 |     int parts = 1 << log_parts;
2108 | 
2109 |     buffer += block*8*4096;
2110 | 
2111 |     int16_t* elem = (int16_t*) buffer;
2112 |     int32_t* payload = buffer + 4096 + 512;;
2113 |     int16_t* next = (int16_t*) (buffer + 2*(4096 + 512));
2114 |     int32_t* head = buffer + 3*(4096+512);
2115 | 
2116 |     
2117 | 
2118 |     int lid = tid % 32;
2119 |     int gnum = blockDim.x/32;
2120 | 
2121 |     int count = 0;
2122 | 
2123 |     int pr = -1;
2124 |     int ps = -1;
2125 | 
2126 | 
2127 |     for (uint32_t p = block; p < parts; p += pwidth) {
2128 |         chain R_chains(R, R_cnts, R_chain, p);
2129 |         chain Pr_chains(Pr, R_cnts, R_chain, p);
2130 | 
2131 |         chain S_chains(S, S_cnts, S_chain, p);
2132 |         chain Ps_chains(Ps, S_cnts, S_chain, p);
2133 | 
2134 |         int len_R = R_cnts[p];
2135 |         int len_S = S_cnts[p];
2136 | 
2137 |         if (len_S > 4096 + 512) {
2138 |            /*it was a microbenchmark so I didn't code this part*/
2139 |             continue;
2140 |         } else {
2141 |             int off = 0;
2142 | 
2143 |             for (int i = tid; i < LOCAL_BUCKETS; i += blockDim.x)
2144 |                 head[i] = -1;
2145 | 
2146 |             int rem_s = len_S % 4096;
2147 |             rem_s = (rem_s + 4 - 1)/4;
2148 | 
2149 |             __syncthreads();
2150 | 
2151 |             chain_iterator it_S = S_chains.begin();
2152 |             chain_iterator it_Ps = Ps_chains.begin();
2153 | 
2154 |             for (;it_S != S_chains.end(); ++it_S, ++it_Ps) {
2155 |                 vec4 data_S = (*it_S).x;
2156 |                 vec4 data_Ps = (*it_Ps).x;
2157 |                 int l_cnt_S = (*it_S).cnt;
2158 |                 
2159 |                 #pragma unroll
2160 |                 for (int k = 0; k < 4; k++) {
2161 |                     if (k < l_cnt_S) {
2162 |                         int val = data_S.i[k];
2163 |                         elem[off + tid] = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
2164 |                         payload[off + tid] = data_Ps.i[k];
2165 |                         int hval = (val >> log_parts) & (LOCAL_BUCKETS - 1);
2166 | 
2167 |                         int32_t last = atomicExch(&head[hval], off + tid);
2168 |                         next[off + tid] = last;
2169 |                     }   
2170 | 
2171 |                     off += (off < 4096)? blockDim.x : rem_s;
2172 |                 }               
2173 |             }
2174 | 
2175 |             __syncthreads();
2176 | 
2177 |             chain_iterator it_R = R_chains.begin();
2178 |             chain_iterator it_Pr = Pr_chains.begin();
2179 | 
2180 |             for (;it_R != R_chains.end(); ++it_R, ++it_Pr) {
2181 |                 vec4 data_R = (*it_R).x;
2182 |                 vec4 data_Pr = (*it_Pr).x;
2183 |                 int l_cnt_R = (*it_R).cnt;
2184 | 
2185 |                 #pragma unroll
2186 |                 for (int k = 0; k < 4; k++) {
2187 |                     int32_t val = data_R.i[k];
2188 |                     int32_t pval = data_Pr.i[k];
2189 |                     int16_t tval = (int16_t) (val >> (LOCAL_BUCKETS_BITS + log_parts));
2190 |                     int32_t hval =  (val >> log_parts) & (LOCAL_BUCKETS - 1);
2191 | 
2192 |                     if (k < l_cnt_R) {
2193 |                         int32_t pos = head[hval];
2194 |                         while (pos >= 0) {
2195 |                             if (elem[pos] == tval) {
2196 |                                 count += pval*payload[pos];
2197 |                             }
2198 | 
2199 |                             pos = next[pos];
2200 |                         }
2201 |                     }                   
2202 |                 }
2203 |             }
2204 | 
2205 |             __syncthreads();
2206 |         }
2207 |     }
2208 | 
2209 |     atomicAdd(results, count);
2210 | 
2211 |     __syncthreads();
2212 | }


--------------------------------------------------------------------------------
/src/join-primitives.cuh:
--------------------------------------------------------------------------------
  1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
  2 |                    Ecole Polytechnique Federale de Lausanne
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in all
 12 | copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 20 | SOFTWARE.*/
 21 | 
 22 | #ifndef JOIN_PRIMITIVES_HPP_
 23 | #define JOIN_PRIMITIVES_HPP_
 24 | 
 25 | #include <cinttypes>
 26 | #include "common.h"
 27 | #include "common-host.h"
 28 | 
 29 | #define CLUSTERING_FACTOR 64
 30 | 
 31 | struct alignas(alignof(int64_t)) hj_bucket_2{
 32 |     int32_t next;
 33 |     int32_t val ;
 34 | 
 35 |     constexpr __host__ __device__ hj_bucket_2(int32_t next, int32_t value): next(next), val(value){}
 36 | };
 37 | 
 38 | __global__ void init_payload (int* R, int n);
 39 | 
 40 | __global__ void partition_pass_one (
 41 |                                     const int32_t   * __restrict__ S,
 42 |                                     const int32_t   * __restrict__ P,
 43 |                                     const size_t    * __restrict__ offsets,
 44 |                                           uint64_t  * __restrict__ heads,
 45 |                                           uint32_t  * __restrict__ buckets_used,
 46 |                                           uint32_t  * __restrict__ chains,
 47 |                                           uint32_t  * __restrict__ out_cnts,
 48 |                                           int32_t   * __restrict__ output_S,
 49 |                                           int32_t   * __restrict__ output_P,
 50 |                                           size_t                   cnt,
 51 |                                           uint32_t                 log_parts,
 52 |                                           uint32_t                 first_bit,
 53 |                                           uint32_t                 num_threads);
 54 | 
 55 | __global__ void compute_bucket_info (uint32_t* chains, uint32_t* out_cnts, uint32_t log_parts);
 56 | 
 57 | __global__ void partition_pass_two (
 58 |                                     const int32_t   * __restrict__ S,
 59 |                                     const int32_t   * __restrict__ P,
 60 |                                     const uint32_t  * __restrict__ bucket_info,
 61 |                                           uint32_t  * __restrict__ buckets_used,
 62 |                                           uint64_t  *              heads,
 63 |                                           uint32_t  * __restrict__ chains,
 64 |                                           uint32_t  * __restrict__ out_cnts,
 65 |                                           int32_t   * __restrict__ output_S,
 66 |                                           int32_t   * __restrict__ output_P,
 67 |                                           uint32_t                 S_log_parts,
 68 |                                           uint32_t                 log_parts,
 69 |                                           uint32_t                 first_bit,
 70 |                                           uint32_t  *              bucket_num_ptr);
 71 | 
 72 | __global__ void join_partitioned_shared (
 73 |                                     const int32_t*               R,
 74 |                                     const int32_t*               Pr,
 75 |                                     const uint32_t*              R_cnts,
 76 |                                     const uint32_t*              R_chain,
 77 |                                     const int32_t*               S,
 78 |                                     const int32_t*               Ps,
 79 |                                     const uint32_t*              S_cnts,
 80 |                                     const uint32_t*              S_chain,
 81 |                                     int32_t                      log_parts,
 82 |                                     int32_t*                      results);
 83 | 
 84 | __global__ void join_partitioned_global (
 85 |                                     const int32_t*               R,
 86 |                                     const int32_t*               Pr,
 87 |                                     const uint32_t*              R_cnts,
 88 |                                     const uint32_t*              R_chain,
 89 |                                     const int32_t*               S,
 90 |                                     const int32_t*               Ps,
 91 |                                     const uint32_t*              S_cnts,
 92 |                                     const uint32_t*              S_chain,
 93 |                                     int32_t                      log_parts,
 94 |                                     int32_t*                     results,
 95 |                                     int32_t*                     buffer);
 96 | 
 97 | __global__ void init_metadata_double ( 
 98 |                                 uint64_t  * __restrict__ heads1,
 99 |                                 uint32_t  * __restrict__ buckets_used1,
100 |                                 uint32_t  * __restrict__ chains1,
101 |                                 uint32_t  * __restrict__ out_cnts1,
102 |                                 uint32_t parts1,
103 |                                 uint32_t buckets_num1,
104 |                                 uint64_t  * __restrict__ heads2,
105 |                                 uint32_t  * __restrict__ buckets_used2,
106 |                                 uint32_t  * __restrict__ chains2,
107 |                                 uint32_t  * __restrict__ out_cnts2,
108 |                                 uint32_t parts2,
109 |                                 uint32_t buckets_num2
110 |                                 );
111 | 
112 | __global__ void build_perfect_array (int32_t* data, int32_t* payload, int n, int32_t* lookup);
113 | 
114 | __global__ void probe_perfect_array (int32_t* data, int32_t* payload, int n, int32_t* lookup, int* aggr);
115 | 
116 | __global__ void build_ht_chains (int32_t* data, int n, uint32_t log_parts, int32_t* output, int* head);
117 | 
118 | __global__ void chains_probing (int32_t* data, int32_t* payload, int n, uint32_t log_parts, int32_t* ht, int32_t* ht_key, int32_t* ht_pay, int* head, int* aggr);
119 | 
120 | __global__ void ht_hist (int* data, int n, int log_parts, int* hist);
121 | 
122 | __global__ void ht_offsets (int log_parts, int* hist, int* offset, int* aggr);
123 | 
124 | __global__ void build_ht_linear (int* data, int* payload, size_t n, int log_parts, int* offset, int* ht, int* htp);
125 | 
126 | __global__ void linear_probing (int* data, int* payload, int* ht, int* htp, int* offset_s, int* offset_e, size_t n, int log_parts, int* aggr);
127 | 
128 | __global__ void decompose_chains (uint32_t* bucket_info, uint32_t* chains, uint32_t* out_cnts, uint32_t log_parts, int threshold);
129 | 
130 | __global__ void join_partitioned_aggregate (
131 |                                     const int32_t*               R,
132 |                                     const int32_t*               Pr,
133 |                                     const uint32_t*              R_chain,
134 |                                     const uint32_t*              bucket_info,
135 |                                     const int32_t*               S,
136 |                                     const int32_t*               Ps,
137 |                                     const uint32_t*              S_cnts,
138 |                                     const uint32_t*              S_chain,
139 |                                     int32_t                      log_parts,
140 |                                     uint32_t*                    buckets_num,
141 |                                     int32_t*                     results);
142 | 
143 | __global__ void join_partitioned_results (
144 |                                     const int32_t*               R,
145 |                                     const int32_t*               Pr,
146 |                                     const uint32_t*              R_chain,
147 |                                     const uint32_t*              bucket_info,
148 |                                     const int32_t*               S,
149 |                                     const int32_t*               Ps,
150 |                                     const uint32_t*              S_cnts,
151 |                                     const uint32_t*              S_chain,
152 |                                     int32_t                      log_parts,
153 |                                     uint32_t*                    buckets_num,
154 |                                     int32_t*                     results,
155 |                                     int32_t*                     output);
156 | 
157 | 
158 | __global__ void join_partitioned_varpayload (
159 |                                     const int32_t*               R,
160 |                                     const int32_t*               Pr,
161 |                                     const int32_t*               Dr,
162 |                                     const uint32_t*              R_chain,
163 |                                     const uint32_t*              bucket_info,
164 |                                     const int32_t*               S,
165 |                                     const int32_t*               Ps,
166 |                                     const int32_t*               Ds,
167 |                                     const uint32_t*              S_cnts,
168 |                                     const uint32_t*              S_chain,
169 |                                     int32_t                      log_parts,
170 |                                     int32_t                      col_num1,
171 |                                     int32_t                      col_num2,
172 |                                     int32_t                      rel_size,
173 |                                     uint32_t*                    buckets_num,
174 |                                     int32_t*                     results);
175 | 
176 | __global__ void probe_perfect_array_varpay (int32_t* data, int32_t* Dr, int n, int32_t* lookup, int32_t* Ds, int col_num1, int col_num2, int res_size, int* aggr);
177 | 
178 | void prepare_Relation_payload (int* R, int* R_temp, int* P, int* P_temp, size_t RelsNum, uint32_t buckets_num, uint64_t* heads[2], uint32_t* cnts[2], uint32_t* chains[2], uint32_t* buckets_used[2], uint32_t log_parts1, uint32_t log_parts2, uint32_t first_bit, cudaStream_t streams, size_t* offsets_GPU, uint32_t num_threads);
179 | 
180 | void prepare_Relation_payload_triple (int* R, int* R_temp, int* R_final, int* P, int* P_temp, int* P_final, size_t RelsNum, uint32_t buckets_num, uint64_t* heads[2], uint32_t* cnts[2], uint32_t* chains[2], uint32_t* buckets_used[2], uint32_t log_parts1, uint32_t log_parts2, uint32_t first_bit, cudaStream_t streams, size_t* offsets_GPU, uint32_t num_threads);
181 | 
182 | 
183 | 
184 | 
185 | #endif


--------------------------------------------------------------------------------
/src/main.cu:
--------------------------------------------------------------------------------
  1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
  2 |                    Ecole Polytechnique Federale de Lausanne
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in all
 12 | copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 20 | SOFTWARE.*/
 21 | 
 22 | #include <unistd.h>
 23 | #include <cstdio>
 24 | #include <limits.h>				/*INT_MAX*/
 25 | #include <getopt.h>
 26 | 
 27 | #include "generator_ETHZ.cuh"
 28 | #include "common.h"
 29 | #include "common-host.h"
 30 | 
 31 | unsigned int hashJoinClusteredProbe(args *inputAttrs, timingInfo *time);
 32 | 
 33 | typedef struct joinAlg {
 34 | 	char name[4];
 35 | 	unsigned int (*joinAlg)(args*, timingInfo*);
 36 | } joinAlg;
 37 | 
 38 | typedef struct inputArgs {
 39 | 	short option = 0;
 40 | 	joinAlg alg
 41 | #ifndef __CUDACC__
 42 | 		= { "NLJ", nestedLoopsJoin }; // does not play well along --expt-relaxed-constexpr
 43 | #else
 44 | 		;
 45 | #endif
 46 | 	uint64_t SelsNum = 0;
 47 | 	uint64_t RelsNum = 0;
 48 | 	int uniqueKeys = 1;
 49 | 	int fullRange = 0;
 50 | 	float skew = 0.0;
 51 | 	int threadsNum = 32;
 52 | //	int selectivity = 1;
 53 | 	int valuesPerThread = 2;
 54 | 	int sharedMem = 30 << 10;
 55 | 	unsigned int pivotsNum = 1;
 56 | 	int one_to_many = 0;
 57 | 	int RelsMultiplier = 1;
 58 | 	int SelsMultiplier = 1;
 59 | 	const char* R_filename = NULL;
 60 | 	const char* S_filename = NULL;
 61 | 	int fileInput = 0;
 62 | } inputArgs;
 63 | 
 64 | static joinAlg algs[] { {"HJC", hashJoinClusteredProbe}
 65 | //		{"HJ", hashIndexJoin}
 66 | };
 67 | 
 68 | void usage_exit(int op) {
 69 | 	if (op == 0)
 70 | 		printf(
 71 | 				"./benchmark -b <id=1(select), 2(reduce), 3(memcpy), 4(streams), 5(tpch), 6(layouts), 7(joins), 8(join on CPU), 9(sort)>\n");
 72 | 	exit(1);
 73 | }
 74 | 
 75 | void print_timing_join(args *input, timingInfo *time, joinAlg *alg);
 76 | void parseInputArgs(int argc, char ** argv, inputArgs *input);
 77 | int createSingleRelation_filename(inputArgs *input, args *attrs);
 78 | void createSingleRelation_data(inputArgs *input, args *attrs, uint64_t bytes);
 79 | 
 80 | int main(int argc, char **argv) {
 81 | 	timingInfo time;
 82 | 	inputArgs input;
 83 | 	parseInputArgs(argc, argv, &input);
 84 | 
 85 | 	int dev = 0;
 86 | 
 87 | 	switch (input.option) {
 88 | 	case 7:
 89 | 	case 8: {
 90 | 		//set up device
 91 | 		cudaDeviceProp deviceProp;
 92 | 		CHK_ERROR(cudaGetDeviceProperties(&deviceProp, dev));
 93 | 		CHK_ERROR(cudaSetDevice(dev));
 94 | 
 95 | 		int* Q_r = NULL;
 96 | 		size_t Q_els_r = input.RelsNum;
 97 | 		size_t Q_bytes_r = Q_els_r * sizeof(int);
 98 | 
 99 | 		int* Q_s = NULL;
100 | 		size_t Q_els_s = input.SelsNum;
101 | 		size_t Q_bytes_s = Q_els_s * sizeof(int);
102 | 		
103 | 		if (input.SelsMultiplier > 1 || input.RelsMultiplier > 1) {
104 | 			input.SelsNum = input.SelsNum * input.SelsMultiplier;
105 | 			input.RelsNum = input.RelsNum * input.RelsMultiplier;
106 | 
107 | 			Q_r = (int*) malloc(Q_bytes_r);
108 | 			Q_s = (int*) malloc(Q_bytes_s);
109 | 		}
110 | 
111 | 		args joinArgs;
112 | 		joinArgs.S_els = input.SelsNum;
113 | 		joinArgs.R_els = input.RelsNum;
114 | 		uint64_t S_bytes = joinArgs.S_els * sizeof(int);
115 | 		uint64_t R_bytes = joinArgs.R_els * sizeof(int);
116 | 		
117 | 
118 | 		/*fix filenames*/
119 | 		if (input.fileInput) {
120 | 
121 | 		} else if (input.fullRange) {
122 | 			int n = 0;
123 | 			if ((n = sprintf(joinArgs.S_filename, "fk_S%lu_pk_R%lu.bin", joinArgs.S_els, joinArgs.R_els)) >= 50) {
124 | 				fprintf(stderr, "ERROR: S_filename is %d characters long\n", n);
125 | 				return 1;
126 | 			}
127 | 			if ((n = sprintf(joinArgs.R_filename, "pk_R%lu.bin", joinArgs.R_els)) >= 50) {
128 | 				fprintf(stderr, "ERROR: R_filename is %d characters long\n", n);
129 | 				return 1;
130 | 			}
131 | 
132 | 		} else if (input.uniqueKeys) {
133 | 			int n = 0;
134 | 
135 | 			if ((n = sprintf(joinArgs.R_filename, "unique_%lu.bin", (input.RelsMultiplier > 1) ? Q_els_r : joinArgs.R_els)) >= 50) {
136 | 				fprintf(stderr, "ERROR: R_filename is %d characters long\n", n);
137 | 				return 1;
138 | 			}
139 | 
140 | 			if (input.skew > 0)
141 | 				n = sprintf(joinArgs.S_filename, "unique_skew%.2f_S%lu.bin", joinArgs.S_els);
142 | 			else
143 | 				n = sprintf(joinArgs.S_filename, "unique_%lu.bin", (input.SelsMultiplier > 1) ? Q_els_s : joinArgs.S_els);
144 | 
145 | 			if (n >= 50) {
146 | 				fprintf(stderr, "ERROR: S_filename is %d characters long\n", n);
147 | 				return 1;
148 | 			}
149 | 		} else {
150 | 			int n = 0;
151 | 			if ((n = sprintf(joinArgs.S_filename, "nonUnique_S%lu.bin", joinArgs.S_els)) >= 50) {
152 | 				fprintf(stderr, "ERROR: S_filename is %d characters long\n", n);
153 | 				return 1;
154 | 			}
155 | 			if ((n = sprintf(joinArgs.R_filename, "nonUnique_R%lu.bin", joinArgs.R_els)) >= 50) {
156 | 				fprintf(stderr, "ERROR: R_filename is %d characters long\n", n);
157 | 				return 1;
158 | 			}
159 | 		}
160 | 
161 | 		/*create relations*/
162 | #if defined(MEM_DEVICE)
163 | 		joinArgs.S = (int *) malloc(S_bytes);
164 | 		joinArgs.R = (int *) malloc(R_bytes);
165 | 		if (!joinArgs.S || !joinArgs.R) {
166 | 			fprintf(stderr, "Problem allocating space for the relations\n");
167 | 			if (joinArgs.S) free(joinArgs.S);
168 | 			if (joinArgs.R) free(joinArgs.R);
169 | 			return 0;
170 | 		}
171 | #elif defined(MEM_S_DEVICE)
172 | 		joinArgs.S = (int *) malloc(S_bytes);
173 | 		if (!joinArgs.S) {
174 | 			fprintf(stderr, "Problem allocating space for the relations\n");
175 | 			return 0;
176 | 		}
177 | 		CHK_ERROR(cudaHostAlloc((void** )&joinArgs.R, R_bytes, cudaHostAllocMapped));
178 | #elif defined(MEM_MANAGED)
179 | 		CHK_ERROR(cudaMallocManaged((void** )&joinArgs.S, S_bytes));
180 | 		CHK_ERROR(cudaMallocManaged((void** )&joinArgs.R, R_bytes));
181 | #elif defined(MEM_HOST)
182 | 		CHK_ERROR(cudaHostAlloc((void** )&joinArgs.S, S_bytes, cudaHostAllocMapped));
183 | 		CHK_ERROR(cudaHostAlloc((void** )&joinArgs.R, R_bytes, cudaHostAllocMapped));
184 | #endif
185 | 
186 | 		if (input.fileInput) {
187 | 			printf("Reading from files\n");
188 | 			readFromFile(input.R_filename, joinArgs.R, joinArgs.R_els);
189 | 			readFromFile(input.S_filename, joinArgs.S, joinArgs.S_els);
190 | 		} else if (input.fullRange) {
191 | 			printf("Creating relation R with %lu tuples (%d MB) using non-unique keys and full range : ",
192 | 					joinArgs.R_els, R_bytes / 1024 / 1024);
193 | 			fflush(stdout);
194 | 			create_relation_nonunique(joinArgs.R_filename, joinArgs.R, joinArgs.R_els, INT_MAX);
195 | 
196 | 			printf("Creating relation S with %lu tuples (%d MB) using non-unique keys and full range : ",
197 | 					joinArgs.S_els, S_bytes / 1024 / 1024);
198 | 			fflush(stdout);
199 | 			create_relation_fk_from_pk(joinArgs.S_filename, joinArgs.S, joinArgs.S_els, joinArgs.R,
200 | 					joinArgs.R_els);
201 | 			fflush(stdout);
202 | 
203 | 		} else if (input.uniqueKeys) {
204 | 			printf("Creating relation R with %lu tuples (%d MB) using unique keys : ", joinArgs.R_els,
205 | 					R_bytes / 1024 / 1024);
206 | 			fflush(stdout);
207 | 
208 | 			if (Q_r == NULL) {
209 | 				create_relation_unique(joinArgs.R_filename, joinArgs.R, joinArgs.R_els, joinArgs.R_els);
210 | 			} else {
211 | 				create_relation_unique(joinArgs.R_filename, Q_r, Q_els_r, Q_els_r);
212 | 				create_relation_n(Q_r, joinArgs.R, Q_els_r, input.RelsMultiplier);
213 | 			}
214 | 
215 | 			if (Q_s == NULL) {
216 | 				if (input.skew > 0) {
217 | 					/* S is skewed */
218 | 					printf("Creating relation S with %lu tuples (%d MB) using unique keys and skew %f : ",
219 | 							joinArgs.S_els, S_bytes / 1024 / 1024, input.skew);
220 | 					fflush(stdout);
221 | 					create_relation_zipf(joinArgs.S_filename, joinArgs.S, joinArgs.S_els, joinArgs.R_els,
222 | 							input.skew);
223 | 				} else {
224 | 					/* S is uniform foreign key */
225 | 					printf("Creating relation S with %lu tuples (%d MB) using unique keys : ", joinArgs.S_els,
226 | 							S_bytes / 1024 / 1024);
227 | 					fflush(stdout);
228 | 					create_relation_unique(joinArgs.S_filename, joinArgs.S, joinArgs.S_els, joinArgs.R_els);
229 | 				}
230 | 			} else {
231 | 				if (input.skew > 0) {
232 | 					/* S is skewed */
233 | 					printf("Creating relation S with %lu tuples (%d MB) using unique keys and skew %f : ",
234 | 							joinArgs.S_els, S_bytes / 1024 / 1024, input.skew);
235 | 					fflush(stdout);
236 | 					create_relation_zipf(joinArgs.S_filename, Q_s, Q_els_s, Q_els_s, input.skew);
237 | 				} else {
238 | 					/* S is uniform foreign key */
239 | 					printf("Creating relation S with %lu tuples (%d MB) using unique keys : ", joinArgs.S_els,
240 | 							S_bytes / 1024 / 1024);
241 | 					fflush(stdout);
242 | 					create_relation_unique(joinArgs.S_filename, Q_s, Q_els_s, Q_els_s);
243 | 				}
244 | 
245 | 				create_relation_n(Q_s, joinArgs.S, Q_els_s, input.SelsMultiplier);
246 | 
247 | 				fflush(stdout);
248 | 			}
249 | 
250 | 			fflush(stdout);
251 | 		} else {
252 | 			printf("Creating relation R with %lu tuples (%d MB) using non-unique keys : ", joinArgs.R_els,
253 | 					R_bytes / 1024 / 1024);
254 | 			fflush(stdout);
255 | 			create_relation_nonunique(joinArgs.R_filename, joinArgs.R, joinArgs.R_els, joinArgs.R_els/2); // |R|/2 to get on average 2entries/value
256 | 
257 | 			printf("Creating relation S with %lu tuples (%d MB) using non-unique keys : ", joinArgs.S_els,
258 | 					S_bytes / 1024 / 1024);
259 | 			fflush(stdout);
260 | 			create_relation_nonunique(joinArgs.S_filename, joinArgs.S, joinArgs.S_els, joinArgs.R_els/2); // |R|/2 and not |S|/2 to get the same range
261 | 			fflush(stdout);
262 | 		}
263 | 
264 | 		if (input.option == 7) {
265 | 			joinArgs.sharedMem = input.sharedMem;
266 | 			joinArgs.threadsNum = input.threadsNum;
267 | 			printf("%s : shareMemory = %ld\t#threads = %d\n", input.alg.name, joinArgs.sharedMem,
268 | 					joinArgs.threadsNum);
269 | 			fflush(stdout);
270 | 
271 | #if defined(MEM_DEVICE)
272 | 			printf ("memory alloc done\n");
273 | 			int *S_host = joinArgs.S;
274 | 			int *R_host = joinArgs.R;
275 | 
276 | 			cudaDeviceSynchronize();	
277 | 			
278 | 			CHK_ERROR(cudaMalloc((int** )&joinArgs.S, S_bytes));
279 | 			CHK_ERROR(cudaMalloc((int** )&joinArgs.R, R_bytes));
280 | 			CHK_ERROR(cudaMemcpy(joinArgs.S, S_host, S_bytes, cudaMemcpyHostToDevice));
281 | 			CHK_ERROR(cudaMemcpy(joinArgs.R, R_host, R_bytes, cudaMemcpyHostToDevice));
282 | 			
283 | 			/*free(S_host); free(R_host);*/
284 | #elif defined(MEM_S_DEVICE)
285 | 			int *S_host = joinArgs.S;
286 | 			CHK_ERROR(cudaMalloc((int** )&joinArgs.S, S_bytes));
287 | 			CHK_ERROR(cudaMemcpy(joinArgs.S, S_host, S_bytes, cudaMemcpyHostToDevice));
288 | 			free(S_host);
289 | #endif
290 | 			recordTime(&time.start[time.n - 1]);
291 | 			uint64_t joinsNum = input.alg.joinAlg(&joinArgs, &time);
292 | 			recordTime(&time.end[time.n - 1]);
293 | 
294 | 			cudaDeviceReset();
295 | #if defined(MEM_HOST)
296 | 			cudaFreeHost(joinArgs.S);
297 | 			cudaFreeHost(joinArgs.R);
298 | #else
299 | 			cudaFree(joinArgs.S); cudaFree(joinArgs.R);
300 | #endif
301 | 		}
302 | 
303 | 	}
304 | 
305 | 		break;
306 | 	default:
307 | 		usage_exit(0);
308 | 		break;
309 | 	}
310 | }
311 | 
312 | 
313 | 
314 | int createSingleRelation_filename(inputArgs *input, args *attrs) {
315 | 	/*fix filename (no matter which relation store everything in S, name only needed to re-use a file)*/
316 | 	int n = 0;
317 | 	if (input->fullRange) {
318 | 		if (input->SelsNum)
319 | 			n = sprintf(attrs->S_filename, "pk_S%lu.bin", attrs->S_els);
320 | 		else
321 | 			n = sprintf(attrs->S_filename, "pk_R%lu.bin", attrs->S_els);
322 | 	} else if (input->uniqueKeys) {
323 | 		if (input->SelsNum)
324 | 			n = sprintf(attrs->S_filename, "unique_S%lu.bin", attrs->S_els);
325 | 		else
326 | 			n = sprintf(attrs->S_filename, "unique_R%lu.bin", attrs->S_els);
327 | 	} else {
328 | 		if (input->SelsNum)
329 | 			n = sprintf(attrs->S_filename, "nonUnique_S%lu.bin", attrs->S_els);
330 | 		else
331 | 			n = sprintf(attrs->S_filename, "nonUnique_R%lu.bin", attrs->S_els);
332 | 	}
333 | 
334 | 	if (n >= 50) {
335 | 		fprintf(stderr, "ERROR: filename is %d characters long\n", n);
336 | 		return 1;
337 | 	}
338 | 
339 | 	return 0;
340 | }
341 | 
342 | void createSingleRelation_data(inputArgs *input, args *attrs, uint64_t bytes) {
343 | 	if (input->fullRange) {
344 | 		printf("Creating relation with %lu tuples (%d MB) using non-unique keys and full range : ",
345 | 				attrs->S_els, bytes / 1024 / 1024);
346 | 		fflush(stdout);
347 | 		create_relation_nonunique(attrs->S_filename, attrs->S, attrs->S_els, INT_MAX);
348 | 	} else if (input->uniqueKeys) {
349 | 		printf("Creating relation with %lu tuples (%d MB) using unique keys : ", attrs->S_els,
350 | 				bytes / 1024 / 1024);
351 | 		fflush(stdout);
352 | 		create_relation_unique(attrs->S_filename, attrs->S, attrs->S_els, attrs->S_els);
353 | 	} else {
354 | 		printf("Creating relation with %lu tuples (%d MB) using non-unique keys : ", attrs->S_els,
355 | 				bytes / 1024 / 1024);
356 | 		fflush(stdout);
357 | 		create_relation_nonunique(attrs->S_filename, attrs->S, attrs->S_els, attrs->S_els);
358 | 	}
359 | 	printf("DONE\n");
360 | 	fflush(stdout);
361 | }
362 | 
363 | void printTimeInfo(uint64_t tuplesNum, time_st *start, time_st *end) {
364 | 	double diff_usec = (((*end).tv_sec * 1000000L + (*end).tv_usec)
365 | 			- ((*start).tv_sec * 1000000L + (*start).tv_usec));
366 | 
367 | 	double tuplesPerSec = tuplesNum / (diff_usec / 1000000.0);
368 | //	printf("%10.3f\n", tuplesPerSec);
369 | 
370 | 	printf("total tuples = %10lu time  = %.3f msecs = %.3f secs\t", tuplesNum, diff_usec / 1000.0,
371 | 			diff_usec / 1000000.0);
372 | 	if (tuplesPerSec < 1024 / sizeof(int))
373 | 		printf("throughput = %8.3lf B/sec\n", tuplesPerSec * sizeof(int));
374 | 	else if (tuplesPerSec < 1024 * 1024 / sizeof(int))
375 | 		printf("throughput = %8.3lf KB/sec\n", tuplesPerSec * sizeof(int) / 1024);
376 | 	else if (tuplesPerSec < 1024 * 1024 * 1024 / sizeof(int))
377 | 		printf("throughput = %8.3lf MB/sec\n", ((tuplesPerSec / 1024) * sizeof(int)) / 1024);
378 | 	else
379 | 		printf("throughput = %8.3lf GB/sec\n", ((tuplesPerSec / 1024 / 1024) * sizeof(int)) / 1024);
380 | }
381 | 
382 | 
383 | void print_timing_join(args *input, timingInfo *time, joinAlg *alg) {
384 | 	unsigned int blocksNum = (input->R_els + input->threadsNum - 1) / input->threadsNum;
385 | 	unsigned int memElsNum = (input->sharedMem + sizeof(int) - 1) / sizeof(int);
386 | 	unsigned int shareMemoryBlocksNum = (input->S_els + memElsNum - 1) / memElsNum;
387 | 	uint64_t tuplesNum = input->S_els + input->R_els; //if alg==0
388 | 
389 | 	if (strcmp(alg->name, algs[2].name) != 0 && strcmp(alg->name, algs[3].name) != 0  && strcmp(alg->name, algs[4].name) != 0) {
390 | 
391 | #if defined(NLJ_SIMPLE)
392 | 	tuplesNum = input->R_els + input->R_els*input->S_els;
393 | #elif defined(SHAREDMEM_LOOPIN)
394 | #if defined(MEM_S_DEVICE)
395 | 	tuplesNum = input->R_els + input->S_els;
396 | #else
397 | 	tuplesNum = input->R_els + input->S_els * blocksNum;
398 | #endif
399 | #endif
400 | 	
401 | 	}
402 | 	
403 | 	printf("blocksNum=%lu\tmemElsNum=%lu\tshareMemBlocksNum=%lu\ttuplesNum=%lu\n", blocksNum, memElsNum,
404 | 			shareMemoryBlocksNum, tuplesNum);
405 | 
406 | 	if (strcmp(alg->name, algs[1].name) == 0) {
407 | 		/*SMJ*/
408 | 		printf("SORT:\t");
409 | 		printTimeInfo(tuplesNum, &(time->start[1]), &(time->end[1]));
410 | 	// } else if (strcmp(alg->name, algs[4].name) == 0) {
411 | 	// 	/*HJ*/
412 | 	// 	printf("INDEX:\t");
413 | 	// 	printTimeInfo(tuplesNum, &(time->start[1]), &(time->end[1]));
414 | 	}
415 | 
416 | 
417 | 	if (strcmp(alg->name, algs[2].name) == 0 || strcmp(alg->name, algs[3].name) == 0  || strcmp(alg->name, algs[4].name) == 0) {
418 | 		printf("BUILD:\t");
419 | 		printTimeInfo(input->R_els, &(time->start[1]), &(time->end[1]));
420 | 		printf("PROBE:\t");
421 | 		printTimeInfo(input->S_els, &(time->start[0]), &(time->end[0]));
422 | 	} else {
423 | 		printf("JOIN:\t");
424 | 		printTimeInfo(tuplesNum, &(time->start[0]), &(time->end[0]));
425 | 	}
426 | 
427 | 	printf("AGGR:\t");
428 | 	printTimeInfo(tuplesNum, &(time->start[time->n - 2]), &(time->end[time->n - 2]));
429 | 	printf("TOTAL:\t");
430 | 	printTimeInfo(tuplesNum, &(time->start[time->n - 1]), &(time->end[time->n - 1]));
431 | 
432 | }
433 | 
434 | void parseInputArgs(int argc, char ** argv, inputArgs *input) {
435 | 	/* flags */
436 | 	int uniqueKeys_flag = input->uniqueKeys;
437 | 	int fullRange_flag = input->fullRange;
438 | 	int file_flag = input->fileInput;
439 | 
440 | 	int c;
441 | 	int option_index = 0;
442 | 
443 | 	printf("INPUT: ");
444 | 
445 | 	static struct option long_options[] = {
446 | 	/*These options set a flag.*/
447 | 	{ "file", no_argument, &file_flag, 1 }, { "non-unique", no_argument, &uniqueKeys_flag, 0 }, { "full-range", no_argument, &fullRange_flag, 1 },
448 | 	/* These options don't set a flag. We distinguish them by their indices. */
449 | 	{ "benchmark", required_argument, 0, 'b' }, { "alg", required_argument, 0, 'a' }, { "SelsNum",
450 | 	required_argument, 0, 'S' }, { "RelsNum", required_argument, 0, 'R' }, { "skew",
451 | 	required_argument, 0, 's' }, { "threadsNum", required_argument, 0, 't' }, { "values",
452 | 	required_argument, 0, 'v' }, { "memory", required_argument, 0, 'm' }, { "pivotsNum",
453 | 	required_argument, 0, 'p' }, { "OneToMany", required_argument, 0, 'w' }, { "XSelsMultiplier", 
454 | 	required_argument, 0, 'x' }, { "YRelsMultiplier", required_argument, 0, 'y' }, { "R_filename", 
455 | 	required_argument, 0, 'k' }, { "S_filename", required_argument, 0, 'l' }, { 0, 0, 0, 0 } };
456 | 
457 | 	while ((c = getopt_long(argc, argv, "b:a:S:R:s:t:v:m:p:x:y:k:l:", long_options, &option_index)) != -1) {
458 | 		switch (c) {
459 | 		case 0:
460 | 			printf("%s\t", long_options[option_index].name);
461 | 			/* If this option set a flag, do nothing else now. */
462 | 			if (long_options[option_index].flag != 0) break;
463 | 			if (optarg) printf(" with arg %s", optarg);
464 | 			printf("\n");
465 | 			break;
466 | 		case 'b':
467 | 			input->option = atoi(optarg);
468 | 			printf("option = %d\t", input->option);
469 | 			break;
470 | 		case 'a': {
471 | 			int i = 0;
472 | 			while (algs[i].joinAlg) {
473 | 				if (strcmp(optarg, algs[i].name) == 0) {
474 | 					strcpy(input->alg.name, algs[i].name);
475 | 					input->alg.joinAlg = algs[i].joinAlg;
476 | 					break;
477 | 				}
478 | 				i++;
479 | 			}
480 | 			printf("joinAlg = %s\t", input->alg.name);
481 | 		}
482 | 			break;
483 | 		case 'k':
484 | 			input->R_filename = optarg;
485 | 			printf("R filename = %s\t", input->R_filename);
486 | 			break;
487 | 		case 'l':
488 | 			input->S_filename = optarg;
489 | 			printf("S filename = %s\t", input->S_filename);
490 | 			break;
491 | 		case 'S': {
492 | 			uint64_t p = atol(optarg);
493 | 			if (p > ULONG_MAX / sizeof(int)) {
494 | 				fprintf(stderr,
495 | 						"WARNING: SelsNun is too big (%lu). Setting SelsNum to maximum supported value %lu\n",
496 | 						p, ULONG_MAX / sizeof(int));
497 | 				input->SelsNum = ULONG_MAX / sizeof(int);
498 | 			} else
499 | 				input->SelsNum = p;
500 | 		}
501 | 			printf("||S|| = %lu\t", input->SelsNum);
502 | 			break;
503 | 		case 'R': {
504 | 			uint64_t p = atol(optarg);
505 | 			if (p > ULONG_MAX / sizeof(int)) {
506 | 				fprintf(stderr,
507 | 						"WARNING: RelsNun is too big (%lu). Setting RelsNum to maximum supported value %lu\n",
508 | 						p, ULONG_MAX / sizeof(int));
509 | 				input->RelsNum = ULONG_MAX / sizeof(int);
510 | 			} else
511 | 				input->RelsNum = p;
512 | 		}
513 | 			printf("||R|| = %lu\t", input->RelsNum);
514 | 			break;
515 | 		case 's':
516 | 			input->skew = atof(optarg);
517 | 			printf("skew = %f\t", input->skew);
518 | 			break;
519 | 		case 't':
520 | 			input->threadsNum = atoi(optarg);
521 | 			printf("#threads = %d\t", input->threadsNum);
522 | 			break;
523 | 		case 'v':
524 | 			input->valuesPerThread = atoi(optarg);
525 | 			printf("values per thread= %d\t", input->valuesPerThread);
526 | 			break;
527 | 		case 'm':
528 | 			input->sharedMem = atoi(optarg); // << 10;
529 | 			printf("sharedMem = %d\t", input->sharedMem);
530 | 			break;
531 | 		case 'p':
532 | 			input->pivotsNum = atoi(optarg);
533 | 			printf("pivotsNum = %d\t", input->pivotsNum);
534 | 			break;
535 | 		case 'w' :
536 | 			input->one_to_many = atoi(optarg);
537 | 			printf("OneToMany = %d\t", input->one_to_many);
538 | 			break;
539 | 		case 'x' :
540 | 			input->SelsMultiplier = atol(optarg);
541 | 			printf("SelsMultiplier = %d\t", input->SelsMultiplier);
542 | 			break;
543 | 		case 'y' :
544 | 			input->RelsMultiplier = atol(optarg);
545 | 			printf("RelsMultiplier = %d\t", input->RelsMultiplier);
546 | 			break;
547 | 		}
548 | 	}
549 | 
550 | 	input->uniqueKeys = uniqueKeys_flag;
551 | 	input->fullRange = fullRange_flag;
552 | 	input->fileInput = file_flag;
553 | 
554 | 	printf("\n");
555 | 
556 | 	if (input->option < 1 || (input->option > 9 && input->option < 100) || input->option > 101) usage_exit(0);
557 | }
558 | 


--------------------------------------------------------------------------------
/src/partition-primitives.cu:
--------------------------------------------------------------------------------
  1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
  2 |                    Ecole Polytechnique Federale de Lausanne
  3 | 
  4 | Permission is hereby granted, free of charge, to any person obtaining a copy
  5 | of this software and associated documentation files (the "Software"), to deal
  6 | in the Software without restriction, including without limitation the rights
  7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8 | copies of the Software, and to permit persons to whom the Software is
  9 | furnished to do so, subject to the following conditions:
 10 | 
 11 | The above copyright notice and this permission notice shall be included in all
 12 | copies or substantial portions of the Software.
 13 | 
 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 20 | SOFTWARE.*/
 21 | 
 22 | #include "partition-primitives.cuh"
 23 | 
 24 | #define LOG_BATCH 8
 25 | #define PARTS_CAPACITY 16
 26 | 
 27 | /*CPU-side partitioning
 28 | we assume that we already have histograms for M-way partitioning, where M is small like 16 (DBs keep statistics anyway so we do it in single pass)
 29 | 
 30 | S=keys of relation S
 31 | P=payload of relation S
 32 | out_cnts=count of elements for each partition
 33 | output_S=that's where we write partitioned keys
 34 | output_P=that's where we write partitioned payloads
 35 | cnt=total number of elements in relation
 36 | log_parts=number of partitions, logarithmic
 37 | first_bit=we shift right before taking bits for radix partitioning
 38 | nthreads=the number of threads running this, it helps find out what each thread reads
 39 | */
 40 | void partitions_host_omp_nontemporal_payload(
 41 |                     const int32_t   * __restrict__ S,
 42 |                     const int32_t   * __restrict__ P,
 43 |                     size_t  * __restrict__ out_cnts,
 44 |                     int32_t   * __restrict__ output_S,
 45 |                     int32_t   * __restrict__ output_P,
 46 |                     const size_t             cnt,
 47 |                     const uint32_t           log_parts,
 48 |                     const uint32_t           first_bit,
 49 |                     const uint32_t           threadIdx,
 50 |                     const uint32_t           nthreads) {
 51 |     const uint32_t parts     = 1 << log_parts;
 52 |     const int32_t parts_mask = parts - 1;
 53 |     const int32_t bucket_mask = (1 << log2_bucket_size) - 1;
 54 | 
 55 |     size_t out_cnts_local[PARTS_CAPACITY];
 56 |     for (int i = 0; i < parts; i++) {
 57 |         out_cnts_local[i] = out_cnts[threadIdx*OMP_MEMORY_STEP + i];
 58 |     }
 59 | 
 60 |     /*software-managed caches, they have to be aligned for AVX2*/
 61 |     int32_t* cache_S = (int32_t*) aligned_alloc(4096, parts*(1 << LOG_BATCH)*sizeof(int32_t));
 62 |     int32_t* cache_P = (int32_t*) aligned_alloc(4096, parts*(1 << LOG_BATCH)*sizeof(int32_t));
 63 | 
 64 |     uint32_t regptr[PARTS_CAPACITY];
 65 |     for (int i = 0; i < parts; i++)
 66 |         regptr[i] = i << LOG_BATCH;
 67 |     
 68 |     for (size_t t = threadIdx; t < (cnt + OMP_MEMORY_STEP - 1)/OMP_MEMORY_STEP; t += nthreads) {
 69 |         const int32_t* chunk_S =  S + t*OMP_MEMORY_STEP;
 70 |         const int32_t* chunk_P =  P + t*OMP_MEMORY_STEP;
 71 | 
 72 |         int end = ((t+1)*OMP_MEMORY_STEP < cnt) ?
 73 |                         OMP_MEMORY_STEP :
 74 |                         cnt - t*OMP_MEMORY_STEP;
 75 |         //#pragma loop unroll
 76 |         for (int i = 0; i < end; i++) {
 77 |             int32_t key = chunk_S[i];
 78 |             int32_t payload = chunk_P[i];
 79 |             uint32_t partition =  (hasht(key) >> first_bit) & parts_mask;
 80 | 
 81 |             /*write element to cache*/
 82 |             uint32_t offset = (regptr[partition])++;
 83 |             cache_S[offset] = key;
 84 |             cache_P[offset] = payload;
 85 | 
 86 |             /*cache for partition is full, flush it to memory
 87 |             do it with non-temporal writes in order to avoid reading the output locations first*/
 88 |             if ((offset & ((1 << LOG_BATCH) - 1)) == ((1 << LOG_BATCH) - 1)) {
 89 |                 for (int k = 0; k < (1 << (LOG_BATCH - 3)); k++) {
 90 |                     __m256i data = *((__m256i*) &cache_S[(partition << LOG_BATCH) + k*8]);
 91 |                     _mm256_stream_si256 ((__m256i*) &output_S[out_cnts_local[partition] + k*8], data);
 92 |                 }
 93 | 
 94 |                 for (int k = 0; k < (1 << (LOG_BATCH - 3)); k++) {
 95 |                     __m256i data = *((__m256i*) &cache_P[(partition << LOG_BATCH) + k*8]);
 96 |                     _mm256_stream_si256 ((__m256i*) &output_P[out_cnts_local[partition] + k*8], data);
 97 |                 }
 98 | 
 99 |                 out_cnts_local[partition] += (1 << LOG_BATCH);
100 |                 regptr[partition] = partition << LOG_BATCH;
101 |             }
102 |         }
103 |     }
104 | 
105 |     /*flush half-full caches*/
106 |     for (int p = 0; p < parts; p++) {
107 |         for (int k = 0; k < (1 << (LOG_BATCH - 3)); k++) {
108 |             if (8*k < regptr[p] - (p << LOG_BATCH)) {
109 |               __m256i data = *((__m256i*) &cache_S[(p << LOG_BATCH) + k*8]);
110 |                 _mm256_stream_si256 ((__m256i*) &output_S[out_cnts_local[p] + k*8], data);
111 |             }
112 |         }
113 | 
114 |         for (int k = 0; k < (1 << (LOG_BATCH - 3)); k++) {
115 |             if (8*k < regptr[p] - (p << LOG_BATCH)) {
116 |               __m256i data = *((__m256i*) &cache_P[(p << LOG_BATCH) + k*8]);
117 |                _mm256_stream_si256 ((__m256i*) &output_P[out_cnts_local[p] + k*8], data);
118 |             }
119 |         }
120 | 
121 |         out_cnts_local[p] += regptr[p] - (p << LOG_BATCH);
122 |     }
123 | 
124 |     #pragma omp barrier
125 | }
126 | 
127 | /*compute the offsets at which each thread writes by doing the count and then doing prefix sum
128 | this is not part of runtime measurements*/
129 | void partition_prepare_payload (int* R, int* P, size_t n, uint32_t log_parts, uint32_t first_bit, 
130 |                             int* R_sock[2], int* out_sock[2],
131 |                             int* P_sock[2], int* pout_sock[2],
132 |                             size_t* out_offsets[2], size_t total[2], size_t* offsets_GPU, uint32_t num_threads) {
133 |     uint32_t parts = (1 << log_parts);
134 |     uint32_t parts_mask = parts - 1;
135 | 
136 |     #pragma omp parallel num_threads(num_threads) 
137 |     {
138 |         uint32_t threadIdx = omp_get_thread_num();
139 |         uint32_t socket = sched_getcpu() % 2;
140 | 
141 |         for (size_t t = threadIdx; t < (n + OMP_MEMORY_STEP - 1)/OMP_MEMORY_STEP; t += num_threads) {
142 |             int end = ((t+1)*OMP_MEMORY_STEP < n) ?
143 |                         OMP_MEMORY_STEP :
144 |                         n - t*OMP_MEMORY_STEP;
145 | 
146 |             for (int i = 0; i < end; i++) {
147 |                 R_sock[socket][t*OMP_MEMORY_STEP + i] = R[t*OMP_MEMORY_STEP + i];
148 |                 P_sock[socket][t*OMP_MEMORY_STEP + i] = P[t*OMP_MEMORY_STEP + i];
149 | 
150 |                 uint32_t partition =  (hasht(R[t*OMP_MEMORY_STEP + i]) >> first_bit) & parts_mask;
151 |                 out_offsets[socket][partition + threadIdx*OMP_MEMORY_STEP] += 1;
152 |             }
153 |         }
154 |     }
155 | 
156 |     size_t prefix1 = 0;
157 | 
158 |     for (int i = 0; i < parts; i++) {
159 |         size_t base = prefix1;
160 | 
161 |         for (int j = 0; j < num_threads; j++) {
162 |             size_t temp = out_offsets[0][i + j*OMP_MEMORY_STEP];
163 |             out_offsets[0][i + j*OMP_MEMORY_STEP] = prefix1;
164 | 
165 |             offsets_GPU[i*num_threads*4 + 2*j] = prefix1 - base;
166 | 
167 |             prefix1 += temp;
168 | 
169 |             offsets_GPU[i*num_threads*4 + 2*j + 1] = prefix1 - base;
170 | 
171 |             prefix1 = ((prefix1 + 31)/32)*32;
172 |         }
173 | 
174 |         for (int j = 0; j < num_threads; j++) {
175 |             size_t temp = out_offsets[1][i + j*OMP_MEMORY_STEP];
176 |             out_offsets[1][i + j*OMP_MEMORY_STEP] = prefix1;
177 | 
178 |             offsets_GPU[i*num_threads*4 + num_threads*2 + 2*j] = prefix1 - base;
179 | 
180 |             prefix1 += temp;
181 | 
182 |             offsets_GPU[i*num_threads*4 + num_threads*2 + 2*j + 1] = prefix1 - base;
183 |             
184 | 
185 |             prefix1 = ((prefix1 + 31)/32)*32;
186 |         }
187 | 
188 |         double fraction = ((double) (prefix1 - base))/n;
189 |     }
190 | 
191 |     total[0] = prefix1;
192 |     total[1] = prefix1;
193 | 
194 |     #pragma omp parallel num_threads(num_threads)
195 |     {
196 |         uint32_t threadIdx = omp_get_thread_num();
197 |         uint32_t socket = sched_getcpu() % 2;
198 | 
199 |         /*test run, I use it to warm up the memory (make sure it is allocated by the time I access it)*/
200 |         partitions_host_omp_nontemporal_payload(R_sock[socket], P_sock[socket], out_offsets[socket], out_sock[socket], pout_sock[socket], n, log_parts, first_bit, threadIdx, num_threads);
201 |         #pragma omp barrier
202 |     }
203 | 
204 | 
205 |     double t1 = cpuSeconds();
206 | 
207 |     #pragma omp parallel num_threads(num_threads)
208 |     {
209 |         uint32_t threadIdx = omp_get_thread_num();
210 |         uint32_t socket = sched_getcpu() % 2;
211 | 
212 |         partitions_host_omp_nontemporal_payload(R_sock[socket], P_sock[socket], out_offsets[socket], out_sock[socket], pout_sock[socket], n, log_parts, first_bit, threadIdx, num_threads);
213 |         #pragma omp barrier
214 |     }
215 | 
216 |     double t2 = cpuSeconds();
217 | 
218 |     printf("bw %f MB/s\n", (n * sizeof(int)) / 1000000 / (t2 - t1));
219 | 
220 | }
221 | 
222 | /*this function handles the multithreaded partitioning*/
223 | void partition_do_payload (int* R_sock[2], int* out_sock[2], int* P_sock[2], int* pout_sock[2], size_t* out_offsets[2], size_t n, uint32_t log_parts, uint32_t first_bit, uint32_t num_threads) {
224 |     #pragma omp parallel num_threads(num_threads)
225 |     {
226 |         uint32_t threadIdx = omp_get_thread_num();
227 |         uint32_t socket = sched_getcpu() % 2;
228 | 
229 |         partitions_host_omp_nontemporal_payload(R_sock[socket], P_sock[socket], out_offsets[socket], out_sock[socket], pout_sock[socket], n, log_parts, first_bit, threadIdx, num_threads);
230 |         #pragma omp barrier
231 |     }
232 | }
233 | 
234 | /*this function handles the multithreaded numa copy (useful for staging between sockets before transfer). I use only some thread to avoid eating away bandwidth from PCIe*/
235 | void numa_copy_multithread (int* __restrict__ dest, int* __restrict__ src, int n) {
236 |     #pragma omp parallel num_threads(OMP_PARALLELISM2) 
237 |     {
238 |         uint32_t threadIdx = omp_get_thread_num() % (OMP_PARALLELISM2/2);
239 |         uint32_t socket = sched_getcpu() % 2;
240 | 
241 |         if (socket == 1)
242 |             for (size_t t = threadIdx; t < (n + OMP_MEMORY_STEP - 1)/OMP_MEMORY_STEP; t += OMP_PARALLELISM2/2) {
243 |                 int end = ((t+1)*OMP_MEMORY_STEP < n) ?
244 |                             OMP_MEMORY_STEP :
245 |                             n - t*OMP_MEMORY_STEP;
246 | 
247 |                 for (int i = 0; i < end; i += 8) {
248 |                     __m256i data = _mm256_load_si256 ((__m256i*) &src[t*OMP_MEMORY_STEP+i]);
249 |                     _mm256_stream_si256 ((__m256i*) &dest[t*OMP_MEMORY_STEP+i], data);
250 |                 }
251 |             }
252 |     }
253 | }
254 | 
255 | 
256 | /*functions used to find which partitions to batch together*/
257 | 
258 | 
259 | void sort (int* key, int* val, int n) {
260 |     if (n <= 1)
261 |         return;
262 | 
263 |     int k = 1;
264 |     int pivot = key[0];
265 | 
266 |     for (int i = 1; i < n; i++) {
267 |         if (key[i] >= pivot) {
268 |             int temp = key[i];
269 |             key[i] = key[k];
270 |             key[k] = temp;
271 |             k++;
272 |         }
273 |     }
274 | 
275 |     key[0] = key[k-1];
276 |     key[k-1] = pivot;
277 | 
278 |     sort (key, val, k-1);
279 |     sort (key + k, val + k, n-k);
280 | }
281 | 
282 | void shuffle (std::list<int>& chosen, int* weight_global, int maxw, std::list<int>& output) {
283 |     int n = chosen.size();
284 |     int cnt = 0;
285 |     int totalw = 0;
286 | 
287 |     int* alias = new int[n];
288 |     int* weight = new int[n];
289 | 
290 |     for (std::list<int>::iterator it = chosen.begin(); it != chosen.end(); ++it) {
291 |         alias[cnt] = *it;
292 |         weight[cnt] = weight_global[*it];
293 |         totalw = totalw + weight[cnt];
294 |         cnt++;
295 |     }
296 | 
297 |     sort (weight, alias, n);
298 | 
299 |     for (int i = 0; i < n; i++)
300 |         output.push_back(alias[i]);
301 | 
302 |     delete[] alias;
303 |     delete[] weight;
304 | }
305 | 
306 | 
307 | void knapSack (std::list<int>& candidates, int* weight_global, double* gain_global, std::list<int>& output, std::list<int>& remainder) {
308 |     int n = candidates.size();
309 |     int w = PARTS_RESIDENT+1;
310 |     int cnt = 0;
311 | 
312 |     int* weight = new int[n];
313 |     double* gain = new double[n];
314 |     int* alias = new int[n];
315 | 
316 |     for (std::list<int>::iterator it = candidates.begin(); it != candidates.end(); ++it) {
317 |         alias[cnt] = *it;
318 |         gain[cnt] = gain_global[*it];
319 |         weight[cnt] = weight_global[*it];
320 |         cnt++;
321 |     }
322 | 
323 | 
324 |     double** matrix = new double*[n+1];
325 | 
326 |     for (int i = 0; i < n+1; i++) {
327 |         matrix[i] = new double[w+1];
328 | 
329 |         for (int j = 0; j < w+1; j++)
330 |             matrix[i][j] = 0.0;
331 |     }
332 | 
333 |     for (int i = 0; i < n+1; i++) {
334 |         int wt = (i > 0)? weight[i-1] : 0;
335 |         double g = (i > 0)? gain[i-1] : 0.0;
336 | 
337 |         for (int j = 0; j < w+1; j++) {
338 |             if (i == 0 || j == 0)
339 |                 matrix[i][j] = 0.0;
340 |             else if (wt <= j)
341 |                 matrix[i][j] = (matrix[i-1][j] + 0.000001 < matrix[i-1][j-wt] + g)? matrix[i-1][j-wt] + g : matrix[i-1][j];
342 |             else
343 |                 matrix[i][j] = matrix[i-1][j];
344 |         }
345 |     }
346 | 
347 |     int t = PARTS_RESIDENT;
348 |     int m = n;
349 |     std::list<int> pr_output;
350 | 
351 |     while (t > 0 && m != 0) {
352 |         for (int i = m; i > 0; i--)
353 |             if (matrix[i][t] > matrix[i-1][t] + 0.000001) {
354 |                 pr_output.push_back(alias[i-1]);
355 |                 t -= weight[i-1];
356 |                 m = i-1;
357 |                 break;
358 |             } else {
359 |                 remainder.push_back(alias[i-1]);
360 |             }
361 |     }
362 | 
363 |     shuffle (pr_output, weight_global, PARTS_RESIDENT, output);
364 | 
365 |     for (int i = m; i > 0; i--) {
366 |         remainder.push_back(alias[i-1]);
367 |     }
368 | 
369 |     delete[] weight;
370 |     delete[] gain;
371 |     delete[] alias;
372 | 
373 |     for (int i = 0; i < n+1; i++)
374 |         delete[] matrix[i];
375 | 
376 |     delete[] matrix;
377 | }
378 | 
379 | #include <cmath>
380 | 
381 | void groupOptimal (double* gain, int n, std::list<std::list<int> >& output) {
382 |     std::list<int> candidates;
383 |     int* weight = new int[n];
384 | 
385 |     for (int i = 0; i < n; i++) {
386 |         candidates.push_back(i);
387 |         weight[i] = ceil(gain[i]);
388 |     }
389 | 
390 |     while (candidates.empty() == false) {
391 |         std::list<int> out;
392 |         std::list<int> remainder;
393 | 
394 |         knapSack (candidates, weight, gain, out, remainder);
395 | 
396 |         output.push_back(out);
397 | 
398 |         candidates = remainder;
399 |     }
400 | 
401 | 
402 |     delete[] weight;
403 | }
404 | 
405 | void groupOptimal2 (double* gain, int n, std::list<std::list<int> >& output) {
406 |     std::list<int> candidates;
407 |     std::list<int>* buckets = new std::list<int> [2];
408 |     int* weight = new int[n];
409 | 
410 |     for (int i = 0; i < n; i++) {
411 |         candidates.push_back(i);
412 |         weight[i] = ceil(gain[i]);
413 |     }
414 | 
415 |     std::list<int> out;
416 |     std::list<int> remainder;
417 |     knapSack (candidates, weight, gain, out, remainder);
418 | 
419 |     output.push_back(out);
420 | 
421 |     for (std::list<int>::iterator it = remainder.begin(); it != remainder.end(); ++it) {
422 |         buckets[weight[*it] - 1].push_back(*it);
423 |     }
424 | 
425 |     std::list<std::list<int> > out2;
426 |     for (std::list<int>::iterator it = buckets[1].begin(); it != buckets[1].end(); ++it) {
427 |         std::list<int> new_out;
428 |         new_out.push_back(*it);
429 |         out2.push_back(new_out);
430 |     }
431 | 
432 |     for (std::list<std::list<int> >::iterator it = out2.begin(); it != out2.end(); ++it) {
433 |         for (int i = 0; i < 3; i++) {
434 |             int next = buckets[0].front();
435 |             (*it).push_back(next);
436 |             buckets[0].pop_front();
437 | 
438 |             if (buckets[0].empty())
439 |                 break;
440 |         }
441 | 
442 |         output.push_back(*it);
443 | 
444 |         if (buckets[0].empty()) {
445 |             ++it;
446 | 
447 |             while (it != out2.end()) {
448 |                 output.push_back(*it);
449 |                 ++it;
450 |             }
451 |             
452 |             break;
453 |         }
454 |     }
455 | 
456 |     while (buckets[0].empty() == false) {
457 |         std::list<int> last;
458 | 
459 |         while (last.size() < PARTS_RESIDENT && buckets[0].empty() == false) {
460 |             int next = buckets[0].front();
461 |             last.push_back(next);
462 |             buckets[0].pop_front();
463 |         }
464 | 
465 |         output.push_back(last);
466 |     }
467 | 
468 |     delete[] weight;
469 | }


--------------------------------------------------------------------------------
/src/partition-primitives.cuh:
--------------------------------------------------------------------------------
 1 | /*Copyright (c) 2018 Data Intensive Applications and Systems Laboratory (DIAS)
 2 |                    Ecole Polytechnique Federale de Lausanne
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.*/
21 | 
22 | #ifndef PARTITION_PRIMITIVES_HPP_
23 | #define PARTITION_PRIMITIVES_HPP_
24 | 
25 | #include <cassert>
26 | #include <iostream>
27 | #include <numa.h>
28 | #include <unistd.h>
29 | 
30 | #include <list>
31 | 
32 | #include <omp.h>
33 | #include <immintrin.h>
34 | 
35 | #include "common.h"
36 | #include "common-host.h"
37 | 
38 | #define OMP_PARALLELISM1 16
39 | #define OMP_PARALLELISM2 16 
40 | #define OMP_MEMORY_STEP 4096
41 | #define LOG_PARTS_OUTER 4
42 | #define PARTS_RESIDENT 5
43 | 
44 | void partitions_host_omp_nontemporal_payload(
45 |                     const int32_t   * __restrict__ S,
46 |                     const int32_t   * __restrict__ P,
47 |                     size_t  * __restrict__ out_cnts,
48 |                     int32_t   * __restrict__ output_S,
49 |                     int32_t   * __restrict__ output_P,
50 |                     const size_t             cnt,
51 |                     const uint32_t           log_parts,
52 |                     const uint32_t           first_bit,
53 |                     const uint32_t           threadIdx,
54 |                     const uint32_t           nthreads);
55 | 
56 | void partition_prepare_payload (int* R, int* P, size_t n, uint32_t log_parts, uint32_t first_bit, 
57 |                             int* R_sock[2], int* out_sock[2],
58 |                             int* P_sock[2], int* pout_sock[2],
59 |                             size_t* out_offsets[2], size_t total[2], size_t* offsets_GPU, uint32_t num_threads);
60 | 
61 | void partition_do_payload (int* R_sock[2], int* out_sock[2], int* P_sock[2], int* pout_sock[2], size_t* out_offsets[2], size_t n, uint32_t log_parts, uint32_t first_bit, uint32_t num_threads);
62 | 
63 | void numa_copy_multithread (int* __restrict__ dest, int* __restrict__ src, int n);
64 | 
65 | void sort (int* key, int* val, int n);
66 | 
67 | void shuffle (std::list<int>& chosen, int* weight_global, int maxw, std::list<int>& output);
68 | 
69 | void knapSack (std::list<int>& candidates, int* weight_global, double* gain_global, std::list<int>& output, std::list<int>& remainder);
70 | 
71 | void groupOptimal (double* gain, int n, std::list<std::list<int> >& output);
72 | 
73 | void groupOptimal2 (double* gain, int n, std::list<std::list<int> >& output);
74 | 
75 | #endif


--------------------------------------------------------------------------------