├── INSTALL.md ├── .gitignore ├── fineTree ├── CMakeLists.txt ├── Makefile ├── main.cpp └── fineTree.h ├── TODO.md ├── stx_bench ├── Makefile └── stx_bench.cpp ├── Makefile ├── map_test.cpp ├── barrier.h ├── CycleTimer.h ├── main.cpp ├── README.md └── palmtree.h /INSTALL.md: -------------------------------------------------------------------------------- 1 | ### Begin development palmtree 2 | 3 | #### Dependencies 4 | * Boost -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | palmtree 3 | palmtree_test 4 | configure 5 | *.log 6 | build/ 7 | autom4te.cache/ 8 | cscope.out 9 | tags 10 | palmtree_test.dSYM/ 11 | *.data 12 | *.gch 13 | -------------------------------------------------------------------------------- /fineTree/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.5) 2 | project(fineTree) 3 | 4 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 5 | 6 | set(SOURCE_FILES main.cpp) 7 | add_executable(fineTree ${SOURCE_FILES}) -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | ### TODOs 2 | 3 | - [ ] Support multi-thread no SIMD 4 | - [ ] Multi thread benchmark 5 | - [ ] Support multi-thread with SIMD on searching and pre-sorting of batch 6 | - [ ] Support multiple values for the same key, and remove of k,v pairs 7 | - [ ] Support scan (or STL like iterator interfaces) 8 | -------------------------------------------------------------------------------- /stx_bench/Makefile: -------------------------------------------------------------------------------- 1 | # CFLAGS=-Werror -Wall -std=c++11 -g -ggdb -I/usr/local/include 2 | CFLAGS=-Werror -Wall -std=c++11 -O3 -I/usr/local/include 3 | LDFLAGS=-L/usr/local/lib -lglog 4 | CC=g++ 5 | 6 | all: stx_bench 7 | 8 | stx_bench: stx_bench.cpp 9 | $(CC) $(CFLAGS) -o stx_bench stx_bench.cpp $(LDFLAGS) 10 | 11 | 12 | clean: 13 | rm -rf stx_bench *.o 14 | -------------------------------------------------------------------------------- /fineTree/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS=-Werror -Wall -std=c++11 -mavx2 -pedantic -O3 -I/usr/local/include 2 | 3 | LDFLAGS=-L/usr/local/lib -lboost_atomic -lboost_system -lboost_thread -lglog #-ljemalloc 4 | CC=g++ 5 | 6 | all: fineTree_test 7 | 8 | fineTree_test: main.cpp fineTree.h 9 | $(CC) $(CFLAGS) -o fineTree_test main.cpp $(LDFLAGS) 10 | 11 | clean: 12 | rm -rf fineTree_test *.o -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # CFLAGS=-Werror -Wall -std=c++11 -mavx2 -g -ggdb -I/usr/local/include 2 | # CFLAGS=-Werror -Wall -std=c++11 -msse4.1 -pedantic -O3 -I/usr/local/include 3 | # CFLAGS=-Werror -Wall -std=c++11 -mavx2 -g -ggdb -I/usr/local/include 4 | CFLAGS=-Werror -Wall -std=c++11 -mavx2 -O3 -I/usr/local/include 5 | 6 | LDFLAGS=-L/usr/local/lib -lboost_atomic -lboost_system -lboost_thread -lglog -ljemalloc -lpthread 7 | CC=g++ 8 | 9 | all: palmtree_test 10 | 11 | palmtree_test: main.cpp palmtree.h barrier.h 12 | $(CC) $(CFLAGS) -o palmtree_test main.cpp $(LDFLAGS) 13 | 14 | map_test: map_test.cpp 15 | $(CC) -std=c++11 -O3 -o map_test map_test.cpp 16 | time ./map_test 17 | 18 | clean: 19 | rm -rf palmtree_test map_test *.o 20 | -------------------------------------------------------------------------------- /map_test.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Zrs_y on 4/28/16. 3 | // 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "CycleTimer.h" 10 | 11 | #define TEST_SIZE 10240000 12 | 13 | int main() { 14 | 15 | int *buff = new int[TEST_SIZE]; 16 | for(int i = 0; i < TEST_SIZE; i++) { 17 | buff[i] = i; 18 | } 19 | 20 | //std::random_shuffle(buff, buff + TEST_SIZE); 21 | 22 | auto begin_time = CycleTimer::currentSeconds(); 23 | std::map t; 24 | 25 | for(int i = 0; i < TEST_SIZE; i++) { 26 | //auto kv = buff[i]; 27 | auto kv = i; 28 | t[kv] = kv; 29 | if (t[kv] != kv) { 30 | return 0; 31 | } 32 | } 33 | 34 | auto end_time = CycleTimer::currentSeconds(); 35 | std::cout << "dict's size is " << t.size() << std::endl; 36 | std::cout << "running time is " << end_time - begin_time << " seconds" << std::endl; 37 | delete buff; 38 | return 0; 39 | } 40 | -------------------------------------------------------------------------------- /barrier.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Runshen Zhu on 4/28/16. 3 | // 4 | 5 | #pragma once 6 | #include 7 | 8 | // a re-useable barrier for sync-ing among multiple working threads 9 | class Barrier { 10 | public: 11 | Barrier() = delete; 12 | Barrier(int n): P(n) { 13 | m_generation = 0; 14 | m_count = n; 15 | } 16 | 17 | // wait blocks until all P threads arrive the barrier and call it. 18 | bool wait() { 19 | lock.lock(); 20 | auto gen = m_generation.load(); 21 | 22 | if (--m_count == 0) { 23 | m_generation++; 24 | m_count = P; 25 | lock.unlock(); 26 | return true; 27 | } 28 | 29 | lock.unlock(); 30 | 31 | while (gen == m_generation); 32 | return false; 33 | } 34 | 35 | private: 36 | class spinlock { 37 | // a tick based spinlock 38 | // traditional CAS spin lock has lots of bus traffic 39 | // this implementation is aimed to ease the bus traffic 40 | // 41 | // traditional CAS lock: 42 | // let const int unlock = 1, lock = 0; 43 | // lock() { while( CAS(LOCK_, unlock, lock) == false ) {} } 44 | // 45 | // each CAS call is equal to a bus write, which invalids cache line 46 | // and bring bus traffic. So it's not a good idea to keep CAS-ing on 47 | // the value LOCK_ 48 | // 49 | // a simple way to ease it is to use `test and CAS` approach: 50 | // lock() { 51 | // for(;;) { 52 | // if (LOCK_ == unlock && CAS(LOCK_, unlock, lock) == true) { 53 | // return; 54 | // } 55 | // } 56 | // } 57 | // 58 | // compared with `test and CAS` approach, tick based spin lock takes a further 59 | // step and reduces more traffic than `test and CAS` spin lock. (because there are 60 | // still confilicts in `test and CAS` approach) 61 | public: 62 | spinlock() { 63 | next_ticket = 0; 64 | now_serving = 0; 65 | } 66 | 67 | void lock() { 68 | auto my_ticket = next_ticket++; 69 | while(my_ticket != now_serving) ; 70 | } 71 | 72 | void unlock() { 73 | now_serving++; 74 | } 75 | 76 | private: 77 | std::atomic next_ticket; 78 | std::atomic now_serving; 79 | }; 80 | spinlock lock; 81 | 82 | // # of threads that haven't arrived this barrier 83 | std::atomic m_count; 84 | // generation of this barrier 85 | std::atomic m_generation; 86 | // # of threads that are using this barrier 87 | int P; 88 | }; -------------------------------------------------------------------------------- /stx_bench/stx_bench.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Zrs_y on 5/4/16. 3 | // 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include "../CycleTimer.h" 12 | 13 | class fast_random { 14 | public: 15 | fast_random(unsigned long seed) : seed(0) { set_seed0(seed); } 16 | 17 | inline unsigned long next() { 18 | return ((unsigned long)next(32) << 32) + next(32); 19 | } 20 | 21 | inline uint32_t next_u32() { return next(32); } 22 | 23 | inline uint16_t next_u16() { return (uint16_t)next(16); } 24 | 25 | /** [0.0, 1.0) */ 26 | inline double next_uniform() { 27 | return (((unsigned long)next(26) << 27) + next(27)) / (double)(1L << 53); 28 | } 29 | 30 | inline char next_char() { return next(8) % 256; } 31 | 32 | inline char next_readable_char() { 33 | static const char readables[] = 34 | "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"; 35 | return readables[next(6)]; 36 | } 37 | 38 | inline std::string next_string(size_t len) { 39 | std::string s(len, 0); 40 | for (size_t i = 0; i < len; i++) s[i] = next_char(); 41 | return s; 42 | } 43 | 44 | inline std::string next_readable_string(size_t len) { 45 | std::string s(len, 0); 46 | for (size_t i = 0; i < len; i++) s[i] = next_readable_char(); 47 | return s; 48 | } 49 | 50 | inline unsigned long get_seed() { return seed; } 51 | 52 | inline void set_seed(unsigned long seed) { this->seed = seed; } 53 | 54 | private: 55 | inline void set_seed0(unsigned long seed) { 56 | this->seed = (seed ^ 0x5DEECE66DL) & ((1L << 48) - 1); 57 | } 58 | 59 | inline unsigned long next(unsigned int bits) { 60 | seed = (seed * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1); 61 | return (unsigned long)(seed >> (48 - bits)); 62 | } 63 | 64 | unsigned long seed; 65 | }; 66 | 67 | 68 | void readonly_bench(size_t entry_count, size_t read_count) { 69 | 70 | LOG(INFO) << "Running std map"; 71 | stx::btree_map map; 72 | for (size_t i = 0; i < entry_count; i++) 73 | map.insert(std::make_pair(i, i)); 74 | 75 | fast_random rng(time(0)); 76 | auto start = CycleTimer::currentSeconds(); 77 | for (size_t i = 0; i < read_count; i++) { 78 | int rand_key = rng.next_u32() % entry_count; 79 | map.find(rand_key); 80 | } 81 | auto end = CycleTimer::currentSeconds(); 82 | LOG(INFO) << "stx map run for " << end-start << "s, " << "thput:" << std::fixed << read_count/(end-start)/1000 << " K rps"; 83 | 84 | } 85 | 86 | int main() { 87 | readonly_bench(1024*512, 1024*1024*10); 88 | return 0; 89 | } 90 | -------------------------------------------------------------------------------- /fineTree/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "fineTree.h" 6 | #include "../CycleTimer.h" 7 | using namespace std; 8 | 9 | class fast_random { 10 | public: 11 | fast_random(unsigned long seed) : seed(0) { set_seed0(seed); } 12 | 13 | inline unsigned long next() { 14 | return ((unsigned long)next(32) << 32) + next(32); 15 | } 16 | 17 | inline uint32_t next_u32() { return next(32); } 18 | 19 | inline uint16_t next_u16() { return (uint16_t)next(16); } 20 | 21 | /** [0.0, 1.0) */ 22 | inline double next_uniform() { 23 | return (((unsigned long)next(26) << 27) + next(27)) / (double)(1L << 53); 24 | } 25 | 26 | inline char next_char() { return next(8) % 256; } 27 | 28 | inline char next_readable_char() { 29 | static const char readables[] = 30 | "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"; 31 | return readables[next(6)]; 32 | } 33 | 34 | inline std::string next_string(size_t len) { 35 | std::string s(len, 0); 36 | for (size_t i = 0; i < len; i++) s[i] = next_char(); 37 | return s; 38 | } 39 | 40 | inline std::string next_readable_string(size_t len) { 41 | std::string s(len, 0); 42 | for (size_t i = 0; i < len; i++) s[i] = next_readable_char(); 43 | return s; 44 | } 45 | 46 | inline unsigned long get_seed() { return seed; } 47 | 48 | inline void set_seed(unsigned long seed) { this->seed = seed; } 49 | 50 | private: 51 | inline void set_seed0(unsigned long seed) { 52 | this->seed = (seed ^ 0x5DEECE66DL) & ((1L << 48) - 1); 53 | } 54 | 55 | inline unsigned long next(unsigned int bits) { 56 | seed = (seed * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1); 57 | return (unsigned long)(seed >> (48 - bits)); 58 | } 59 | 60 | unsigned long seed; 61 | }; 62 | 63 | 64 | int main() { 65 | cout << "Hello, World!" << endl; 66 | 67 | fineTree fTree(0xffffffff); 68 | 69 | auto entry_count = 1024 * 51200; 70 | auto read_count = 1024*1024*1; 71 | int *buff = new int[entry_count]; 72 | for(int i = 0; i < entry_count; i++) { 73 | buff[i] = i; 74 | } 75 | 76 | std::random_shuffle(buff, buff + entry_count); 77 | 78 | for(int j = 0; j < entry_count; j++) { 79 | auto kv = buff[j]; 80 | fTree.insert(kv, kv); 81 | } 82 | auto fp = &fTree; 83 | 84 | 85 | auto start = CycleTimer::currentSeconds(); 86 | std::vector threads; 87 | int w_n = 4; 88 | for(int j = 0; j < w_n; j ++) { 89 | threads.push_back(std::thread([fp, w_n, read_count, entry_count]() { 90 | fast_random rng(time(0)); 91 | for (int i = 0; i < read_count / w_n; i++) { 92 | int rand_key = rng.next_u32() % entry_count; 93 | int val; 94 | auto res = fp->search(rand_key, val); 95 | if (res != 0 || val != rand_key) { 96 | LOG(FATAL) << "search fail"; 97 | } 98 | } 99 | })); 100 | } 101 | 102 | for (auto &t : threads) { 103 | t.join(); 104 | } 105 | auto end = CycleTimer::currentSeconds(); 106 | LOG(INFO) << "fineTree run for " << end-start << "s, " << "thput:" << std::fixed << read_count/(end-start)/1000 << " K rps"; 107 | 108 | delete buff; 109 | } -------------------------------------------------------------------------------- /CycleTimer.h: -------------------------------------------------------------------------------- 1 | #ifndef _SYRAH_CYCLE_TIMER_H_ 2 | #define _SYRAH_CYCLE_TIMER_H_ 3 | 4 | #if defined(__APPLE__) 5 | #if defined(__x86_64__) 6 | #include 7 | #else 8 | #include 9 | #include 10 | #endif // __x86_64__ or not 11 | 12 | #include // fprintf 13 | #include // exit 14 | 15 | #elif _WIN32 16 | # include 17 | # include 18 | #else 19 | # include 20 | # include 21 | # include 22 | # include 23 | #endif 24 | 25 | 26 | // This uses the cycle counter of the processor. Different 27 | // processors in the system will have different values for this. If 28 | // you process moves across processors, then the delta time you 29 | // measure will likely be incorrect. This is mostly for fine 30 | // grained measurements where the process is likely to be on the 31 | // same processor. For more global things you should use the 32 | // Time interface. 33 | 34 | // Also note that if you processors' speeds change (i.e. processors 35 | // scaling) or if you are in a heterogenous environment, you will 36 | // likely get spurious results. 37 | class CycleTimer { 38 | public: 39 | typedef unsigned long long SysClock; 40 | 41 | ////////// 42 | // Return the current CPU time, in terms of clock ticks. 43 | // Time zero is at some arbitrary point in the past. 44 | static SysClock currentTicks() { 45 | #if defined(__APPLE__) && !defined(__x86_64__) 46 | return mach_absolute_time(); 47 | #elif defined(_WIN32) 48 | LARGE_INTEGER qwTime; 49 | QueryPerformanceCounter(&qwTime); 50 | return qwTime.QuadPart; 51 | #elif defined(__x86_64__) 52 | unsigned int a, d; 53 | asm volatile("rdtsc" : "=a" (a), "=d" (d)); 54 | return static_cast(a) | 55 | (static_cast(d) << 32); 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser. 57 | unsigned int val; 58 | asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val)); 59 | return val; 60 | #else 61 | timespec spec; 62 | clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec); 63 | return CycleTimer::SysClock(static_cast(spec.tv_sec) * 1e9 + static_cast(spec.tv_nsec)); 64 | #endif 65 | } 66 | 67 | ////////// 68 | // Return the current CPU time, in terms of seconds. 69 | // This is slower than currentTicks(). Time zero is at 70 | // some arbitrary point in the past. 71 | static double currentSeconds() { 72 | return currentTicks() * secondsPerTick(); 73 | } 74 | 75 | ////////// 76 | // Return the conversion from seconds to ticks. 77 | static double ticksPerSecond() { 78 | return 1.0/secondsPerTick(); 79 | } 80 | 81 | static const char* tickUnits() { 82 | #if defined(__APPLE__) && !defined(__x86_64__) 83 | return "ns"; 84 | #elif defined(__WIN32__) || defined(__x86_64__) 85 | return "cycles"; 86 | #else 87 | return "ns"; // clock_gettime 88 | #endif 89 | } 90 | 91 | ////////// 92 | // Return the conversion from ticks to seconds. 93 | static double secondsPerTick() { 94 | static bool initialized = false; 95 | static double secondsPerTick_val; 96 | if (initialized) return secondsPerTick_val; 97 | #if defined(__APPLE__) 98 | #ifdef __x86_64__ 99 | int args[] = {CTL_HW, HW_CPU_FREQ}; 100 | unsigned int Hz; 101 | size_t len = sizeof(Hz); 102 | if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) { 103 | fprintf(stderr, "Failed to initialize secondsPerTick_val!\n"); 104 | exit(-1); 105 | } 106 | secondsPerTick_val = 1.0 / (double) Hz; 107 | #else 108 | mach_timebase_info_data_t time_info; 109 | mach_timebase_info(&time_info); 110 | 111 | // Scales to nanoseconds without 1e-9f 112 | secondsPerTick_val = (1e-9*static_cast(time_info.numer))/ 113 | static_cast(time_info.denom); 114 | #endif // x86_64 or not 115 | #elif defined(_WIN32) 116 | LARGE_INTEGER qwTicksPerSec; 117 | QueryPerformanceFrequency(&qwTicksPerSec); 118 | secondsPerTick_val = 1.0/static_cast(qwTicksPerSec.QuadPart); 119 | #else 120 | FILE *fp = fopen("/proc/cpuinfo","r"); 121 | char input[1024]; 122 | if (!fp) { 123 | fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo."); 124 | exit(-1); 125 | } 126 | // In case we don't find it, e.g. on the N900 127 | secondsPerTick_val = 1e-9; 128 | while (!feof(fp) && fgets(input, 1024, fp)) { 129 | // NOTE(boulos): Because reading cpuinfo depends on dynamic 130 | // frequency scaling it's better to read the @ sign first 131 | float GHz, MHz; 132 | if (strstr(input, "model name")) { 133 | char* at_sign = strstr(input, "@"); 134 | if (at_sign) { 135 | char* after_at = at_sign + 1; 136 | char* GHz_str = strstr(after_at, "GHz"); 137 | char* MHz_str = strstr(after_at, "MHz"); 138 | if (GHz_str) { 139 | *GHz_str = '\0'; 140 | if (1 == sscanf(after_at, "%f", &GHz)) { 141 | //printf("GHz = %f\n", GHz); 142 | secondsPerTick_val = 1e-9f / GHz; 143 | break; 144 | } 145 | } else if (MHz_str) { 146 | *MHz_str = '\0'; 147 | if (1 == sscanf(after_at, "%f", &MHz)) { 148 | //printf("MHz = %f\n", MHz); 149 | secondsPerTick_val = 1e-6f / GHz; 150 | break; 151 | } 152 | } 153 | } 154 | } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) { 155 | //printf("MHz = %f\n", MHz); 156 | secondsPerTick_val = 1e-6f / MHz; 157 | break; 158 | } 159 | } 160 | fclose(fp); 161 | #endif 162 | 163 | initialized = true; 164 | return secondsPerTick_val; 165 | } 166 | 167 | ////////// 168 | // Return the conversion from ticks to milliseconds. 169 | static double msPerTick() { 170 | return secondsPerTick() * 1000.0; 171 | } 172 | 173 | private: 174 | CycleTimer(); 175 | }; 176 | 177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_ 178 | -------------------------------------------------------------------------------- /fineTree/fineTree.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Zrs_y on 5/5/16. 3 | // 4 | 5 | #ifndef FINETREE_FINETREE_H 6 | #define FINETREE_FINETREE_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "immintrin.h" 13 | 14 | #define UNUSED __attribute__((unused)) 15 | 16 | enum NodeType { 17 | INNERNODE = 0, 18 | LEAFNODE 19 | }; 20 | 21 | static std::atomic NODE_NUM(0); 22 | 23 | template , 26 | typename KeyComparator = std::less > 27 | class fineTree { 28 | 29 | public: 30 | fineTree(KeyType min_key) { 31 | this->min_key = min_key; 32 | root = new InnerNode(nullptr, 1); 33 | auto child = new LeafNode(root, 0); 34 | 35 | add_item_inner(root, min_key, child); 36 | 37 | } 38 | 39 | int search(KeyType UNUSED key, ValueType &res) { 40 | auto ptr = (InnerNode *)root; 41 | ptr->lock_shared(); 42 | for (;;) { 43 | CHECK(ptr->slot_used > 0) << "Search empty inner node"; 44 | 45 | auto idx = this->search_inner(ptr->keys, ptr->slot_used, key); 46 | CHECK(idx != -1) << "search innerNode fail"; 47 | CHECK(key_less(ptr->keys[idx], key) || key_eq(ptr->keys[idx], key)); 48 | if(idx + 1 < ptr->slot_used) { 49 | CHECK(key_less(key, ptr->keys[idx + 1])); 50 | } 51 | auto child = ptr->values[idx]; 52 | child->lock_shared(); 53 | ptr->unlock_shared(); 54 | if (child->type() == LEAFNODE) { 55 | auto leaf = (LeafNode *)child; 56 | idx = search_leaf(leaf->keys, leaf->slot_used, key); 57 | if (idx < 0) { 58 | child->unlock_shared(); 59 | return -1; 60 | }else{ 61 | res = leaf->values[idx]; 62 | child->unlock_shared(); 63 | return 0; 64 | } 65 | } else { 66 | ptr = (InnerNode *)child; 67 | } 68 | } 69 | 70 | 71 | return 0; 72 | } 73 | 74 | void insert(KeyType UNUSED key, ValueType UNUSED val) { 75 | root->lock_exclusive(); 76 | 77 | auto new_child = insert_inner((InnerNode *)root, key, val); 78 | 79 | if(new_child == nullptr) { 80 | root->unlock_execlusive(); 81 | return; 82 | } 83 | 84 | auto new_root = new InnerNode(nullptr, root->level + 1); 85 | root->parent = new_root; 86 | new_child->parent = new_root; 87 | 88 | add_item_inner(new_root, root->keys[0], root); 89 | add_item_inner(new_root, ((InnerNode *)new_child)->keys[0], new_child); 90 | 91 | root = new_root; 92 | root->values[0]->unlock_execlusive(); 93 | } 94 | 95 | void test() { 96 | root->upgrade_lock(); 97 | std::cout << "lock shared" << std::endl; 98 | root->upgrade_lock(); 99 | std::cout << "lock exclusive" << std::endl; 100 | } 101 | 102 | private: 103 | KeyType min_key; 104 | // Max number of slots per inner node 105 | static const int INNER_MAX_SLOT = 64; 106 | // Max number of slots per leaf node 107 | static const int LEAF_MAX_SLOT = 128; 108 | 109 | class spinlock { 110 | public: 111 | spinlock() { 112 | next_ticket = 0; 113 | now_serving = 0; 114 | } 115 | 116 | void lock() { 117 | auto my_ticket = next_ticket++; 118 | while(my_ticket != now_serving) ; 119 | } 120 | 121 | void unlock() { 122 | now_serving++; 123 | } 124 | 125 | private: 126 | std::atomic next_ticket; 127 | std::atomic now_serving; 128 | }; 129 | 130 | 131 | struct Node { 132 | // Number of actually used slots 133 | int slot_used; 134 | int id; 135 | int level; 136 | KeyType lower_bound; 137 | Node *parent; 138 | 139 | 140 | Node() = delete; 141 | Node(Node *p, int lvl): slot_used(0), level(lvl), parent(p) { 142 | id = NODE_NUM++; 143 | }; 144 | 145 | void lock_shared() { 146 | lock.lock(); 147 | } 148 | 149 | void unlock_shared() { 150 | lock.unlock(); 151 | } 152 | 153 | void lock_exclusive() { 154 | lock.lock(); 155 | } 156 | 157 | void unlock_execlusive() { 158 | lock.unlock(); 159 | } 160 | 161 | // upgrade to exclusive lock 162 | void upgrade_lock() { 163 | lock.lock(); 164 | } 165 | 166 | // downgrade to shared lock 167 | void downgrade_lock() { 168 | lock.unlock(); 169 | } 170 | 171 | 172 | // boost::upgrade_mutex lock; 173 | spinlock lock; 174 | virtual ~Node() {}; 175 | virtual std::string to_string() = 0; 176 | virtual NodeType type() const = 0; 177 | virtual bool is_few() = 0; 178 | }; 179 | 180 | struct InnerNode : public Node { 181 | InnerNode() = delete; 182 | InnerNode(Node *parent, int level): Node(parent, level){}; 183 | virtual ~InnerNode() {}; 184 | // Keys for values 185 | KeyType keys[LEAF_MAX_SLOT]; 186 | // Pointers for child nodes 187 | Node *values[LEAF_MAX_SLOT]; 188 | 189 | virtual NodeType type() const { 190 | return INNERNODE; 191 | } 192 | 193 | virtual std::string to_string() { 194 | std::string res; 195 | res += "InnerNode[" + std::to_string(Node::id) + " @ " + std::to_string(Node::level) + "] "; 196 | // res += std::to_string(Node::slot_used); 197 | for (int i = 0 ; i < Node::slot_used ; i++) { 198 | res += " " + std::to_string(keys[i]) + ":" + std::to_string(values[i]->id); 199 | } 200 | return res; 201 | } 202 | 203 | inline bool is_full() const { 204 | return Node::slot_used == MAX_SLOT(); 205 | } 206 | 207 | 208 | inline size_t MAX_SLOT() const { 209 | return LEAF_MAX_SLOT; 210 | } 211 | 212 | virtual inline bool is_few() { 213 | return Node::slot_used < MAX_SLOT()/4 || Node::slot_used == 0; 214 | } 215 | 216 | }; 217 | 218 | struct LeafNode : public Node { 219 | LeafNode() = delete; 220 | LeafNode(Node *parent, int level): Node(parent, level){}; 221 | virtual ~LeafNode() {}; 222 | 223 | // Keys and values for leaf node 224 | KeyType keys[INNER_MAX_SLOT]; 225 | ValueType values[INNER_MAX_SLOT]; 226 | 227 | virtual NodeType type() const { 228 | return LEAFNODE; 229 | } 230 | 231 | virtual std::string to_string() { 232 | std::string res; 233 | res += "LeafNode[" + std::to_string(Node::id) + " @ " + std::to_string(Node::level) + "] "; 234 | 235 | for (int i = 0 ; i < Node::slot_used ; i++) { 236 | res += " " + std::to_string(keys[i]) + ":" + std::to_string(values[i]); 237 | } 238 | return res; 239 | } 240 | 241 | inline bool is_full() const { 242 | return Node::slot_used == MAX_SLOT(); 243 | } 244 | 245 | inline size_t MAX_SLOT() const { 246 | return INNER_MAX_SLOT; 247 | } 248 | 249 | virtual inline bool is_few() { 250 | return Node::slot_used < MAX_SLOT()/4 || Node::slot_used == 0; 251 | } 252 | }; 253 | 254 | // Return true if k1 < k2 255 | inline bool key_less(const KeyType &k1, const KeyType &k2) { 256 | return k1 < k2; 257 | } 258 | // Return true if k1 == k2 259 | inline bool key_eq(const KeyType &k1, const KeyType &k2) { 260 | return k1 == k2; 261 | } 262 | 263 | // Return the index of the largest slot whose key <= @target 264 | // assume there is no duplicated element 265 | int search_inner(const KeyType *input, int size, const KeyType &target) { 266 | // auto bt = CycleTimer::currentTicks(); 267 | int low = 0, high = size - 1; 268 | while (low != high) { 269 | int mid = (low + high) / 2 + 1; 270 | if (key_less(target, input[mid])) { 271 | // target < input[mid] 272 | high = mid - 1; 273 | } 274 | else { 275 | // target >= input[mid]; 276 | low = mid; 277 | } 278 | } 279 | // STAT.add_stat(0, "search_inner", CycleTimer::currentTicks() - bt); 280 | 281 | if (low == size) { 282 | return -1; 283 | } 284 | return low; 285 | } 286 | 287 | int search_leaf(const KeyType *data, int size, const KeyType &target) { 288 | // auto bt = CycleTimer::currentTicks(); 289 | const __m256i keys = _mm256_set1_epi32(target); 290 | 291 | const auto n = size; 292 | const auto rounded = 8 * (n/8); 293 | 294 | for (int i=0; i < rounded; i += 8) { 295 | 296 | const __m256i vec1 = _mm256_loadu_si256(reinterpret_cast(&data[i])); 297 | 298 | const __m256i cmp1 = _mm256_cmpeq_epi32(vec1, keys); 299 | 300 | const uint32_t mask = _mm256_movemask_epi8(cmp1); 301 | 302 | if (mask != 0) { 303 | // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt); 304 | return i + __builtin_ctz(mask)/4; 305 | } 306 | } 307 | 308 | for (int i = rounded; i < n; i++) { 309 | if (data[i] == target) { 310 | // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt); 311 | return i; 312 | } 313 | } 314 | 315 | // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt); 316 | return -1; 317 | } 318 | 319 | 320 | 321 | void add_item_leaf(LeafNode *node, KeyType key, ValueType value) { 322 | auto idx = node->slot_used++; 323 | node->keys[idx] = key; 324 | node->values[idx] = value; 325 | return; 326 | } 327 | 328 | void add_item_inner(InnerNode *node, KeyType key, Node *value) { 329 | // add item to inner node 330 | // ensure it's order 331 | 332 | if(node->slot_used == 0) { 333 | node->keys[0] = key; 334 | node->values[0] = value; 335 | node->slot_used++; 336 | return; 337 | } 338 | 339 | auto idx = search_inner(node->keys, node->slot_used, key); 340 | 341 | CHECK(idx != -1) << "search innerNode fail" << key <<" " <keys[0]; 342 | CHECK(key_less(node->keys[idx], key) || key_eq(node->keys[idx], key)); 343 | if(idx + 1 < node->slot_used) { 344 | CHECK(key_less(key, node->keys[idx + 1])) << "search inner fail"; 345 | } 346 | 347 | auto k = key; 348 | auto v = value; 349 | 350 | for(int i = idx + 1; i < node->slot_used; i++) { 351 | std::swap(node->keys[i], k); 352 | std::swap(node->values[i], v); 353 | } 354 | 355 | node->keys[node->slot_used] = k; 356 | node->values[node->slot_used] = v; 357 | node->slot_used++; 358 | } 359 | 360 | Node *insert_leaf(LeafNode *node, KeyType key, ValueType value) { 361 | // assume we hold the exclusive lock of leaf 362 | 363 | 364 | // node not full 365 | // simple add item to leaf 366 | if(!node->is_full()) { 367 | add_item_leaf(node, key, value); 368 | return nullptr; 369 | } 370 | 371 | // otherwise, firstly buff all elements 372 | std::vector> buff; 373 | for(int i = 0; i < node->slot_used; i++) { 374 | buff.push_back(std::make_pair(node->keys[i], node->values[i])); 375 | } 376 | buff.push_back(std::make_pair(key, value)); 377 | 378 | 379 | // sort 380 | std::sort(buff.begin(), buff.end(), [this](const std::pair &p1, const std::pair &p2) { 381 | return key_less(p1.first, p2.first); 382 | }); 383 | 384 | 385 | // split into 2 parts 386 | // store the second half to new node 387 | auto half = buff.size() / 2; 388 | auto itr = buff.begin(); 389 | node->slot_used = 0; 390 | for(int i = 0; i < half; i++) { 391 | add_item_leaf(node, itr->first, itr->second); 392 | itr++; 393 | } 394 | 395 | auto new_child = new LeafNode(node->parent, 0); 396 | 397 | while(itr != buff.end()) { 398 | add_item_leaf(new_child, itr->first, itr->second); 399 | itr++; 400 | } 401 | 402 | 403 | // return the new node to upper layer 404 | return new_child; 405 | } 406 | 407 | Node *insert_inner(InnerNode *node, KeyType key, ValueType value) { 408 | // assume we hold the exclusive lock before entering this function 409 | 410 | // firstly, find the child to insert 411 | auto idx = search_inner(node->keys, node->slot_used, key); 412 | CHECK(idx != -1) << "search fail"; 413 | auto child = node->values[idx]; 414 | Node *new_child = nullptr; 415 | if(child->type() == LEAFNODE) { 416 | child->lock_exclusive(); 417 | new_child = insert_leaf((LeafNode *)child, key, value); 418 | }else { 419 | // child->lock_shared(); 420 | child->lock_exclusive(); 421 | new_child = insert_inner((InnerNode *)child, key, value); 422 | } 423 | 424 | // child not split 425 | if(new_child == nullptr) { 426 | child->unlock_execlusive(); 427 | return nullptr; 428 | } 429 | 430 | // child split 431 | KeyType new_key; 432 | if(new_child->type() == LEAFNODE) { 433 | new_key = ((LeafNode *)new_child)->keys[0]; 434 | }else{ 435 | new_key = ((InnerNode *)new_child)->keys[0]; 436 | } 437 | 438 | // node not split 439 | if(!node->is_full()) { 440 | add_item_inner(node, new_key, new_child); 441 | child->unlock_execlusive(); 442 | return nullptr; 443 | } 444 | 445 | // node also need split 446 | 447 | // lock all children 448 | for(int i = 0; i < node->slot_used; i++) { 449 | if(node->values[i] != child) { 450 | node->values[i]->lock_exclusive(); 451 | } 452 | } 453 | 454 | 455 | // buff all elements 456 | std::vector> buff; 457 | for(int i = 0; i < node->slot_used; i++) { 458 | buff.push_back(std::make_pair(node->keys[i], node->values[i])); 459 | } 460 | buff.push_back(std::make_pair(new_key, new_child)); 461 | 462 | 463 | // sort 464 | std::sort(buff.begin(), buff.end(), [this](const std::pair &p1, const std::pair &p2) { 465 | return key_less(p1.first, p2.first); 466 | }); 467 | 468 | 469 | // store half 470 | auto half = buff.size() / 2; 471 | auto itr = buff.begin(); 472 | node->slot_used = 0; 473 | for(int i = 0; i < half; i++) { 474 | node->keys[i] = itr->first; 475 | node->values[i] = itr->second; 476 | node->slot_used++; 477 | itr++; 478 | } 479 | 480 | // new node store another half 481 | auto new_inner = new InnerNode(node->parent, node->level); 482 | 483 | int i = 0; 484 | while(itr != buff.end()) { 485 | new_inner->keys[i] = itr->first; 486 | new_inner->values[i] = itr->second; 487 | new_inner->slot_used++; 488 | itr->second->parent = new_inner; 489 | itr++; 490 | i++; 491 | } 492 | 493 | 494 | // unlock children 495 | for(int i = 0; i < node->slot_used; i++) { 496 | if(node->values[i] != new_child) { 497 | node->values[i]->unlock_execlusive(); 498 | } 499 | } 500 | 501 | for(int i = 0; i < new_inner->slot_used; i++) { 502 | if(new_inner->values[i] != new_child) { 503 | new_inner->values[i]->unlock_execlusive(); 504 | } 505 | } 506 | return new_inner; 507 | } 508 | 509 | InnerNode *root; 510 | 511 | }; 512 | 513 | 514 | #endif //FINETREE_FINETREE_H 515 | -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #define NDEBUG 2 | 3 | #include 4 | #include 5 | #include 6 | #include "palmtree.h" 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "CycleTimer.h" 17 | 18 | #define TEST_SIZE 10240000 19 | using namespace std; 20 | 21 | int worker_num; 22 | 23 | class fast_random { 24 | public: 25 | fast_random(unsigned long seed) : seed(0) { set_seed0(seed); } 26 | 27 | inline unsigned long next() { 28 | return ((unsigned long)next(32) << 32) + next(32); 29 | } 30 | 31 | inline uint32_t next_u32() { return next(32); } 32 | 33 | inline uint16_t next_u16() { return (uint16_t)next(16); } 34 | 35 | /** [0.0, 1.0) */ 36 | inline double next_uniform() { 37 | return (((unsigned long)next(26) << 27) + next(27)) / (double)(1L << 53); 38 | } 39 | 40 | inline char next_char() { return next(8) % 256; } 41 | 42 | inline char next_readable_char() { 43 | static const char readables[] = 44 | "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"; 45 | return readables[next(6)]; 46 | } 47 | 48 | inline std::string next_string(size_t len) { 49 | std::string s(len, 0); 50 | for (size_t i = 0; i < len; i++) s[i] = next_char(); 51 | return s; 52 | } 53 | 54 | inline std::string next_readable_string(size_t len) { 55 | std::string s(len, 0); 56 | for (size_t i = 0; i < len; i++) s[i] = next_readable_char(); 57 | return s; 58 | } 59 | 60 | inline unsigned long get_seed() { return seed; } 61 | 62 | inline void set_seed(unsigned long seed) { this->seed = seed; } 63 | 64 | private: 65 | inline void set_seed0(unsigned long seed) { 66 | this->seed = (seed ^ 0x5DEECE66DL) & ((1L << 48) - 1); 67 | } 68 | 69 | inline unsigned long next(unsigned int bits) { 70 | seed = (seed * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1); 71 | return (unsigned long)(seed >> (48 - bits)); 72 | } 73 | 74 | unsigned long seed; 75 | }; 76 | 77 | void test() { 78 | palmtree::PalmTree palmtree(std::numeric_limits::min(), worker_num); 79 | palmtree::PalmTree *palmtreep = &palmtree; 80 | 81 | for (int i = 0; i < 32; i++) { 82 | palmtreep->insert(i, i); 83 | } 84 | 85 | for (int i = 16; i <= 30; i++) { 86 | palmtreep->remove(i); 87 | } 88 | 89 | for (int i = 0; i <= 15; i++) { 90 | palmtreep->remove(i); 91 | } 92 | 93 | palmtreep->remove(31); 94 | 95 | for (int i = 0; i < 32; i++) { 96 | DLOG(INFO) << "Remove " << i; 97 | palmtreep->remove(i); 98 | int res; 99 | DLOG(INFO) << "Find " << i; 100 | bool success = palmtreep->find(i, res); 101 | if (success) { 102 | assert(false); 103 | } else { 104 | DLOG(INFO) << "Thread " << i << " get nothing"; 105 | } 106 | } 107 | 108 | srand(15618); 109 | 110 | std::map reference; 111 | for (int i = 10; i < 256; i++) { 112 | int key1 = i; 113 | int value1 = rand() % 10; 114 | int key2 = i - 10; 115 | 116 | palmtreep->insert(key1, value1); 117 | palmtreep->remove(key2); 118 | 119 | reference.emplace(key1, value1); 120 | reference.erase(key2); 121 | } 122 | 123 | for (auto itr = reference.begin(); itr != reference.end(); itr++) { 124 | DLOG(INFO) << itr->first << " " << itr->second; 125 | } 126 | 127 | for (int i = 246; i < 256; i++) { 128 | int res; 129 | bool suc = palmtreep->find(i, res); 130 | CHECK(suc == true && res == reference[i]) << "Should find " << i << " " << reference[i]; 131 | } 132 | 133 | while(palmtree.task_nums > 0) 134 | ; 135 | } 136 | 137 | void bench() { 138 | int *buff = new int[TEST_SIZE]; 139 | for(int i = 0; i < TEST_SIZE; i++) { 140 | buff[i] = i; 141 | } 142 | 143 | std::random_shuffle(buff, buff + TEST_SIZE); 144 | 145 | palmtree::PalmTree palmtree(std::numeric_limits::min(), worker_num); 146 | palmtree::PalmTree *palmtreep = &palmtree; 147 | 148 | std::vector threads; 149 | 150 | double start = CycleTimer::currentSeconds(); 151 | 152 | for (int i = 0; i < 1; i++) { 153 | threads.push_back(std::thread([palmtreep, i, buff]() { 154 | for(int j = 0; j < TEST_SIZE; j++) { 155 | auto kv = buff[j]; 156 | int res; 157 | palmtreep->insert(kv, kv); 158 | palmtreep->find(kv, res); 159 | } 160 | })); 161 | } 162 | 163 | for (auto &thread : threads) 164 | thread.join(); 165 | 166 | delete buff; 167 | LOG(INFO) << "task_nums: " << palmtree.task_nums; 168 | while(palmtree.task_nums > 0) 169 | ; 170 | 171 | double end = CycleTimer::currentSeconds(); 172 | cout << "run for " << end-start << "s"; 173 | } 174 | 175 | // Populate a palm tree with @entry_count entries 176 | void populate_palm_tree(palmtree::PalmTree *palmtreep, size_t entry_count) { 177 | int *buff = new int[entry_count]; 178 | for(size_t i = 0; i < entry_count; i++) { 179 | buff[i] = i; 180 | } 181 | 182 | std::random_shuffle(buff, buff + entry_count); 183 | 184 | for(size_t j = 0; j < entry_count; j++) { 185 | // auto kv = buff[j]; 186 | palmtreep->insert(2 * j, 2 * j); 187 | } 188 | 189 | delete buff; 190 | 191 | // Wait for task finished 192 | palmtreep->wait_finish(); 193 | } 194 | 195 | 196 | void readonly_skew(size_t entry_count, size_t op_count, float contention_ratio, bool run_std_map = false) { 197 | LOG(INFO) << "Begin palmtree readonly skew benchmark, contention ratio: " << contention_ratio; 198 | palmtree::PalmTree palmtree(std::numeric_limits::min(), worker_num); 199 | palmtree::PalmTree *palmtreep = &palmtree; 200 | 201 | populate_palm_tree(palmtreep, entry_count); 202 | // Reset the metrics 203 | palmtreep->reset_metric(); 204 | 205 | // Wait for insertion finished 206 | LOG(INFO) << entry_count << " entries inserted"; 207 | 208 | fast_random rng(time(0)); 209 | 210 | double start = CycleTimer::currentSeconds(); 211 | LOG(INFO) << "Benchmark started"; 212 | 213 | int one_step = entry_count / (palmtreep->batch_size()+1); 214 | int last_key = 0; 215 | int batch_task_count = 0; 216 | for (size_t i = 0; i < op_count; i++) { 217 | last_key += rng.next_u32() % one_step; 218 | last_key %= entry_count; 219 | batch_task_count++; 220 | 221 | auto id = rng.next_uniform(); 222 | auto k = last_key; 223 | if(id < contention_ratio) { 224 | k = (int) (k * 0.2); 225 | } 226 | int res; 227 | palmtreep->find(2 * k, res); 228 | if (batch_task_count >= palmtreep->batch_size()) { 229 | batch_task_count = 0; 230 | last_key = 0; 231 | } 232 | } 233 | 234 | LOG(INFO) << palmtreep->task_nums << " left"; 235 | palmtreep->wait_finish(); 236 | double end = CycleTimer::currentSeconds(); 237 | LOG(INFO) << "Palmtree run for " << end-start << "s, " << "thput: " << std::fixed << 2 * op_count/(end-start)/1000 << " K rps"; 238 | double runtime = (end-start) / 2; 239 | 240 | if (run_std_map) { 241 | LOG(INFO) << "Running std map"; 242 | std::map map; 243 | for (size_t i = 0; i < entry_count; i++) 244 | map.insert(std::make_pair(i, i)); 245 | 246 | pthread_rwlock_t lock_rw = PTHREAD_RWLOCK_INITIALIZER; 247 | pthread_rwlock_t *l = &lock_rw; 248 | 249 | auto map_p = ↦ 250 | start = CycleTimer::currentSeconds(); 251 | std::vector threads; 252 | 253 | 254 | auto w_n = worker_num; 255 | for(int i = 0; i < w_n; i++) { 256 | threads.push_back(std::thread([map_p, op_count, entry_count, l, w_n, contention_ratio]() { 257 | fast_random rng(time(0)); 258 | for (size_t i = 0; i < op_count / w_n; i++) { 259 | int rand_key = rng.next_u32() % entry_count; 260 | auto id = rng.next_uniform(); 261 | if(id < contention_ratio) { 262 | rand_key = (int) (rand_key * 0.2); 263 | } 264 | pthread_rwlock_rdlock(l); 265 | map_p->find(rand_key); 266 | pthread_rwlock_unlock(l); 267 | } 268 | })); 269 | } 270 | 271 | for(auto &t : threads) { 272 | t.join(); 273 | } 274 | end = CycleTimer::currentSeconds(); 275 | LOG(INFO) << "std::map run for " << end-start << "s, " << "thput:" << std::fixed << op_count/(end-start)/1000 << " K rps"; 276 | double runtime_ref = end-start; 277 | LOG(INFO) << "SPEEDUP over std map: " << runtime_ref / runtime << " X"; 278 | 279 | threads.clear(); 280 | 281 | // stx 282 | LOG(INFO) << "Running stx map"; 283 | stx::btree_map stx_map; 284 | for (size_t i = 0; i < entry_count; i++) 285 | stx_map.insert(std::make_pair(i, i)); 286 | 287 | start = CycleTimer::currentSeconds(); 288 | auto stx_p = &stx_map; 289 | for(int i = 0; i < w_n; i++) { 290 | threads.push_back(std::thread([stx_p, op_count, entry_count, l, w_n, contention_ratio]() { 291 | fast_random rng(time(0)); 292 | for (size_t i = 0; i < op_count / w_n; i++) { 293 | int rand_key = rng.next_u32() % entry_count; 294 | auto id = rng.next_uniform(); 295 | if(id < contention_ratio) { 296 | rand_key = (int) (rand_key * 0.2); 297 | } 298 | pthread_rwlock_rdlock(l); 299 | stx_p->find(rand_key); 300 | pthread_rwlock_unlock(l); 301 | } 302 | })); 303 | } 304 | 305 | for(auto &t : threads) { 306 | t.join(); 307 | } 308 | 309 | end = CycleTimer::currentSeconds(); 310 | LOG(INFO) << "stx map run for " << end-start << "s, " << "thput:" << std::fixed << op_count/(end-start)/1000 << " K rps"; 311 | 312 | runtime_ref = end-start; 313 | LOG(INFO) << "SPEEDUP over PalmTree: " << runtime_ref / runtime << " X"; 314 | } 315 | } 316 | 317 | 318 | 319 | 320 | 321 | void update_skew(size_t entry_count, size_t op_count, float contention_ratio, bool run_std_map = false) { 322 | LOG(INFO) << "Begin palmtree update skew benchmark, contention ratio: " << contention_ratio; 323 | // palmtree::PalmTree palmtree(std::numeric_limits::min(), worker_num); 324 | palmtree::PalmTree *palmtreep = new palmtree::PalmTree (std::numeric_limits::min(), worker_num);; 325 | 326 | populate_palm_tree(palmtreep, entry_count); 327 | // Reset the metrics 328 | palmtreep->reset_metric(); 329 | 330 | // Wait for insertion finished 331 | LOG(INFO) << entry_count << " entries inserted"; 332 | 333 | fast_random rng(time(0)); 334 | 335 | double start = CycleTimer::currentSeconds(); 336 | LOG(INFO) << "Benchmark started"; 337 | 338 | int one_step = 2 * entry_count / (palmtreep->batch_size()+1); 339 | int last_key = 0; 340 | int batch_task_count = 0; 341 | for (size_t i = 0; i < op_count; i++) { 342 | last_key += rng.next_u32() % one_step; 343 | last_key %= entry_count; 344 | batch_task_count++; 345 | auto id = rng.next_uniform(); 346 | int k = last_key; 347 | if(id < contention_ratio) { 348 | k = (int) (k * 0.2); 349 | } 350 | 351 | id = rng.next_uniform(); 352 | 353 | if(id < 0.1) { 354 | palmtreep->insert(last_key, last_key); 355 | } else if(id < 0.2) { 356 | palmtreep->remove(last_key); 357 | }else { 358 | int res; 359 | palmtreep->find(last_key, res); 360 | } 361 | 362 | if (batch_task_count >= palmtreep->batch_size()) { 363 | batch_task_count = 0; 364 | last_key = 0; 365 | } 366 | } 367 | 368 | LOG(INFO) << palmtreep->task_nums << " left"; 369 | palmtreep->wait_finish(); 370 | double end = CycleTimer::currentSeconds(); 371 | LOG(INFO) << "Palmtree run for " << end-start << "s, " << "thput: " << std::fixed << 2 * op_count/(end-start)/1000 << " K rps"; 372 | double runtime = (end-start) / 2; 373 | 374 | delete palmtreep; 375 | 376 | if (run_std_map) { 377 | LOG(INFO) << "Running std map"; 378 | std::map map; 379 | for (size_t i = 0; i < entry_count; i++) 380 | map.insert(std::make_pair(i, i)); 381 | 382 | 383 | pthread_rwlock_t lock_rw = PTHREAD_RWLOCK_INITIALIZER; 384 | pthread_rwlock_t *l = &lock_rw; 385 | 386 | 387 | start = CycleTimer::currentSeconds(); 388 | auto map_p = ↦ 389 | start = CycleTimer::currentSeconds(); 390 | std::vector threads; 391 | 392 | auto w_n = worker_num; 393 | for(int i = 0; i < w_n; i++) { 394 | threads.push_back(std::thread([map_p, op_count, entry_count, l, w_n, contention_ratio]() { 395 | fast_random rng(time(0)); 396 | 397 | auto map = *map_p; 398 | for (size_t i = 0; i < op_count / w_n; i++) { 399 | int k = rng.next_u32() % entry_count; 400 | auto id = rng.next_uniform(); 401 | 402 | auto rand_key = k; 403 | if(id < contention_ratio) { 404 | rand_key = (int) rand_key * 0.2; 405 | } 406 | id = rng.next_uniform(); 407 | if(id < 0.1) { 408 | pthread_rwlock_wrlock(l); 409 | map[rand_key] = rand_key; 410 | }else if (id < 0.2) { 411 | pthread_rwlock_wrlock(l); 412 | map.erase(rand_key); 413 | }else { 414 | pthread_rwlock_rdlock(l); 415 | map.find(rand_key); 416 | } 417 | pthread_rwlock_unlock(l); 418 | } 419 | })); 420 | } 421 | 422 | for(auto &t : threads) { 423 | t.join(); 424 | } 425 | 426 | threads.clear(); 427 | 428 | end = CycleTimer::currentSeconds(); 429 | LOG(INFO) << "std::map run for " << end-start << "s, " << "thput:" << std::fixed << op_count/(end-start)/1000 << " K rps"; 430 | 431 | double runtime_ref = end-start; 432 | LOG(INFO) << "SPEEDUP over PalmTree: " << runtime_ref / runtime << " X"; 433 | 434 | // stx 435 | LOG(INFO) << "Running stx map"; 436 | stx::btree_map stx_map; 437 | for (size_t i = 0; i < entry_count; i++) 438 | stx_map.insert(std::make_pair(i, i)); 439 | 440 | start = CycleTimer::currentSeconds(); 441 | auto stx_p = &stx_map; 442 | for(int i = 0; i < w_n; i++) { 443 | threads.push_back(std::thread([stx_p, op_count, entry_count, l, w_n, contention_ratio]() { 444 | fast_random rng(time(0)); 445 | auto stx = *stx_p; 446 | for (size_t i = 0; i < op_count / w_n; i++) { 447 | int k = rng.next_u32() % entry_count; 448 | auto id = rng.next_uniform(); 449 | 450 | auto rand_key = k; 451 | if(id < contention_ratio) { 452 | rand_key = (int) rand_key * 0.2; 453 | } 454 | 455 | id = rng.next_uniform(); 456 | if(id < 0.1) { 457 | pthread_rwlock_wrlock(l); 458 | stx.insert(rand_key, rand_key); 459 | }else if (id < 0.2) { 460 | pthread_rwlock_wrlock(l); 461 | stx.erase(rand_key); 462 | }else { 463 | pthread_rwlock_rdlock(l); 464 | stx.find(rand_key); 465 | } 466 | 467 | pthread_rwlock_unlock(l); 468 | } 469 | })); 470 | } 471 | 472 | for(auto &t : threads) { 473 | t.join(); 474 | } 475 | 476 | end = CycleTimer::currentSeconds(); 477 | LOG(INFO) << "stx map run for " << end-start << "s, " << "thput:" << std::fixed << op_count/(end-start)/1000 << " K rps"; 478 | 479 | runtime_ref = end-start; 480 | LOG(INFO) << "SPEEDUP over PalmTree: " << runtime_ref / runtime << " X"; 481 | 482 | } 483 | } 484 | 485 | 486 | int main(int argc, char *argv[]) { 487 | // Google logging 488 | FLAGS_logtostderr = 1; 489 | google::InitGoogleLogging(argv[0]); 490 | 491 | if(argc < 5) { 492 | // print usage 493 | cout << "usage example: 8 true r 0.8" << endl; 494 | cout << "\trunning 8 workers, running map to compare performance, readonly, contention ratio 0.8" << endl; 495 | exit(0); 496 | } 497 | 498 | worker_num = atoi(argv[1]); 499 | bool c; 500 | if(strcmp(argv[2], "true") == 0) { 501 | c = true; 502 | }else{ 503 | c = false; 504 | } 505 | 506 | bool r; 507 | if(strcmp(argv[3], "r") == 0) { 508 | r = true; 509 | }else{ 510 | r = false; 511 | } 512 | 513 | float contention_ratio; 514 | 515 | contention_ratio = atof(argv[4]); 516 | 517 | 518 | auto insert = 1024 * 512 * 10; 519 | auto op_num = 1024 * 1024 * 10; 520 | if(r) { 521 | readonly_skew(insert, op_num, contention_ratio, c); 522 | }else { 523 | update_skew(insert, op_num, contention_ratio, c); 524 | } 525 | 526 | return 0; 527 | } 528 | 529 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Summary: 2 | 3 | We have implemented a concurrent lock free B+Tree (called Palm Tree) that scales to 16 cores, with 60M queries per second (QPS) on read only and R/W mixed workload, which is 15.5x speed up comparing to our single thread implementation. Our implementation can also maintain a nearly linear speed up even for skewed workload. 4 | 5 | ### Backgroud: 6 | B+Tree is intensivily used in database management systems (DBMS). All most all relational database system uses B+Tree as the primary data structure for index. Hence the performance of B+Tree index is critical to fast query performance. On the other hand, there are two hardware trends in recent years for DBMS, systems with high core counts and large memory capacity], which results in the rising of in memory database systems. 7 | 8 | The design of B+Tree index data structure of traditional DBMS is far different than that of an in memory database today. Traditional DBMSs assume that the primary storage is on disk (maganetic disk or SSD), and it is fine in most of the case to acquire a latch to provide concurrent accesses to the index because disk IO is anyway slow. However for in main memory DBMS, fetching data from memory is so much faster than from disk, such that the overhead of locking would easily doom the power of underlying hardwares. 9 | 10 | In this sense, a high performance concurrent B+Tree is demanded for next generation main memory DBMS. This project is an effort to explore the pallelisim of B+Tree data structures and make it scalable to higher core counts. 11 | 12 | A B+Tree is an self balancing tree struture that allows searches, scan, insertions and deletions on key/value pairs. It is a generalization of Binary Search Tree, with the similar concept of internal nodes and leaf nodes. Each inernal node contains a key range, and each range points to a subtree that contains data within that range. Each leaf node contains the actual key/value pairs. 13 | 14 | The mechanism of B+Tree’s ability to keep self balanced is to split when a leaf node or internal node becomes too large, and to merge when a node becomes too small. Particularly, when root of a B+Tree splits, a new root will be allocated the tree depth is increased by one, and when the entire layer of the tree merges, the tree node will decsend and the tree depth is decreased by one. The split and merge operations are critical to maintain a balanced tree with similar sized nodes. 15 | 16 | To implement a B+Tree, the following or similar operations need to be provided. 17 | 18 | * **search(key)**: search for the leaf node that contains key from root to bottom and returns the leaf node. 19 | * **add_item(node, key, value)**: add an item to a node for a given key, the value could either be a child pointer (for internal node), or the actual value (for leaf node). This operation may cause a node to split. 20 | * **del_item(node, key)**: delete an item from a node for a given key. As opposed to add_item(), this operation may cause a node to merge. 21 | * **split(node)**: split a node into multiple nodes, the ranges of the splitted nodes are continuous and the items are sorted within each node. Returns the new nodes that are splitted out. The parents of the spillted node will insert newly created child nodes. 22 | * **merge(node)**: if a node contains few keys, it will be merged. The parents of the merged node will re-insert the merged key/values into other nodes, and reclaim the space of the merged child node. 23 | * **handle_root()**: this is a special handler of the root node, because the split and merge of the root will cause the tree depth to change, and new root may need to be assigned. 24 | 25 | For our prototype system, we implmeneted 3 public APIs in C++: 26 | 27 | * **bool find(const KeyType &key, ValueType &value)**. find() will search for *key* and fill in the corresponding *value*. It will return true if the key/value pair is found, and false otherwise. 28 | * **void delete(const KeyType &key)**. delete() will delete the entry in the tree if key is present in the tree. 29 | * **void insert(const KeyType &key, const ValueType &valuel)**; insert() will insert an entry into the tree. 30 | 31 | ###Approach: 32 | ####Approach #1: Coarse Grained Locking 33 | There are several ways to implement a concurrent B+Tree. The easiest one is to have a coarse grained lock to protect the tree, for example we can use a shared lock to support `find()`, `delete()` and `insert()`. The strategy is simple, `find()` can take a read lock, as it won’t change the structure of the tree. `delete()` and `insert()` need to take a write lock, because it will modify the tree. The advantage of coarse grained locking is its simplism, but it is often not the optimal solution since `find()` will block `delete()` and `insert()`, `delete()` and `insert()` will block all other operations on the tree. 34 | 35 | ####Approach #2: Fine Grained Locking 36 | The second approach is to use fine grained locking to protect the tree data structure. One viable way is to use some sort of hand-over-hand locking when searching down the tree, and lock the corresponding nodes before the tree structure is modified. In this project, to compare with our lock free implmenetation, we also designed and implemented a fine grained locking B+Tree: 37 | 38 | * For `find()`, we first acquire a lock on the root node, find the corresponding child node and acquire the lock on that node then release the lock held previously on the root node. For internal node, we always acquire a lock on the target child node before releasing the lock. 39 | * For `delete()` and `insert()`, because they will potentially modify its parent node (by splitting or merging), and possibly propagate the tree modifications all the way up to the root node, we decided to acquire a lock on each node along the way we search down the tree, so we are sure that no others are possibly searching or modifying on the path. 40 | 41 | The advantage of this approach is that readers will not block readers, and it blocks writers in a fine grained way (unlike the first approach, because `search()` uses a hand-over-hand locking scheme, the writers may still be able to proceed its operations after a unfinished reader). It is also reasonably simple to implement. 42 | 43 | The disadvantage of this approach is that writers will still block readers. The writers will take an exclusive path on the tree, meaning that no other operations are possibly happen at the same time. 44 | 45 | ####Approach #3: Lock Free 46 | Approach #1 and #2 both used lock to protect the data strucutre. In both cases, writers will block readers and other writers. It is more soundable to implement a lock free B+Tree that both readers and writers can proceed without blocking each other. One of such example is Palm Tree. Palm Tree is a lock free concurrent B+Tree proposed by Intel in [1], it features a Bulk Synchronized Parallism (BSP) approach to bulkly perform B+Tree operations and resolve hazards gracefully. The main contribution of this project is an efficient implementation of Palm Tree. 47 | 48 | The first idea of Palm Tree is to group quries into batches, and the batches are processed one at a time cooperatively by a pool of threads. The idea behind batch is that by performing more quries at a time will likely to compensate the communication and scheduling overhead. 49 | 50 | Second, to resolve conflicting access to the tree, Palm Tree adopts a stage by stage Bulk Synchronize fasion for query processing, that is a batch is processed in different stages on different layers of the tree. Between different stage, there is a synchronization point to make sure that each worker has finished the last stage and is ready for the next stage (it sounds like a barrier, the real implementation might not necessarily be one). 51 | 52 | 53 | 1. Stage 0: In the 0 stage, queries in a batch are evenly assigned to workers 54 | 55 | 2. Stage 1: Every query requires firstly search down the tree to locate the leaves, the workers in stage 1 perform this search and record the target leaf node for each query. 56 | 57 | 3. Stage 2: At this stage, `insert()` or `delete()` may modify the leaf nodes, to prevent race conditions, these operations are partitioned by nodes, and are re-distributed to worker threads on a node by node base. This redistribution guarantees that each node is only accessed by exactly one worker, so that conflicing accesses are avoided inherently. 58 | 59 | After the redistribution, the workers will execute insert() and delete(). During this process, the workers may generate split and merge requests to parent node. These operations are registered in the upper layer, but is **not** executed immediately because other siblings may also want to split and merge, causing the parent node being updated concurrently without protection. 60 | 61 | 3. Stage 3: During this stage, each node gathers split and merge requests from its children. These requests are again grouped by each node (here node is the parent node respective to the node in stage 2) and assigned to workers. Stage 3 may again generate some split and merge requests to its upper layer. We repeat Stage 3 on each layer up to the root node, until then the necessary tree modifications are all done in such manner except the root node. 62 | 63 | 4. Stage 4: This is the final stage. A single thread will handle the special case of root split and root merge. For a root split, a new root is allocated, it will point to the old root and newly splitted node. For a root merge, we did some trick to merge the root only when the root has one sinlge child, we decsend the root node and use the single child as the new root. In the end of stage 4, all queries in the batch are fullfilled, the results of the batch are finally delivered back to clients. 64 | 65 | During the upwards operations, within each layer the task needs to be re-distributed to ensure correctness and leverages parallisms. Palm Tree's partition algorithm is as follows: for each worker thread, it records all the nodes it has accessed in the lower level, then dicards all nodes that have been accessed by a worker with a lower worker id (each worker is assigned a worker id from 0~`WORKER_NUM`). One drawbacks of such approach would be workload imbalance, as the worker with lower id has privilege over other workers. 66 | 67 | ###Optimizations: 68 | * The first optimization we made is to pre-sort the queries in a batch, and assign them to threads in an ordered way. We will see below that pre-sort can benifit task distribution process, here the main benift is load balancing. If the batch is pre-sorted, each worker thread will be assigned a range of queries, and they will reach leaf layer with properly ordered leaf nodes. Comparing to random assignment of tasks, the 0th worker may end up with many leaf nodes, potentially have more work than others due to the task distribution policy. 69 | 70 | * Next, we soonly find out that memory allocation is a bottleneck. When we measured from perf, with higher thread counts, the time spent in malloc and free takes longer and longer, so we suspuct the memory allocator is not scalable. We searched online and find a good scalable memory allocator called JEMalloc, it is nearly zero change to our code to use JeMalloc. 71 | 72 | * SIMD accelearation for key lookup. There are two ways to lookup a key during tree search. If the keys in the node are sorted, we can use binary search to search the key. If the node is not sorted, we can linearly scan and match the keys. While binary search has a better aymptotic complexity than linear scan, it suffers from branch mispredications and requires the keys in the node to be sorted. Linear scan on the other hand, may potentially has the same overhead as binary search given the node size is small, can further exploit SIMD acceleration, and has a fast delete and insert speed as the node is not acquired to be sorted. Using SIMD to linearly search for key seems to be a counter-ituitive but efficient way for key lookups. 73 | 74 | * Reduce communication overhead. 75 | * Pre-sort a batch with different purpose. Task distribution is actually a proactive process. As described in the previous section, task distribution is by probing into other threads' task and determine which tasks belong to the worker, sorting the queries beforehand can potentially just looking at a worker's neighbours. 76 | * Previously, the 0th worker is a special worker, it is responsible for distributing the queries on the tree to all other workers. This portion of code is sequential, we improve it by let each thread calculate the range of its responsible quries and collect its own task cooperatively. 77 | 78 | ### Results: 79 | The platform we run our evaluation on: 80 | 81 | * 18 cores, 36 hardware threads 82 | * 2.9 GHz cpu, 32K L1 cache, 256K L2 cache, 26M L3 cache 83 | * 2 NUMA nodes, 60GB memory 84 | 85 | First look at our final evaluation with all optimization implemented. We have evaluated a read only benchmark and a 20% update 80% read mixed benchmark. We pre-poplate the tree with different number of items before generating the workloads. 86 | 87 | Below is a graph showing different optimization we did towards the final scalable algorithm. The workload used in this graph is a read only workload with uniform access patterns on a tree with 0.5M keys. 88 | 89 | 90 | 91 | The baseline version has a throughput about 2000KQPS, we didn’t see a huge speedup by adapting the pre-sort optimization mentioned in the paper, this is mainly because the system is bottlenecked by the the memory allocator. We then replaced the default libc’s `malloc` with jemalloc, the performance now greatly goes up, however after 6 cores there is no more throughput gain. The B+tree throughput is 10MQPS. At this point, applying SIMD to the data structure can provide a 10%-20% speed up. 92 | 93 | Then the huge performance gain is from reducing the communciation overhead. We first implemented a customized profiler to collect running time of different stages of the system. As can be seen from the log output when profing on 4 workers and 8 workers Palm Tree, we found that batch collection (Stage 0) and result distribution (Stage 4) is not scalable, mainly because it is only done by the 0th worker by design. 94 | 95 | ``` 96 | I0505 01:02:58.919889 70461 palmtree.h:63] [collect_batch] 97 | I0505 01:02:58.919924 70461 palmtree.h:68] 0: 1.06791 <= 98 | I0505 01:02:58.919939 70461 palmtree.h:68] 1: 0 99 | I0505 01:02:58.919947 70461 palmtree.h:68] 2: 0 100 | ... 101 | I0505 01:02:58.920054 70461 palmtree.h:63] [end_stage] 102 | I0505 01:02:58.920061 70461 palmtree.h:68] 0: 1.09612 <= 103 | I0505 01:02:58.920070 70461 palmtree.h:68] 1: 0 104 | I0505 01:02:58.920078 70461 palmtree.h:68] 2: 0 105 | ... 106 | I0505 01:02:58.920110 70461 palmtree.h:63] [total_time] 107 | I0505 01:02:58.920117 70461 palmtree.h:68] 0: 3.12207 108 | I0505 01:02:58.920125 70461 palmtree.h:68] 1: 3.12296 109 | I0505 01:02:58.920133 70461 palmtree.h:68] 2: 3.1128 110 | ... 111 | ``` 112 | 113 | 114 | To fix this problem, we let each thread calculate its own task ranges in the batch, and fetches the task without communicating with others, and hence there is not 0th worker's responsibility to distribute the batch tasks. When the task is finished, the worker threads are responsible for returning the results back cooperatively, instead of all done by 0th worker. 115 | 116 | Another communication overhead is in stage 2's redistribution of node modification tasks, shown in the following screenshot. By pre-sorting the batch, a worker node may be able to only probe its neighbours’ task s to determine its tasks. 117 | 118 | As shown in the graph, the final speed up is promising, we have achieved 60M QPS on a 16 core system and the algorithm scales very well! 119 | 120 | The following graph shows the scalability of our implementation, we vary the number of workers in the worker pool as well as the pre-populated tree’s size. When the tree is of medium or small size, the speed up is close to a linear speed up. When the tree size is large, we believe the system has been memory bounded so that the speed up is not as good (however it is still 10x). This workload is a 20% update, 80% read workload with uniform access to keys in the tree. 121 | 122 | 123 | 124 | 125 | Our implmentation is also resilient to skewed data access patterns. The following graph is the comparison of throughput for uniform access and contended access. The contended workload is generated by having 80% of operations accesing 20% of the entries in the tree. For either small, medium or large trees, the throughput has a slightly drop but not much for skewed access, showing that our implementation can actually resist to the skewness quite well. 126 | 127 | 128 | 129 | We have also compared the performance of Palm Tree with single thread `std::map` and single thread `stx::btree` (an open source efficient implementation of B+Tree), and also our not so efficient implementation of fine grained lock B+Tree in hand-over-hand fasion. As can be seen, `std::map` is generally not performent even for single thread, `stx::btree` is performant for single thread but it is not a concurrent data structure. We have tried to add a shared lock to both `std::map` and `stx::btree`, it turns out they perform even worse in a many core settings. The hand-over-hand B+Tree can’t scale beyond 4 threads. We wish we would have a better implementation of fine grained locking B+Tree, but it turns out to be even harder than Palm Tree, many corner cases might happen, given limited time we are not able to engage into that. 130 | 131 | 132 | 133 | The final graph is about the decomposition of time spent in each stage of a workload. The workload is 20% update, 80% read, 0.5M keys in the tree, uniform access. We generated 1B of operations to the tree. 134 | 135 | From the runtime decomposition we can see that the time spent in stage 2 is being less and less significant when more threads are used. Recall that stage 2 is actually matching keys, inserting keys or removing keys on the leaf node, this is one of the most expensive and frequent operations in Palm Tree. in the beginning when there is just one thread, most of the time is spent on stage 2. However with the increasing of number of workers, the communication overhead becomes more and more significant, it grows from nearly 0% for 1 thread to around 33%, for 16 threads. This is not surprising as we have more threads, the more likely that they can’t keep up with each other so that waiting is common. One way to overcome this problem will be focusing on how to elimiate this all to all communications. 136 | 137 | 138 | 139 | #### References: 140 | [1] J. Sewall, J. Chhugani, C. Kim, N. Satish, and P. Dubey. PALM: Parallel architecture-friendly latch-free modifications to B+ trees on many-core processors. Proc. VLDB Endowment, 4(11):795--806, August 2011. 141 | 142 | [2] David B. Lomet, Sudipta Sengupta, and Justin J. Levandoski. 2013. The Bw-Tree: A B-tree for new hardware platforms. In Proceedings of the 2013 IEEE International Conference on Data Engineering (ICDE 2013) (ICDE '13). IEEE Computer Society, Washington, DC, USA, 302-313. DOI=http://dx.doi.org/10.1109/ICDE.2013.6544834 143 | 144 | #### Work Partition: 145 | Equal work was performed by both project members (@Ran Xian and @Runshen Zhu). 146 | -------------------------------------------------------------------------------- /palmtree.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "immintrin.h" 18 | // #include "smmintrin.h" 19 | #include "CycleTimer.h" 20 | #include "barrier.h" 21 | #include 22 | 23 | using std::cout; 24 | using std::endl; 25 | 26 | #define UNUSED __attribute__((unused)) 27 | 28 | #define PROFILE 29 | 30 | namespace palmtree { 31 | 32 | static std::atomic NODE_NUM(0); 33 | unsigned int batch_id = 0; 34 | /** 35 | * Tree operation types 36 | */ 37 | enum TreeOpType { 38 | TREE_OP_FIND = 0, 39 | TREE_OP_INSERT, 40 | TREE_OP_REMOVE 41 | }; 42 | 43 | enum NodeType { 44 | INNERNODE = 0, 45 | LEAFNODE 46 | }; 47 | 48 | class Stats { 49 | public: 50 | Stats(int worker_num): worker_num_(worker_num) {} 51 | Stats() {} 52 | /** 53 | * add stat for one metric of one worker 54 | */ 55 | void add_stat(int worker_id, std::string metric_name, double metric_value) { 56 | stats_[metric_name][worker_id] += metric_value; 57 | } 58 | 59 | void init_metric(std::string metric_name) { 60 | stats_[metric_name] = std::vector(worker_num_); 61 | for (int i = 0; i < worker_num_; i++) 62 | stats_[metric_name][i] = 0; 63 | metric_names_.push_back(metric_name); 64 | } 65 | 66 | /** 67 | * Print the stats out 68 | */ 69 | void print_stat() { 70 | for (auto &metric_name : metric_names_) { 71 | LOG(INFO) << "\033[1m[" << metric_name << "]\033[0m "; 72 | auto &values = stats_[metric_name]; 73 | std::string line = ""; 74 | for (int i = 0; i < worker_num_; i++) { 75 | if (metric_name == "leaf_task") { 76 | line += "\t" + std::to_string(i) + ": " + std::to_string(values[i]); 77 | 78 | } else { 79 | line += "\t" + std::to_string(i) + ": " + std::to_string(values[i] * CycleTimer::secondsPerTick()); 80 | } 81 | } 82 | LOG(INFO) << line; 83 | } 84 | } 85 | 86 | void reset_metric() { 87 | for (auto itr = stats_.begin(); itr != stats_.end(); itr++) { 88 | for (int i = 0; i < worker_num_; i++) { 89 | itr->second[i] = 0; 90 | } 91 | } 92 | } 93 | private: 94 | std::unordered_map> stats_; 95 | std::vector metric_names_; 96 | int worker_num_; 97 | } STAT; 98 | 99 | template , 102 | typename KeyComparator = std::less > 103 | class PalmTree { 104 | public: 105 | // Number of working threads 106 | int NUM_WORKER; 107 | int BATCH_SIZE; 108 | 109 | private: 110 | // Max number of slots per inner node 111 | static const int INNER_MAX_SLOT = 256; 112 | // Max number of slots per leaf node 113 | static const int LEAF_MAX_SLOT = 64; 114 | // Threshold to control bsearch or linear search 115 | static const int BIN_SEARCH_THRESHOLD = 32; 116 | // Number of working threads 117 | static const int BATCH_SIZE_PER_WORKER = 4096; 118 | 119 | private: 120 | /** 121 | * Tree node base class 122 | */ 123 | struct InnerNode; 124 | struct Node { 125 | // Number of actually used slots 126 | int slot_used; 127 | int id; 128 | int level; 129 | KeyType lower_bound; 130 | Node *parent; 131 | 132 | 133 | Node() = delete; 134 | Node(Node *p, int lvl): slot_used(0), level(lvl), parent(p) { 135 | id = NODE_NUM++; 136 | }; 137 | virtual ~Node() {}; 138 | virtual std::string to_string() = 0; 139 | virtual NodeType type() const = 0; 140 | virtual bool is_few() = 0; 141 | }; 142 | 143 | struct InnerNode : public Node { 144 | InnerNode() = delete; 145 | InnerNode(Node *parent, int level): Node(parent, level){}; 146 | virtual ~InnerNode() {}; 147 | // Keys for values 148 | KeyType keys[LEAF_MAX_SLOT]; 149 | // Pointers for child nodes 150 | Node *values[LEAF_MAX_SLOT]; 151 | 152 | virtual NodeType type() const { 153 | return INNERNODE; 154 | } 155 | 156 | virtual std::string to_string() { 157 | std::string res; 158 | res += "InnerNode[" + std::to_string(Node::id) + " @ " + std::to_string(Node::level) + "] "; 159 | for (int i = 0 ; i < Node::slot_used ; i++) { 160 | res += " " + std::to_string(keys[i]) + ":" + std::to_string(values[i]->id); 161 | } 162 | return res; 163 | } 164 | 165 | inline bool is_full() const { 166 | return Node::slot_used == MAX_SLOT(); 167 | } 168 | 169 | 170 | inline size_t MAX_SLOT() const { 171 | return LEAF_MAX_SLOT; 172 | } 173 | 174 | virtual inline bool is_few() { 175 | return Node::slot_used < MAX_SLOT()/4 || Node::slot_used == 0; 176 | } 177 | 178 | }; 179 | 180 | struct LeafNode : public Node { 181 | LeafNode() = delete; 182 | LeafNode(Node *parent, int level): Node(parent, level){}; 183 | virtual ~LeafNode() {}; 184 | 185 | // Keys and values for leaf node 186 | KeyType keys[INNER_MAX_SLOT]; 187 | ValueType values[INNER_MAX_SLOT]; 188 | 189 | virtual NodeType type() const { 190 | return LEAFNODE; 191 | } 192 | 193 | virtual std::string to_string() { 194 | std::string res; 195 | res += "LeafNode[" + std::to_string(Node::id) + " @ " + std::to_string(Node::level) + "] "; 196 | 197 | for (int i = 0 ; i < Node::slot_used ; i++) { 198 | res += " " + std::to_string(keys[i]) + ":" + std::to_string(values[i]); 199 | } 200 | return res; 201 | } 202 | 203 | inline bool is_full() const { 204 | return Node::slot_used == MAX_SLOT(); 205 | } 206 | 207 | inline size_t MAX_SLOT() const { 208 | return INNER_MAX_SLOT; 209 | } 210 | 211 | virtual inline bool is_few() { 212 | return Node::slot_used < MAX_SLOT()/4 || Node::slot_used == 0; 213 | } 214 | }; 215 | /** 216 | * Tree operation wrappers 217 | */ 218 | struct TreeOp { 219 | // Op can either be none, add or delete 220 | TreeOp(TreeOpType op_type, const KeyType &key, const ValueType &value): 221 | op_type_(op_type), key_(key), value_(value), target_node_(nullptr), 222 | boolean_result_(false), done_(false) {}; 223 | 224 | 225 | TreeOp(TreeOpType op_type, const KeyType &key): 226 | op_type_(op_type), key_(key), target_node_(nullptr), 227 | boolean_result_(false), done_(false) {}; 228 | 229 | TreeOpType op_type_; 230 | KeyType key_; 231 | ValueType value_; 232 | 233 | LeafNode *target_node_; 234 | ValueType result_; 235 | bool boolean_result_; 236 | bool done_; 237 | 238 | // Wait until this operation is done 239 | // Now use busy waiting, should use something more smart. But be careful 240 | // that conditional variable could be very expensive 241 | inline void wait() { 242 | while (!done_) { 243 | boost::this_thread::sleep_for(boost::chrono::milliseconds(1)); 244 | } 245 | } 246 | }; 247 | 248 | /** 249 | * A batch of tree operations, this data structure is not thread safe 250 | * The major goal of this class is to amortize memory allocation of 251 | * tree operations 252 | */ 253 | class TaskBatch { 254 | public: 255 | TaskBatch(size_t capacity): capacity_(capacity), ntask_(0) { 256 | ops_ = (TreeOp *)malloc(sizeof(TreeOp) * capacity_); 257 | } 258 | 259 | void destroy() { 260 | free(ops_); 261 | ops_ = nullptr; 262 | } 263 | 264 | // Add a tree operation to the batch 265 | inline void add_op(TreeOpType op_type, const KeyType *keyp, const ValueType *valp) { 266 | assert(ntask_ != capacity_); 267 | 268 | if (op_type == TREE_OP_INSERT) { 269 | assert(valp != nullptr); 270 | ops_[ntask_++] = TreeOp(op_type, *keyp, *valp); 271 | } else { 272 | ops_[ntask_++] = TreeOp(op_type, *keyp); 273 | } 274 | } 275 | 276 | // Whether the tree is full or not 277 | inline bool is_full() { return ntask_ == capacity_; } 278 | // The size of the batch 279 | inline size_t size() { return ntask_; } 280 | // Overloading [] to return the ith operation in the batch 281 | TreeOp * get_op(int i) { 282 | assert(i < ntask_); 283 | return ops_ + i; 284 | } 285 | 286 | // Capacity of the batch 287 | size_t capacity_; 288 | // Number of tasks currently in the batch 289 | size_t ntask_; 290 | // Tree opearations 291 | TreeOp *ops_; 292 | }; 293 | 294 | enum ModType { 295 | MOD_TYPE_ADD, 296 | MOD_TYPE_DEC, 297 | MOD_TYPE_NONE 298 | }; 299 | 300 | /** 301 | * Wrapper for node modification 302 | */ 303 | struct NodeMod { 304 | NodeMod(ModType type): type_(type) {} 305 | NodeMod(const TreeOp &op) { 306 | CHECK(op.op_type_ != TREE_OP_FIND) << "NodeMod can't convert from a find operation" << endl; 307 | if (op.op_type_ == TREE_OP_REMOVE) { 308 | this->type_ = MOD_TYPE_DEC; 309 | this->value_items.emplace_back(std::make_pair(op.key_, ValueType())); 310 | } else { 311 | this->type_ = MOD_TYPE_ADD; 312 | this->value_items.emplace_back(std::make_pair(op.key_, op.value_)); 313 | } 314 | } 315 | ModType type_; 316 | // For leaf modification 317 | std::vector> value_items; 318 | // For inner node modification 319 | std::vector> node_items; 320 | // For removed keys 321 | std::vector> orphaned_kv; 322 | }; 323 | 324 | /******************** 325 | * PalmTree private 326 | * ******************/ 327 | private: 328 | // Root of the palm tree 329 | Node *tree_root; 330 | // Height of the tree 331 | int tree_depth_; 332 | // Number of nodes on each layer 333 | std::vector *> layer_width_; 334 | // Is the tree being destroyed or not 335 | bool destroyed_; 336 | // Minimal key 337 | KeyType min_key_; 338 | // Key comparator 339 | KeyComparator kcmp; 340 | // Current batch of the tree 341 | TaskBatch *tree_current_batch_; 342 | 343 | // Push a task into the current batch, if the batch is full, push the batch 344 | // into the batch queue. 345 | void push_task(TreeOpType op_type, const KeyType *keyp, const ValueType *valp) { 346 | tree_current_batch_->add_op(op_type, keyp, valp); 347 | task_nums += 2; 348 | 349 | if (tree_current_batch_->is_full()) { 350 | task_batch_queue_.push(tree_current_batch_); 351 | tree_current_batch_ = (TaskBatch *)malloc(sizeof(TaskBatch)); 352 | new (tree_current_batch_) TaskBatch(BATCH_SIZE); 353 | DLOG(INFO) << "Push one batch into the queue "; 354 | } 355 | } 356 | 357 | // Return true if k1 < k2 358 | inline bool key_less(const KeyType &k1, const KeyType &k2) { 359 | return kcmp(k1, k2); 360 | } 361 | // Return true if k1 == k2 362 | inline bool key_eq(const KeyType &k1, const KeyType &k2) { 363 | return !kcmp(k1, k2) && !kcmp(k2, k1); 364 | } 365 | 366 | 367 | // Return the index of the largest slot whose key <= @target 368 | // assume there is no duplicated element 369 | int search_helper(const KeyType *input, int size, const KeyType &target) { 370 | int res = -1; 371 | // loop all element 372 | for (int i = 0; i < size; i++) { 373 | if(key_less(target, input[i])){ 374 | // target < input 375 | // ignore 376 | continue; 377 | 378 | } 379 | if (res == -1 || key_less(input[res], input[i])) { 380 | res = i; 381 | } 382 | } 383 | 384 | return res; 385 | } 386 | 387 | // liner search in leaf 388 | // assume there is no duplicated element 389 | // int search_leaf(const KeyType *data, int size, const KeyType &target) { 390 | // const __m128i keys = _mm_set1_epi32(target); 391 | // 392 | // const auto n = size; 393 | // const auto rounded = 8 * (n / 8); 394 | // 395 | // for (int i = 0; i < rounded; i += 8) { 396 | // 397 | // const __m128i vec1 = _mm_loadu_si128(reinterpret_cast(&data[i])); 398 | // const __m128i vec2 = _mm_loadu_si128(reinterpret_cast(&data[i + 4])); 399 | // 400 | // const __m128i cmp1 = _mm_cmpeq_epi32(vec1, keys); 401 | // const __m128i cmp2 = _mm_cmpeq_epi32(vec2, keys); 402 | // 403 | // const __m128i tmp = _mm_packs_epi32(cmp1, cmp2); 404 | // const uint32_t mask = _mm_movemask_epi8(tmp); 405 | // 406 | // if (mask != 0) { 407 | // return i + __builtin_ctz(mask) / 2; 408 | // } 409 | // } 410 | // 411 | // for (int i = rounded; i < n; i++) { 412 | // if (data[i] == target) { 413 | // return i; 414 | // } 415 | // } 416 | // 417 | // return -1; 418 | // } 419 | 420 | 421 | 422 | 423 | int search_leaf(const KeyType *data, int size, const KeyType &target) { 424 | // #ifdef PROFILE 425 | // auto bt = CycleTimer::currentTicks(); 426 | // #endif 427 | const __m256i keys = _mm256_set1_epi32(target); 428 | 429 | const auto n = size; 430 | const auto rounded = 8 * (n/8); 431 | 432 | for (int i=0; i < rounded; i += 8) { 433 | 434 | const __m256i vec1 = _mm256_loadu_si256(reinterpret_cast(&data[i])); 435 | 436 | const __m256i cmp1 = _mm256_cmpeq_epi32(vec1, keys); 437 | 438 | const uint32_t mask = _mm256_movemask_epi8(cmp1); 439 | 440 | if (mask != 0) { 441 | // #ifdef PROFILE 442 | // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt); 443 | // #endif 444 | return i + __builtin_ctz(mask)/4; 445 | } 446 | } 447 | 448 | for (int i = rounded; i < n; i++) { 449 | if (data[i] == target) { 450 | // #ifdef PROFILE 451 | // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt); 452 | // #endif 453 | return i; 454 | } 455 | } 456 | 457 | // #ifdef PROFILE 458 | // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt); 459 | // #endif 460 | return -1; 461 | } 462 | 463 | 464 | 465 | // Return the index of the largest slot whose key <= @target 466 | // assume there is no duplicated element 467 | int search_inner(const KeyType *input, int size, const KeyType &target) { 468 | // #ifdef PROFILE 469 | // auto bt = CycleTimer::currentTicks(); 470 | // #endif 471 | int low = 0, high = size - 1; 472 | while (low != high) { 473 | int mid = (low + high) / 2 + 1; 474 | if (key_less(target, input[mid])) { 475 | // target < input[mid] 476 | high = mid - 1; 477 | } 478 | else { 479 | // target >= input[mid]; 480 | low = mid; 481 | } 482 | } 483 | // #ifdef PROFILE 484 | // STAT.add_stat(0, "search_inner", CycleTimer::currentTicks() - bt); 485 | // #endif 486 | 487 | if (low == size) { 488 | return -1; 489 | } 490 | return low; 491 | } 492 | 493 | /** 494 | * @brief Return the leaf node that contains the @key 495 | */ 496 | LeafNode *search(const KeyType &key UNUSED) { 497 | 498 | auto ptr = (InnerNode *)tree_root; 499 | for (;;) { 500 | CHECK(ptr->slot_used > 0) << "Search empty inner node"; 501 | auto idx = this->search_inner(ptr->keys, ptr->slot_used, key); 502 | CHECK(idx != -1) << "search innerNode fail" << endl; 503 | CHECK(key_less(ptr->keys[idx], key) || key_eq(ptr->keys[idx], key)); 504 | if(idx + 1 < ptr->slot_used) { 505 | CHECK(key_less(key, ptr->keys[idx + 1])); 506 | } 507 | Node *child = ptr->values[idx]; 508 | if (child->type() == LEAFNODE) { 509 | return (LeafNode *)child; 510 | } else { 511 | ptr = (InnerNode *)child; 512 | } 513 | } 514 | // we shouldn't reach here 515 | assert(0); 516 | } 517 | 518 | /** 519 | * @brief big_split will split the kv pair vector into multiple tree nodes 520 | * that is within the threshold. The actual type of value is templated as V. 521 | * The splited nodes should be stored in Node, respect to appropriate 522 | * node types 523 | */ 524 | template 525 | void big_split(std::vector> &input, NodeType *node, std::vector> &new_nodes) { 526 | std::sort(input.begin(), input.end(), [this](const std::pair &p1, const std::pair &p2) { 527 | return key_less(p1.first, p2.first); 528 | }); 529 | 530 | auto itr = input.begin(); 531 | 532 | auto item_per_node = node->MAX_SLOT() / 2; 533 | auto node_num = input.size() / (item_per_node); 534 | // save first half items (small part) in old node 535 | node->slot_used = 0; 536 | for (int i = 0; i < item_per_node; i++) { 537 | // add_item(node, itr->first, itr->second); 538 | node->keys[i] = itr->first; 539 | node->values[i] = itr->second; 540 | node->slot_used++; 541 | itr++; 542 | } 543 | 544 | // Add a new node 545 | int node_create_num = 1; 546 | while(node_create_num < node_num) { 547 | 548 | NodeType *new_node = new NodeType(node->parent, node->Node::level); 549 | layer_width_[node->Node::level]->fetch_add(1); 550 | 551 | // save the second-half in new node 552 | auto new_key = (*itr).first; 553 | int i = 0; 554 | while (itr != input.end() && new_node->slot_used < item_per_node) { 555 | // add_item(new_node, itr->first, itr->second); 556 | new_node->keys[i] = itr->first; 557 | new_node->values[i] = itr->second; 558 | new_node->slot_used++; 559 | itr++; 560 | i++; 561 | } 562 | if(node_create_num == node_num - 1) { 563 | while(itr != input.end()) { 564 | new_node->keys[i] = itr->first; 565 | new_node->values[i] = itr->second; 566 | new_node->slot_used++; 567 | itr++; 568 | i++; 569 | } 570 | } 571 | 572 | new_nodes.push_back(std::make_pair(new_key, new_node)); 573 | node_create_num++; 574 | } 575 | } 576 | 577 | // Warning: if this function return true, the width of the layer will be 578 | // decreased by 1, so the caller must actually merge the node 579 | bool must_merge(Node *node) { 580 | if (!node->is_few()) 581 | return false; 582 | 583 | int old_width = layer_width_[node->level]->fetch_add(-1); 584 | if (old_width == 1) { 585 | // Can't merge 586 | layer_width_[node->level]->fetch_add(1); 587 | return false; 588 | } 589 | 590 | return true; 591 | } 592 | 593 | template 594 | void add_item(NodeType *node, const KeyType &key, V value) { 595 | // add item to leaf node 596 | // just append it to the end of the slot 597 | if (node->type() == LEAFNODE) { 598 | // auto idx = node->slot_used++; 599 | auto idx = search_leaf(node->keys, node->slot_used, key); 600 | if(idx != -1) { 601 | return; 602 | } 603 | idx = node->slot_used++; 604 | node->keys[idx] = key; 605 | node->values[idx] = value; 606 | return; 607 | } 608 | 609 | if(node->slot_used == 0) { 610 | node->keys[0] = key; 611 | node->values[0] = value; 612 | node->slot_used++; 613 | return; 614 | } 615 | 616 | // add item to inner node 617 | // ensure it's order 618 | DLOG(INFO) << "search inner begin"; 619 | auto idx = search_inner(node->keys, node->slot_used, key); 620 | 621 | CHECK(idx != -1) << "search innerNode fail" << key <<" " <keys[0]; 622 | CHECK(key_less(node->keys[idx], key) || key_eq(node->keys[idx], key)); 623 | if(idx + 1 < node->slot_used) { 624 | CHECK(key_less(key, node->keys[idx + 1])) << "search inner fail"; 625 | } 626 | 627 | DLOG(INFO) << "search inner end"; 628 | auto k = key; 629 | auto v = value; 630 | 631 | for(int i = idx + 1; i < node->slot_used; i++) { 632 | std::swap(node->keys[i], k); 633 | std::swap(node->values[i], v); 634 | } 635 | 636 | node->keys[node->slot_used] = k; 637 | node->values[node->slot_used] = v; 638 | node->slot_used++; 639 | } 640 | 641 | 642 | template 643 | void del_item(NodeType *node, const KeyType &key) { 644 | auto lastIdx = node->slot_used - 1; 645 | auto idx = search_helper(node->keys, node->slot_used, key); 646 | DLOG(INFO) << "search in del, idx: " << idx; 647 | if (idx == -1) { 648 | DLOG(WARNING) << "del fail, can't find key in node"; 649 | return; 650 | } 651 | 652 | if (!key_eq(key, node->keys[idx])) { 653 | DLOG(WARNING) << "del in inner, del idx: " << idx << " key != del_key" << endl; 654 | if (node->type() == LEAFNODE) 655 | return; 656 | } 657 | 658 | if (node->type() == INNERNODE) { 659 | Node *child_node = reinterpret_cast(&node->values[idx]); 660 | DLOG(INFO) << "Delete node " << child_node->id; 661 | free_recursive(child_node); 662 | 663 | KeyType del_key = node->keys[idx]; 664 | 665 | // auto k = node->keys[idx]; 666 | // auto v = node->value[idx]; 667 | for(int i = idx; i < node->slot_used - 1; i++) { 668 | std::swap(node->keys[i], node->keys[i + 1]); 669 | std::swap(node->values[i], node->values[i + 1]); 670 | } 671 | 672 | if(idx == 0) { 673 | node->keys[0] = del_key; 674 | } 675 | 676 | node->slot_used--; 677 | 678 | 679 | }else { 680 | // del in leaf 681 | if (idx == lastIdx) { 682 | // if it's the last element, just pop it 683 | node->slot_used--; 684 | } else { 685 | // otherwise, swap 686 | node->keys[idx] = node->keys[lastIdx]; 687 | node->values[idx] = node->values[lastIdx]; 688 | node->slot_used--; 689 | } 690 | } 691 | 692 | return; 693 | } 694 | 695 | // collect kv pairs in (or under) this node 696 | // used for merge 697 | void collect_leaf(Node *node, std::vector> &container) { 698 | if (node->type() == LEAFNODE) { 699 | auto ptr = (LeafNode *)node; 700 | for(int i = 0; i < node->slot_used; i++) { 701 | container.push_back(std::make_pair(ptr->keys[i], ptr->values[i])); 702 | } 703 | } else if (node->type() == INNERNODE) { 704 | auto ptr = (InnerNode *)node; 705 | for(int i = 0; i < node->slot_used; i++) { 706 | collect_leaf(ptr->values[i], container); 707 | } 708 | layer_width_[node->level-1]->fetch_add(-node->slot_used); 709 | } else { 710 | assert(0); 711 | } 712 | 713 | return; 714 | } 715 | 716 | /** 717 | * @brief Modify @node by applying node modifications in @modes. If @node 718 | * is a leaf node, @mods will be a list of add kv and del kv. If @node is 719 | * a inner node, @mods will be a list of add range and del range. If new 720 | * node modifications are triggered, record them in @new_mods. 721 | */ 722 | NodeMod modify_node(Node *node, const std::vector &mods) { 723 | DLOG(INFO) << "Modifying node " << node->id << " with " << mods.size() << " operations"; 724 | if(node->type() == LEAFNODE) { 725 | return modify_node_leaf((LeafNode *)node, mods); 726 | }else{ 727 | CHECK(node->type() == INNERNODE) << "unKnown node" << endl; 728 | return modify_node_inner((InnerNode *)node, mods); 729 | } 730 | } 731 | 732 | NodeMod modify_node_leaf(LeafNode *node, const std::vector &mods) { 733 | NodeMod ret(MOD_TYPE_NONE); 734 | auto& kv = ret.orphaned_kv; 735 | 736 | // randomly pick up a key, used for merge 737 | auto node_key = node->keys[0]; 738 | 739 | // firstly, we loop all items to save orphaned and count nodes 740 | int num = node->slot_used; 741 | for (auto& item : mods) { 742 | // save all orphaned_* 743 | kv.insert(kv.end(), item.orphaned_kv.begin(), item.orphaned_kv.end()); 744 | 745 | auto item_size = (int)item.value_items.size(); 746 | if (item.type_ == MOD_TYPE_ADD) { 747 | num += item_size; 748 | } else if (item.type_ == MOD_TYPE_DEC) { 749 | num -= item_size; 750 | } else { 751 | assert(item_size == 0); 752 | } 753 | } 754 | 755 | DLOG(INFO) << "Result node size " << num; 756 | if (num > node->MAX_SLOT()) { 757 | DLOG(INFO) << "Going to split"; 758 | auto comp = [this](const std::pair &p1, const std::pair &p2) { 759 | return key_less(p1.first, p2.first); 760 | }; 761 | 762 | std::set, decltype(comp)> buf(comp); 763 | 764 | // execute add/del 765 | for (auto& item : mods) { 766 | if (item.type_ == MOD_TYPE_ADD) { 767 | for (auto& kv : item.value_items) { 768 | buf.insert(kv); 769 | } 770 | } else if(item.type_ == MOD_TYPE_DEC) { 771 | for (auto& kv : item.value_items) { 772 | if(buf.count(kv)) { 773 | buf.erase(kv); 774 | }else{ 775 | del_item(node, kv.first); 776 | } 777 | } 778 | } 779 | } 780 | 781 | // construct input for split 782 | std::vector> split_input; 783 | for(auto itr = buf.begin(); itr != buf.end(); itr++) { 784 | split_input.push_back(*itr); 785 | } 786 | 787 | for(auto i = 0; i < node->slot_used; i++) { 788 | split_input.push_back(std::make_pair(node->keys[i], node->values[i])); 789 | } 790 | // do split based on this buf 791 | big_split(split_input, node, ret.node_items); 792 | ret.type_ = MOD_TYPE_ADD; 793 | return ret; 794 | } else { 795 | DLOG(INFO) << "don't split"; 796 | for (auto& item : mods) { 797 | if (item.type_ == MOD_TYPE_ADD) { 798 | for (auto& kv : item.value_items) { 799 | add_item(node, kv.first, kv.second); 800 | } 801 | } else if(item.type_ == MOD_TYPE_DEC) { 802 | for (auto& kv : item.value_items) { 803 | del_item(node, kv.first); 804 | } 805 | } 806 | } 807 | } 808 | 809 | // merge 810 | // fixme: never merge the first leafnode 811 | // because the min_key is in this node 812 | // we can't delete min_key 813 | if (must_merge(node)) { 814 | DLOG(INFO) << "Merge leaf node " << node->id; 815 | collect_leaf(node, ret.orphaned_kv); 816 | ret.node_items.push_back(std::make_pair(node_key, node)); 817 | ret.type_ = MOD_TYPE_DEC; 818 | } 819 | 820 | return ret; 821 | } 822 | 823 | NodeMod modify_node_inner(InnerNode *node UNUSED, const std::vector &mods UNUSED) { 824 | NodeMod ret(MOD_TYPE_NONE); 825 | auto& kv = ret.orphaned_kv; 826 | 827 | // randomly pick up a key, used for merge 828 | auto node_key = node->keys[0]; 829 | 830 | // firstly, we loop all items to save orphaned and count nodes 831 | int num = node->slot_used; 832 | for (auto& item : mods) { 833 | // save all orphaned_* 834 | kv.insert(kv.end(), item.orphaned_kv.begin(), item.orphaned_kv.end()); 835 | 836 | auto item_size = (int)item.node_items.size(); 837 | if (item.type_ == MOD_TYPE_ADD) { 838 | num += item_size; 839 | } else if (item.type_ == MOD_TYPE_DEC) { 840 | num -= item_size; 841 | } else { 842 | assert(item_size == 0); 843 | } 844 | } 845 | 846 | if (num > node->MAX_SLOT()) { 847 | DLOG(INFO) << "inner will split"; 848 | auto comp = [this](const std::pair &p1, const std::pair &p2) { 849 | return key_less(p1.first, p2.first); 850 | }; 851 | 852 | std::set, decltype(comp)> buf(comp); 853 | 854 | // execute add/del 855 | for (auto& item : mods) { 856 | if (item.type_ == MOD_TYPE_ADD) { 857 | for (auto& kv : item.node_items) { 858 | buf.insert(kv); 859 | } 860 | } else if(item.type_ == MOD_TYPE_DEC) { 861 | for (auto& kv : item.node_items) { 862 | if(buf.count(kv)) { 863 | buf.erase(kv); 864 | // TODO: memleak 865 | }else{ 866 | // cout << "del " << kv.first<(node, kv.first); 868 | 869 | } 870 | } 871 | } 872 | } 873 | 874 | // construct input for split 875 | std::vector> split_input; 876 | for(auto itr = buf.begin(); itr != buf.end(); itr++) { 877 | split_input.push_back(*itr); 878 | } 879 | 880 | for(auto i = 0; i < node->slot_used; i++) { 881 | split_input.push_back(std::make_pair(node->keys[i], node->values[i])); 882 | } 883 | // do split based on this buf 884 | big_split(split_input, node, ret.node_items); 885 | for (auto itr = ret.node_items.begin(); itr != ret.node_items.end(); itr++) { 886 | // Reset parent, the children of the newly splited node should point 887 | // to the new parent 888 | auto new_node = itr->second; 889 | for (int i = 0; i < new_node->slot_used; i++) { 890 | CHECK(new_node->type() == INNERNODE) << " split leaf node in modify_node_inner"; 891 | ((InnerNode *)new_node)->values[i]->parent = new_node; 892 | } 893 | } 894 | ret.type_ = MOD_TYPE_ADD; 895 | return ret; 896 | } else { 897 | DLOG(INFO) << "inner not split"; 898 | for (auto& item : mods) { 899 | if (item.type_ == MOD_TYPE_ADD) { 900 | for (auto& kv : item.node_items) { 901 | DLOG(INFO) << "Add item " << kv.first; 902 | add_item(node, kv.first, kv.second); 903 | } 904 | } else if(item.type_ == MOD_TYPE_DEC) { 905 | for (auto& kv : item.node_items) { 906 | DLOG(INFO) << "Del item " << kv.first; 907 | del_item(node, kv.first); 908 | } 909 | } else { 910 | DLOG(INFO) << "A NOOP has propagated"; 911 | } 912 | } 913 | } 914 | 915 | // merge 916 | if (must_merge(node)) { 917 | collect_leaf(node, ret.orphaned_kv); 918 | ret.node_items.push_back(std::make_pair(node_key, node)); 919 | ret.type_ = MOD_TYPE_DEC; 920 | 921 | } else { 922 | DLOG(INFO) << "Don't merge " << layer_width_[node->level]->load() << " " << node->is_few() << " " << node->slot_used; 923 | } 924 | 925 | return ret; 926 | } 927 | 928 | // set the smallest key in node to min_key 929 | void ensure_min_range(InnerNode *node UNUSED, const KeyType &min) { 930 | if (node->slot_used <= 1) { 931 | return; 932 | } 933 | // find the second smallest 934 | int idx = 0; 935 | for(int i = 1; i < node->slot_used; i++) { 936 | if(key_less(node->keys[i], node->keys[idx])) { 937 | idx = i; 938 | } 939 | } 940 | 941 | CHECK(key_less(min, node->keys[idx])); 942 | 943 | if(idx == 0) { 944 | return; 945 | } 946 | 947 | // swap idx with slot 0 948 | 949 | std::swap(node->keys[0], node->keys[idx]); 950 | std::swap(node->values[0], node->values[idx]); 951 | 952 | } 953 | 954 | void ensure_min_key() { 955 | auto ptr = (Node *)tree_root; 956 | while(ptr->type() == INNERNODE) { 957 | auto inner = (InnerNode *)ptr; 958 | inner->keys[0] = min_key_; 959 | ptr = inner->values[0]; 960 | } 961 | } 962 | 963 | void ensure_tree_structure(Node *node, int indent) { 964 | std::map recorder; 965 | ensure_tree_structure_helper(node, indent, recorder); 966 | 967 | CHECK(layer_width_.size() == recorder.size()) << "mismatch layer"; 968 | for(auto itr = recorder.begin(); itr != recorder.end(); itr++) { 969 | CHECK(layer_width_[itr->first]->load() == itr->second) << "mismatch layer size in "<< itr->first <<" , expect: " << layer_width_[itr->first]->load()<< " actual "<second; 970 | } 971 | 972 | } 973 | void ensure_tree_structure_helper(Node *node, int indent, std::map& layer_size_recorder) { 974 | if(layer_size_recorder.count(node->level)) { 975 | layer_size_recorder[node->level]++; 976 | } else { 977 | layer_size_recorder[node->level] = 1; 978 | } 979 | std::string space; 980 | for (int i = 0; i < indent; i++) 981 | space += " "; 982 | DLOG(INFO) << space << node->to_string() << " | Layer size " << layer_width_[node->level]->load();; 983 | 984 | if (node->type() == INNERNODE) { 985 | InnerNode *inode = (InnerNode *)node; 986 | for (int i = 0; i < inode->slot_used; i++) { 987 | auto child = inode->values[i]; 988 | CHECK(child->parent == node) << "My child " << i << " does not point to me"; 989 | } 990 | } 991 | if (node->type() == INNERNODE) { 992 | InnerNode *inode = (InnerNode *)node; 993 | for (int i = 0; i < inode->slot_used; i++) { 994 | auto child = inode->values[i]; 995 | KeyType *key_set; 996 | if (child->type() == LEAFNODE) 997 | key_set = ((LeafNode *)child)->keys; 998 | else 999 | key_set = ((InnerNode *)child)->keys; 1000 | if (child->slot_used == 0) { 1001 | CHECK(node == tree_root) << "Non root node has empty child " << i; 1002 | } else { 1003 | int idx = 0; 1004 | for (int j = 1; j < child->slot_used; j++) { 1005 | if (key_less(key_set[j], key_set[idx])) { 1006 | idx = j; 1007 | } 1008 | } 1009 | 1010 | auto child_min_key = key_set[idx]; 1011 | if(child->type() == INNERNODE) { 1012 | CHECK(idx == 0) << "InnerNode " << i << "'s first key isn't the smallest"; 1013 | } 1014 | CHECK(!key_less(child_min_key, inode->keys[i])) << "My child " << i << " is beyond the key range"; 1015 | } 1016 | } 1017 | 1018 | for (int i = 0; i < inode->slot_used; i++) { 1019 | ensure_tree_structure_helper(inode->values[i], indent + 4, layer_size_recorder); 1020 | } 1021 | } 1022 | } 1023 | 1024 | /************************** 1025 | * Concurrent executions ** 1026 | * 1027 | * Design: we have a potential infinite long task queue, where clients add 1028 | * requests by calling find, insert or remove. We also have a fixed length 1029 | * pool of worker threads. One of the thread (thread 0) will collect task from the 1030 | * work queue, if it has collected enough task for a batch, or has timed out 1031 | * before collecting enough tasks, it will partition the work and start the 1032 | * Palm algorithm among the threads. 1033 | * ************************/ 1034 | // boost::barrier barrier_; 1035 | Barrier barrier_; 1036 | boost::lockfree::spsc_queue task_batch_queue_; 1037 | 1038 | // The current batch that is being processed by the workers 1039 | TaskBatch *current_batch_; 1040 | 1041 | void sync(int worker_id) { 1042 | auto begin_tick = CycleTimer::currentTicks(); 1043 | barrier_.wait(); 1044 | auto passed_tick = CycleTimer::currentTicks() - begin_tick; 1045 | STAT.add_stat(worker_id, "sync_time", passed_tick); 1046 | } 1047 | 1048 | struct WorkerThread { 1049 | WorkerThread(int worker_id, PalmTree *palmtree): 1050 | worker_id_(worker_id), 1051 | palmtree_(palmtree), 1052 | done_(false) { 1053 | // Initialize 2 layers of modifications 1054 | node_mods_.push_back(NodeModsMapType()); 1055 | node_mods_.push_back(NodeModsMapType()); 1056 | } 1057 | // Worker id, the thread with worker id 0 will need to be the coordinator 1058 | int worker_id_; 1059 | // The work for the worker at each stage 1060 | std::vector current_tasks_; 1061 | std::unordered_map> leaf_ops_; 1062 | // Node modifications on each layer, the size of the vector will be the 1063 | // same as the tree height 1064 | typedef std::unordered_map> NodeModsMapType; 1065 | std::vector node_mods_; 1066 | // Spawn a thread and run the worker loop 1067 | boost::thread wthread_; 1068 | // The palm tree the worker belong to 1069 | PalmTree *palmtree_; 1070 | bool done_; 1071 | void start() { 1072 | wthread_ = boost::thread(&WorkerThread::worker_loop, this); 1073 | } 1074 | 1075 | inline int LOWER() { 1076 | auto batch_size = palmtree_->current_batch_->size(); 1077 | auto task_per_thread = batch_size / palmtree_->NUM_WORKER + 1; 1078 | auto LOWER = worker_id_*task_per_thread; 1079 | return LOWER; 1080 | } 1081 | 1082 | inline int UPPER() { 1083 | auto batch_size = palmtree_->current_batch_->size(); 1084 | auto task_per_thread = batch_size / palmtree_->NUM_WORKER + 1; 1085 | auto LOWER = worker_id_*task_per_thread; 1086 | return (worker_id_ == palmtree_->NUM_WORKER-1) ? (batch_size) : (LOWER+task_per_thread); 1087 | } 1088 | // The #0 thread is responsible to collect tasks to a batch 1089 | void collect_batch() { 1090 | DLOG(INFO) << "Thread " << worker_id_ << " collect tasks " << palmtree_->BATCH_SIZE; 1091 | 1092 | if (worker_id_ == 0) { 1093 | if (batch_id % 2 == 0) { 1094 | int sleep_time = 0; 1095 | while (sleep_time < 1024) { 1096 | 1097 | bool res = palmtree_->task_batch_queue_.pop(palmtree_->current_batch_); 1098 | if (res) { 1099 | break; 1100 | } else { 1101 | DLOG(INFO) << sleep_time; 1102 | sleep_time++; 1103 | } 1104 | } 1105 | } 1106 | batch_id++; 1107 | // STAT.add_stat(0, "fetch_batch", CycleTimer::currentTicks() - bt); 1108 | // DLOG(INFO) << "Collected a batch of " << palmtree_->current_batch_->size(); 1109 | } 1110 | 1111 | palmtree_->sync(worker_id_); 1112 | if (palmtree_->current_batch_ == nullptr) { 1113 | return; 1114 | } 1115 | 1116 | if (palmtree_->current_batch_->size() == 0) { 1117 | return; 1118 | } 1119 | // STAT.add_stat(worker_id_, "batch_sort", CycleTimer::currentTicks() - bt); 1120 | 1121 | // Partition the task among threads 1122 | int batch_size = palmtree_->current_batch_->size(); 1123 | int task_per_thread = batch_size / palmtree_->NUM_WORKER; 1124 | int task_residue = batch_size - task_per_thread * palmtree_->NUM_WORKER; 1125 | 1126 | int lower = task_per_thread * worker_id_ + std::min(task_residue, worker_id_); 1127 | int upper = lower + task_per_thread + (worker_id_ < task_residue); 1128 | 1129 | DLOG(INFO) << worker_id_ << " got " << lower << " to " << upper << " tasks"; 1130 | for (int i = lower; i < upper; i++) { 1131 | palmtree_->workers_[worker_id_].current_tasks_ 1132 | .push_back(palmtree_->current_batch_->get_op(i)); 1133 | } 1134 | } 1135 | 1136 | // Redistribute the tasks on leaf node 1137 | void redistribute_leaf_tasks(std::unordered_map> &result) { 1138 | #ifdef PROFILE 1139 | auto bt = CycleTimer::currentTicks(); 1140 | #endif 1141 | // First add current tasks 1142 | for (auto op : current_tasks_) { 1143 | if (result.find(op->target_node_) == result.end()) { 1144 | result.emplace(op->target_node_, std::vector()); 1145 | } 1146 | 1147 | result[op->target_node_].push_back(op); 1148 | } 1149 | 1150 | // Then remove nodes that don't belong to the current worker 1151 | for (int i = 0; i < worker_id_; i++) { 1152 | WorkerThread &wthread = palmtree_->workers_[i]; 1153 | for (int j = wthread.current_tasks_.size()-1; j >= 0; j--) { 1154 | auto &op = wthread.current_tasks_[j]; 1155 | if (result.count(op->target_node_) == 0) 1156 | break; 1157 | result.erase(op->target_node_); 1158 | } 1159 | } 1160 | 1161 | for (int i = worker_id_+1; i < palmtree_->NUM_WORKER; i++) { 1162 | WorkerThread &wthread = palmtree_->workers_[i]; 1163 | bool early_break = false; 1164 | for (auto op : wthread.current_tasks_) { 1165 | CHECK(op->target_node_ != nullptr) << "worker " << i <<" hasn't finished search"; 1166 | if (result.find(op->target_node_) != result.end()) { 1167 | result[op->target_node_].push_back(op); 1168 | } else { 1169 | early_break = true; 1170 | break; 1171 | } 1172 | } 1173 | 1174 | if (early_break) 1175 | break; 1176 | } 1177 | 1178 | 1179 | // LOG(INFO) << "Worker " << worker_id_ << " has " << result.size() << " nodes of tasks after task redistribution"; 1180 | 1181 | 1182 | // Calculate number of tasks 1183 | int sum = 0; 1184 | for (auto itr = result.begin(); itr != result.end(); itr++) { 1185 | sum += itr->second.size(); 1186 | } 1187 | 1188 | STAT.add_stat(worker_id_, "leaf_task", sum); 1189 | 1190 | // LOG(INFO) << "Worker " << worker_id_ << " has " << result.size() << " nodes of tasks after task redistribution, " << sum << " tasks in total"; 1191 | // std::this_thread::sleep_for(std::chrono::milliseconds(1)); 1192 | 1193 | #ifdef PROFILE 1194 | STAT.add_stat(worker_id_, "redist_leaf", CycleTimer::currentTicks() - bt); 1195 | #endif 1196 | } 1197 | 1198 | /** 1199 | * @brief redistribute inner node tasks for the current thread. It will 1200 | * read @depth layer's information about node modifications and determine 1201 | * tasks that belongs to the current thread. 1202 | * 1203 | * @param layer which layer's modifications are we trying to colelct 1204 | * @param cur_mods the collected tasks will be stored in @cur_mods 1205 | */ 1206 | void redistribute_inner_tasks(int layer, NodeModsMapType &cur_mods) { 1207 | cur_mods = node_mods_[layer]; 1208 | 1209 | // discard 1210 | for (int i = 0; i < worker_id_; i++) { 1211 | auto &wthread = palmtree_->workers_[i]; 1212 | for (auto other_itr = wthread.node_mods_[layer].begin(); other_itr != wthread.node_mods_[layer].end(); other_itr++) { 1213 | cur_mods.erase(other_itr->first); 1214 | } 1215 | } 1216 | 1217 | // Steal work from other threads 1218 | for (int i = worker_id_+1; i < palmtree_->NUM_WORKER; i++) { 1219 | auto &wthread = palmtree_->workers_[i]; 1220 | for (auto other_itr = wthread.node_mods_[layer].begin(); other_itr != wthread.node_mods_[layer].end(); other_itr++) { 1221 | auto itr = cur_mods.find(other_itr->first); 1222 | if (itr != cur_mods.end()) { 1223 | auto &my_mods = itr->second; 1224 | auto &other_mods = other_itr->second; 1225 | my_mods.insert(my_mods.end(), other_mods.begin(), other_mods.end()); 1226 | } 1227 | } 1228 | } 1229 | } 1230 | 1231 | /** 1232 | * @brief carry out all operations on the tree in a serializable order, 1233 | * reduce operations on the same key. The result of this function is to 1234 | * provide proper return result for all the operations, as well as filter 1235 | * out the todo node modifications on the #0 layer 1236 | * */ 1237 | void resolve_hazards(const std::unordered_map> &tree_ops UNUSED) { 1238 | #ifdef PROFILE 1239 | auto bt = CycleTimer::currentTicks(); 1240 | #endif 1241 | node_mods_[0].clear(); 1242 | auto &leaf_mods = node_mods_[0]; 1243 | std::unordered_map changed_values; 1244 | std::unordered_set deleted; 1245 | for (auto itr = tree_ops.begin(); itr != tree_ops.end(); itr++) { 1246 | LeafNode *leaf = static_cast(itr->first); 1247 | auto &ops = itr->second; 1248 | for (auto op : ops) { 1249 | if (op->op_type_ == TREE_OP_FIND) { 1250 | if (deleted.find(op->key_) != deleted.end()) { 1251 | op->boolean_result_ = false; 1252 | } else { 1253 | if (changed_values.count(op->key_) != 0) { 1254 | op->result_ = changed_values[op->key_]; 1255 | op->boolean_result_ = true; 1256 | } else { 1257 | int idx = palmtree_->search_leaf(leaf->keys, leaf->slot_used, op->key_); 1258 | if (idx == -1 || !palmtree_->key_eq(leaf->keys[idx], op->key_)) { 1259 | // Not find 1260 | op->boolean_result_ = false; 1261 | } else { 1262 | op->result_ = leaf->values[idx]; 1263 | op->boolean_result_ = true; 1264 | } 1265 | } 1266 | } 1267 | } else if (op->op_type_ == TREE_OP_INSERT) { 1268 | DLOG(INFO) << "Try to insert " << op->key_ << ": " << op->value_; 1269 | deleted.erase(op->key_); 1270 | changed_values[op->key_] = op->value_; 1271 | if (leaf_mods.count(leaf) == 0) 1272 | leaf_mods.emplace(leaf, std::vector()); 1273 | leaf_mods[leaf].push_back(NodeMod(*op)); 1274 | } else { 1275 | CHECK(op->op_type_ == TREE_OP_REMOVE) << "Invalid tree operation"; 1276 | changed_values.erase(op->key_); 1277 | if (leaf_mods.count(leaf) == 0) 1278 | leaf_mods.emplace(leaf, std::vector()); 1279 | leaf_mods[leaf].push_back(NodeMod(*op)); 1280 | } 1281 | } 1282 | } 1283 | 1284 | #ifdef PROFILE 1285 | STAT.add_stat(worker_id_, "resolve_hazards", CycleTimer::currentTicks() - bt); 1286 | #endif 1287 | } // End resolve_hazards 1288 | 1289 | /** 1290 | * @brief Handle root split and re-insert orphaned keys. It may need to grow the tree height 1291 | */ 1292 | void handle_root() { 1293 | 1294 | int root_depth = palmtree_->tree_depth_; 1295 | std::vector root_mods; 1296 | // Collect root modifications from all threads 1297 | for (auto &wthread : palmtree_->workers_) { 1298 | auto itr = wthread.node_mods_[root_depth].begin(); 1299 | if (itr != wthread.node_mods_[root_depth].end()) { 1300 | root_mods.insert(root_mods.end(), itr->second.begin(), itr->second.end()); 1301 | } 1302 | } 1303 | // Handle over to modify_node 1304 | auto new_mod = palmtree_->modify_node(palmtree_->tree_root, root_mods); 1305 | if (new_mod.type_ == MOD_TYPE_NONE) { 1306 | DLOG(INFO) << "Root won't split"; 1307 | } else if (new_mod.type_ == MOD_TYPE_ADD) { 1308 | DLOG(INFO) << "Split root"; 1309 | InnerNode *new_root = new InnerNode(nullptr, palmtree_->tree_root->level+1); 1310 | palmtree_->tree_root->parent = new_root; 1311 | palmtree_->add_item(new_root, palmtree_->min_key_, palmtree_->tree_root); 1312 | for (auto itr = new_mod.node_items.begin(); itr != new_mod.node_items.end(); itr++) { 1313 | itr->second->parent = new_root; 1314 | palmtree_->add_item(new_root, itr->first, itr->second); 1315 | } 1316 | palmtree_->tree_root = new_root; 1317 | palmtree_->tree_depth_ += 1; 1318 | for (auto &wthread : palmtree_->workers_) { 1319 | wthread.node_mods_.push_back(NodeModsMapType()); 1320 | } 1321 | palmtree_->layer_width_.emplace_back(new std::atomic(1)); 1322 | } 1323 | // Merge root if neccessary 1324 | while (palmtree_->tree_depth_ >= 2 && palmtree_->tree_root->slot_used == 1) { 1325 | DLOG(INFO) << "Decrease tree depth"; 1326 | // Decrease root height 1327 | auto old_root = static_cast(palmtree_->tree_root); 1328 | palmtree_->tree_root = old_root->values[0]; 1329 | delete old_root; 1330 | palmtree_->tree_depth_ -= 1; 1331 | for (auto &wthread : palmtree_->workers_) { 1332 | wthread.node_mods_.pop_back(); 1333 | } 1334 | delete palmtree_->layer_width_.back(); 1335 | palmtree_->layer_width_.pop_back(); 1336 | } 1337 | DLOG(INFO) << "Insert orphaned"; 1338 | // Naively insert orphaned 1339 | for (auto itr = new_mod.orphaned_kv.begin(); itr != new_mod.orphaned_kv.end(); itr++) { 1340 | DLOG(INFO) << "Insert " << itr->first << " " << itr->second; 1341 | auto leaf = palmtree_->search(itr->first); 1342 | palmtree_->add_item(leaf, itr->first, itr->second); 1343 | } 1344 | palmtree_->ensure_min_key(); 1345 | DLOG(INFO) << "Root handled"; 1346 | } // End of handle_root() 1347 | 1348 | // Worker loop: process tasks 1349 | void worker_loop() { 1350 | while (!done_) { 1351 | // Stage 0, collect work batch and partition 1352 | CycleTimer::SysClock start_tick = CycleTimer::currentTicks(); 1353 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 0: collect tasks"; 1354 | collect_batch(); 1355 | if (worker_id_ == 0) { 1356 | // Check if the tree is destroyed, we must do it before the sync point 1357 | if (palmtree_->destroyed_) { 1358 | for (int i = 0; i < palmtree_->NUM_WORKER; i++) 1359 | palmtree_->workers_[i].done_ = true; 1360 | }; 1361 | } 1362 | CycleTimer::SysClock passed = CycleTimer::currentTicks() - start_tick; 1363 | STAT.add_stat(worker_id_, "stage0", passed); 1364 | palmtree_->sync(worker_id_); 1365 | if (done_) 1366 | LOG(INFO) << "Worker " << worker_id_ << " exit"; 1367 | 1368 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 0 finished"; 1369 | #ifdef PROFILE 1370 | auto s1_bt = CycleTimer::currentTicks(); 1371 | #endif 1372 | // Stage 1, Search for leafs 1373 | DLOG(INFO) << "Worker " << worker_id_ << " got " << current_tasks_.size() << " tasks"; 1374 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 1: search for leaves"; 1375 | 1376 | leaf_ops_.clear(); 1377 | std::unordered_map> collected_tasks; 1378 | for (auto op : current_tasks_) { 1379 | op->target_node_ = palmtree_->search(op->key_); 1380 | 1381 | CHECK(op->target_node_ != nullptr) << "search returns nullptr"; 1382 | } 1383 | #ifdef PROFILE 1384 | STAT.add_stat(worker_id_, "stage1", CycleTimer::currentTicks() - s1_bt); 1385 | #endif 1386 | palmtree_->sync(worker_id_); 1387 | 1388 | #ifdef PROFILE 1389 | auto s2_bt = CycleTimer::currentTicks(); 1390 | #endif 1391 | 1392 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 1 finished"; 1393 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 2: Process leaves"; 1394 | // Stage 2, redistribute work, read the tree then modify, each thread 1395 | // will handle the nodes it has searched for, except the nodes that 1396 | // have been handled by workers whose worker_id is less than me. 1397 | // Currently we use a unordered_map to record the ownership of tasks upon 1398 | // certain nodes. 1399 | 1400 | redistribute_leaf_tasks(collected_tasks); 1401 | resolve_hazards(collected_tasks); 1402 | DLOG_IF(INFO, worker_id_ == 0) << "resolved hazards"; 1403 | // Modify nodes 1404 | auto &upper_mods = node_mods_[1]; 1405 | auto &cur_mods = node_mods_[0]; 1406 | upper_mods.clear(); 1407 | for (auto itr = cur_mods.begin() ; itr != cur_mods.end(); itr++) { 1408 | auto node = itr->first; 1409 | auto &mods = itr->second; 1410 | CHECK(node != nullptr) << "Modifying a null node"; 1411 | auto upper_mod = palmtree_->modify_node(node, mods); 1412 | // FIXME: now we have orphaned_keys 1413 | if (upper_mod.type_ == MOD_TYPE_NONE && upper_mod.orphaned_kv.empty()) { 1414 | DLOG(INFO) << "No node modification happened, don't propagate upwards"; 1415 | continue; 1416 | } 1417 | DLOG(INFO) << "Add node modification " << upper_mod.type_ << " to upper layer " << 1; 1418 | if (upper_mods.find(node->parent) == upper_mods.end()) { 1419 | upper_mods.emplace(node->parent, std::vector()); 1420 | } 1421 | upper_mods[node->parent].push_back(upper_mod); 1422 | } 1423 | 1424 | #ifdef PROFILE 1425 | STAT.add_stat(worker_id_, "stage2", CycleTimer::currentTicks() - s2_bt); 1426 | #endif 1427 | palmtree_->sync(worker_id_); 1428 | #ifdef PROFILE 1429 | auto s3_bt = CycleTimer::currentTicks(); 1430 | #endif 1431 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 2 finished"; 1432 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 3: propagate tree modification"; 1433 | // Stage 3, propagate tree modifications back 1434 | // Propagate modifications until root 1435 | for (int layer = 1; layer <= palmtree_->tree_depth_-1; layer++) { 1436 | // DLOG_IF(INFO, worker_id_ == 0) << "Layer #" << layer << " begin"; 1437 | NodeModsMapType cur_mods; 1438 | redistribute_inner_tasks(layer, cur_mods); 1439 | auto &upper_mods = node_mods_[layer+1]; 1440 | upper_mods.clear(); 1441 | for (auto itr = cur_mods.begin(); itr != cur_mods.end(); itr++) { 1442 | auto node = itr->first; 1443 | auto &mods = itr->second; 1444 | DLOG(INFO) << "Stage 3 modify " << node->id; 1445 | auto mod_res = palmtree_->modify_node(node, mods); 1446 | if (upper_mods.count(node->parent) == 0) { 1447 | upper_mods.emplace(node->parent, std::vector()); 1448 | } 1449 | upper_mods[node->parent].push_back(mod_res); 1450 | } 1451 | palmtree_->sync(worker_id_); 1452 | // DLOG_IF(INFO, worker_id_ == 0) << "Layer #" << layer << " done"; 1453 | } // End propagate 1454 | #ifdef PROFILE 1455 | STAT.add_stat(worker_id_, "stage3", CycleTimer::currentTicks() - s3_bt); 1456 | #endif 1457 | palmtree_->sync(worker_id_); 1458 | #ifdef PROFILE 1459 | auto s4_bt = CycleTimer::currentTicks(); 1460 | #endif 1461 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 3 finished"; 1462 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 4: Handle root"; 1463 | 1464 | // Stage 4, modify the root, re-insert orphaned, mark work as done 1465 | if (worker_id_ == 0) { 1466 | CycleTimer::SysClock st = CycleTimer::currentTicks(); 1467 | // Mark tasks as done 1468 | handle_root(); 1469 | STAT.add_stat(worker_id_, "end_stage", CycleTimer::currentTicks() - st); 1470 | // palmtree_->ensure_tree_structure(palmtree_->tree_root, 0); 1471 | } 1472 | 1473 | auto st2 = CycleTimer::currentTicks(); 1474 | STAT.add_stat(worker_id_, "deliver tasks", CycleTimer::currentTicks() - st2); 1475 | 1476 | auto st3 = CycleTimer::currentTicks(); 1477 | palmtree_->task_nums -= current_tasks_.size(); 1478 | STAT.add_stat(worker_id_, "dec task num", CycleTimer::currentTicks() - st3); 1479 | 1480 | current_tasks_.clear(); 1481 | #ifdef PROFILE 1482 | STAT.add_stat(worker_id_, "stage4", CycleTimer::currentTicks() - s4_bt); 1483 | #endif 1484 | palmtree_->sync(worker_id_); 1485 | 1486 | // Free the current batch 1487 | 1488 | if (worker_id_ == 0 && batch_id % 2 == 0 && palmtree_->current_batch_ != nullptr) { 1489 | DLOG(INFO) << "Free the current batch"; 1490 | palmtree_->current_batch_->destroy(); 1491 | free(palmtree_->current_batch_); 1492 | palmtree_->current_batch_ = nullptr; 1493 | DLOG(INFO) << "Free-ed"; 1494 | } 1495 | 1496 | DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 4 finished"; 1497 | 1498 | CycleTimer::SysClock end_tick = CycleTimer::currentTicks(); 1499 | 1500 | STAT.add_stat(worker_id_, "round_time", end_tick-start_tick); 1501 | } // End worker loop 1502 | DLOG(INFO) << "Worker " << worker_id_ << " exited"; 1503 | } 1504 | }; // End WorkerThread 1505 | 1506 | std::vector workers_; 1507 | /********************** 1508 | * PalmTree public * 1509 | * ********************/ 1510 | public: 1511 | std::atomic task_nums; 1512 | 1513 | PalmTree(KeyType min_key, int num_worker): 1514 | tree_depth_(1), 1515 | destroyed_(false), 1516 | min_key_(min_key), 1517 | barrier_(num_worker), 1518 | task_batch_queue_{1024*500} 1519 | { 1520 | NUM_WORKER = num_worker; 1521 | BATCH_SIZE = BATCH_SIZE_PER_WORKER * NUM_WORKER; 1522 | 1523 | LOG(INFO) << "init palm tree with " << NUM_WORKER << " workers"; 1524 | // Init the root node 1525 | tree_root = new InnerNode(nullptr, 1); 1526 | add_item((InnerNode *)tree_root, min_key_, new LeafNode(tree_root, 0)); 1527 | // Init layer width 1528 | layer_width_.push_back(new std::atomic(1)); 1529 | layer_width_.push_back(new std::atomic(1)); 1530 | // Init current batch 1531 | current_batch_ = nullptr; 1532 | tree_current_batch_ = (TaskBatch *)malloc(sizeof(TaskBatch)); 1533 | new (tree_current_batch_) TaskBatch(BATCH_SIZE); 1534 | // Init stats 1535 | 1536 | STAT = Stats(NUM_WORKER); 1537 | STAT.init_metric("batch_sort"); 1538 | STAT.init_metric("stage0"); 1539 | STAT.init_metric("stage1"); 1540 | STAT.init_metric("redist_leaf"); 1541 | STAT.init_metric("resolve_hazards"); 1542 | STAT.init_metric("stage2"); 1543 | STAT.init_metric("stage3"); 1544 | STAT.init_metric("stage4"); 1545 | STAT.init_metric("end_stage"); 1546 | 1547 | STAT.init_metric("search_inner"); 1548 | STAT.init_metric("search_leaf"); 1549 | 1550 | STAT.init_metric("leaf_task"); 1551 | 1552 | STAT.init_metric("sync_time"); 1553 | STAT.init_metric("round_time"); 1554 | 1555 | STAT.init_metric("deliver tasks"); 1556 | STAT.init_metric("dec task num"); 1557 | 1558 | // Init the worker thread 1559 | // Init the worker thread and start them 1560 | for (int worker_id = 0; worker_id < NUM_WORKER; worker_id++) { 1561 | workers_.emplace_back(worker_id, this); 1562 | } 1563 | for (auto &worker : workers_) { 1564 | worker.start(); 1565 | } 1566 | 1567 | task_nums = 0; 1568 | } 1569 | 1570 | // Recursively free the resources of one tree node 1571 | void free_recursive(Node *node UNUSED) { 1572 | if (node->type() == INNERNODE) { 1573 | auto ptr = (InnerNode *)node; 1574 | for(int i = 0; i < ptr->slot_used; i++) { 1575 | free_recursive(ptr->values[i]); 1576 | } 1577 | } 1578 | 1579 | delete node; 1580 | } 1581 | 1582 | ~PalmTree() { 1583 | 1584 | // Mark the tree as destroyed 1585 | destroyed_ = true; 1586 | // Join all workter thread 1587 | for (auto &wthread : workers_) 1588 | wthread.wthread_.join(); 1589 | // Free atomic layer width 1590 | while (!layer_width_.empty()) { 1591 | delete layer_width_.back(); 1592 | layer_width_.pop_back(); 1593 | } 1594 | 1595 | STAT.print_stat(); 1596 | 1597 | free_recursive(tree_root); 1598 | 1599 | 1600 | if (tree_current_batch_ != nullptr) { 1601 | tree_current_batch_->destroy(); 1602 | free(tree_current_batch_); 1603 | } 1604 | } 1605 | 1606 | /** 1607 | * @brief execute a batch of tree operations, the batch will be executed 1608 | * cooperatively by all worker threads 1609 | */ 1610 | void execute_batch(std::vector &operations UNUSED) { 1611 | 1612 | } 1613 | 1614 | /** 1615 | * @brief Find the value for a key 1616 | * @param key the key to be retrieved 1617 | * @return nullptr if no such k,v pair 1618 | */ 1619 | bool find(const KeyType &key UNUSED, ValueType &value UNUSED) { 1620 | push_task(TREE_OP_FIND, &key, nullptr); 1621 | 1622 | // op.wait(); 1623 | //if (op.boolean_result_) 1624 | //value = op.result_; 1625 | //return op.boolean_result_; 1626 | return true; 1627 | } 1628 | 1629 | /** 1630 | * @brief insert a k,v into the tree 1631 | */ 1632 | void insert(const KeyType &key UNUSED, const ValueType &value UNUSED) { 1633 | // TreeOp op(TREE_OP_INSERT, key, value); 1634 | 1635 | push_task(TREE_OP_INSERT, &key, &value); 1636 | 1637 | // op.wait(); 1638 | } 1639 | 1640 | /** 1641 | * @brief remove a k,v from the tree 1642 | */ 1643 | void remove(const KeyType &key UNUSED) { 1644 | push_task(TREE_OP_REMOVE, &key, nullptr); 1645 | 1646 | // op->wait(); 1647 | } 1648 | 1649 | void reset_metric() { 1650 | STAT.reset_metric(); 1651 | } 1652 | 1653 | int batch_size() { 1654 | return BATCH_SIZE_PER_WORKER * NUM_WORKER; 1655 | } 1656 | 1657 | // Wait until all task finished 1658 | void wait_finish() { 1659 | if (tree_current_batch_->size() != 0) { 1660 | task_batch_queue_.push(tree_current_batch_); 1661 | tree_current_batch_ = (TaskBatch *)malloc(sizeof(TaskBatch)); 1662 | new (tree_current_batch_) TaskBatch(BATCH_SIZE); 1663 | } 1664 | while (task_nums != 0) 1665 | ; 1666 | } 1667 | }; // End of PalmTree 1668 | // Explicit template initialization 1669 | template class PalmTree; 1670 | } // End of namespace palmtree 1671 | 1672 | --------------------------------------------------------------------------------