├── INSTALL.md
├── .gitignore
├── fineTree
    ├── CMakeLists.txt
    ├── Makefile
    ├── main.cpp
    └── fineTree.h
├── TODO.md
├── stx_bench
    ├── Makefile
    └── stx_bench.cpp
├── Makefile
├── map_test.cpp
├── barrier.h
├── CycleTimer.h
├── main.cpp
├── README.md
└── palmtree.h


/INSTALL.md:
--------------------------------------------------------------------------------
1 | ### Begin development palmtree
2 | 
3 | #### Dependencies
4 | * Boost


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | palmtree
 3 | palmtree_test
 4 | configure
 5 | *.log
 6 | build/
 7 | autom4te.cache/
 8 | cscope.out
 9 | tags
10 | palmtree_test.dSYM/
11 | *.data
12 | *.gch
13 | 


--------------------------------------------------------------------------------
/fineTree/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.5)
2 | project(fineTree)
3 | 
4 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
5 | 
6 | set(SOURCE_FILES main.cpp)
7 | add_executable(fineTree ${SOURCE_FILES})


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | ### TODOs
2 | 
3 | - [ ] Support multi-thread no SIMD
4 | - [ ] Multi thread benchmark
5 | - [ ] Support multi-thread with SIMD on searching and pre-sorting of batch
6 | - [ ] Support multiple values for the same key, and remove of k,v pairs
7 | - [ ] Support scan (or STL like iterator interfaces)
8 | 


--------------------------------------------------------------------------------
/stx_bench/Makefile:
--------------------------------------------------------------------------------
 1 | # CFLAGS=-Werror -Wall -std=c++11 -g -ggdb -I/usr/local/include
 2 | CFLAGS=-Werror -Wall -std=c++11  -O3 -I/usr/local/include
 3 | LDFLAGS=-L/usr/local/lib  -lglog
 4 | CC=g++
 5 | 
 6 | all: stx_bench
 7 | 
 8 | stx_bench: stx_bench.cpp
 9 | 	$(CC) $(CFLAGS) -o stx_bench stx_bench.cpp $(LDFLAGS)
10 | 
11 | 
12 | clean:
13 | 	rm -rf stx_bench *.o
14 | 


--------------------------------------------------------------------------------
/fineTree/Makefile:
--------------------------------------------------------------------------------
 1 | CFLAGS=-Werror -Wall -std=c++11 -mavx2 -pedantic -O3 -I/usr/local/include
 2 | 
 3 | LDFLAGS=-L/usr/local/lib -lboost_atomic -lboost_system -lboost_thread -lglog #-ljemalloc
 4 | CC=g++
 5 | 
 6 | all: fineTree_test
 7 | 
 8 | fineTree_test: main.cpp fineTree.h
 9 | 	$(CC) $(CFLAGS) -o fineTree_test main.cpp $(LDFLAGS)
10 | 
11 | clean:
12 | 	rm -rf fineTree_test *.o


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # CFLAGS=-Werror -Wall -std=c++11 -mavx2 -g -ggdb -I/usr/local/include
 2 | # CFLAGS=-Werror -Wall -std=c++11 -msse4.1 -pedantic -O3 -I/usr/local/include
 3 | # CFLAGS=-Werror -Wall -std=c++11 -mavx2 -g -ggdb -I/usr/local/include
 4 | CFLAGS=-Werror -Wall -std=c++11 -mavx2 -O3 -I/usr/local/include
 5 | 
 6 | LDFLAGS=-L/usr/local/lib -lboost_atomic -lboost_system -lboost_thread -lglog -ljemalloc -lpthread
 7 | CC=g++
 8 | 
 9 | all: palmtree_test
10 | 
11 | palmtree_test: main.cpp palmtree.h barrier.h
12 | 	$(CC) $(CFLAGS) -o palmtree_test main.cpp $(LDFLAGS)
13 | 
14 | map_test: map_test.cpp
15 | 	$(CC) -std=c++11 -O3 -o map_test map_test.cpp
16 | 	time ./map_test
17 | 
18 | clean:
19 | 	rm -rf palmtree_test map_test *.o
20 | 


--------------------------------------------------------------------------------
/map_test.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Zrs_y on 4/28/16.
 3 | //
 4 | #include <stdio.h>
 5 | #include <iostream>
 6 | #include <map>
 7 | #include <algorithm>
 8 | #include <cstdlib>
 9 | #include "CycleTimer.h"
10 | 
11 | #define TEST_SIZE 10240000
12 | 
13 | int main() {
14 | 
15 |   int *buff = new int[TEST_SIZE];
16 |   for(int i = 0; i < TEST_SIZE; i++) {
17 |     buff[i] = i;
18 |   }
19 | 
20 |   //std::random_shuffle(buff, buff + TEST_SIZE);
21 | 
22 |   auto begin_time = CycleTimer::currentSeconds();
23 |   std::map<int, int> t;
24 | 
25 |   for(int i = 0; i < TEST_SIZE; i++) {
26 |     //auto kv = buff[i];
27 |     auto kv = i;
28 |     t[kv] = kv;
29 |     if (t[kv] != kv) {
30 |       return 0;
31 |     }
32 |   }
33 | 
34 |   auto end_time = CycleTimer::currentSeconds();
35 |   std::cout << "dict's size is " << t.size() << std::endl;
36 |   std::cout << "running time is " << end_time - begin_time << " seconds" << std::endl;
37 |   delete buff;
38 |   return 0;
39 | }
40 | 


--------------------------------------------------------------------------------
/barrier.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Runshen Zhu on 4/28/16.
 3 | //
 4 | 
 5 | #pragma once
 6 | #include <atomic>
 7 | 
 8 | // a re-useable barrier for sync-ing among multiple working threads
 9 | class Barrier {
10 | public:
11 |   Barrier() = delete;
12 |   Barrier(int n): P(n) {
13 |     m_generation = 0;
14 |     m_count = n;
15 |   }
16 | 
17 |   // wait blocks until all P threads arrive the barrier and call it.
18 |   bool wait() {
19 |     lock.lock();
20 |     auto gen = m_generation.load();
21 | 
22 |     if (--m_count == 0) {
23 |       m_generation++;
24 |       m_count = P;
25 |       lock.unlock();
26 |       return true;
27 |     }
28 | 
29 |     lock.unlock();
30 | 
31 |     while (gen == m_generation);
32 |     return false;
33 |   }
34 | 
35 | private:
36 |   class spinlock {
37 |   // a tick based spinlock
38 |   // traditional CAS spin lock has lots of bus traffic
39 |   // this implementation is aimed to ease the bus traffic
40 |   // 
41 |   // traditional CAS lock:
42 |   //    let const int unlock = 1, lock = 0;
43 |   //    lock() { while( CAS(LOCK_, unlock, lock) == false ) {} }
44 |   // 
45 |   // each CAS call is equal to a bus write, which invalids cache line
46 |   // and bring bus traffic. So it's not a good idea to keep CAS-ing on
47 |   // the value LOCK_
48 |   //
49 |   // a simple way to ease it is to use `test and CAS` approach:
50 |   //    lock() {
51 |   //        for(;;) {
52 |   //            if (LOCK_ == unlock && CAS(LOCK_, unlock, lock) == true) {
53 |   //                return;
54 |   //            } 
55 |   //        }
56 |   //    }
57 |   //
58 |   // compared with `test and CAS` approach, tick based spin lock takes a further
59 |   // step and reduces more traffic than `test and CAS` spin lock. (because there are
60 |   // still confilicts in `test and CAS` approach)
61 |   public:
62 |     spinlock() {
63 |       next_ticket = 0;
64 |       now_serving = 0;
65 |     }
66 | 
67 |     void lock() {
68 |       auto my_ticket = next_ticket++;
69 |       while(my_ticket != now_serving) ;
70 |     }
71 | 
72 |     void unlock() {
73 |       now_serving++;
74 |     }
75 | 
76 |   private:
77 |     std::atomic<unsigned int> next_ticket;
78 |     std::atomic<unsigned int> now_serving;
79 |   };
80 |   spinlock lock;
81 | 
82 |   // # of threads that haven't arrived this barrier
83 |   std::atomic<int> m_count;
84 |   // generation of this barrier
85 |   std::atomic<unsigned long> m_generation;
86 |   // # of threads that are using this barrier
87 |   int P;
88 | };


--------------------------------------------------------------------------------
/stx_bench/stx_bench.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by Zrs_y on 5/4/16.
 3 | //
 4 | #include <iostream>
 5 | #include <stx/btree_map.h>
 6 | #include <glog/logging.h>
 7 | #include <iostream>
 8 | #include <cstdlib>
 9 | #include <time.h>
10 | #include <unistd.h>
11 | #include "../CycleTimer.h"
12 | 
13 | class fast_random {
14 | public:
15 |   fast_random(unsigned long seed) : seed(0) { set_seed0(seed); }
16 | 
17 |   inline unsigned long next() {
18 |     return ((unsigned long)next(32) << 32) + next(32);
19 |   }
20 | 
21 |   inline uint32_t next_u32() { return next(32); }
22 | 
23 |   inline uint16_t next_u16() { return (uint16_t)next(16); }
24 | 
25 |   /** [0.0, 1.0) */
26 |   inline double next_uniform() {
27 |     return (((unsigned long)next(26) << 27) + next(27)) / (double)(1L << 53);
28 |   }
29 | 
30 |   inline char next_char() { return next(8) % 256; }
31 | 
32 |   inline char next_readable_char() {
33 |     static const char readables[] =
34 |       "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz";
35 |     return readables[next(6)];
36 |   }
37 | 
38 |   inline std::string next_string(size_t len) {
39 |     std::string s(len, 0);
40 |     for (size_t i = 0; i < len; i++) s[i] = next_char();
41 |     return s;
42 |   }
43 | 
44 |   inline std::string next_readable_string(size_t len) {
45 |     std::string s(len, 0);
46 |     for (size_t i = 0; i < len; i++) s[i] = next_readable_char();
47 |     return s;
48 |   }
49 | 
50 |   inline unsigned long get_seed() { return seed; }
51 | 
52 |   inline void set_seed(unsigned long seed) { this->seed = seed; }
53 | 
54 | private:
55 |   inline void set_seed0(unsigned long seed) {
56 |     this->seed = (seed ^ 0x5DEECE66DL) & ((1L << 48) - 1);
57 |   }
58 | 
59 |   inline unsigned long next(unsigned int bits) {
60 |     seed = (seed * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1);
61 |     return (unsigned long)(seed >> (48 - bits));
62 |   }
63 | 
64 |   unsigned long seed;
65 | };
66 | 
67 | 
68 | void readonly_bench(size_t entry_count, size_t read_count) {
69 | 
70 |   LOG(INFO) << "Running std map";
71 |   stx::btree_map<int, int> map;
72 |   for (size_t i = 0; i < entry_count; i++)
73 |     map.insert(std::make_pair(i, i));
74 | 
75 |   fast_random rng(time(0));
76 |   auto start = CycleTimer::currentSeconds();
77 |   for (size_t i = 0; i < read_count; i++) {
78 |     int rand_key = rng.next_u32() % entry_count;
79 |     map.find(rand_key);
80 |   }
81 |   auto end = CycleTimer::currentSeconds();
82 |   LOG(INFO) << "stx map run for " << end-start << "s, " << "thput:" << std::fixed << read_count/(end-start)/1000 << " K rps";
83 | 
84 | }
85 | 
86 | int main() {
87 |   readonly_bench(1024*512, 1024*1024*10);
88 |   return 0;
89 | }
90 | 


--------------------------------------------------------------------------------
/fineTree/main.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <glog/logging.h>
  3 | #include <vector>
  4 | #include <thread>
  5 | #include "fineTree.h"
  6 | #include "../CycleTimer.h"
  7 | using namespace std;
  8 | 
  9 | class fast_random {
 10 | public:
 11 |   fast_random(unsigned long seed) : seed(0) { set_seed0(seed); }
 12 | 
 13 |   inline unsigned long next() {
 14 |     return ((unsigned long)next(32) << 32) + next(32);
 15 |   }
 16 | 
 17 |   inline uint32_t next_u32() { return next(32); }
 18 | 
 19 |   inline uint16_t next_u16() { return (uint16_t)next(16); }
 20 | 
 21 |   /** [0.0, 1.0) */
 22 |   inline double next_uniform() {
 23 |     return (((unsigned long)next(26) << 27) + next(27)) / (double)(1L << 53);
 24 |   }
 25 | 
 26 |   inline char next_char() { return next(8) % 256; }
 27 | 
 28 |   inline char next_readable_char() {
 29 |     static const char readables[] =
 30 |       "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz";
 31 |     return readables[next(6)];
 32 |   }
 33 | 
 34 |   inline std::string next_string(size_t len) {
 35 |     std::string s(len, 0);
 36 |     for (size_t i = 0; i < len; i++) s[i] = next_char();
 37 |     return s;
 38 |   }
 39 | 
 40 |   inline std::string next_readable_string(size_t len) {
 41 |     std::string s(len, 0);
 42 |     for (size_t i = 0; i < len; i++) s[i] = next_readable_char();
 43 |     return s;
 44 |   }
 45 | 
 46 |   inline unsigned long get_seed() { return seed; }
 47 | 
 48 |   inline void set_seed(unsigned long seed) { this->seed = seed; }
 49 | 
 50 | private:
 51 |   inline void set_seed0(unsigned long seed) {
 52 |     this->seed = (seed ^ 0x5DEECE66DL) & ((1L << 48) - 1);
 53 |   }
 54 | 
 55 |   inline unsigned long next(unsigned int bits) {
 56 |     seed = (seed * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1);
 57 |     return (unsigned long)(seed >> (48 - bits));
 58 |   }
 59 | 
 60 |   unsigned long seed;
 61 | };
 62 | 
 63 | 
 64 | int main() {
 65 |   cout << "Hello, World!" << endl;
 66 | 
 67 |   fineTree<int, int> fTree(0xffffffff);
 68 | 
 69 |   auto entry_count = 1024 * 51200;
 70 |   auto read_count = 1024*1024*1;
 71 |   int *buff = new int[entry_count];
 72 |   for(int i = 0; i < entry_count; i++) {
 73 |     buff[i] = i;
 74 |   }
 75 | 
 76 |   std::random_shuffle(buff, buff + entry_count);
 77 | 
 78 |   for(int j = 0; j < entry_count; j++) {
 79 |     auto kv = buff[j];
 80 |     fTree.insert(kv, kv);
 81 |   }
 82 |   auto fp = &fTree;
 83 | 
 84 | 
 85 |   auto start = CycleTimer::currentSeconds();
 86 |   std::vector<std::thread> threads;
 87 |   int w_n = 4;
 88 |   for(int j = 0; j < w_n; j ++) {
 89 |     threads.push_back(std::thread([fp, w_n, read_count, entry_count]() {
 90 |       fast_random rng(time(0));
 91 |       for (int i = 0; i < read_count / w_n; i++) {
 92 |         int rand_key = rng.next_u32() % entry_count;
 93 |         int val;
 94 |         auto res = fp->search(rand_key, val);
 95 |         if (res != 0 || val != rand_key) {
 96 |           LOG(FATAL) << "search fail";
 97 |         }
 98 |       }
 99 |     }));
100 |   }
101 | 
102 |   for (auto &t : threads) {
103 |     t.join();
104 |   }
105 |   auto end = CycleTimer::currentSeconds();
106 |   LOG(INFO) << "fineTree run for " << end-start << "s, " << "thput:" << std::fixed << read_count/(end-start)/1000 << " K rps";
107 | 
108 |   delete buff;
109 | }


--------------------------------------------------------------------------------
/CycleTimer.h:
--------------------------------------------------------------------------------
  1 | #ifndef _SYRAH_CYCLE_TIMER_H_
  2 | #define _SYRAH_CYCLE_TIMER_H_
  3 | 
  4 | #if defined(__APPLE__)
  5 |   #if defined(__x86_64__)
  6 |     #include <sys/sysctl.h>
  7 |   #else
  8 |     #include <mach/mach.h>
  9 |     #include <mach/mach_time.h>
 10 |   #endif // __x86_64__ or not
 11 | 
 12 |   #include <stdio.h>  // fprintf
 13 |   #include <stdlib.h> // exit
 14 | 
 15 | #elif _WIN32
 16 | #  include <windows.h>
 17 | #  include <time.h>
 18 | #else
 19 | #  include <stdio.h>
 20 | #  include <stdlib.h>
 21 | #  include <string.h>
 22 | #  include <sys/time.h>
 23 | #endif
 24 | 
 25 | 
 26 |   // This uses the cycle counter of the processor.  Different
 27 |   // processors in the system will have different values for this.  If
 28 |   // you process moves across processors, then the delta time you
 29 |   // measure will likely be incorrect.  This is mostly for fine
 30 |   // grained measurements where the process is likely to be on the
 31 |   // same processor.  For more global things you should use the
 32 |   // Time interface.
 33 | 
 34 |   // Also note that if you processors' speeds change (i.e. processors
 35 |   // scaling) or if you are in a heterogenous environment, you will
 36 |   // likely get spurious results.
 37 |   class CycleTimer {
 38 |   public:
 39 |     typedef unsigned long long SysClock;
 40 | 
 41 |     //////////
 42 |     // Return the current CPU time, in terms of clock ticks.
 43 |     // Time zero is at some arbitrary point in the past.
 44 |     static SysClock currentTicks() {
 45 | #if defined(__APPLE__) && !defined(__x86_64__)
 46 |       return mach_absolute_time();
 47 | #elif defined(_WIN32)
 48 |       LARGE_INTEGER qwTime;
 49 |       QueryPerformanceCounter(&qwTime);
 50 |       return qwTime.QuadPart;
 51 | #elif defined(__x86_64__)
 52 |       unsigned int a, d;
 53 |       asm volatile("rdtsc" : "=a" (a), "=d" (d));
 54 |       return static_cast<unsigned long long>(a) |
 55 |         (static_cast<unsigned long long>(d) << 32);
 56 | #elif defined(__ARM_NEON__) && 0 // mrc requires superuser.
 57 |       unsigned int val;
 58 |       asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(val));
 59 |       return val;
 60 | #else
 61 |       timespec spec;
 62 |       clock_gettime(CLOCK_THREAD_CPUTIME_ID, &spec);
 63 |       return CycleTimer::SysClock(static_cast<float>(spec.tv_sec) * 1e9 + static_cast<float>(spec.tv_nsec));
 64 | #endif
 65 |     }
 66 | 
 67 |     //////////
 68 |     // Return the current CPU time, in terms of seconds.
 69 |     // This is slower than currentTicks().  Time zero is at
 70 |     // some arbitrary point in the past.
 71 |     static double currentSeconds() {
 72 |       return currentTicks() * secondsPerTick();
 73 |     }
 74 | 
 75 |     //////////
 76 |     // Return the conversion from seconds to ticks.
 77 |     static double ticksPerSecond() {
 78 |       return 1.0/secondsPerTick();
 79 |     }
 80 | 
 81 |     static const char* tickUnits() {
 82 | #if defined(__APPLE__) && !defined(__x86_64__)
 83 |       return "ns";
 84 | #elif defined(__WIN32__) || defined(__x86_64__)
 85 |       return "cycles";
 86 | #else
 87 |       return "ns"; // clock_gettime
 88 | #endif
 89 |     }
 90 | 
 91 |     //////////
 92 |     // Return the conversion from ticks to seconds.
 93 |     static double secondsPerTick() {
 94 |       static bool initialized = false;
 95 |       static double secondsPerTick_val;
 96 |       if (initialized) return secondsPerTick_val;
 97 | #if defined(__APPLE__)
 98 |   #ifdef __x86_64__
 99 |       int args[] = {CTL_HW, HW_CPU_FREQ};
100 |       unsigned int Hz;
101 |       size_t len = sizeof(Hz);
102 |       if (sysctl(args, 2, &Hz, &len, NULL, 0) != 0) {
103 |          fprintf(stderr, "Failed to initialize secondsPerTick_val!\n");
104 |          exit(-1);
105 |       }
106 |       secondsPerTick_val = 1.0 / (double) Hz;
107 |   #else
108 |       mach_timebase_info_data_t time_info;
109 |       mach_timebase_info(&time_info);
110 | 
111 |       // Scales to nanoseconds without 1e-9f
112 |       secondsPerTick_val = (1e-9*static_cast<double>(time_info.numer))/
113 |         static_cast<double>(time_info.denom);
114 |   #endif // x86_64 or not
115 | #elif defined(_WIN32)
116 |       LARGE_INTEGER qwTicksPerSec;
117 |       QueryPerformanceFrequency(&qwTicksPerSec);
118 |       secondsPerTick_val = 1.0/static_cast<double>(qwTicksPerSec.QuadPart);
119 | #else
120 |       FILE *fp = fopen("/proc/cpuinfo","r");
121 |       char input[1024];
122 |       if (!fp) {
123 |          fprintf(stderr, "CycleTimer::resetScale failed: couldn't find /proc/cpuinfo.");
124 |          exit(-1);
125 |       }
126 |       // In case we don't find it, e.g. on the N900
127 |       secondsPerTick_val = 1e-9;
128 |       while (!feof(fp) && fgets(input, 1024, fp)) {
129 |         // NOTE(boulos): Because reading cpuinfo depends on dynamic
130 |         // frequency scaling it's better to read the @ sign first
131 |         float GHz, MHz;
132 |         if (strstr(input, "model name")) {
133 |           char* at_sign = strstr(input, "@");
134 |           if (at_sign) {
135 |             char* after_at = at_sign + 1;
136 |             char* GHz_str = strstr(after_at, "GHz");
137 |             char* MHz_str = strstr(after_at, "MHz");
138 |             if (GHz_str) {
139 |               *GHz_str = '\0';
140 |               if (1 == sscanf(after_at, "%f", &GHz)) {
141 |                 //printf("GHz = %f\n", GHz);
142 |                 secondsPerTick_val = 1e-9f / GHz;
143 |                 break;
144 |               }
145 |             } else if (MHz_str) {
146 |               *MHz_str = '\0';
147 |               if (1 == sscanf(after_at, "%f", &MHz)) {
148 |                 //printf("MHz = %f\n", MHz);
149 |                 secondsPerTick_val = 1e-6f / GHz;
150 |                 break;
151 |               }
152 |             }
153 |           }
154 |         } else if (1 == sscanf(input, "cpu MHz : %f", &MHz)) {
155 |           //printf("MHz = %f\n", MHz);
156 |           secondsPerTick_val = 1e-6f / MHz;
157 |           break;
158 |         }
159 |       }
160 |       fclose(fp);
161 | #endif
162 | 
163 |       initialized = true;
164 |       return secondsPerTick_val;
165 |     }
166 | 
167 |     //////////
168 |     // Return the conversion from ticks to milliseconds.
169 |     static double msPerTick() {
170 |       return secondsPerTick() * 1000.0;
171 |     }
172 | 
173 |   private:
174 |     CycleTimer();
175 |   };
176 | 
177 | #endif // #ifndef _SYRAH_CYCLE_TIMER_H_
178 | 


--------------------------------------------------------------------------------
/fineTree/fineTree.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by Zrs_y on 5/5/16.
  3 | //
  4 | 
  5 | #ifndef FINETREE_FINETREE_H
  6 | #define FINETREE_FINETREE_H
  7 | 
  8 | #include <boost/thread/locks.hpp>
  9 | #include <boost/thread/shared_mutex.hpp>
 10 | #include <glog/logging.h>
 11 | #include <atomic>
 12 | #include "immintrin.h"
 13 | 
 14 | #define UNUSED __attribute__((unused))
 15 | 
 16 | enum NodeType {
 17 |   INNERNODE = 0,
 18 |   LEAFNODE
 19 | };
 20 | 
 21 | static std::atomic<int> NODE_NUM(0);
 22 | 
 23 | template <typename KeyType,
 24 |   typename ValueType,
 25 |   typename PairType = std::pair<KeyType, ValueType>,
 26 |   typename KeyComparator = std::less<KeyType> >
 27 | class fineTree {
 28 | 
 29 | public:
 30 |   fineTree(KeyType min_key) {
 31 |     this->min_key = min_key;
 32 |     root = new InnerNode(nullptr, 1);
 33 |     auto child = new LeafNode(root, 0);
 34 | 
 35 |     add_item_inner(root, min_key, child);
 36 | 
 37 |   }
 38 | 
 39 |   int search(KeyType UNUSED key, ValueType &res) {
 40 |     auto ptr = (InnerNode *)root;
 41 |     ptr->lock_shared();
 42 |     for (;;) {
 43 |       CHECK(ptr->slot_used > 0) << "Search empty inner node";
 44 | 
 45 |       auto idx = this->search_inner(ptr->keys, ptr->slot_used, key);
 46 |       CHECK(idx != -1) << "search innerNode fail";
 47 |       CHECK(key_less(ptr->keys[idx], key) || key_eq(ptr->keys[idx], key));
 48 |       if(idx + 1 < ptr->slot_used) {
 49 |         CHECK(key_less(key, ptr->keys[idx + 1]));
 50 |       }
 51 |       auto child = ptr->values[idx];
 52 |       child->lock_shared();
 53 |       ptr->unlock_shared();
 54 |       if (child->type() == LEAFNODE) {
 55 |         auto leaf = (LeafNode *)child;
 56 |         idx = search_leaf(leaf->keys, leaf->slot_used, key);
 57 |         if (idx < 0) {
 58 |           child->unlock_shared();
 59 |           return -1;
 60 |         }else{
 61 |           res = leaf->values[idx];
 62 |           child->unlock_shared();
 63 |           return 0;
 64 |         }
 65 |       } else {
 66 |         ptr = (InnerNode *)child;
 67 |       }
 68 |     }
 69 | 
 70 | 
 71 |     return 0;
 72 |   }
 73 | 
 74 |   void insert(KeyType UNUSED key, ValueType UNUSED val) {
 75 |     root->lock_exclusive();
 76 | 
 77 |     auto new_child = insert_inner((InnerNode *)root, key, val);
 78 | 
 79 |     if(new_child == nullptr) {
 80 |       root->unlock_execlusive();
 81 |       return;
 82 |     }
 83 | 
 84 |     auto new_root = new InnerNode(nullptr, root->level + 1);
 85 |     root->parent = new_root;
 86 |     new_child->parent = new_root;
 87 | 
 88 |     add_item_inner(new_root, root->keys[0], root);
 89 |     add_item_inner(new_root, ((InnerNode *)new_child)->keys[0], new_child);
 90 | 
 91 |     root = new_root;
 92 |     root->values[0]->unlock_execlusive();
 93 |   }
 94 | 
 95 |   void test() {
 96 |     root->upgrade_lock();
 97 |     std::cout << "lock shared" << std::endl;
 98 |     root->upgrade_lock();
 99 |     std::cout << "lock exclusive" << std::endl;
100 |   }
101 | 
102 | private:
103 |   KeyType min_key;
104 |   // Max number of slots per inner node
105 |   static const int INNER_MAX_SLOT = 64;
106 |   // Max number of slots per leaf node
107 |   static const int LEAF_MAX_SLOT = 128;
108 | 
109 |   class spinlock {
110 |   public:
111 |     spinlock() {
112 |       next_ticket = 0;
113 |       now_serving = 0;
114 |     }
115 | 
116 |     void lock() {
117 |       auto my_ticket = next_ticket++;
118 |       while(my_ticket != now_serving) ;
119 |     }
120 | 
121 |     void unlock() {
122 |       now_serving++;
123 |     }
124 | 
125 |   private:
126 |     std::atomic<unsigned int> next_ticket;
127 |     std::atomic<unsigned int> now_serving;
128 |   };
129 | 
130 | 
131 |   struct Node {
132 |     // Number of actually used slots
133 |     int slot_used;
134 |     int id;
135 |     int level;
136 |     KeyType lower_bound;
137 |     Node *parent;
138 | 
139 | 
140 |     Node() = delete;
141 |     Node(Node *p, int lvl): slot_used(0), level(lvl), parent(p) {
142 |       id = NODE_NUM++;
143 |     };
144 | 
145 |     void lock_shared() {
146 |       lock.lock();
147 |     }
148 | 
149 |     void unlock_shared() {
150 |       lock.unlock();
151 |     }
152 | 
153 |     void lock_exclusive() {
154 |       lock.lock();
155 |     }
156 | 
157 |     void unlock_execlusive() {
158 |       lock.unlock();
159 |     }
160 | 
161 |     // upgrade to exclusive lock
162 |     void upgrade_lock() {
163 |       lock.lock();
164 |     }
165 | 
166 |     // downgrade to shared lock
167 |     void downgrade_lock() {
168 |       lock.unlock();
169 |     }
170 | 
171 | 
172 |     // boost::upgrade_mutex lock;
173 |     spinlock lock;
174 |     virtual ~Node() {};
175 |     virtual std::string to_string() = 0;
176 |     virtual NodeType type() const = 0;
177 |     virtual bool is_few() = 0;
178 |   };
179 | 
180 |   struct InnerNode : public Node {
181 |     InnerNode() = delete;
182 |     InnerNode(Node *parent, int level): Node(parent, level){};
183 |     virtual ~InnerNode() {};
184 |     // Keys for values
185 |     KeyType keys[LEAF_MAX_SLOT];
186 |     // Pointers for child nodes
187 |     Node *values[LEAF_MAX_SLOT];
188 | 
189 |     virtual NodeType type() const {
190 |       return INNERNODE;
191 |     }
192 | 
193 |     virtual std::string to_string() {
194 |       std::string res;
195 |       res += "InnerNode[" + std::to_string(Node::id) + " @ " + std::to_string(Node::level) + "] ";
196 |       // res += std::to_string(Node::slot_used);
197 |       for (int i = 0 ; i < Node::slot_used ; i++) {
198 |         res += " " + std::to_string(keys[i]) + ":" + std::to_string(values[i]->id);
199 |       }
200 |       return res;
201 |     }
202 | 
203 |     inline bool is_full() const {
204 |       return Node::slot_used == MAX_SLOT();
205 |     }
206 | 
207 | 
208 |     inline size_t MAX_SLOT() const {
209 |       return LEAF_MAX_SLOT;
210 |     }
211 | 
212 |     virtual inline bool is_few() {
213 |       return Node::slot_used < MAX_SLOT()/4 || Node::slot_used == 0;
214 |     }
215 | 
216 |   };
217 | 
218 |   struct LeafNode : public Node {
219 |     LeafNode() = delete;
220 |     LeafNode(Node *parent, int level): Node(parent, level){};
221 |     virtual ~LeafNode() {};
222 | 
223 |     // Keys and values for leaf node
224 |     KeyType keys[INNER_MAX_SLOT];
225 |     ValueType values[INNER_MAX_SLOT];
226 | 
227 |     virtual NodeType type() const {
228 |       return LEAFNODE;
229 |     }
230 | 
231 |     virtual std::string to_string() {
232 |       std::string res;
233 |       res += "LeafNode[" + std::to_string(Node::id) + " @ " + std::to_string(Node::level) + "] ";
234 | 
235 |       for (int i = 0 ; i < Node::slot_used ; i++) {
236 |         res += " " + std::to_string(keys[i]) + ":" + std::to_string(values[i]);
237 |       }
238 |       return res;
239 |     }
240 | 
241 |     inline bool is_full() const {
242 |       return Node::slot_used == MAX_SLOT();
243 |     }
244 | 
245 |     inline size_t MAX_SLOT() const {
246 |       return INNER_MAX_SLOT;
247 |     }
248 | 
249 |     virtual inline bool is_few() {
250 |       return Node::slot_used < MAX_SLOT()/4 || Node::slot_used == 0;
251 |     }
252 |   };
253 | 
254 |   // Return true if k1 < k2
255 |   inline bool key_less(const KeyType &k1, const KeyType &k2) {
256 |     return k1 < k2;
257 |   }
258 |   // Return true if k1 == k2
259 |   inline bool key_eq(const KeyType &k1, const KeyType &k2) {
260 |     return k1 == k2;
261 |   }
262 | 
263 |   // Return the index of the largest slot whose key <= @target
264 |   // assume there is no duplicated element
265 |   int search_inner(const KeyType *input, int size, const KeyType &target) {
266 |     // auto bt = CycleTimer::currentTicks();
267 |     int low = 0, high = size - 1;
268 |     while (low != high) {
269 |       int mid = (low + high) / 2 + 1;
270 |       if (key_less(target, input[mid])) {
271 |         // target < input[mid]
272 |         high = mid - 1;
273 |       }
274 |       else {
275 |         // target >= input[mid];
276 |         low = mid;
277 |       }
278 |     }
279 |     // STAT.add_stat(0, "search_inner", CycleTimer::currentTicks() - bt);
280 | 
281 |     if (low == size) {
282 |       return -1;
283 |     }
284 |     return low;
285 |   }
286 | 
287 |   int search_leaf(const KeyType *data, int size, const KeyType &target) {
288 |     // auto bt = CycleTimer::currentTicks();
289 |     const __m256i keys = _mm256_set1_epi32(target);
290 | 
291 |     const auto n = size;
292 |     const auto rounded = 8 * (n/8);
293 | 
294 |     for (int i=0; i < rounded; i += 8) {
295 | 
296 |       const __m256i vec1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&data[i]));
297 | 
298 |       const __m256i cmp1 = _mm256_cmpeq_epi32(vec1, keys);
299 | 
300 |       const uint32_t mask = _mm256_movemask_epi8(cmp1);
301 | 
302 |       if (mask != 0) {
303 |         // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt);
304 |         return i + __builtin_ctz(mask)/4;
305 |       }
306 |     }
307 | 
308 |     for (int i = rounded; i < n; i++) {
309 |       if (data[i] == target) {
310 |         // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt);
311 |         return i;
312 |       }
313 |     }
314 | 
315 |     // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt);
316 |     return -1;
317 |   }
318 | 
319 | 
320 | 
321 |   void add_item_leaf(LeafNode *node, KeyType key, ValueType value) {
322 |     auto idx = node->slot_used++;
323 |     node->keys[idx] = key;
324 |     node->values[idx] = value;
325 |     return;
326 |   }
327 | 
328 |   void add_item_inner(InnerNode *node, KeyType key, Node *value) {
329 |     // add item to inner node
330 |     // ensure it's order
331 | 
332 |     if(node->slot_used == 0) {
333 |       node->keys[0] = key;
334 |       node->values[0] = value;
335 |       node->slot_used++;
336 |       return;
337 |     }
338 | 
339 |     auto idx = search_inner(node->keys, node->slot_used, key);
340 | 
341 |     CHECK(idx != -1) << "search innerNode fail" << key <<" " <<node->keys[0];
342 |     CHECK(key_less(node->keys[idx], key) || key_eq(node->keys[idx], key));
343 |     if(idx + 1 < node->slot_used) {
344 |       CHECK(key_less(key, node->keys[idx + 1])) << "search inner fail";
345 |     }
346 | 
347 |     auto k = key;
348 |     auto v = value;
349 | 
350 |     for(int i = idx + 1; i < node->slot_used; i++) {
351 |       std::swap(node->keys[i], k);
352 |       std::swap(node->values[i], v);
353 |     }
354 | 
355 |     node->keys[node->slot_used] = k;
356 |     node->values[node->slot_used] = v;
357 |     node->slot_used++;
358 |   }
359 | 
360 |   Node *insert_leaf(LeafNode *node, KeyType key, ValueType value) {
361 |     // assume we hold the exclusive lock of leaf
362 | 
363 | 
364 |     // node not full
365 |     // simple add item to leaf
366 |     if(!node->is_full()) {
367 |       add_item_leaf(node, key, value);
368 |       return nullptr;
369 |     }
370 | 
371 |     // otherwise, firstly buff all elements
372 |     std::vector<std::pair<KeyType, ValueType>> buff;
373 |     for(int i = 0; i < node->slot_used; i++) {
374 |       buff.push_back(std::make_pair(node->keys[i], node->values[i]));
375 |     }
376 |     buff.push_back(std::make_pair(key, value));
377 | 
378 | 
379 |     // sort
380 |     std::sort(buff.begin(), buff.end(), [this](const std::pair<KeyType, ValueType> &p1, const std::pair<KeyType, ValueType> &p2) {
381 |       return key_less(p1.first, p2.first);
382 |     });
383 | 
384 | 
385 |     // split into 2 parts
386 |     // store the second half to new node
387 |     auto half = buff.size() / 2;
388 |     auto itr = buff.begin();
389 |     node->slot_used = 0;
390 |     for(int i = 0; i < half; i++) {
391 |       add_item_leaf(node, itr->first, itr->second);
392 |       itr++;
393 |     }
394 | 
395 |     auto new_child = new LeafNode(node->parent, 0);
396 | 
397 |     while(itr != buff.end()) {
398 |       add_item_leaf(new_child, itr->first, itr->second);
399 |       itr++;
400 |     }
401 | 
402 | 
403 |     // return the new node to upper layer
404 |     return new_child;
405 |   }
406 | 
407 |   Node *insert_inner(InnerNode *node, KeyType key, ValueType value) {
408 |     // assume we hold the exclusive lock before entering this function
409 | 
410 |     // firstly, find the child to insert
411 |     auto idx = search_inner(node->keys, node->slot_used, key);
412 |     CHECK(idx != -1)  << "search fail";
413 |     auto child = node->values[idx];
414 |     Node *new_child = nullptr;
415 |     if(child->type() == LEAFNODE) {
416 |       child->lock_exclusive();
417 |       new_child = insert_leaf((LeafNode *)child, key, value);
418 |     }else {
419 |       // child->lock_shared();
420 |       child->lock_exclusive();
421 |       new_child = insert_inner((InnerNode *)child, key, value);
422 |     }
423 | 
424 |     // child not split
425 |     if(new_child == nullptr) {
426 |       child->unlock_execlusive();
427 |       return nullptr;
428 |     }
429 | 
430 |     // child split
431 |     KeyType new_key;
432 |     if(new_child->type() == LEAFNODE) {
433 |       new_key = ((LeafNode *)new_child)->keys[0];
434 |     }else{
435 |       new_key = ((InnerNode *)new_child)->keys[0];
436 |     }
437 | 
438 |     // node not split
439 |     if(!node->is_full()) {
440 |       add_item_inner(node, new_key, new_child);
441 |       child->unlock_execlusive();
442 |       return nullptr;
443 |     }
444 | 
445 |     // node also need split
446 | 
447 |     // lock all children
448 |     for(int i = 0; i < node->slot_used; i++) {
449 |       if(node->values[i] != child) {
450 |         node->values[i]->lock_exclusive();
451 |       }
452 |     }
453 | 
454 | 
455 |     // buff all elements
456 |     std::vector<std::pair<KeyType, Node *>> buff;
457 |     for(int i = 0; i < node->slot_used; i++) {
458 |       buff.push_back(std::make_pair(node->keys[i], node->values[i]));
459 |     }
460 |     buff.push_back(std::make_pair(new_key, new_child));
461 | 
462 | 
463 |     // sort
464 |     std::sort(buff.begin(), buff.end(), [this](const std::pair<KeyType, Node *> &p1, const std::pair<KeyType, Node *> &p2) {
465 |       return key_less(p1.first, p2.first);
466 |     });
467 | 
468 | 
469 |     // store half
470 |     auto half = buff.size() / 2;
471 |     auto itr = buff.begin();
472 |     node->slot_used = 0;
473 |     for(int i = 0; i < half; i++) {
474 |       node->keys[i] = itr->first;
475 |       node->values[i] = itr->second;
476 |       node->slot_used++;
477 |       itr++;
478 |     }
479 | 
480 |     // new node store another half
481 |     auto new_inner = new InnerNode(node->parent, node->level);
482 | 
483 |     int i = 0;
484 |     while(itr != buff.end()) {
485 |       new_inner->keys[i] = itr->first;
486 |       new_inner->values[i] = itr->second;
487 |       new_inner->slot_used++;
488 |       itr->second->parent = new_inner;
489 |       itr++;
490 |       i++;
491 |     }
492 | 
493 | 
494 |     // unlock children
495 |     for(int i = 0; i < node->slot_used; i++) {
496 |       if(node->values[i] != new_child) {
497 |         node->values[i]->unlock_execlusive();
498 |       }
499 |     }
500 | 
501 |     for(int i = 0; i < new_inner->slot_used; i++) {
502 |       if(new_inner->values[i] != new_child) {
503 |         new_inner->values[i]->unlock_execlusive();
504 |       }
505 |     }
506 |     return new_inner;
507 |   }
508 | 
509 |   InnerNode *root;
510 | 
511 | };
512 | 
513 | 
514 | #endif //FINETREE_FINETREE_H
515 | 


--------------------------------------------------------------------------------
/main.cpp:
--------------------------------------------------------------------------------
  1 | #define NDEBUG
  2 | 
  3 | #include <iostream>
  4 | #include <string>
  5 | #include <assert.h>
  6 | #include "palmtree.h"
  7 | #include <thread>
  8 | #include <cstdlib>
  9 | #include <glog/logging.h>
 10 | #include <map>
 11 | #include <time.h>
 12 | #include <unistd.h>
 13 | #include <jemalloc/jemalloc.h>
 14 | #include <stx/btree_map.h>
 15 | #include <pthread.h>
 16 | #include "CycleTimer.h"
 17 | 
 18 | #define TEST_SIZE 10240000
 19 | using namespace std;
 20 | 
 21 | int worker_num;
 22 | 
 23 | class fast_random {
 24 |  public:
 25 |   fast_random(unsigned long seed) : seed(0) { set_seed0(seed); }
 26 | 
 27 |   inline unsigned long next() {
 28 |     return ((unsigned long)next(32) << 32) + next(32);
 29 |   }
 30 | 
 31 |   inline uint32_t next_u32() { return next(32); }
 32 | 
 33 |   inline uint16_t next_u16() { return (uint16_t)next(16); }
 34 | 
 35 |   /** [0.0, 1.0) */
 36 |   inline double next_uniform() {
 37 |     return (((unsigned long)next(26) << 27) + next(27)) / (double)(1L << 53);
 38 |   }
 39 | 
 40 |   inline char next_char() { return next(8) % 256; }
 41 | 
 42 |   inline char next_readable_char() {
 43 |     static const char readables[] =
 44 |         "0123456789@ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz";
 45 |     return readables[next(6)];
 46 |   }
 47 | 
 48 |   inline std::string next_string(size_t len) {
 49 |     std::string s(len, 0);
 50 |     for (size_t i = 0; i < len; i++) s[i] = next_char();
 51 |     return s;
 52 |   }
 53 | 
 54 |   inline std::string next_readable_string(size_t len) {
 55 |     std::string s(len, 0);
 56 |     for (size_t i = 0; i < len; i++) s[i] = next_readable_char();
 57 |     return s;
 58 |   }
 59 | 
 60 |   inline unsigned long get_seed() { return seed; }
 61 | 
 62 |   inline void set_seed(unsigned long seed) { this->seed = seed; }
 63 | 
 64 |  private:
 65 |   inline void set_seed0(unsigned long seed) {
 66 |     this->seed = (seed ^ 0x5DEECE66DL) & ((1L << 48) - 1);
 67 |   }
 68 | 
 69 |   inline unsigned long next(unsigned int bits) {
 70 |     seed = (seed * 0x5DEECE66DL + 0xBL) & ((1L << 48) - 1);
 71 |     return (unsigned long)(seed >> (48 - bits));
 72 |   }
 73 | 
 74 |   unsigned long seed;
 75 | };
 76 | 
 77 | void test() {
 78 |   palmtree::PalmTree<int, int> palmtree(std::numeric_limits<int>::min(), worker_num);
 79 |   palmtree::PalmTree<int, int> *palmtreep = &palmtree;
 80 | 
 81 |   for (int i = 0; i < 32; i++) {
 82 |     palmtreep->insert(i, i);
 83 |   }
 84 | 
 85 |   for (int i = 16; i <= 30; i++) {
 86 |      palmtreep->remove(i);
 87 |   }
 88 | 
 89 |   for (int i = 0; i <= 15; i++) {
 90 |     palmtreep->remove(i);
 91 |   }
 92 | 
 93 |   palmtreep->remove(31);
 94 | 
 95 |   for (int i = 0; i < 32; i++) {
 96 |     DLOG(INFO) << "Remove " << i;
 97 |     palmtreep->remove(i);
 98 |     int res;
 99 |     DLOG(INFO) << "Find " << i;
100 |     bool success = palmtreep->find(i, res);
101 |     if (success) {
102 |       assert(false);
103 |     } else {
104 |       DLOG(INFO) << "Thread " << i << " get nothing";
105 |     }
106 |   }
107 | 
108 |   srand(15618);
109 | 
110 |   std::map<int, int> reference;
111 |   for (int i = 10; i < 256; i++) {
112 |     int key1 = i;
113 |     int value1 = rand() % 10;
114 |     int key2 = i - 10;
115 | 
116 |     palmtreep->insert(key1, value1);
117 |     palmtreep->remove(key2);
118 | 
119 |     reference.emplace(key1, value1);
120 |     reference.erase(key2);
121 |   }
122 | 
123 |   for (auto itr = reference.begin(); itr != reference.end(); itr++) {
124 |     DLOG(INFO) << itr->first << " " << itr->second;
125 |   }
126 | 
127 |   for (int i = 246; i < 256; i++) {
128 |     int res;
129 |     bool suc = palmtreep->find(i, res);
130 |     CHECK(suc == true && res == reference[i]) << "Should find " << i << " " << reference[i];
131 |   }
132 | 
133 |   while(palmtree.task_nums > 0)
134 |     ;
135 | }
136 | 
137 | void bench() {
138 |   int *buff = new int[TEST_SIZE];
139 |   for(int i = 0; i < TEST_SIZE; i++) {
140 |     buff[i] = i;
141 |   }
142 | 
143 |   std::random_shuffle(buff, buff + TEST_SIZE);
144 | 
145 |   palmtree::PalmTree<int, int> palmtree(std::numeric_limits<int>::min(), worker_num);
146 |   palmtree::PalmTree<int, int> *palmtreep = &palmtree;
147 | 
148 |   std::vector<std::thread> threads;
149 | 
150 |   double start = CycleTimer::currentSeconds();
151 | 
152 |   for (int i = 0; i < 1; i++) {
153 |     threads.push_back(std::thread([palmtreep, i, buff]() {
154 |       for(int j = 0; j < TEST_SIZE; j++) {
155 |         auto kv = buff[j];
156 |         int res;
157 |         palmtreep->insert(kv, kv);
158 |         palmtreep->find(kv, res);
159 |       }
160 |     }));
161 |   }
162 | 
163 |   for (auto &thread : threads)
164 |     thread.join();
165 | 
166 |   delete buff;
167 |   LOG(INFO) << "task_nums: " << palmtree.task_nums;
168 |   while(palmtree.task_nums > 0)
169 |     ;
170 | 
171 |   double end = CycleTimer::currentSeconds();
172 |   cout << "run for " << end-start << "s";
173 | }
174 | 
175 | // Populate a palm tree with @entry_count entries
176 | void populate_palm_tree(palmtree::PalmTree<int, int> *palmtreep, size_t entry_count) {
177 |   int *buff = new int[entry_count];
178 |   for(size_t i = 0; i < entry_count; i++) {
179 |     buff[i] = i;
180 |   }
181 | 
182 |   std::random_shuffle(buff, buff + entry_count);
183 | 
184 |   for(size_t j = 0; j < entry_count; j++) {
185 |     // auto kv = buff[j];
186 |     palmtreep->insert(2 * j, 2 * j);
187 |   }
188 | 
189 |   delete buff;
190 | 
191 |   // Wait for task finished
192 |   palmtreep->wait_finish();
193 | }
194 | 
195 | 
196 | void readonly_skew(size_t entry_count, size_t op_count, float contention_ratio, bool run_std_map = false) {
197 |   LOG(INFO) << "Begin palmtree readonly skew benchmark, contention ratio: " << contention_ratio;
198 |   palmtree::PalmTree<int, int> palmtree(std::numeric_limits<int>::min(), worker_num);
199 |   palmtree::PalmTree<int, int> *palmtreep = &palmtree;
200 | 
201 |   populate_palm_tree(palmtreep, entry_count);
202 |   // Reset the metrics
203 |   palmtreep->reset_metric();
204 | 
205 |   // Wait for insertion finished
206 |   LOG(INFO) << entry_count << " entries inserted";
207 | 
208 |   fast_random rng(time(0));
209 | 
210 |   double start = CycleTimer::currentSeconds();
211 |   LOG(INFO) << "Benchmark started";
212 | 
213 |   int one_step = entry_count / (palmtreep->batch_size()+1);
214 |   int last_key = 0;
215 |   int batch_task_count = 0;
216 |   for (size_t i = 0; i < op_count; i++) {
217 |     last_key += rng.next_u32() % one_step;
218 |     last_key %= entry_count;
219 |     batch_task_count++;
220 | 
221 |     auto id = rng.next_uniform();
222 |     auto k = last_key;
223 |     if(id < contention_ratio) {
224 |       k = (int) (k * 0.2);
225 |     }
226 |     int res;
227 |     palmtreep->find(2 * k, res);
228 |     if (batch_task_count >= palmtreep->batch_size()) {
229 |       batch_task_count = 0;
230 |       last_key = 0;
231 |     }
232 |   }
233 | 
234 |   LOG(INFO) << palmtreep->task_nums << " left";
235 |   palmtreep->wait_finish();
236 |   double end = CycleTimer::currentSeconds();
237 |   LOG(INFO) << "Palmtree run for " << end-start << "s, " << "thput: " << std::fixed << 2 * op_count/(end-start)/1000 << " K rps";
238 |   double runtime = (end-start) / 2;
239 | 
240 |   if (run_std_map) {
241 |     LOG(INFO) << "Running std map";
242 |     std::map<int, int> map;
243 |     for (size_t i = 0; i < entry_count; i++)
244 |       map.insert(std::make_pair(i, i));
245 | 
246 |     pthread_rwlock_t lock_rw = PTHREAD_RWLOCK_INITIALIZER;
247 |     pthread_rwlock_t *l = &lock_rw;
248 | 
249 |     auto map_p = &map;
250 |     start = CycleTimer::currentSeconds();
251 |     std::vector<std::thread> threads;
252 | 
253 | 
254 |     auto w_n = worker_num;
255 |     for(int i = 0; i < w_n; i++) {
256 |       threads.push_back(std::thread([map_p, op_count, entry_count, l, w_n, contention_ratio]() {
257 |         fast_random rng(time(0));
258 |         for (size_t i = 0; i < op_count / w_n; i++) {
259 |           int rand_key = rng.next_u32() % entry_count;
260 |           auto id = rng.next_uniform();
261 |           if(id < contention_ratio) {
262 |             rand_key = (int) (rand_key * 0.2);
263 |           }
264 |           pthread_rwlock_rdlock(l);
265 |           map_p->find(rand_key);
266 |           pthread_rwlock_unlock(l);
267 |         }
268 |       }));
269 |     }
270 | 
271 |     for(auto &t : threads) {
272 |       t.join();
273 |     }
274 |     end = CycleTimer::currentSeconds();
275 |     LOG(INFO) << "std::map run for " << end-start << "s, " << "thput:" << std::fixed << op_count/(end-start)/1000 << " K rps";
276 |     double runtime_ref = end-start;
277 |     LOG(INFO) << "SPEEDUP over std map: " << runtime_ref / runtime << " X";
278 | 
279 |     threads.clear();
280 | 
281 |     // stx
282 |     LOG(INFO) << "Running stx map";
283 |     stx::btree_map<int, int> stx_map;
284 |     for (size_t i = 0; i < entry_count; i++)
285 |       stx_map.insert(std::make_pair(i, i));
286 | 
287 |     start = CycleTimer::currentSeconds();
288 |     auto stx_p = &stx_map;
289 |     for(int i = 0; i < w_n; i++) {
290 |       threads.push_back(std::thread([stx_p, op_count, entry_count, l, w_n, contention_ratio]() {
291 |         fast_random rng(time(0));
292 |         for (size_t i = 0; i < op_count / w_n; i++) {
293 |           int rand_key = rng.next_u32() % entry_count;
294 |           auto id = rng.next_uniform();
295 |           if(id < contention_ratio) {
296 |             rand_key = (int) (rand_key * 0.2);
297 |           }
298 |           pthread_rwlock_rdlock(l);
299 |           stx_p->find(rand_key);
300 |           pthread_rwlock_unlock(l);
301 |         }
302 |       }));
303 |     }
304 | 
305 |     for(auto &t : threads) {
306 |       t.join();
307 |     }
308 | 
309 |     end = CycleTimer::currentSeconds();
310 |     LOG(INFO) << "stx map run for " << end-start << "s, " << "thput:" << std::fixed << op_count/(end-start)/1000 << " K rps";
311 | 
312 |     runtime_ref = end-start;
313 |     LOG(INFO) << "SPEEDUP over PalmTree: " << runtime_ref / runtime << " X";
314 |   }
315 | }
316 | 
317 | 
318 | 
319 | 
320 | 
321 | void update_skew(size_t entry_count, size_t op_count, float contention_ratio, bool run_std_map = false) {
322 |   LOG(INFO) << "Begin palmtree update skew benchmark, contention ratio: " << contention_ratio;
323 |   // palmtree::PalmTree<int, int> palmtree(std::numeric_limits<int>::min(), worker_num);
324 |   palmtree::PalmTree<int, int> *palmtreep = new palmtree::PalmTree<int, int> (std::numeric_limits<int>::min(), worker_num);;
325 | 
326 |   populate_palm_tree(palmtreep, entry_count);
327 |   // Reset the metrics
328 |   palmtreep->reset_metric();
329 | 
330 |   // Wait for insertion finished
331 |   LOG(INFO) << entry_count << " entries inserted";
332 | 
333 |   fast_random rng(time(0));
334 | 
335 |   double start = CycleTimer::currentSeconds();
336 |   LOG(INFO) << "Benchmark started";
337 | 
338 |   int one_step = 2 * entry_count / (palmtreep->batch_size()+1);
339 |   int last_key = 0;
340 |   int batch_task_count = 0;
341 |   for (size_t i = 0; i < op_count; i++) {
342 |     last_key += rng.next_u32() % one_step;
343 |     last_key %= entry_count;
344 |     batch_task_count++;
345 |     auto id = rng.next_uniform();
346 |     int k = last_key;
347 |     if(id < contention_ratio) {
348 |       k = (int) (k * 0.2);
349 |     }
350 | 
351 |     id = rng.next_uniform();
352 | 
353 |     if(id < 0.1) {
354 |       palmtreep->insert(last_key, last_key);
355 |     } else if(id < 0.2) {
356 |       palmtreep->remove(last_key);
357 |     }else {
358 |       int res;
359 |       palmtreep->find(last_key, res);
360 |     }
361 | 
362 |     if (batch_task_count >= palmtreep->batch_size()) {
363 |       batch_task_count = 0;
364 |       last_key = 0;
365 |     }
366 |   }
367 | 
368 |   LOG(INFO) << palmtreep->task_nums << " left";
369 |   palmtreep->wait_finish();
370 |   double end = CycleTimer::currentSeconds();
371 |   LOG(INFO) << "Palmtree run for " << end-start << "s, " << "thput: " << std::fixed << 2 * op_count/(end-start)/1000 << " K rps";
372 |   double runtime = (end-start) / 2;
373 | 
374 |   delete palmtreep;
375 | 
376 |   if (run_std_map) {
377 |     LOG(INFO) << "Running std map";
378 |     std::map<int, int> map;
379 |     for (size_t i = 0; i < entry_count; i++)
380 |       map.insert(std::make_pair(i, i));
381 | 
382 | 
383 |     pthread_rwlock_t lock_rw = PTHREAD_RWLOCK_INITIALIZER;
384 |     pthread_rwlock_t *l = &lock_rw;
385 | 
386 | 
387 |     start = CycleTimer::currentSeconds();
388 |     auto map_p = &map;
389 |     start = CycleTimer::currentSeconds();
390 |     std::vector<std::thread> threads;
391 | 
392 |     auto w_n = worker_num;
393 |     for(int i = 0; i < w_n; i++) {
394 |       threads.push_back(std::thread([map_p, op_count, entry_count, l, w_n, contention_ratio]() {
395 |         fast_random rng(time(0));
396 | 
397 |         auto map = *map_p;
398 |         for (size_t i = 0; i < op_count / w_n; i++) {
399 |           int k = rng.next_u32() % entry_count;
400 |           auto id = rng.next_uniform();
401 | 
402 |           auto rand_key = k;
403 |           if(id < contention_ratio) {
404 |             rand_key = (int) rand_key * 0.2;
405 |           }
406 |           id = rng.next_uniform();
407 |           if(id < 0.1) {
408 |             pthread_rwlock_wrlock(l);
409 |             map[rand_key] = rand_key;
410 |           }else if (id < 0.2) {
411 |             pthread_rwlock_wrlock(l);
412 |             map.erase(rand_key);
413 |           }else {
414 |             pthread_rwlock_rdlock(l);
415 |             map.find(rand_key);
416 |           }
417 |           pthread_rwlock_unlock(l);
418 |         }
419 |       }));
420 |     }
421 | 
422 |     for(auto &t : threads) {
423 |       t.join();
424 |     }
425 | 
426 |     threads.clear();
427 | 
428 |     end = CycleTimer::currentSeconds();
429 |     LOG(INFO) << "std::map run for " << end-start << "s, " << "thput:" << std::fixed << op_count/(end-start)/1000 << " K rps";
430 | 
431 |     double runtime_ref = end-start;
432 |     LOG(INFO) << "SPEEDUP over PalmTree: " << runtime_ref / runtime << " X";
433 | 
434 |     // stx
435 |     LOG(INFO) << "Running stx map";
436 |     stx::btree_map<int, int> stx_map;
437 |     for (size_t i = 0; i < entry_count; i++)
438 |       stx_map.insert(std::make_pair(i, i));
439 | 
440 |     start = CycleTimer::currentSeconds();
441 |     auto stx_p = &stx_map;
442 |     for(int i = 0; i < w_n; i++) {
443 |       threads.push_back(std::thread([stx_p, op_count, entry_count, l, w_n, contention_ratio]() {
444 |         fast_random rng(time(0));
445 |         auto stx = *stx_p;
446 |         for (size_t i = 0; i < op_count / w_n; i++) {
447 |           int k = rng.next_u32() % entry_count;
448 |           auto id = rng.next_uniform();
449 | 
450 |           auto rand_key = k;
451 |           if(id < contention_ratio) {
452 |             rand_key = (int) rand_key * 0.2;
453 |           }
454 | 
455 |           id = rng.next_uniform();
456 |           if(id < 0.1) {
457 |             pthread_rwlock_wrlock(l);
458 |             stx.insert(rand_key, rand_key);
459 |           }else if (id < 0.2) {
460 |             pthread_rwlock_wrlock(l);
461 |             stx.erase(rand_key);
462 |           }else {
463 |             pthread_rwlock_rdlock(l);
464 |             stx.find(rand_key);
465 |           }
466 | 
467 |           pthread_rwlock_unlock(l);
468 |         }
469 |       }));
470 |     }
471 | 
472 |     for(auto &t : threads) {
473 |       t.join();
474 |     }
475 | 
476 |     end = CycleTimer::currentSeconds();
477 |     LOG(INFO) << "stx map run for " << end-start << "s, " << "thput:" << std::fixed << op_count/(end-start)/1000 << " K rps";
478 | 
479 |     runtime_ref = end-start;
480 |     LOG(INFO) << "SPEEDUP over PalmTree: " << runtime_ref / runtime << " X";
481 | 
482 |   }
483 | }
484 | 
485 | 
486 | int main(int argc, char *argv[]) {
487 |   // Google logging
488 |   FLAGS_logtostderr = 1;
489 |   google::InitGoogleLogging(argv[0]);
490 | 
491 |   if(argc < 5) {
492 |     // print usage
493 |     cout << "usage example: 8 true r 0.8" << endl;
494 |     cout << "\trunning 8 workers, running map to compare performance, readonly, contention ratio 0.8" << endl;
495 |     exit(0);
496 |   }
497 | 
498 |   worker_num = atoi(argv[1]);
499 |   bool c;
500 |   if(strcmp(argv[2], "true") == 0) {
501 |     c = true;
502 |   }else{
503 |     c = false;
504 |   }
505 | 
506 |   bool r;
507 |   if(strcmp(argv[3], "r") == 0) {
508 |     r = true;
509 |   }else{
510 |     r = false;
511 |   }
512 | 
513 |   float contention_ratio;
514 | 
515 |   contention_ratio = atof(argv[4]);
516 | 
517 | 
518 |   auto insert = 1024 * 512 * 10;
519 |   auto op_num = 1024 * 1024 * 10;
520 |   if(r) {
521 |     readonly_skew(insert, op_num, contention_ratio, c);
522 |   }else {
523 |     update_skew(insert, op_num, contention_ratio, c);
524 |   }
525 | 
526 |   return 0;
527 | }
528 | 
529 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ### Summary:
  2 | 
  3 | We have implemented a concurrent lock free B+Tree (called Palm Tree) that scales to 16 cores, with 60M queries per second (QPS) on read only and R/W mixed workload, which is 15.5x speed up comparing to our single thread implementation. Our implementation can also maintain a nearly linear speed up even for skewed workload.
  4 | 
  5 | ### Backgroud:
  6 | B+Tree is intensivily used in database management systems (DBMS). All most all relational database system uses B+Tree as the primary data structure for index. Hence the performance of B+Tree index is critical to fast query performance. On the other hand, there are two hardware trends in recent years for DBMS, systems with high core counts and large memory capacity], which results in the rising of in memory database systems.
  7 | 
  8 | The design of B+Tree index data structure of traditional DBMS is far different than that of an in memory database today. Traditional DBMSs assume that the primary storage is on disk (maganetic disk or SSD), and it is fine in most of the case to acquire a latch to provide concurrent accesses to the index because disk IO is anyway slow. However for in main memory DBMS, fetching data from memory is so much faster than from disk, such that the overhead of locking would easily doom the power of underlying hardwares.
  9 | 
 10 | In this sense, a high performance concurrent B+Tree is demanded for next generation main memory DBMS. This project is an effort to explore the pallelisim of B+Tree data structures and make it scalable to higher core counts.
 11 | 
 12 | A B+Tree is an self balancing tree struture that allows searches, scan, insertions and deletions on key/value pairs. It is a generalization of Binary Search Tree, with the similar concept of internal nodes and leaf nodes. Each inernal node contains a key range, and each range points to a subtree that contains data within that range. Each leaf node contains the actual key/value pairs.
 13 | 
 14 | The mechanism of B+Tree’s ability to keep self balanced is to split when a leaf node or internal node becomes too large, and to merge when a node becomes too small. Particularly, when root of a B+Tree splits, a new root will be allocated the tree depth is increased by one, and when the entire layer of the tree merges, the tree node will decsend and the tree depth is decreased by one. The split and merge operations are critical to maintain a balanced tree with similar sized nodes.
 15 | 
 16 | To implement a B+Tree, the following or similar operations need to be provided.
 17 | 
 18 | * **search(key)**: search for the leaf node that contains key from root to bottom and returns the leaf node.
 19 | * **add_item(node, key, value)**: add an item to a node for a given key, the value could either be a child pointer (for internal node), or the actual value (for leaf node). This operation may cause a node to split.
 20 | * **del_item(node, key)**: delete an item from a node for a given key. As opposed to add_item(), this operation may cause a node to merge.
 21 | * **split(node)**: split a node into multiple nodes, the ranges of the splitted nodes are continuous and the items are sorted within each node. Returns the new nodes that are splitted out. The parents of the spillted node will insert newly created child nodes.
 22 | * **merge(node)**: if a node contains few keys, it will be merged. The parents of the merged node will re-insert the merged key/values into other nodes, and reclaim the space of the merged child node.
 23 | * **handle_root()**: this is a special handler of the root node, because the split and merge of the root will cause the tree depth to change, and new root may need to be assigned.
 24 | 
 25 | For our prototype system, we implmeneted 3 public APIs in C++:
 26 | 
 27 | * **bool find(const KeyType &key, ValueType &value)**. find() will search for *key* and fill in the corresponding *value*. It will return true if the key/value pair is found, and false otherwise.
 28 | * **void delete(const KeyType &key)**. delete() will delete the entry in the tree if key is present in the tree.
 29 | * **void insert(const KeyType &key, const ValueType &valuel)**; insert() will insert an entry into the tree.
 30 | 
 31 | ###Approach:
 32 | ####Approach #1: Coarse Grained Locking
 33 | There are several ways to implement a concurrent B+Tree. The easiest one is to have a coarse grained lock to protect the tree, for example we can use a shared lock to support `find()`, `delete()` and `insert()`. The strategy is simple, `find()` can take a read lock, as it won’t change the structure of the tree. `delete()` and `insert()` need to take a write lock, because it will modify the tree. The advantage of coarse grained locking is its simplism, but it is often not the optimal solution since `find()` will block `delete()` and `insert()`, `delete()` and `insert()` will block all other operations on the tree.
 34 | 
 35 | ####Approach #2: Fine Grained Locking
 36 | The second approach is to use fine grained locking to protect the tree data structure. One viable way is to use some sort of hand-over-hand locking when searching down the tree, and lock the corresponding nodes before the tree structure is modified. In this project, to compare with our lock free implmenetation, we also designed and implemented a fine grained locking B+Tree: 
 37 | 
 38 | * For `find()`, we first acquire a lock on the root node, find the corresponding child node and acquire the lock on that node then release the lock held previously on the root node. For internal node, we always acquire a lock on the target child node before releasing the lock.
 39 | * For `delete()` and `insert()`, because they will potentially modify its parent node (by splitting or merging), and possibly propagate the tree modifications all the way up to the root node, we decided to acquire a lock on each node along the way we search down the tree, so we are sure that no others are possibly searching or modifying on the path.
 40 | 
 41 | The advantage of this approach is that readers will not block readers, and it blocks writers in a fine grained way (unlike the first approach, because `search()` uses a hand-over-hand locking scheme, the writers may still be able to proceed its operations after a unfinished reader). It is also reasonably simple to implement.
 42 | 
 43 | The disadvantage of this approach is that writers will still block readers. The writers will take an exclusive path on the tree, meaning that no other operations are possibly happen at the same time. 
 44 | 
 45 | ####Approach #3: Lock Free
 46 | Approach #1 and #2 both used lock to protect the data strucutre. In both cases, writers will block readers and other writers. It is more soundable to implement a lock free B+Tree that both readers and writers can proceed without blocking each other. One of such example is Palm Tree. Palm Tree is a lock free concurrent B+Tree proposed by Intel in [1], it features a Bulk Synchronized Parallism (BSP) approach to bulkly perform B+Tree operations and resolve hazards gracefully. The main contribution of this project is an efficient implementation of Palm Tree.
 47 | 
 48 | The first idea of Palm Tree is to group quries into batches, and the batches are processed one at a time cooperatively by a pool of threads. The idea behind batch is that by performing more quries at a time will likely to compensate the communication and scheduling overhead.
 49 | 
 50 | Second, to resolve conflicting access to the tree, Palm Tree adopts a stage by stage Bulk Synchronize fasion for query processing, that is a batch is processed in different stages on different layers of the tree. Between different stage, there is a synchronization point to make sure that each worker has finished the last stage and is ready for the next stage (it sounds like a barrier, the real implementation might not necessarily be one).
 51 | 
 52 | 
 53 | 1. Stage 0: In the 0 stage, queries in a batch are evenly assigned to workers
 54 | 
 55 | 2. Stage 1: Every query requires firstly search down the tree to locate the leaves, the workers in stage 1 perform this search and record the target leaf node for each query.
 56 | 
 57 | 3. Stage 2: At this stage, `insert()` or `delete()` may modify the leaf nodes, to prevent race conditions, these operations are partitioned by nodes, and are re-distributed to worker threads on a node by node base. This redistribution guarantees that each node is only accessed by exactly one worker, so that conflicing accesses are avoided inherently.
 58 | 
 59 | 	After the redistribution, the workers will execute insert() and delete(). During this process, the workers may generate split and merge requests to parent node. These operations are registered in the upper layer, but is **not** executed immediately because other siblings may also want to split and merge, causing the parent node being updated concurrently without protection.
 60 | 
 61 | 3. Stage 3: During this stage, each node gathers split and merge requests from its children. These requests are again grouped by each node (here node is the parent node respective to the node in stage 2) and assigned to workers. Stage 3 may again generate some split and merge requests to its upper layer. We repeat Stage 3 on each layer up to the root node, until then the necessary tree modifications are all done in such manner except the root node.
 62 | 
 63 | 4. Stage 4: This is the final stage. A single thread will handle the special case of root split and root merge. For a root split, a new root is allocated, it will point to the old root and newly splitted node. For a root merge, we did some trick to merge the root only when the root has one sinlge child, we decsend the root node and use the single child as the new root. In the end of stage 4, all queries in the batch are fullfilled, the results of the batch are finally delivered back to clients.
 64 | 
 65 | During the upwards operations, within each layer the task needs to be re-distributed to ensure correctness and leverages parallisms. Palm Tree's partition algorithm is as follows: for each worker thread, it records all the nodes it has accessed in the lower level, then dicards all nodes that have been accessed by a worker with a lower worker id (each worker is assigned a worker id from 0~`WORKER_NUM`). One drawbacks of such approach would be workload imbalance, as the worker with lower id has privilege over other workers.
 66 | 
 67 | ###Optimizations:
 68 | * The first optimization we made is to pre-sort the queries in a batch,  and assign them to threads in an ordered way. We will see below that pre-sort can benifit task distribution process, here the main benift is load balancing. If the batch is pre-sorted, each worker thread will be assigned a range of queries, and they will reach leaf layer with properly ordered leaf nodes. Comparing to random assignment of tasks, the 0th worker may end up with many leaf nodes, potentially have more work than others due to the task distribution policy.
 69 | 
 70 | * Next, we soonly find out that memory allocation is a bottleneck. When we measured from perf, with higher thread counts, the time spent in malloc and free takes longer and longer, so we suspuct the memory allocator is not scalable. We searched online and find a good scalable memory allocator called JEMalloc, it is nearly zero change to our code to use JeMalloc.
 71 | 
 72 | * SIMD accelearation for key lookup. There are two ways to lookup a key during tree search. If the keys in the node are sorted, we can use binary search to search the key. If the node is not sorted, we can linearly scan and match the keys. While binary search has a better aymptotic complexity than linear scan, it suffers from branch mispredications and requires the keys in the node to be sorted. Linear scan on the other hand, may potentially has the same overhead as binary search given the node size is small, can further exploit SIMD acceleration, and has a fast delete and insert speed as the node is not acquired to be sorted. Using SIMD to linearly search for key seems to be a counter-ituitive but efficient way for key lookups.
 73 | 
 74 | * Reduce communication overhead.
 75 | 	* Pre-sort a batch with different purpose. Task distribution is actually a proactive process. As described in the previous section, task distribution is by probing into other threads' task and determine which tasks belong to the worker, sorting the queries beforehand can potentially just looking at a worker's neighbours. 
 76 | 	* Previously, the 0th worker is a special worker, it is responsible for distributing the queries on the tree to all other workers. This portion of code is sequential, we improve it by let each thread calculate the range of its responsible quries and collect its own task cooperatively.
 77 | 
 78 | ### Results:
 79 | The platform we run our evaluation on:
 80 | 
 81 | * 18 cores, 36 hardware threads
 82 | * 2.9 GHz cpu, 32K L1 cache, 256K L2 cache, 26M L3 cache
 83 | * 2 NUMA nodes, 60GB memory
 84 | 
 85 | First look at our final evaluation with all optimization implemented. We have evaluated a read only benchmark and a 20% update 80% read mixed benchmark. We pre-poplate the tree with different number of items before generating the workloads.
 86 | 
 87 | Below is a graph showing different optimization we did towards the final scalable algorithm. The workload used in this graph is a read only workload with uniform access patterns on a tree with 0.5M keys.
 88 | 
 89 | <img src="https://raw.githubusercontent.com/runshenzhu/palmtree/22733aa0a41e2c19e24b657d59e20ad757064e65/scale.png" width="100%" />
 90 | 
 91 | The baseline version has a throughput about 2000KQPS, we didn’t see a huge speedup by adapting the pre-sort optimization mentioned in the paper, this is mainly because the system is bottlenecked by the the memory allocator. We then replaced the default libc’s `malloc` with jemalloc, the performance now greatly goes up, however after 6 cores there is no more throughput gain. The B+tree throughput is 10MQPS. At this point, applying SIMD to the data structure can provide a 10%-20% speed up.
 92 | 
 93 | Then the huge performance gain is from reducing the communciation overhead. We first implemented a customized profiler to collect running time of different stages of the system. As can be seen from the log output when profing on 4 workers and 8 workers Palm Tree, we found that batch collection (Stage 0) and result distribution (Stage 4) is not scalable, mainly because it is only done by the 0th worker by design.
 94 | 
 95 | ```
 96 | I0505 01:02:58.919889 70461 palmtree.h:63] [collect_batch]
 97 | I0505 01:02:58.919924 70461 palmtree.h:68] 0: 1.06791      <=
 98 | I0505 01:02:58.919939 70461 palmtree.h:68] 1: 0
 99 | I0505 01:02:58.919947 70461 palmtree.h:68] 2: 0
100 | ...
101 | I0505 01:02:58.920054 70461 palmtree.h:63] [end_stage]
102 | I0505 01:02:58.920061 70461 palmtree.h:68] 0: 1.09612      <=
103 | I0505 01:02:58.920070 70461 palmtree.h:68] 1: 0
104 | I0505 01:02:58.920078 70461 palmtree.h:68] 2: 0
105 | ...
106 | I0505 01:02:58.920110 70461 palmtree.h:63] [total_time]
107 | I0505 01:02:58.920117 70461 palmtree.h:68] 0: 3.12207
108 | I0505 01:02:58.920125 70461 palmtree.h:68] 1: 3.12296
109 | I0505 01:02:58.920133 70461 palmtree.h:68] 2: 3.1128
110 | ...
111 | ```
112 | 
113 | 
114 | To fix this problem, we let each thread calculate its own task ranges in the batch, and fetches the task without communicating with others, and hence there is not 0th worker's responsibility to distribute the batch tasks. When the task is finished, the worker threads are responsible for returning the results back cooperatively, instead of all done by 0th worker.
115 | 
116 | Another communication overhead is in stage 2's redistribution of node modification tasks, shown in the following screenshot. By pre-sorting the batch, a worker node may be able to only probe its neighbours’ task s to determine its tasks.
117 | 
118 | As shown in the graph, the final speed up is promising, we have achieved 60M QPS on a 16 core system and the algorithm scales very well!
119 | 
120 | The following graph shows the scalability of our implementation, we vary the number of workers in the worker pool as well as the pre-populated tree’s size. When the tree is of medium or small size, the speed up is close to a linear speed up. When the tree size is large, we believe the system has been memory bounded so that the speed up is not as good (however it is still 10x). This workload is a 20% update, 80% read workload with uniform access to keys in the tree.
121 | 
122 | <img src="https://raw.githubusercontent.com/runshenzhu/palmtree/22733aa0a41e2c19e24b657d59e20ad757064e65/speedup.png" width="100%" />
123 | 
124 | 
125 | Our implmentation is also resilient to skewed data access patterns. The following graph is the comparison of throughput for uniform access and contended access. The contended workload is generated by having 80% of operations accesing 20% of the entries in the tree. For either small, medium or large trees, the throughput has a slightly drop but not much for skewed access, showing that our implementation can actually resist to the skewness quite well.
126 | 
127 | <img src="https://raw.githubusercontent.com/runshenzhu/palmtree/22733aa0a41e2c19e24b657d59e20ad757064e65/skew.png" width="100%" />
128 | 
129 | We have also compared the performance of Palm Tree with single thread `std::map` and single thread `stx::btree` (an open source efficient implementation of B+Tree), and also our not so efficient implementation of fine grained lock B+Tree in hand-over-hand fasion. As can be seen, `std::map` is generally not performent even for single thread, `stx::btree` is performant for single thread but it is not a concurrent data structure. We have tried to add a shared lock to both `std::map` and `stx::btree`, it turns out they perform even worse in a many core settings. The hand-over-hand B+Tree can’t scale beyond 4 threads. We wish we would have a better implementation of fine grained locking B+Tree, but it turns out to be even harder than Palm Tree, many corner cases might happen, given limited time we are not able to engage into that.
130 | 
131 | <img src="https://raw.githubusercontent.com/runshenzhu/palmtree/22733aa0a41e2c19e24b657d59e20ad757064e65/compare.png" width="100%" />
132 | 
133 | The final graph is about the decomposition of time spent in each stage of a workload. The workload is 20% update, 80% read, 0.5M keys in the tree, uniform access. We generated 1B of operations to the tree.
134 | 
135 | From the runtime decomposition we can see that the time spent in stage 2 is being less and less significant when more threads are used. Recall that stage 2 is actually matching keys, inserting keys or removing keys on the leaf node, this is one of the most expensive and frequent operations in Palm Tree. in the beginning when there is just one thread, most of the time is spent on stage 2. However with the increasing of number of workers, the communication overhead becomes more and more significant, it grows from nearly 0% for 1 thread to around 33%, for 16 threads. This is not surprising as we have more threads, the more likely that they can’t keep up with each other so that waiting is common. One way to overcome this problem will be focusing on how to elimiate this all to all communications.
136 | 
137 | <img src="https://raw.githubusercontent.com/runshenzhu/palmtree/22733aa0a41e2c19e24b657d59e20ad757064e65/decomp.png" width="100%" />
138 | 
139 | #### References:
140 | [1] J. Sewall, J. Chhugani, C. Kim, N. Satish, and P. Dubey. PALM: Parallel architecture-friendly latch-free modifications to B+ trees on many-core processors. Proc. VLDB Endowment, 4(11):795--806, August 2011.
141 | 
142 | [2] David B. Lomet, Sudipta Sengupta, and Justin J. Levandoski. 2013. The Bw-Tree: A B-tree for new hardware platforms. In Proceedings of the 2013 IEEE International Conference on Data Engineering (ICDE 2013) (ICDE '13). IEEE Computer Society, Washington, DC, USA, 302-313. DOI=http://dx.doi.org/10.1109/ICDE.2013.6544834
143 | 
144 | #### Work Partition:
145 | Equal work was performed by both project members (@Ran Xian and @Runshen Zhu).
146 | 


--------------------------------------------------------------------------------
/palmtree.h:
--------------------------------------------------------------------------------
   1 | #pragma once
   2 | 
   3 | #include <functional>
   4 | #include <vector>
   5 | #include <unordered_map>
   6 | #include <unordered_set>
   7 | #include <chrono>
   8 | #include <assert.h>
   9 | #include <thread>
  10 | #include <boost/lockfree/spsc_queue.hpp>
  11 | #include <boost/thread/barrier.hpp>
  12 | #include <boost/thread.hpp>
  13 | #include <iostream>
  14 | #include <memory>
  15 | #include <atomic>
  16 | #include <glog/logging.h>
  17 | #include "immintrin.h"
  18 | // #include "smmintrin.h"
  19 | #include "CycleTimer.h"
  20 | #include "barrier.h"
  21 | #include <jemalloc/jemalloc.h>
  22 | 
  23 | using std::cout;
  24 | using std::endl;
  25 | 
  26 | #define UNUSED __attribute__((unused))
  27 | 
  28 | #define PROFILE
  29 | 
  30 | namespace palmtree {
  31 | 
  32 |   static std::atomic<int> NODE_NUM(0);
  33 |   unsigned int batch_id = 0;
  34 |   /**
  35 |    * Tree operation types
  36 |    */
  37 |   enum TreeOpType {
  38 |     TREE_OP_FIND = 0,
  39 |     TREE_OP_INSERT,
  40 |     TREE_OP_REMOVE
  41 |   };
  42 | 
  43 |   enum NodeType {
  44 |     INNERNODE = 0,
  45 |     LEAFNODE
  46 |   };
  47 | 
  48 |   class Stats {
  49 |   public:
  50 |     Stats(int worker_num): worker_num_(worker_num) {}
  51 |     Stats() {}
  52 |     /**
  53 |      * add stat for one metric of one worker
  54 |      */
  55 |     void add_stat(int worker_id, std::string metric_name, double metric_value) {
  56 |       stats_[metric_name][worker_id] += metric_value;
  57 |     }
  58 | 
  59 |     void init_metric(std::string metric_name) {
  60 |       stats_[metric_name] = std::vector<CycleTimer::SysClock>(worker_num_);
  61 |       for (int i = 0; i < worker_num_; i++)
  62 |         stats_[metric_name][i] = 0;
  63 |       metric_names_.push_back(metric_name);
  64 |     }
  65 | 
  66 |     /**
  67 |      * Print the stats out
  68 |      */
  69 |     void print_stat() {
  70 |       for (auto &metric_name : metric_names_) {
  71 |         LOG(INFO) << "\033[1m[" << metric_name << "]\033[0m ";
  72 |         auto &values = stats_[metric_name];
  73 |         std::string line = "";
  74 |         for (int i = 0; i < worker_num_; i++) {
  75 |           if (metric_name == "leaf_task") {
  76 |             line += "\t" + std::to_string(i) + ": " + std::to_string(values[i]);
  77 | 
  78 |           } else {
  79 |             line += "\t" + std::to_string(i) + ": " + std::to_string(values[i] * CycleTimer::secondsPerTick());
  80 |           }
  81 |         }
  82 |         LOG(INFO) << line;
  83 |       }
  84 |     }
  85 | 
  86 |     void reset_metric() {
  87 |       for (auto itr = stats_.begin(); itr != stats_.end(); itr++) {
  88 |         for (int i = 0; i < worker_num_; i++) {
  89 |           itr->second[i] = 0;
  90 |         }
  91 |       }
  92 |     }
  93 |   private:
  94 |     std::unordered_map<std::string, std::vector<CycleTimer::SysClock>> stats_;
  95 |     std::vector<std::string> metric_names_;
  96 |     int worker_num_;
  97 |   } STAT;
  98 | 
  99 |   template <typename KeyType,
 100 |            typename ValueType,
 101 |            typename PairType = std::pair<KeyType, ValueType>,
 102 |            typename KeyComparator = std::less<KeyType> >
 103 |   class PalmTree {
 104 |   public:
 105 |     // Number of working threads
 106 |     int NUM_WORKER;
 107 |     int BATCH_SIZE;
 108 | 
 109 |   private:
 110 |     // Max number of slots per inner node
 111 |     static const int INNER_MAX_SLOT = 256;
 112 |     // Max number of slots per leaf node
 113 |     static const int LEAF_MAX_SLOT = 64;
 114 |     // Threshold to control bsearch or linear search
 115 |     static const int BIN_SEARCH_THRESHOLD = 32;
 116 |     // Number of working threads
 117 |     static const int BATCH_SIZE_PER_WORKER = 4096;
 118 | 
 119 |   private:
 120 |     /**
 121 |      * Tree node base class
 122 |      */
 123 |     struct InnerNode;
 124 |     struct Node {
 125 |       // Number of actually used slots
 126 |       int slot_used;
 127 |       int id;
 128 |       int level;
 129 |       KeyType lower_bound;
 130 |       Node *parent;
 131 | 
 132 | 
 133 |       Node() = delete;
 134 |       Node(Node *p, int lvl): slot_used(0), level(lvl), parent(p) {
 135 |         id = NODE_NUM++;
 136 |       };
 137 |       virtual ~Node() {};
 138 |       virtual std::string to_string() = 0;
 139 |       virtual NodeType type() const = 0;
 140 |       virtual bool is_few() = 0;
 141 |     };
 142 | 
 143 |     struct InnerNode : public Node {
 144 |       InnerNode() = delete;
 145 |       InnerNode(Node *parent, int level): Node(parent, level){};
 146 |       virtual ~InnerNode() {};
 147 |       // Keys for values
 148 |       KeyType keys[LEAF_MAX_SLOT];
 149 |       // Pointers for child nodes
 150 |       Node *values[LEAF_MAX_SLOT];
 151 | 
 152 |       virtual NodeType type() const {
 153 |         return INNERNODE;
 154 |       }
 155 | 
 156 |       virtual std::string to_string() {
 157 |         std::string res;
 158 |         res += "InnerNode[" + std::to_string(Node::id) + " @ " + std::to_string(Node::level) + "] ";
 159 |         for (int i = 0 ; i < Node::slot_used ; i++) {
 160 |           res += " " + std::to_string(keys[i]) + ":" + std::to_string(values[i]->id);
 161 |         }
 162 |         return res;
 163 |       }
 164 | 
 165 |       inline bool is_full() const {
 166 |         return Node::slot_used == MAX_SLOT();
 167 |       }
 168 | 
 169 | 
 170 |       inline size_t MAX_SLOT() const {
 171 |         return LEAF_MAX_SLOT;
 172 |       }
 173 | 
 174 |       virtual inline bool is_few() {
 175 |         return Node::slot_used < MAX_SLOT()/4 || Node::slot_used == 0;
 176 |       }
 177 | 
 178 |     };
 179 | 
 180 |     struct LeafNode : public Node {
 181 |       LeafNode() = delete;
 182 |       LeafNode(Node *parent, int level): Node(parent, level){};
 183 |       virtual ~LeafNode() {};
 184 | 
 185 |       // Keys and values for leaf node
 186 |       KeyType keys[INNER_MAX_SLOT];
 187 |       ValueType values[INNER_MAX_SLOT];
 188 | 
 189 |       virtual NodeType type() const {
 190 |         return LEAFNODE;
 191 |       }
 192 | 
 193 |       virtual std::string to_string() {
 194 |         std::string res;
 195 |         res += "LeafNode[" + std::to_string(Node::id) + " @ " + std::to_string(Node::level) + "] ";
 196 | 
 197 |         for (int i = 0 ; i < Node::slot_used ; i++) {
 198 |           res += " " + std::to_string(keys[i]) + ":" + std::to_string(values[i]);
 199 |         }
 200 |         return res;
 201 |       }
 202 | 
 203 |       inline bool is_full() const {
 204 |         return Node::slot_used == MAX_SLOT();
 205 |       }
 206 | 
 207 |       inline size_t MAX_SLOT() const {
 208 |         return INNER_MAX_SLOT;
 209 |       }
 210 | 
 211 |       virtual inline bool is_few() {
 212 |         return Node::slot_used < MAX_SLOT()/4 || Node::slot_used == 0;
 213 |       }
 214 |     };
 215 |     /**
 216 |      * Tree operation wrappers
 217 |      */
 218 |     struct TreeOp {
 219 |       // Op can either be none, add or delete
 220 |       TreeOp(TreeOpType op_type, const KeyType &key, const ValueType &value):
 221 |         op_type_(op_type), key_(key), value_(value), target_node_(nullptr),
 222 |         boolean_result_(false), done_(false) {};
 223 | 
 224 | 
 225 |       TreeOp(TreeOpType op_type, const KeyType &key):
 226 |         op_type_(op_type), key_(key), target_node_(nullptr),
 227 |         boolean_result_(false), done_(false) {};
 228 | 
 229 |       TreeOpType op_type_;
 230 |       KeyType key_;
 231 |       ValueType value_;
 232 | 
 233 |       LeafNode *target_node_;
 234 |       ValueType result_;
 235 |       bool boolean_result_;
 236 |       bool done_;
 237 | 
 238 |       // Wait until this operation is done
 239 |       // Now use busy waiting, should use something more smart. But be careful
 240 |       // that conditional variable could be very expensive
 241 |       inline void wait() {
 242 |         while (!done_) {
 243 |           boost::this_thread::sleep_for(boost::chrono::milliseconds(1));
 244 |         }
 245 |       }
 246 |     };
 247 | 
 248 |     /**
 249 |      * A batch of tree operations, this data structure is not thread safe
 250 |      * The major goal of this class is to amortize memory allocation of
 251 |      * tree operations
 252 |      */
 253 |     class TaskBatch {
 254 |     public:
 255 |       TaskBatch(size_t capacity): capacity_(capacity), ntask_(0) {
 256 |         ops_ = (TreeOp *)malloc(sizeof(TreeOp) * capacity_);
 257 |       }
 258 | 
 259 |       void destroy() {
 260 |         free(ops_);
 261 |         ops_ = nullptr;
 262 |       }
 263 | 
 264 |       // Add a tree operation to the batch
 265 |       inline void add_op(TreeOpType op_type, const KeyType *keyp, const ValueType *valp) {
 266 |         assert(ntask_ != capacity_);
 267 | 
 268 |         if (op_type == TREE_OP_INSERT) {
 269 |           assert(valp != nullptr);
 270 |           ops_[ntask_++] = TreeOp(op_type, *keyp, *valp);
 271 |         } else {
 272 |           ops_[ntask_++] = TreeOp(op_type, *keyp);
 273 |         }
 274 |       }
 275 | 
 276 |       // Whether the tree is full or not
 277 |       inline bool is_full() { return ntask_ == capacity_; }
 278 |       // The size of the batch
 279 |       inline size_t size() { return ntask_; }
 280 |       // Overloading [] to return the ith operation in the batch
 281 |       TreeOp * get_op(int i) {
 282 |         assert(i < ntask_);
 283 |         return ops_ + i;
 284 |       }
 285 | 
 286 |       // Capacity of the batch
 287 |       size_t capacity_;
 288 |       // Number of tasks currently in the batch
 289 |       size_t ntask_;
 290 |       // Tree opearations
 291 |       TreeOp *ops_;
 292 |     };
 293 | 
 294 |     enum ModType {
 295 |       MOD_TYPE_ADD,
 296 |       MOD_TYPE_DEC,
 297 |       MOD_TYPE_NONE
 298 |     };
 299 | 
 300 |     /**
 301 |      * Wrapper for node modification
 302 |      */
 303 |     struct NodeMod {
 304 |       NodeMod(ModType type): type_(type) {}
 305 |       NodeMod(const TreeOp &op) {
 306 |         CHECK(op.op_type_ != TREE_OP_FIND) << "NodeMod can't convert from a find operation" << endl;
 307 |         if (op.op_type_ == TREE_OP_REMOVE) {
 308 |           this->type_ = MOD_TYPE_DEC;
 309 |           this->value_items.emplace_back(std::make_pair(op.key_, ValueType()));
 310 |         } else {
 311 |           this->type_ = MOD_TYPE_ADD;
 312 |           this->value_items.emplace_back(std::make_pair(op.key_, op.value_));
 313 |         }
 314 |       }
 315 |       ModType type_;
 316 |       // For leaf modification
 317 |       std::vector<std::pair<KeyType, ValueType>> value_items;
 318 |       // For inner node modification
 319 |       std::vector<std::pair<KeyType, Node *>> node_items;
 320 |       // For removed keys
 321 |       std::vector<std::pair<KeyType, ValueType>> orphaned_kv;
 322 |     };
 323 | 
 324 |   /********************
 325 |    * PalmTree private
 326 |    * ******************/
 327 |   private:
 328 |     // Root of the palm tree
 329 |     Node *tree_root;
 330 |     // Height of the tree
 331 |     int tree_depth_;
 332 |     // Number of nodes on each layer
 333 |     std::vector<std::atomic<int> *> layer_width_;
 334 |     // Is the tree being destroyed or not
 335 |     bool destroyed_;
 336 |     // Minimal key
 337 |     KeyType min_key_;
 338 |     // Key comparator
 339 |     KeyComparator kcmp;
 340 |     // Current batch of the tree
 341 |     TaskBatch *tree_current_batch_;
 342 | 
 343 |     // Push a task into the current batch, if the batch is full, push the batch
 344 |     // into the batch queue.
 345 |     void push_task(TreeOpType op_type, const KeyType *keyp, const ValueType *valp) {
 346 |       tree_current_batch_->add_op(op_type, keyp, valp);
 347 |       task_nums += 2;
 348 | 
 349 |       if (tree_current_batch_->is_full()) {
 350 |         task_batch_queue_.push(tree_current_batch_);
 351 |         tree_current_batch_ = (TaskBatch *)malloc(sizeof(TaskBatch));
 352 |         new (tree_current_batch_) TaskBatch(BATCH_SIZE);
 353 |         DLOG(INFO) << "Push one batch into the queue ";
 354 |       }
 355 |     }
 356 | 
 357 |     // Return true if k1 < k2
 358 |     inline bool key_less(const KeyType &k1, const KeyType &k2) {
 359 |       return kcmp(k1, k2);
 360 |     }
 361 |     // Return true if k1 == k2
 362 |     inline bool key_eq(const KeyType &k1, const KeyType &k2) {
 363 |       return !kcmp(k1, k2) && !kcmp(k2, k1);
 364 |     }
 365 | 
 366 | 
 367 |     // Return the index of the largest slot whose key <= @target
 368 |     // assume there is no duplicated element
 369 |     int search_helper(const KeyType *input, int size, const KeyType &target) {
 370 |       int res = -1;
 371 |       // loop all element
 372 |       for (int i = 0; i < size; i++) {
 373 |         if(key_less(target, input[i])){
 374 |           // target < input
 375 |           // ignore
 376 |           continue;
 377 | 
 378 |         }
 379 |         if (res == -1 || key_less(input[res], input[i])) {
 380 |           res = i;
 381 |         }
 382 |       }
 383 | 
 384 |       return res;
 385 |     }
 386 | 
 387 |     // liner search in leaf
 388 |     // assume there is no duplicated element
 389 | //    int search_leaf(const KeyType *data, int size, const KeyType &target) {
 390 | //      const __m128i keys = _mm_set1_epi32(target);
 391 | //
 392 | //      const auto n = size;
 393 | //      const auto rounded = 8 * (n / 8);
 394 | //
 395 | //      for (int i = 0; i < rounded; i += 8) {
 396 | //
 397 | //        const __m128i vec1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i]));
 398 | //        const __m128i vec2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&data[i + 4]));
 399 | //
 400 | //        const __m128i cmp1 = _mm_cmpeq_epi32(vec1, keys);
 401 | //        const __m128i cmp2 = _mm_cmpeq_epi32(vec2, keys);
 402 | //
 403 | //        const __m128i tmp = _mm_packs_epi32(cmp1, cmp2);
 404 | //        const uint32_t mask = _mm_movemask_epi8(tmp);
 405 | //
 406 | //        if (mask != 0) {
 407 | //          return i + __builtin_ctz(mask) / 2;
 408 | //        }
 409 | //      }
 410 | //
 411 | //      for (int i = rounded; i < n; i++) {
 412 | //        if (data[i] == target) {
 413 | //          return i;
 414 | //        }
 415 | //      }
 416 | //
 417 | //      return -1;
 418 | //    }
 419 | 
 420 | 
 421 | 
 422 | 
 423 |     int search_leaf(const KeyType *data, int size, const KeyType &target) {
 424 |       // #ifdef PROFILE
 425 |       // auto bt = CycleTimer::currentTicks();
 426 |       // #endif
 427 |       const __m256i keys = _mm256_set1_epi32(target);
 428 | 
 429 |       const auto n = size;
 430 |       const auto rounded = 8 * (n/8);
 431 | 
 432 |       for (int i=0; i < rounded; i += 8) {
 433 | 
 434 |         const __m256i vec1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&data[i]));
 435 | 
 436 |         const __m256i cmp1 = _mm256_cmpeq_epi32(vec1, keys);
 437 | 
 438 |         const uint32_t mask = _mm256_movemask_epi8(cmp1);
 439 | 
 440 |         if (mask != 0) {
 441 |           // #ifdef PROFILE
 442 |           // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt);
 443 |           // #endif
 444 |           return i + __builtin_ctz(mask)/4;
 445 |         }
 446 |       }
 447 | 
 448 |       for (int i = rounded; i < n; i++) {
 449 |         if (data[i] == target) {
 450 |           // #ifdef PROFILE
 451 |           // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt);
 452 |           // #endif
 453 |           return i;
 454 |         }
 455 |       }
 456 | 
 457 |       // #ifdef PROFILE
 458 |       // STAT.add_stat(0, "search_leaf", CycleTimer::currentTicks() - bt);
 459 |       // #endif
 460 |       return -1;
 461 |     }
 462 | 
 463 | 
 464 | 
 465 |     // Return the index of the largest slot whose key <= @target
 466 |     // assume there is no duplicated element
 467 |     int search_inner(const KeyType *input, int size, const KeyType &target) {
 468 |       // #ifdef PROFILE
 469 |       // auto bt = CycleTimer::currentTicks();
 470 |       // #endif
 471 |       int low = 0, high = size - 1;
 472 |       while (low != high) {
 473 |         int mid = (low + high) / 2 + 1;
 474 |         if (key_less(target, input[mid])) {
 475 |           // target < input[mid]
 476 |           high = mid - 1;
 477 |         }
 478 |         else {
 479 |           // target >= input[mid];
 480 |           low = mid;
 481 |         }
 482 |       }
 483 |       // #ifdef PROFILE
 484 |       // STAT.add_stat(0, "search_inner", CycleTimer::currentTicks() - bt);
 485 |       // #endif
 486 | 
 487 |       if (low == size) {
 488 |         return -1;
 489 |       }
 490 |       return low;
 491 |     }
 492 | 
 493 |     /**
 494 |      * @brief Return the leaf node that contains the @key
 495 |      */
 496 |     LeafNode *search(const KeyType &key UNUSED) {
 497 | 
 498 |       auto ptr = (InnerNode *)tree_root;
 499 |       for (;;) {
 500 |         CHECK(ptr->slot_used > 0) << "Search empty inner node";
 501 |         auto idx = this->search_inner(ptr->keys, ptr->slot_used, key);
 502 |         CHECK(idx != -1) << "search innerNode fail" << endl;
 503 |         CHECK(key_less(ptr->keys[idx], key) || key_eq(ptr->keys[idx], key));
 504 |         if(idx + 1 < ptr->slot_used) {
 505 |           CHECK(key_less(key, ptr->keys[idx + 1]));
 506 |         }
 507 |         Node *child = ptr->values[idx];
 508 |         if (child->type() == LEAFNODE) {
 509 |           return (LeafNode *)child;
 510 |         } else {
 511 |           ptr = (InnerNode *)child;
 512 |         }
 513 |       }
 514 |       // we shouldn't reach here
 515 |       assert(0);
 516 |     }
 517 | 
 518 |     /**
 519 |      * @brief big_split will split the kv pair vector into multiple tree nodes
 520 |      *  that is within the threshold. The actual type of value is templated as V.
 521 |      *  The splited nodes should be stored in Node, respect to appropriate
 522 |      *  node types
 523 |      */
 524 |     template<typename NodeType, typename V>
 525 |     void big_split(std::vector<std::pair<KeyType, V>> &input, NodeType *node, std::vector<std::pair<KeyType, Node *>> &new_nodes) {
 526 |       std::sort(input.begin(), input.end(), [this](const std::pair<KeyType, V> &p1, const std::pair<KeyType, V> &p2) {
 527 |         return key_less(p1.first, p2.first);
 528 |       });
 529 | 
 530 |       auto itr = input.begin();
 531 | 
 532 |       auto item_per_node = node->MAX_SLOT() / 2;
 533 |       auto node_num = input.size() / (item_per_node);
 534 |       // save first half items (small part) in old node
 535 |       node->slot_used = 0;
 536 |       for (int i = 0; i < item_per_node; i++) {
 537 |         // add_item<NodeType, V>(node, itr->first, itr->second);
 538 |         node->keys[i] = itr->first;
 539 |         node->values[i] = itr->second;
 540 |         node->slot_used++;
 541 |         itr++;
 542 |       }
 543 | 
 544 |       // Add a new node
 545 |       int node_create_num = 1;
 546 |       while(node_create_num < node_num) {
 547 | 
 548 |         NodeType *new_node = new NodeType(node->parent, node->Node::level);
 549 |         layer_width_[node->Node::level]->fetch_add(1);
 550 | 
 551 |         // save the second-half in new node
 552 |         auto new_key = (*itr).first;
 553 |         int i = 0;
 554 |         while (itr != input.end() && new_node->slot_used < item_per_node) {
 555 |           // add_item<NodeType, V>(new_node, itr->first, itr->second);
 556 |           new_node->keys[i] = itr->first;
 557 |           new_node->values[i] = itr->second;
 558 |           new_node->slot_used++;
 559 |           itr++;
 560 |           i++;
 561 |         }
 562 |         if(node_create_num == node_num - 1) {
 563 |           while(itr != input.end()) {
 564 |             new_node->keys[i] = itr->first;
 565 |             new_node->values[i] = itr->second;
 566 |             new_node->slot_used++;
 567 |             itr++;
 568 |             i++;
 569 |           }
 570 |         }
 571 | 
 572 |         new_nodes.push_back(std::make_pair(new_key, new_node));
 573 |         node_create_num++;
 574 |       }
 575 |     }
 576 | 
 577 |     // Warning: if this function return true, the width of the layer will be
 578 |     // decreased by 1, so the caller must actually merge the node
 579 |     bool must_merge(Node *node) {
 580 |       if (!node->is_few())
 581 |         return false;
 582 | 
 583 |       int old_width = layer_width_[node->level]->fetch_add(-1);
 584 |       if (old_width == 1) {
 585 |         // Can't merge
 586 |         layer_width_[node->level]->fetch_add(1);
 587 |         return false;
 588 |       }
 589 | 
 590 |       return true;
 591 |     }
 592 | 
 593 |     template <typename NodeType, typename V>
 594 |     void add_item(NodeType *node, const KeyType &key, V value) {
 595 |       // add item to leaf node
 596 |       // just append it to the end of the slot
 597 |       if (node->type() == LEAFNODE) {
 598 |         // auto idx = node->slot_used++;
 599 |         auto idx = search_leaf(node->keys, node->slot_used, key);
 600 |         if(idx != -1) {
 601 |           return;
 602 |         }
 603 |         idx = node->slot_used++;
 604 |         node->keys[idx] = key;
 605 |         node->values[idx] = value;
 606 |         return;
 607 |       }
 608 | 
 609 |       if(node->slot_used == 0) {
 610 |         node->keys[0] = key;
 611 |         node->values[0] = value;
 612 |         node->slot_used++;
 613 |         return;
 614 |       }
 615 | 
 616 |       // add item to inner node
 617 |       // ensure it's order
 618 |       DLOG(INFO) << "search inner begin";
 619 |       auto idx = search_inner(node->keys, node->slot_used, key);
 620 | 
 621 |       CHECK(idx != -1) << "search innerNode fail" << key <<" " <<node->keys[0];
 622 |       CHECK(key_less(node->keys[idx], key) || key_eq(node->keys[idx], key));
 623 |       if(idx + 1 < node->slot_used) {
 624 |         CHECK(key_less(key, node->keys[idx + 1])) << "search inner fail";
 625 |       }
 626 | 
 627 |       DLOG(INFO) << "search inner end";
 628 |       auto k = key;
 629 |       auto v = value;
 630 | 
 631 |       for(int i = idx + 1; i < node->slot_used; i++) {
 632 |         std::swap(node->keys[i], k);
 633 |         std::swap(node->values[i], v);
 634 |       }
 635 | 
 636 |       node->keys[node->slot_used] = k;
 637 |       node->values[node->slot_used] = v;
 638 |       node->slot_used++;
 639 |     }
 640 | 
 641 | 
 642 |     template <typename NodeType>
 643 |     void del_item(NodeType *node, const KeyType &key) {
 644 |       auto lastIdx = node->slot_used - 1;
 645 |       auto idx = search_helper(node->keys, node->slot_used, key);
 646 |       DLOG(INFO) << "search in del, idx: " << idx;
 647 |       if (idx == -1) {
 648 |         DLOG(WARNING) << "del fail, can't find key in node";
 649 |         return;
 650 |       }
 651 | 
 652 |       if (!key_eq(key, node->keys[idx])) {
 653 |         DLOG(WARNING) << "del in inner, del idx: " << idx << " key != del_key" << endl;
 654 |         if (node->type() == LEAFNODE)
 655 |           return;
 656 |       }
 657 | 
 658 |       if (node->type() == INNERNODE) {
 659 |         Node *child_node = reinterpret_cast<Node *>(&node->values[idx]);
 660 |         DLOG(INFO) << "Delete node " << child_node->id;
 661 |         free_recursive(child_node);
 662 | 
 663 |         KeyType del_key = node->keys[idx];
 664 | 
 665 |         // auto k = node->keys[idx];
 666 |         // auto v = node->value[idx];
 667 |         for(int i = idx; i < node->slot_used - 1; i++) {
 668 |           std::swap(node->keys[i], node->keys[i + 1]);
 669 |           std::swap(node->values[i], node->values[i + 1]);
 670 |         }
 671 | 
 672 |         if(idx == 0) {
 673 |           node->keys[0] = del_key;
 674 |         }
 675 | 
 676 |         node->slot_used--;
 677 | 
 678 | 
 679 |       }else {
 680 |         // del in leaf
 681 |         if (idx == lastIdx) {
 682 |           // if it's the last element, just pop it
 683 |           node->slot_used--;
 684 |         } else {
 685 |           // otherwise, swap
 686 |           node->keys[idx] = node->keys[lastIdx];
 687 |           node->values[idx] = node->values[lastIdx];
 688 |           node->slot_used--;
 689 |         }
 690 |       }
 691 | 
 692 |       return;
 693 |     }
 694 | 
 695 |     // collect kv pairs in (or under) this node
 696 |     // used for merge
 697 |     void collect_leaf(Node *node, std::vector<std::pair<KeyType, ValueType>> &container) {
 698 |       if (node->type() == LEAFNODE) {
 699 |         auto ptr = (LeafNode *)node;
 700 |         for(int i = 0; i < node->slot_used; i++) {
 701 |           container.push_back(std::make_pair(ptr->keys[i], ptr->values[i]));
 702 |         }
 703 |       } else if (node->type() == INNERNODE) {
 704 |         auto ptr = (InnerNode *)node;
 705 |         for(int i = 0; i < node->slot_used; i++) {
 706 |           collect_leaf(ptr->values[i], container);
 707 |         }
 708 |         layer_width_[node->level-1]->fetch_add(-node->slot_used);
 709 |       } else {
 710 |         assert(0);
 711 |       }
 712 | 
 713 |       return;
 714 |     }
 715 | 
 716 |     /**
 717 |      * @brief Modify @node by applying node modifications in @modes. If @node
 718 |      * is a leaf node, @mods will be a list of add kv and del kv. If @node is
 719 |      * a inner node, @mods will be a list of add range and del range. If new
 720 |      * node modifications are triggered, record them in @new_mods.
 721 |      */
 722 |     NodeMod modify_node(Node *node, const std::vector<NodeMod> &mods) {
 723 |       DLOG(INFO) << "Modifying node " << node->id << " with " << mods.size() << " operations";
 724 |       if(node->type() == LEAFNODE) {
 725 |         return modify_node_leaf((LeafNode *)node, mods);
 726 |       }else{
 727 |         CHECK(node->type() == INNERNODE) << "unKnown node" << endl;
 728 |         return modify_node_inner((InnerNode *)node, mods);
 729 |       }
 730 |     }
 731 | 
 732 |     NodeMod modify_node_leaf(LeafNode *node, const std::vector<NodeMod> &mods) {
 733 |       NodeMod ret(MOD_TYPE_NONE);
 734 |       auto& kv = ret.orphaned_kv;
 735 | 
 736 |       // randomly pick up a key, used for merge
 737 |       auto node_key = node->keys[0];
 738 | 
 739 |       // firstly, we loop all items to save orphaned and count nodes
 740 |       int num = node->slot_used;
 741 |       for (auto& item : mods) {
 742 |         // save all orphaned_*
 743 |         kv.insert(kv.end(), item.orphaned_kv.begin(), item.orphaned_kv.end());
 744 | 
 745 |         auto item_size = (int)item.value_items.size();
 746 |         if (item.type_ == MOD_TYPE_ADD) {
 747 |           num += item_size;
 748 |         } else if (item.type_ == MOD_TYPE_DEC) {
 749 |           num -= item_size;
 750 |         } else {
 751 |           assert(item_size == 0);
 752 |         }
 753 |       }
 754 | 
 755 |       DLOG(INFO) << "Result node size " << num;
 756 |       if (num > node->MAX_SLOT()) {
 757 |         DLOG(INFO) << "Going to split";
 758 |         auto comp = [this](const std::pair<KeyType, ValueType> &p1, const std::pair<KeyType, ValueType> &p2) {
 759 |           return key_less(p1.first, p2.first);
 760 |         };
 761 | 
 762 |         std::set<std::pair<KeyType, ValueType>, decltype(comp)> buf(comp);
 763 | 
 764 |         // execute add/del
 765 |         for (auto& item : mods) {
 766 |           if (item.type_ == MOD_TYPE_ADD) {
 767 |             for (auto& kv : item.value_items) {
 768 |               buf.insert(kv);
 769 |             }
 770 |           } else if(item.type_ == MOD_TYPE_DEC) {
 771 |             for (auto& kv : item.value_items) {
 772 |               if(buf.count(kv)) {
 773 |                 buf.erase(kv);
 774 |               }else{
 775 |                 del_item<LeafNode>(node, kv.first);
 776 |               }
 777 |             }
 778 |           }
 779 |         }
 780 | 
 781 |         // construct input for split
 782 |         std::vector<std::pair<KeyType, ValueType>> split_input;
 783 |         for(auto itr = buf.begin(); itr != buf.end(); itr++) {
 784 |           split_input.push_back(*itr);
 785 |         }
 786 | 
 787 |         for(auto i = 0; i < node->slot_used; i++) {
 788 |           split_input.push_back(std::make_pair(node->keys[i], node->values[i]));
 789 |         }
 790 |         // do split based on this buf
 791 |         big_split<LeafNode, ValueType>(split_input, node, ret.node_items);
 792 |         ret.type_ = MOD_TYPE_ADD;
 793 |         return ret;
 794 |       } else {
 795 |         DLOG(INFO) << "don't split";
 796 |         for (auto& item : mods) {
 797 |           if (item.type_ == MOD_TYPE_ADD) {
 798 |             for (auto& kv : item.value_items) {
 799 |               add_item<LeafNode, ValueType>(node, kv.first, kv.second);
 800 |             }
 801 |           } else if(item.type_ == MOD_TYPE_DEC) {
 802 |             for (auto& kv : item.value_items) {
 803 |               del_item<LeafNode>(node, kv.first);
 804 |             }
 805 |           }
 806 |         }
 807 |       }
 808 | 
 809 |       // merge
 810 |       // fixme: never merge the first leafnode
 811 |       // because the min_key is in this node
 812 |       // we can't delete min_key
 813 |       if (must_merge(node)) {
 814 |         DLOG(INFO) << "Merge leaf node " << node->id;
 815 |         collect_leaf(node, ret.orphaned_kv);
 816 |         ret.node_items.push_back(std::make_pair(node_key, node));
 817 |         ret.type_ = MOD_TYPE_DEC;
 818 |       }
 819 | 
 820 |       return ret;
 821 |     }
 822 | 
 823 |     NodeMod modify_node_inner(InnerNode *node UNUSED, const std::vector<NodeMod> &mods UNUSED) {
 824 |       NodeMod ret(MOD_TYPE_NONE);
 825 |       auto& kv = ret.orphaned_kv;
 826 | 
 827 |       // randomly pick up a key, used for merge
 828 |       auto node_key = node->keys[0];
 829 | 
 830 |       // firstly, we loop all items to save orphaned and count nodes
 831 |       int num = node->slot_used;
 832 |       for (auto& item : mods) {
 833 |         // save all orphaned_*
 834 |         kv.insert(kv.end(), item.orphaned_kv.begin(), item.orphaned_kv.end());
 835 | 
 836 |         auto item_size = (int)item.node_items.size();
 837 |         if (item.type_ == MOD_TYPE_ADD) {
 838 |           num += item_size;
 839 |         } else if (item.type_ == MOD_TYPE_DEC) {
 840 |           num -= item_size;
 841 |         } else {
 842 |           assert(item_size == 0);
 843 |         }
 844 |       }
 845 | 
 846 |       if (num > node->MAX_SLOT()) {
 847 |         DLOG(INFO) << "inner will split";
 848 |         auto comp = [this](const std::pair<KeyType, Node *> &p1, const std::pair<KeyType, Node *> &p2) {
 849 |           return key_less(p1.first, p2.first);
 850 |         };
 851 | 
 852 |         std::set<std::pair<KeyType, Node *>, decltype(comp)> buf(comp);
 853 | 
 854 |         // execute add/del
 855 |         for (auto& item : mods) {
 856 |           if (item.type_ == MOD_TYPE_ADD) {
 857 |             for (auto& kv : item.node_items) {
 858 |               buf.insert(kv);
 859 |             }
 860 |           } else if(item.type_ == MOD_TYPE_DEC) {
 861 |             for (auto& kv : item.node_items) {
 862 |               if(buf.count(kv)) {
 863 |                 buf.erase(kv);
 864 |                 // TODO: memleak
 865 |               }else{
 866 |                 // cout << "del " << kv.first<<endl;
 867 |                 del_item<InnerNode>(node, kv.first);
 868 | 
 869 |               }
 870 |             }
 871 |           }
 872 |         }
 873 | 
 874 |         // construct input for split
 875 |         std::vector<std::pair<KeyType, Node *>> split_input;
 876 |         for(auto itr = buf.begin(); itr != buf.end(); itr++) {
 877 |           split_input.push_back(*itr);
 878 |         }
 879 | 
 880 |         for(auto i = 0; i < node->slot_used; i++) {
 881 |           split_input.push_back(std::make_pair(node->keys[i], node->values[i]));
 882 |         }
 883 |         // do split based on this buf
 884 |         big_split<InnerNode, Node *>(split_input, node, ret.node_items);
 885 |         for (auto itr = ret.node_items.begin(); itr != ret.node_items.end(); itr++) {
 886 |           // Reset parent, the children of the newly splited node should point
 887 |           // to the new parent
 888 |           auto new_node = itr->second;
 889 |           for (int i = 0; i < new_node->slot_used; i++) {
 890 |             CHECK(new_node->type() == INNERNODE) << " split leaf node in modify_node_inner";
 891 |             ((InnerNode *)new_node)->values[i]->parent = new_node;
 892 |           }
 893 |         }
 894 |         ret.type_ = MOD_TYPE_ADD;
 895 |         return ret;
 896 |       } else {
 897 |         DLOG(INFO) << "inner not split";
 898 |         for (auto& item : mods) {
 899 |           if (item.type_ == MOD_TYPE_ADD) {
 900 |             for (auto& kv : item.node_items) {
 901 |               DLOG(INFO) << "Add item " << kv.first;
 902 |               add_item<InnerNode, Node *>(node, kv.first, kv.second);
 903 |             }
 904 |           } else if(item.type_ == MOD_TYPE_DEC) {
 905 |             for (auto& kv : item.node_items) {
 906 |               DLOG(INFO) << "Del item " << kv.first;
 907 |               del_item<InnerNode>(node, kv.first);
 908 |             }
 909 |           } else {
 910 |              DLOG(INFO) << "A NOOP has propagated";
 911 |           }
 912 |         }
 913 |       }
 914 | 
 915 |       // merge
 916 |       if (must_merge(node)) {
 917 |         collect_leaf(node, ret.orphaned_kv);
 918 |         ret.node_items.push_back(std::make_pair(node_key, node));
 919 |         ret.type_ = MOD_TYPE_DEC;
 920 | 
 921 |       } else {
 922 |          DLOG(INFO) << "Don't merge " << layer_width_[node->level]->load() << " " << node->is_few() << " " << node->slot_used;
 923 |       }
 924 | 
 925 |       return ret;
 926 |     }
 927 | 
 928 |     // set the smallest key in node to min_key
 929 |     void ensure_min_range(InnerNode *node UNUSED, const KeyType &min) {
 930 |       if (node->slot_used <= 1) {
 931 |         return;
 932 |       }
 933 |       // find the second smallest
 934 |       int idx = 0;
 935 |       for(int i = 1; i < node->slot_used; i++) {
 936 |         if(key_less(node->keys[i], node->keys[idx])) {
 937 |           idx = i;
 938 |         }
 939 |       }
 940 | 
 941 |       CHECK(key_less(min, node->keys[idx]));
 942 | 
 943 |       if(idx == 0) {
 944 |         return;
 945 |       }
 946 | 
 947 |       // swap idx with slot 0
 948 | 
 949 |       std::swap(node->keys[0], node->keys[idx]);
 950 |       std::swap(node->values[0], node->values[idx]);
 951 | 
 952 |     }
 953 | 
 954 |     void ensure_min_key() {
 955 |       auto ptr = (Node *)tree_root;
 956 |       while(ptr->type() == INNERNODE) {
 957 |         auto inner = (InnerNode *)ptr;
 958 |         inner->keys[0] = min_key_;
 959 |         ptr = inner->values[0];
 960 |       }
 961 |     }
 962 | 
 963 |     void ensure_tree_structure(Node *node, int indent) {
 964 |       std::map<int, int> recorder;
 965 |       ensure_tree_structure_helper(node, indent, recorder);
 966 | 
 967 |       CHECK(layer_width_.size() == recorder.size()) << "mismatch layer";
 968 |       for(auto itr = recorder.begin(); itr != recorder.end(); itr++) {
 969 |         CHECK(layer_width_[itr->first]->load() == itr->second) << "mismatch layer size in "<< itr->first <<" , expect: " << layer_width_[itr->first]->load()<< " actual "<<itr->second;
 970 |       }
 971 | 
 972 |     }
 973 |     void ensure_tree_structure_helper(Node *node, int indent, std::map<int, int>& layer_size_recorder) {
 974 |       if(layer_size_recorder.count(node->level)) {
 975 |         layer_size_recorder[node->level]++;
 976 |       } else {
 977 |         layer_size_recorder[node->level] = 1;
 978 |       }
 979 |       std::string space;
 980 |       for (int i = 0; i < indent; i++)
 981 |         space += " ";
 982 |       DLOG(INFO) << space << node->to_string() << " | Layer size " << layer_width_[node->level]->load();;
 983 | 
 984 |       if (node->type() == INNERNODE) {
 985 |         InnerNode *inode = (InnerNode *)node;
 986 |         for (int i = 0; i < inode->slot_used; i++) {
 987 |           auto child = inode->values[i];
 988 |           CHECK(child->parent == node) << "My child " << i << " does not point to me";
 989 |         }
 990 |       }
 991 |       if (node->type() == INNERNODE) {
 992 |         InnerNode *inode = (InnerNode *)node;
 993 |         for (int i = 0; i < inode->slot_used; i++) {
 994 |           auto child = inode->values[i];
 995 |           KeyType *key_set;
 996 |           if (child->type() == LEAFNODE)
 997 |             key_set = ((LeafNode *)child)->keys;
 998 |           else
 999 |             key_set = ((InnerNode *)child)->keys;
1000 |           if (child->slot_used == 0) {
1001 |             CHECK(node == tree_root) << "Non root node has empty child " << i;
1002 |           } else {
1003 |             int idx = 0;
1004 |             for (int j = 1; j < child->slot_used; j++) {
1005 |               if (key_less(key_set[j], key_set[idx])) {
1006 |                 idx = j;
1007 |               }
1008 |             }
1009 | 
1010 |             auto child_min_key = key_set[idx];
1011 |             if(child->type() == INNERNODE) {
1012 |               CHECK(idx == 0) << "InnerNode " << i << "'s first key isn't the smallest";
1013 |             }
1014 |             CHECK(!key_less(child_min_key, inode->keys[i])) << "My child " << i << " is beyond the key range";
1015 |           }
1016 |         }
1017 | 
1018 |         for (int i = 0; i < inode->slot_used; i++) {
1019 |           ensure_tree_structure_helper(inode->values[i], indent + 4, layer_size_recorder);
1020 |         }
1021 |       }
1022 |     }
1023 | 
1024 |     /**************************
1025 |      * Concurrent executions **
1026 |      *
1027 |      * Design: we have a potential infinite long task queue, where clients add
1028 |      * requests by calling find, insert or remove. We also have a fixed length
1029 |      * pool of worker threads. One of the thread (thread 0) will collect task from the
1030 |      * work queue, if it has collected enough task for a batch, or has timed out
1031 |      * before collecting enough tasks, it will partition the work and start the
1032 |      * Palm algorithm among the threads.
1033 |      * ************************/
1034 |     // boost::barrier barrier_;
1035 |     Barrier barrier_;
1036 |     boost::lockfree::spsc_queue<TaskBatch *> task_batch_queue_;
1037 | 
1038 |     // The current batch that is being processed by the workers
1039 |     TaskBatch *current_batch_;
1040 | 
1041 |     void sync(int worker_id) {
1042 |       auto begin_tick = CycleTimer::currentTicks();
1043 |       barrier_.wait();
1044 |       auto passed_tick = CycleTimer::currentTicks() - begin_tick;
1045 |       STAT.add_stat(worker_id, "sync_time", passed_tick);
1046 |     }
1047 | 
1048 |     struct WorkerThread {
1049 |       WorkerThread(int worker_id, PalmTree *palmtree):
1050 |         worker_id_(worker_id),
1051 |         palmtree_(palmtree),
1052 |         done_(false) {
1053 |           // Initialize 2 layers of modifications
1054 |           node_mods_.push_back(NodeModsMapType());
1055 |           node_mods_.push_back(NodeModsMapType());
1056 |         }
1057 |       // Worker id, the thread with worker id 0 will need to be the coordinator
1058 |       int worker_id_;
1059 |       // The work for the worker at each stage
1060 |       std::vector<TreeOp *> current_tasks_;
1061 |       std::unordered_map<Node *, std::vector<TreeOp *>> leaf_ops_;
1062 |       // Node modifications on each layer, the size of the vector will be the
1063 |       // same as the tree height
1064 |       typedef std::unordered_map<Node *, std::vector<NodeMod>> NodeModsMapType;
1065 |       std::vector<NodeModsMapType> node_mods_;
1066 |       // Spawn a thread and run the worker loop
1067 |       boost::thread wthread_;
1068 |       // The palm tree the worker belong to
1069 |       PalmTree *palmtree_;
1070 |       bool done_;
1071 |       void start() {
1072 |         wthread_ = boost::thread(&WorkerThread::worker_loop, this);
1073 |       }
1074 | 
1075 |       inline int LOWER() {
1076 |         auto batch_size = palmtree_->current_batch_->size();
1077 |         auto task_per_thread = batch_size / palmtree_->NUM_WORKER + 1;
1078 |         auto LOWER = worker_id_*task_per_thread;
1079 |         return LOWER;
1080 |       }
1081 | 
1082 |       inline int UPPER() {
1083 |         auto batch_size = palmtree_->current_batch_->size();
1084 |         auto task_per_thread = batch_size / palmtree_->NUM_WORKER + 1;
1085 |         auto LOWER = worker_id_*task_per_thread;
1086 |         return (worker_id_ == palmtree_->NUM_WORKER-1) ? (batch_size) : (LOWER+task_per_thread);
1087 |       }
1088 |       // The #0 thread is responsible to collect tasks to a batch
1089 |       void collect_batch() {
1090 |         DLOG(INFO) << "Thread " << worker_id_ << " collect tasks " << palmtree_->BATCH_SIZE;
1091 | 
1092 |         if (worker_id_ == 0) {
1093 |           if (batch_id % 2 == 0) {
1094 |             int sleep_time = 0;
1095 |             while (sleep_time < 1024) {
1096 | 
1097 |               bool res = palmtree_->task_batch_queue_.pop(palmtree_->current_batch_);
1098 |               if (res) {
1099 |                 break;
1100 |               } else {
1101 |                 DLOG(INFO) << sleep_time;
1102 |                 sleep_time++;
1103 |               }
1104 |             }
1105 |           }
1106 |           batch_id++;
1107 |           // STAT.add_stat(0, "fetch_batch", CycleTimer::currentTicks() - bt);
1108 |           // DLOG(INFO) << "Collected a batch of " << palmtree_->current_batch_->size();
1109 |         }
1110 | 
1111 |         palmtree_->sync(worker_id_);
1112 |         if (palmtree_->current_batch_ == nullptr) {
1113 |           return;
1114 |         }
1115 | 
1116 |         if (palmtree_->current_batch_->size() == 0) {
1117 |           return;
1118 |         }
1119 |         // STAT.add_stat(worker_id_, "batch_sort", CycleTimer::currentTicks() - bt);
1120 | 
1121 |         // Partition the task among threads
1122 |         int batch_size = palmtree_->current_batch_->size();
1123 |         int task_per_thread = batch_size / palmtree_->NUM_WORKER;
1124 |         int task_residue = batch_size - task_per_thread * palmtree_->NUM_WORKER;
1125 | 
1126 |         int lower = task_per_thread * worker_id_ + std::min(task_residue, worker_id_);
1127 |         int upper = lower + task_per_thread + (worker_id_ < task_residue);
1128 | 
1129 |         DLOG(INFO) << worker_id_ << " got " << lower << " to " << upper << " tasks";
1130 |         for (int i = lower; i < upper; i++) {
1131 |           palmtree_->workers_[worker_id_].current_tasks_
1132 |               .push_back(palmtree_->current_batch_->get_op(i));
1133 |         }
1134 |       }
1135 | 
1136 |       // Redistribute the tasks on leaf node
1137 |       void redistribute_leaf_tasks(std::unordered_map<Node *, std::vector<TreeOp *>> &result) {
1138 | #ifdef PROFILE
1139 |         auto bt = CycleTimer::currentTicks();
1140 | #endif
1141 |         // First add current tasks
1142 |         for (auto op : current_tasks_) {
1143 |           if (result.find(op->target_node_) == result.end()) {
1144 |             result.emplace(op->target_node_, std::vector<TreeOp *>());
1145 |           }
1146 | 
1147 |           result[op->target_node_].push_back(op);
1148 |         }
1149 | 
1150 |         // Then remove nodes that don't belong to the current worker
1151 |         for (int i = 0; i < worker_id_; i++) {
1152 |           WorkerThread &wthread = palmtree_->workers_[i];
1153 |           for (int j = wthread.current_tasks_.size()-1; j >= 0; j--) {
1154 |             auto &op = wthread.current_tasks_[j];
1155 |             if (result.count(op->target_node_) == 0)
1156 |               break;
1157 |             result.erase(op->target_node_);
1158 |           }
1159 |         }
1160 | 
1161 |         for (int i = worker_id_+1; i < palmtree_->NUM_WORKER; i++) {
1162 |           WorkerThread &wthread = palmtree_->workers_[i];
1163 |           bool early_break = false;
1164 |           for (auto op : wthread.current_tasks_) {
1165 |             CHECK(op->target_node_ != nullptr) << "worker " << i <<" hasn't finished search";
1166 |             if (result.find(op->target_node_) != result.end()) {
1167 |               result[op->target_node_].push_back(op);
1168 |             } else {
1169 |               early_break = true;
1170 |               break;
1171 |             }
1172 |           }
1173 | 
1174 |           if (early_break)
1175 |             break;
1176 |         }
1177 | 
1178 | 
1179 |         // LOG(INFO) << "Worker " << worker_id_ << " has " << result.size() << " nodes of tasks after task redistribution";
1180 | 
1181 | 
1182 |         // Calculate number of tasks
1183 |         int sum = 0;
1184 |         for (auto itr = result.begin(); itr != result.end(); itr++) {
1185 |           sum += itr->second.size();
1186 |         }
1187 | 
1188 |         STAT.add_stat(worker_id_, "leaf_task", sum);
1189 | 
1190 |         // LOG(INFO) << "Worker " << worker_id_ << " has " << result.size() << " nodes of tasks after task redistribution, " << sum << " tasks in total";
1191 |         // std::this_thread::sleep_for(std::chrono::milliseconds(1));
1192 | 
1193 | #ifdef PROFILE
1194 |         STAT.add_stat(worker_id_, "redist_leaf", CycleTimer::currentTicks() - bt);
1195 | #endif
1196 |       }
1197 | 
1198 |       /**
1199 |        * @brief redistribute inner node tasks for the current thread. It will
1200 |        * read @depth layer's information about node modifications and determine
1201 |        * tasks that belongs to the current thread.
1202 |        *
1203 |        * @param layer which layer's modifications are we trying to colelct
1204 |        * @param cur_mods the collected tasks will be stored in @cur_mods
1205 |        */
1206 |       void redistribute_inner_tasks(int layer, NodeModsMapType &cur_mods) {
1207 |         cur_mods = node_mods_[layer];
1208 | 
1209 |         // discard
1210 |         for (int i = 0; i < worker_id_; i++) {
1211 |           auto &wthread = palmtree_->workers_[i];
1212 |           for (auto other_itr = wthread.node_mods_[layer].begin(); other_itr != wthread.node_mods_[layer].end(); other_itr++) {
1213 |             cur_mods.erase(other_itr->first);
1214 |           }
1215 |         }
1216 | 
1217 |         // Steal work from other threads
1218 |         for (int i = worker_id_+1; i < palmtree_->NUM_WORKER; i++) {
1219 |           auto &wthread = palmtree_->workers_[i];
1220 |           for (auto other_itr = wthread.node_mods_[layer].begin(); other_itr != wthread.node_mods_[layer].end(); other_itr++) {
1221 |             auto itr = cur_mods.find(other_itr->first);
1222 |             if (itr != cur_mods.end()) {
1223 |               auto &my_mods = itr->second;
1224 |               auto &other_mods = other_itr->second;
1225 |               my_mods.insert(my_mods.end(), other_mods.begin(), other_mods.end());
1226 |             }
1227 |           }
1228 |         }
1229 |       }
1230 | 
1231 |       /**
1232 |        * @brief carry out all operations on the tree in a serializable order,
1233 |        *  reduce operations on the same key. The result of this function is to
1234 |        *  provide proper return result for all the operations, as well as filter
1235 |        *  out the todo node modifications on the #0 layer
1236 |        *  */
1237 |       void resolve_hazards(const std::unordered_map<Node *, std::vector<TreeOp *>> &tree_ops UNUSED) {
1238 | #ifdef PROFILE
1239 |         auto bt = CycleTimer::currentTicks();
1240 | #endif
1241 |         node_mods_[0].clear();
1242 |         auto &leaf_mods = node_mods_[0];
1243 |         std::unordered_map<KeyType, ValueType> changed_values;
1244 |         std::unordered_set<KeyType> deleted;
1245 |         for (auto itr = tree_ops.begin(); itr != tree_ops.end(); itr++) {
1246 |           LeafNode *leaf = static_cast<LeafNode *>(itr->first);
1247 |           auto &ops = itr->second;
1248 |           for (auto op : ops) {
1249 |             if (op->op_type_ == TREE_OP_FIND) {
1250 |               if (deleted.find(op->key_) != deleted.end()) {
1251 |                 op->boolean_result_ = false;
1252 |               } else {
1253 |                 if (changed_values.count(op->key_) != 0) {
1254 |                   op->result_ = changed_values[op->key_];
1255 |                   op->boolean_result_ = true;
1256 |                 } else {
1257 |                   int idx = palmtree_->search_leaf(leaf->keys, leaf->slot_used, op->key_);
1258 |                   if (idx == -1 || !palmtree_->key_eq(leaf->keys[idx], op->key_)) {
1259 |                     // Not find
1260 |                     op->boolean_result_ = false;
1261 |                   } else {
1262 |                     op->result_ = leaf->values[idx];
1263 |                     op->boolean_result_ = true;
1264 |                   }
1265 |                 }
1266 |               }
1267 |             } else if (op->op_type_ == TREE_OP_INSERT) {
1268 |               DLOG(INFO) << "Try to insert " << op->key_ << ": " << op->value_;
1269 |               deleted.erase(op->key_);
1270 |               changed_values[op->key_] = op->value_;
1271 |               if (leaf_mods.count(leaf) == 0)
1272 |                 leaf_mods.emplace(leaf, std::vector<NodeMod>());
1273 |               leaf_mods[leaf].push_back(NodeMod(*op));
1274 |             } else {
1275 |               CHECK(op->op_type_ == TREE_OP_REMOVE) << "Invalid tree operation";
1276 |               changed_values.erase(op->key_);
1277 |               if (leaf_mods.count(leaf) == 0)
1278 |                 leaf_mods.emplace(leaf, std::vector<NodeMod>());
1279 |               leaf_mods[leaf].push_back(NodeMod(*op));
1280 |             }
1281 |           }
1282 |         }
1283 | 
1284 | #ifdef PROFILE
1285 |         STAT.add_stat(worker_id_, "resolve_hazards", CycleTimer::currentTicks() - bt);
1286 | #endif
1287 |       } // End resolve_hazards
1288 | 
1289 |       /**
1290 |        * @brief Handle root split and re-insert orphaned keys. It may need to grow the tree height
1291 |        */
1292 |       void handle_root() {
1293 | 
1294 |         int root_depth = palmtree_->tree_depth_;
1295 |         std::vector<NodeMod> root_mods;
1296 |         // Collect root modifications from all threads
1297 |         for (auto &wthread : palmtree_->workers_) {
1298 |           auto itr = wthread.node_mods_[root_depth].begin();
1299 |           if (itr != wthread.node_mods_[root_depth].end()) {
1300 |             root_mods.insert(root_mods.end(), itr->second.begin(), itr->second.end());
1301 |           }
1302 |         }
1303 |         // Handle over to modify_node
1304 |         auto new_mod = palmtree_->modify_node(palmtree_->tree_root, root_mods);
1305 |         if (new_mod.type_ == MOD_TYPE_NONE) {
1306 |           DLOG(INFO) << "Root won't split";
1307 |         } else if (new_mod.type_ == MOD_TYPE_ADD) {
1308 |           DLOG(INFO) << "Split root";
1309 |           InnerNode *new_root = new InnerNode(nullptr, palmtree_->tree_root->level+1);
1310 |           palmtree_->tree_root->parent = new_root;
1311 |           palmtree_->add_item<InnerNode, Node *>(new_root, palmtree_->min_key_, palmtree_->tree_root);
1312 |           for (auto itr = new_mod.node_items.begin(); itr != new_mod.node_items.end(); itr++) {
1313 |             itr->second->parent = new_root;
1314 |             palmtree_->add_item<InnerNode, Node *>(new_root, itr->first, itr->second);
1315 |           }
1316 |           palmtree_->tree_root = new_root;
1317 |           palmtree_->tree_depth_ += 1;
1318 |           for (auto &wthread : palmtree_->workers_) {
1319 |              wthread.node_mods_.push_back(NodeModsMapType());
1320 |           }
1321 |           palmtree_->layer_width_.emplace_back(new std::atomic<int>(1));
1322 |         }
1323 |         // Merge root if neccessary
1324 |         while (palmtree_->tree_depth_ >= 2 && palmtree_->tree_root->slot_used == 1) {
1325 |           DLOG(INFO) << "Decrease tree depth";
1326 |           // Decrease root height
1327 |           auto old_root = static_cast<InnerNode *>(palmtree_->tree_root);
1328 |           palmtree_->tree_root = old_root->values[0];
1329 |           delete old_root;
1330 |           palmtree_->tree_depth_ -= 1;
1331 |           for (auto &wthread : palmtree_->workers_) {
1332 |              wthread.node_mods_.pop_back();
1333 |           }
1334 |           delete palmtree_->layer_width_.back();
1335 |           palmtree_->layer_width_.pop_back();
1336 |         }
1337 |         DLOG(INFO) << "Insert orphaned";
1338 |         // Naively insert orphaned
1339 |         for (auto itr = new_mod.orphaned_kv.begin(); itr != new_mod.orphaned_kv.end(); itr++) {
1340 |           DLOG(INFO) << "Insert " << itr->first << " " << itr->second;
1341 |           auto leaf = palmtree_->search(itr->first);
1342 |           palmtree_->add_item<LeafNode, ValueType>(leaf, itr->first, itr->second);
1343 |         }
1344 |         palmtree_->ensure_min_key();
1345 |         DLOG(INFO) << "Root handled";
1346 |       } // End of handle_root()
1347 | 
1348 |       // Worker loop: process tasks
1349 |       void worker_loop() {
1350 |         while (!done_) {
1351 |           // Stage 0, collect work batch and partition
1352 |           CycleTimer::SysClock start_tick = CycleTimer::currentTicks();
1353 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 0: collect tasks";
1354 |           collect_batch();
1355 |           if (worker_id_ == 0) {
1356 |             // Check if the tree is destroyed, we must do it before the sync point
1357 |             if (palmtree_->destroyed_) {
1358 |               for (int i = 0; i < palmtree_->NUM_WORKER; i++)
1359 |                 palmtree_->workers_[i].done_ = true;
1360 |             };
1361 |           }
1362 |           CycleTimer::SysClock passed = CycleTimer::currentTicks() - start_tick;
1363 |           STAT.add_stat(worker_id_, "stage0", passed);
1364 |           palmtree_->sync(worker_id_);
1365 |           if (done_)
1366 |             LOG(INFO) << "Worker " << worker_id_ << " exit";
1367 | 
1368 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 0 finished";
1369 | #ifdef PROFILE
1370 |           auto s1_bt = CycleTimer::currentTicks();
1371 | #endif
1372 |           // Stage 1, Search for leafs
1373 |           DLOG(INFO) << "Worker " << worker_id_ << " got " << current_tasks_.size() << " tasks";
1374 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 1: search for leaves";
1375 | 
1376 |           leaf_ops_.clear();
1377 |           std::unordered_map<Node *, std::vector<TreeOp *>> collected_tasks;
1378 |           for (auto op : current_tasks_) {
1379 |             op->target_node_ = palmtree_->search(op->key_);
1380 | 
1381 |             CHECK(op->target_node_ != nullptr) << "search returns nullptr";
1382 |           }
1383 | #ifdef PROFILE
1384 |           STAT.add_stat(worker_id_, "stage1", CycleTimer::currentTicks() - s1_bt);
1385 | #endif
1386 |           palmtree_->sync(worker_id_);
1387 | 
1388 | #ifdef PROFILE
1389 |           auto s2_bt = CycleTimer::currentTicks();
1390 | #endif
1391 | 
1392 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 1 finished";
1393 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 2: Process leaves";
1394 |           // Stage 2, redistribute work, read the tree then modify, each thread
1395 |           // will handle the nodes it has searched for, except the nodes that
1396 |           // have been handled by workers whose worker_id is less than me.
1397 |           // Currently we use a unordered_map to record the ownership of tasks upon
1398 |           // certain nodes.
1399 | 
1400 |           redistribute_leaf_tasks(collected_tasks);
1401 |           resolve_hazards(collected_tasks);
1402 |           DLOG_IF(INFO, worker_id_ == 0) << "resolved hazards";
1403 |           // Modify nodes
1404 |           auto &upper_mods = node_mods_[1];
1405 |           auto &cur_mods = node_mods_[0];
1406 |           upper_mods.clear();
1407 |           for (auto itr = cur_mods.begin() ; itr != cur_mods.end(); itr++) {
1408 |             auto node = itr->first;
1409 |             auto &mods = itr->second;
1410 |             CHECK(node != nullptr) << "Modifying a null node";
1411 |             auto upper_mod = palmtree_->modify_node(node, mods);
1412 |             // FIXME: now we have orphaned_keys
1413 |             if (upper_mod.type_ == MOD_TYPE_NONE && upper_mod.orphaned_kv.empty()) {
1414 |               DLOG(INFO) << "No node modification happened, don't propagate upwards";
1415 |               continue;
1416 |             }
1417 |             DLOG(INFO) << "Add node modification " << upper_mod.type_ << " to upper layer " << 1;
1418 |             if (upper_mods.find(node->parent) == upper_mods.end()) {
1419 |               upper_mods.emplace(node->parent, std::vector<NodeMod>());
1420 |             }
1421 |             upper_mods[node->parent].push_back(upper_mod);
1422 |           }
1423 | 
1424 | #ifdef PROFILE
1425 |           STAT.add_stat(worker_id_, "stage2", CycleTimer::currentTicks() - s2_bt);
1426 | #endif
1427 |           palmtree_->sync(worker_id_);
1428 | #ifdef PROFILE
1429 |           auto s3_bt = CycleTimer::currentTicks();
1430 | #endif
1431 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 2 finished";
1432 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 3: propagate tree modification";
1433 |           // Stage 3, propagate tree modifications back
1434 |           // Propagate modifications until root
1435 |           for (int layer = 1; layer <= palmtree_->tree_depth_-1; layer++) {
1436 |             // DLOG_IF(INFO, worker_id_ == 0) << "Layer #" << layer << " begin";
1437 |             NodeModsMapType cur_mods;
1438 |             redistribute_inner_tasks(layer, cur_mods);
1439 |             auto &upper_mods = node_mods_[layer+1];
1440 |             upper_mods.clear();
1441 |             for (auto itr = cur_mods.begin(); itr != cur_mods.end(); itr++) {
1442 |               auto node = itr->first;
1443 |               auto &mods = itr->second;
1444 |               DLOG(INFO) << "Stage 3 modify " << node->id;
1445 |               auto mod_res = palmtree_->modify_node(node, mods);
1446 |               if (upper_mods.count(node->parent) == 0) {
1447 |                 upper_mods.emplace(node->parent, std::vector<NodeMod>());
1448 |               }
1449 |               upper_mods[node->parent].push_back(mod_res);
1450 |             }
1451 |             palmtree_->sync(worker_id_);
1452 |             // DLOG_IF(INFO, worker_id_ == 0) << "Layer #" << layer << " done";
1453 |           } // End propagate
1454 | #ifdef PROFILE
1455 |           STAT.add_stat(worker_id_, "stage3", CycleTimer::currentTicks() - s3_bt);
1456 | #endif
1457 |           palmtree_->sync(worker_id_);
1458 | #ifdef PROFILE
1459 |           auto s4_bt = CycleTimer::currentTicks();
1460 | #endif
1461 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 3 finished";
1462 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 4: Handle root";
1463 | 
1464 |           // Stage 4, modify the root, re-insert orphaned, mark work as done
1465 |           if (worker_id_ == 0) {
1466 |             CycleTimer::SysClock st = CycleTimer::currentTicks();
1467 |             // Mark tasks as done
1468 |             handle_root();
1469 |             STAT.add_stat(worker_id_, "end_stage", CycleTimer::currentTicks() - st);
1470 |             // palmtree_->ensure_tree_structure(palmtree_->tree_root, 0);
1471 |           }
1472 | 
1473 |           auto st2 = CycleTimer::currentTicks();
1474 |           STAT.add_stat(worker_id_, "deliver tasks", CycleTimer::currentTicks() - st2);
1475 | 
1476 |           auto st3 = CycleTimer::currentTicks();
1477 |           palmtree_->task_nums -= current_tasks_.size();
1478 |           STAT.add_stat(worker_id_, "dec task num", CycleTimer::currentTicks() - st3);
1479 | 
1480 |           current_tasks_.clear();
1481 | #ifdef PROFILE
1482 |           STAT.add_stat(worker_id_, "stage4", CycleTimer::currentTicks() - s4_bt);
1483 | #endif
1484 |           palmtree_->sync(worker_id_);
1485 | 
1486 |           // Free the current batch
1487 | 
1488 |           if (worker_id_ == 0 && batch_id % 2 == 0 && palmtree_->current_batch_ != nullptr) {
1489 |             DLOG(INFO) << "Free the current batch";
1490 |             palmtree_->current_batch_->destroy();
1491 |             free(palmtree_->current_batch_);
1492 |             palmtree_->current_batch_ = nullptr;
1493 |             DLOG(INFO) << "Free-ed";
1494 |           }
1495 | 
1496 |           DLOG_IF(INFO, worker_id_ == 0) << "#### STAGE 4 finished";
1497 | 
1498 |           CycleTimer::SysClock end_tick = CycleTimer::currentTicks();
1499 | 
1500 |           STAT.add_stat(worker_id_, "round_time", end_tick-start_tick);
1501 |         } // End worker loop
1502 |         DLOG(INFO) << "Worker " << worker_id_ << " exited";
1503 |       }
1504 |     }; // End WorkerThread
1505 | 
1506 |     std::vector<WorkerThread> workers_;
1507 |     /**********************
1508 |      * PalmTree public    *
1509 |      * ********************/
1510 |   public:
1511 |     std::atomic<int> task_nums;
1512 | 
1513 |     PalmTree(KeyType min_key, int num_worker):
1514 |       tree_depth_(1),
1515 |       destroyed_(false),
1516 |       min_key_(min_key),
1517 |       barrier_(num_worker),
1518 |       task_batch_queue_{1024*500}
1519 |     {
1520 |       NUM_WORKER = num_worker;
1521 |       BATCH_SIZE = BATCH_SIZE_PER_WORKER * NUM_WORKER;
1522 | 
1523 |       LOG(INFO) << "init palm tree with " << NUM_WORKER << " workers";
1524 |       // Init the root node
1525 |       tree_root = new InnerNode(nullptr, 1);
1526 |       add_item<InnerNode, Node *>((InnerNode *)tree_root, min_key_, new LeafNode(tree_root, 0));
1527 |       // Init layer width
1528 |       layer_width_.push_back(new std::atomic<int>(1));
1529 |       layer_width_.push_back(new std::atomic<int>(1));
1530 |       // Init current batch
1531 |       current_batch_ = nullptr;
1532 |       tree_current_batch_ = (TaskBatch *)malloc(sizeof(TaskBatch));
1533 |       new (tree_current_batch_) TaskBatch(BATCH_SIZE);
1534 |       // Init stats
1535 | 
1536 |       STAT = Stats(NUM_WORKER);
1537 |       STAT.init_metric("batch_sort");
1538 |       STAT.init_metric("stage0");
1539 |       STAT.init_metric("stage1");
1540 |       STAT.init_metric("redist_leaf");
1541 |       STAT.init_metric("resolve_hazards");
1542 |       STAT.init_metric("stage2");
1543 |       STAT.init_metric("stage3");
1544 |       STAT.init_metric("stage4");
1545 |       STAT.init_metric("end_stage");
1546 | 
1547 |       STAT.init_metric("search_inner");
1548 |       STAT.init_metric("search_leaf");
1549 | 
1550 |       STAT.init_metric("leaf_task");
1551 | 
1552 |       STAT.init_metric("sync_time");
1553 |       STAT.init_metric("round_time");
1554 | 
1555 |       STAT.init_metric("deliver tasks");
1556 |       STAT.init_metric("dec task num");
1557 | 
1558 |       // Init the worker thread
1559 |       // Init the worker thread and start them
1560 |       for (int worker_id = 0; worker_id < NUM_WORKER; worker_id++) {
1561 |         workers_.emplace_back(worker_id, this);
1562 |       }
1563 |       for (auto &worker : workers_) {
1564 |         worker.start();
1565 |       }
1566 | 
1567 |       task_nums = 0;
1568 |     }
1569 | 
1570 |     // Recursively free the resources of one tree node
1571 |     void free_recursive(Node *node UNUSED) {
1572 |       if (node->type() == INNERNODE) {
1573 |         auto ptr = (InnerNode *)node;
1574 |         for(int i = 0; i < ptr->slot_used; i++) {
1575 |           free_recursive(ptr->values[i]);
1576 |         }
1577 |       }
1578 | 
1579 |       delete node;
1580 |     }
1581 | 
1582 |     ~PalmTree() {
1583 | 
1584 |       // Mark the tree as destroyed
1585 |       destroyed_ = true;
1586 |       // Join all workter thread
1587 |       for (auto &wthread : workers_)
1588 |         wthread.wthread_.join();
1589 |       // Free atomic layer width
1590 |       while (!layer_width_.empty()) {
1591 |         delete layer_width_.back();
1592 |         layer_width_.pop_back();
1593 |       }
1594 | 
1595 |       STAT.print_stat();
1596 | 
1597 |       free_recursive(tree_root);
1598 | 
1599 | 
1600 |       if (tree_current_batch_ != nullptr) {
1601 |         tree_current_batch_->destroy();
1602 |         free(tree_current_batch_);
1603 |       }
1604 |     }
1605 | 
1606 |     /**
1607 |      * @brief execute a batch of tree operations, the batch will be executed
1608 |      *  cooperatively by all worker threads
1609 |      */
1610 |     void execute_batch(std::vector<TreeOp> &operations UNUSED) {
1611 | 
1612 |     }
1613 | 
1614 |     /**
1615 |      * @brief Find the value for a key
1616 |      * @param key the key to be retrieved
1617 |      * @return nullptr if no such k,v pair
1618 |      */
1619 |     bool find(const KeyType &key UNUSED, ValueType &value UNUSED) {
1620 |       push_task(TREE_OP_FIND, &key, nullptr);
1621 | 
1622 |       // op.wait();
1623 |       //if (op.boolean_result_)
1624 |         //value = op.result_;
1625 |       //return op.boolean_result_;
1626 |       return true;
1627 |     }
1628 | 
1629 |     /**
1630 |      * @brief insert a k,v into the tree
1631 |      */
1632 |     void insert(const KeyType &key UNUSED, const ValueType &value UNUSED) {
1633 |       // TreeOp op(TREE_OP_INSERT, key, value);
1634 | 
1635 |       push_task(TREE_OP_INSERT, &key, &value);
1636 | 
1637 |       // op.wait();
1638 |     }
1639 | 
1640 |     /**
1641 |      * @brief remove a k,v from the tree
1642 |      */
1643 |     void remove(const KeyType &key UNUSED) {
1644 |       push_task(TREE_OP_REMOVE, &key, nullptr);
1645 | 
1646 |       // op->wait();
1647 |     }
1648 | 
1649 |     void reset_metric() {
1650 |       STAT.reset_metric();
1651 |     }
1652 | 
1653 |     int batch_size() {
1654 |       return BATCH_SIZE_PER_WORKER * NUM_WORKER;
1655 |     }
1656 | 
1657 |     // Wait until all task finished
1658 |     void wait_finish() {
1659 |       if (tree_current_batch_->size() != 0) {
1660 |         task_batch_queue_.push(tree_current_batch_);
1661 |         tree_current_batch_ = (TaskBatch *)malloc(sizeof(TaskBatch));
1662 |         new (tree_current_batch_) TaskBatch(BATCH_SIZE);
1663 |       }
1664 |       while (task_nums != 0)
1665 |         ;
1666 |     }
1667 |   }; // End of PalmTree
1668 |   // Explicit template initialization
1669 |   template class PalmTree<int, int>;
1670 | } // End of namespace palmtree
1671 | 
1672 | 


--------------------------------------------------------------------------------