├── .gitignore ├── Makefile ├── test.cc ├── README.md ├── bplus_tree.h └── bplus_tree.cc /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | info.plist 3 | test 4 | test.db 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CXX = g++ 2 | CXXFLAGS = -Wall -Wextra -Werror=return-type -pedantic -std=c++2a -g -o2 -fsanitize=leak 3 | EXEC = test 4 | all: $(EXEC) 5 | 6 | $(EXEC): test.cc bplus_tree.cc bplus_tree.h 7 | $(CXX) $(CXXFLAGS) test.cc bplus_tree.cc -o $(EXEC) 8 | rm -f test.db 9 | 10 | clean: 11 | rm -rf $(EXEC) *.o test.db 12 | -------------------------------------------------------------------------------- /test.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "bplus_tree.h" 5 | 6 | int main(int argc, char const* argv[]) { 7 | (void)argc; 8 | (void)argv; 9 | 10 | srand(time(0)); 11 | BPlusTree bpt("test.db"); 12 | char k[33]; 13 | char v[101]; 14 | for (int n = 10000; n <= 1000000; n *= 10) { 15 | std::cout << "----------------------------------------------------" 16 | << "\n"; 17 | auto t1 = std::chrono::steady_clock::now(); 18 | // Random Insert 19 | for (int i = 0; i < n; ++i) { 20 | int r = rand() % n; 21 | snprintf(k, 33, "k%d", r); 22 | snprintf(v, 101, "v%d", r); 23 | bpt.Put(k, v); 24 | } 25 | auto t2 = std::chrono::steady_clock::now(); 26 | std::cout << "Random Insert " << n << " items: time span=" 27 | << std::chrono::duration_cast(t2 - t1) 28 | .count() 29 | << "ms" 30 | << "\n"; 31 | 32 | // Random Find 33 | for (int i = 0; i < n; ++i) { 34 | int r = rand() % n; 35 | snprintf(k, 33, "k%d", r); 36 | std::string value; 37 | bpt.Get(k, value); 38 | } 39 | auto t3 = std::chrono::steady_clock::now(); 40 | std::cout << "Random Get " << n << " items: time span=" 41 | << std::chrono::duration_cast(t3 - t2) 42 | .count() 43 | << "ms" 44 | << "\n"; 45 | 46 | // Random Delete 47 | for (int i = 0; i < n; ++i) { 48 | int r = rand() % n; 49 | snprintf(k, 33, "k%d", r); 50 | bpt.Delete(k); 51 | } 52 | auto t4 = std::chrono::steady_clock::now(); 53 | std::cout << "Random Delete " << n << " items: time span=" 54 | << std::chrono::duration_cast(t4 - t3) 55 | .count() 56 | << "ms" 57 | << "\n"; 58 | } 59 | return 0; 60 | } 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BPlusTree 2 | A B+ tree is an m-ary tree with a variable but often large number of children per node. A B+ tree consists of a root, internal nodes and leaves. The root may be either a leaf or a node with two or more children.\ 3 | The primary value of a B+ tree is in storing data for efficient retrieval in a block-oriented storage context — in particular, filesystems(In my repo, I use mmap.). This is primarily because unlike binary search trees, B+ trees have very high fanout (number of pointers to child nodes in a node, typically on the order of 100 or more), which reduces the number of I/O operations required to find an element in the tree.\ 4 | In theory, if the size of the index node in B+ tree is close to the size of the disk block(eg.4k bytes page size in linux), a query operation needs to access the disk logb(N) times. 5 | ## Feature 6 | * Use mmap to read and write to disk. 7 | * Use LRU to cache mapped blocks. 8 | ## Benchmark 9 | Magnitude | Put | Get | Delete | 10 | :----------- | :-----------| :----------|:-----------| 11 | 10K | 72ms | 28ms | 52ms | 12 | 100K | 900ms | 507ms | 777ms | 13 | 1000K | 10333ms | 4726ms | 8154ms | 14 | 15 | The above data was tested on my 2013 macbook-pro with Intel Core i7 4 cores 2.3 GHz.\ 16 | Each record has a value length of 100 bytes and I set cache size to 5MB. 17 | See also [test](test.cc). 18 | ## Build 19 | ``` 20 | make && ./test 21 | ``` 22 | ## API 23 | ```C++ 24 | BPlusTree(const char* path); 25 | void Put(const std::string& key, const std::string& value); 26 | bool Delete(const std::string& key); 27 | bool Get(const std::string& key, std::string& value) const; 28 | std::vector GetRange(const std::string& left, const std::string& right) const; 29 | bool Empty() const; 30 | size_t Size() const; 31 | ``` 32 | ## TODO List 33 | - [ ] Support for variable key-value length. 34 | - [ ] When Dealloc is executed, put block into reuse-pool. 35 | - [ ] Defragment db file. 36 | - [ ] Add WAL(Write Ahead Log). 37 | - [ ] Data compression. 38 | ## Reference 39 | [1] https://en.wikipedia.org/wiki/B%2B_tree \ 40 | [2] https://www.cnblogs.com/nullzx/p/8729425.html \ 41 | [3] http://man7.org/linux/man-pages/man2/mmap.2.html 42 | -------------------------------------------------------------------------------- /bplus_tree.h: -------------------------------------------------------------------------------- 1 | #ifndef BPLUS_TREE_H 2 | #define BPLUS_TREE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #define DEBUG 9 | 10 | #ifdef DEBUG 11 | #define LOG(fmt, ...) \ 12 | do { \ 13 | fprintf(stderr, "%s:%d:" fmt, __FILE__, __LINE__, __VA_ARGS__); \ 14 | } while (0) 15 | 16 | #define LOG2(fmt, ...) \ 17 | do { \ 18 | fprintf(stderr, fmt, __VA_ARGS__); \ 19 | } while (0) 20 | #endif 21 | 22 | class BPlusTree { 23 | struct Meta; 24 | struct Index; 25 | struct Record; 26 | struct Node; 27 | struct IndexNode; 28 | struct LeafNode; 29 | class BlockCache; 30 | 31 | public: 32 | BPlusTree(const char* path); 33 | ~BPlusTree(); 34 | 35 | void Put(const std::string& key, const std::string& value); 36 | bool Delete(const std::string& key); 37 | bool Get(const std::string& key, std::string& value) const; 38 | std::vector> GetRange( 39 | const std::string& left_key, const std::string& right_key) const; 40 | bool Empty() const; 41 | size_t Size() const; 42 | 43 | #ifdef DEBUG 44 | void Dump(); 45 | #endif 46 | 47 | private: 48 | template 49 | T* Map(off_t offset) const; 50 | template 51 | void UnMap(T* map_obj) const; 52 | template 53 | T* Alloc(); 54 | template 55 | void Dealloc(T* node); 56 | 57 | constexpr size_t GetMinKeys() const; 58 | constexpr size_t GetMaxKeys() const; 59 | 60 | template 61 | int UpperBound(T arr[], int n, const char* target) const; 62 | template 63 | int LowerBound(T arr[], int n, const char* target) const; 64 | 65 | off_t GetLeafOffset(const char* key) const; 66 | LeafNode* SplitLeafNode(LeafNode* leaf_node); 67 | IndexNode* SplitIndexNode(IndexNode* index_node); 68 | size_t InsertKeyIntoIndexNode(IndexNode* index_node, const char* key, 69 | Node* left_node, Node* right_node); 70 | size_t InsertKVIntoLeafNode(LeafNode* leaf_node, const char* key, 71 | const char* value); 72 | int GetIndexFromLeafNode(LeafNode* leaf_node, const char* key) const; 73 | IndexNode* GetOrCreateParent(Node* node); 74 | 75 | bool BorrowFromLeftLeafSibling(LeafNode* leaf_node); 76 | bool BorrowFromRightLeafSibling(LeafNode* leaf_node); 77 | bool BorrowFromLeafSibling(LeafNode* leaf_node); 78 | bool MergeLeftLeaf(LeafNode* leaf_node); 79 | bool MergeRightLeaf(LeafNode* leaf_node); 80 | LeafNode* MergeLeaf(LeafNode* leaf_node); 81 | 82 | bool BorrowFromLeftIndexSibling(IndexNode* index_node); 83 | bool BorrowFromRightIndexSibling(IndexNode* index_node); 84 | bool BorrowFromIndexSibling(IndexNode* index_node); 85 | bool MergeLeftIndex(IndexNode* index_node); 86 | bool MergeRightIndex(IndexNode* index_node); 87 | IndexNode* MergeIndex(IndexNode* index_node); 88 | 89 | int fd_; 90 | BlockCache* block_cache_; 91 | Meta* meta_; 92 | }; 93 | 94 | #endif // BPLUS_TREE_H -------------------------------------------------------------------------------- /bplus_tree.cc: -------------------------------------------------------------------------------- 1 | #include "bplus_tree.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | const off_t kMetaOffset = 0; 14 | const int kOrder = 128; 15 | static_assert(kOrder >= 3, 16 | "The order of B+Tree should be greater than or equal to 3."); 17 | const int kMaxKeySize = 32; 18 | const int kMaxValueSize = 256; 19 | const int kMaxCacheSize = 1024 * 1024 * 5; 20 | typedef char Key[kMaxKeySize]; 21 | typedef char Value[kMaxValueSize]; 22 | 23 | void Exit(const char* msg) { 24 | perror(msg); 25 | exit(EXIT_FAILURE); 26 | } 27 | 28 | struct BPlusTree::Meta { 29 | off_t offset; // ofset of self 30 | off_t root; // offset of root 31 | off_t block; // offset of next new node 32 | size_t height; // height of B+Tree 33 | size_t size; // key size 34 | }; 35 | 36 | struct BPlusTree::Index { 37 | Index() : offset(0) { std::memset(key, 0, sizeof(key)); } 38 | 39 | off_t offset; 40 | Key key; 41 | 42 | void UpdateIndex(off_t of, const char* k) { 43 | offset = of; 44 | strncpy(key, k, kMaxKeySize); 45 | } 46 | void UpdateKey(const char* k) { strncpy(key, k, kMaxKeySize); } 47 | }; 48 | 49 | struct BPlusTree::Record { 50 | Key key; 51 | Value value; 52 | 53 | void UpdateKV(const char* k, const char* v) { 54 | strncpy(key, k, kMaxKeySize); 55 | strncpy(value, v, kMaxValueSize); 56 | } 57 | void UpdateKey(const char* k) { strncpy(key, k, kMaxKeySize); } 58 | void UpdateValue(const char* v) { strncpy(value, v, kMaxValueSize); } 59 | }; 60 | 61 | struct BPlusTree::Node { 62 | Node() : parent(0), left(0), right(0), count(0) {} 63 | Node(off_t parent_, off_t leaf_, off_t right_, size_t count_) 64 | : parent(parent_), left(leaf_), right(right_), count(count_) {} 65 | ~Node() = default; 66 | 67 | off_t offset; // offset of self 68 | off_t parent; // offset of parent 69 | off_t left; // offset of left node(may be sibling) 70 | off_t right; // offset of right node(may be sibling) 71 | size_t count; // count of keys 72 | }; 73 | 74 | struct BPlusTree::IndexNode : BPlusTree::Node { 75 | IndexNode() = default; 76 | ~IndexNode() = default; 77 | 78 | const char* FirstKey() const { 79 | assert(count > 0); 80 | return indexes[0].key; 81 | } 82 | 83 | const char* LastKey() const { 84 | assert(count > 0); 85 | return indexes[count - 1].key; 86 | } 87 | 88 | const char* Key(int index) const { 89 | assert(count > 0); 90 | assert(index >= 0); 91 | assert(index <= kOrder); 92 | return indexes[index].key; 93 | } 94 | 95 | void UpdateKey(int index, const char* k) { 96 | assert(index >= 0); 97 | assert(index <= kOrder); 98 | indexes[index].UpdateKey(k); 99 | } 100 | 101 | void UpdateOffset(int index, off_t offset) { 102 | assert(index >= 0); 103 | assert(index <= kOrder); 104 | indexes[index].offset = offset; 105 | } 106 | 107 | void UpdateIndex(int index, const char* k, off_t offset) { 108 | assert(index >= 0); 109 | assert(index <= kOrder); 110 | UpdateKey(index, k); 111 | UpdateOffset(index, offset); 112 | } 113 | 114 | void DeleteKeyAtIndex(int index) { 115 | assert(index >= 0); 116 | assert(index <= kOrder); 117 | std::memmove(&indexes[index], &indexes[index + 1], 118 | sizeof(indexes[0]) * (count-- - index)); 119 | } 120 | 121 | void InsertKeyAtIndex(int index, const char* k) { 122 | assert(index >= 0); 123 | assert(index <= kOrder); 124 | std::memmove(&indexes[index + 1], &indexes[index], 125 | sizeof(indexes[0]) * (++count - index)); 126 | UpdateKey(index, k); 127 | } 128 | 129 | void InsertIndexAtIndex(int index, const char* k, off_t offset) { 130 | assert(index >= 0); 131 | assert(index <= kOrder); 132 | std::memmove(&indexes[index + 1], &indexes[index], 133 | sizeof(indexes[0]) * (++count - index)); 134 | UpdateIndex(index, k, offset); 135 | } 136 | 137 | void MergeLeftSibling(IndexNode* sibling) { 138 | std::memmove(&indexes[sibling->count + 1], &indexes[0], 139 | sizeof(indexes[0]) * (count + 1)); 140 | std::memcpy(&indexes[0], &sibling->indexes[0], 141 | sizeof(indexes[0]) * (sibling->count + 1)); 142 | count += (sibling->count + 1); 143 | } 144 | 145 | void MergeRightSibling(IndexNode* sibling) { 146 | std::memcpy(&indexes[count], &sibling->indexes[0], 147 | sizeof(indexes[0]) * (sibling->count + 1)); 148 | count += sibling->count; 149 | } 150 | 151 | Index indexes[kOrder + 1]; 152 | }; 153 | 154 | struct BPlusTree::LeafNode : BPlusTree::Node { 155 | LeafNode() = default; 156 | ~LeafNode() = default; 157 | 158 | const char* FirstKey() const { 159 | assert(count > 0); 160 | return records[0].key; 161 | } 162 | 163 | const char* LastKey() const { 164 | assert(count > 0); 165 | return records[count - 1].key; 166 | } 167 | 168 | const char* Key(int index) const { 169 | assert(count > 0); 170 | assert(index >= 0); 171 | return records[index].key; 172 | } 173 | 174 | const char* FirstValue() const { 175 | assert(count > 0); 176 | return records[0].value; 177 | } 178 | 179 | const char* LastValue() const { 180 | assert(count > 0); 181 | return records[count - 1].value; 182 | } 183 | 184 | const char* Value(int index) const { 185 | assert(count > 0); 186 | return records[index].value; 187 | } 188 | 189 | void UpdateValue(int index, const char* v) { 190 | assert(index >= 0); 191 | records[index].UpdateValue(v); 192 | } 193 | 194 | void UpdateKey(int index, const char* k) { 195 | assert(index >= 0); 196 | records[index].UpdateKey(k); 197 | } 198 | 199 | void UpdateKV(int index, const char* k, const char* v) { 200 | assert(index >= 0); 201 | records[index].UpdateKV(k, v); 202 | } 203 | 204 | void InsertKVAtIndex(int index, const char* k, const char* v) { 205 | assert(index >= 0); 206 | assert(index < kOrder); 207 | std::memmove(&records[index + 1], &records[index], 208 | sizeof(records[0]) * (count++ - index)); 209 | UpdateKV(index, k, v); 210 | } 211 | 212 | void DeleteKVAtIndex(int index) { 213 | assert(index >= 0); 214 | assert(index < kOrder); 215 | std::memmove(&records[index], &records[index + 1], 216 | sizeof(records[0]) * (--count - index)); 217 | } 218 | 219 | void MergeLeftSibling(LeafNode* sibling) { 220 | std::memmove(&records[sibling->count], &records[0], 221 | sizeof(records[0]) * count); 222 | std::memcpy(&records[0], &sibling->records[0], 223 | sizeof(records[0]) * sibling->count); 224 | count += sibling->count; 225 | } 226 | 227 | void MergeRightSibling(LeafNode* sibling) { 228 | std::memcpy(&records[count], &sibling->records[0], 229 | sizeof(records[0]) * sibling->count); 230 | count += sibling->count; 231 | } 232 | 233 | BPlusTree::Record records[kOrder]; 234 | }; 235 | 236 | class BPlusTree::BlockCache { 237 | struct Node; 238 | 239 | public: 240 | BlockCache() : head_(new Node()), size_(0) { 241 | head_->next = head_; 242 | head_->prev = head_; 243 | } 244 | 245 | ~BlockCache() { 246 | for (auto it = offset2node_.begin(); it != offset2node_.end(); it++) { 247 | Node* node = it->second; 248 | off_t page_offset = node->offset & ~(sysconf(_SC_PAGE_SIZE) - 1); 249 | char* start = reinterpret_cast(node->block); 250 | void* addr = static_cast(&start[page_offset - node->offset]); 251 | if (munmap(addr, node->size + node->offset - page_offset) != 0) { 252 | Exit("munmap"); 253 | } 254 | delete node; 255 | } 256 | delete head_; 257 | } 258 | 259 | void DeleteNode(Node* node) { 260 | if (node->next == node->prev && nullptr == node->next) return; 261 | node->prev->next = node->next; 262 | node->next->prev = node->prev; 263 | node->next = node->prev = nullptr; 264 | size_ -= node->size; 265 | } 266 | 267 | void InsertHead(Node* node) { 268 | node->next = head_->next; 269 | node->prev = head_; 270 | head_->next->prev = node; 271 | head_->next = node; 272 | size_ += node->size; 273 | } 274 | 275 | Node* DeleteTail() { 276 | if (size_ == 0) { 277 | assert(head_->next == head_); 278 | assert(head_->prev == head_); 279 | return nullptr; 280 | } 281 | Node* tail = head_->prev; 282 | DeleteNode(tail); 283 | return tail; 284 | } 285 | 286 | template 287 | void Put(T* block) { 288 | while (size_ > kMaxCacheSize) Kick(); 289 | 290 | if (offset2node_.find(block->offset) == offset2node_.end()) { 291 | Node* node = new Node(block, block->offset, sizeof(T)); 292 | offset2node_.emplace(block->offset, node); 293 | InsertHead(node); 294 | } else { 295 | Node* node = offset2node_[block->offset]; 296 | if (--node->ref == 0) InsertHead(node); 297 | } 298 | } 299 | 300 | template 301 | T* Get(int fd, off_t offset) { 302 | if (offset2node_.find(offset) == offset2node_.end()) { 303 | struct stat st; 304 | if (fstat(fd, &st) != 0) Exit("fstat"); 305 | constexpr int size = sizeof(T); 306 | if (st.st_size < offset + size && ftruncate(fd, offset + size) != 0) { 307 | Exit("ftruncate"); 308 | } 309 | // Align offset to page size. 310 | // See http://man7.org/linux/man-pages/man2/mmap.2.html 311 | off_t page_offset = offset & ~(sysconf(_SC_PAGE_SIZE) - 1); 312 | void* addr = mmap(nullptr, size + offset - page_offset, 313 | PROT_READ | PROT_WRITE, MAP_SHARED, fd, page_offset); 314 | if (MAP_FAILED == addr) Exit("mmap"); 315 | char* start = static_cast(addr); 316 | return reinterpret_cast(&start[offset - page_offset]); 317 | } 318 | 319 | Node* node = offset2node_[offset]; 320 | ++node->ref; 321 | DeleteNode(node); 322 | return static_cast(node->block); 323 | } 324 | 325 | private: 326 | void Kick() { 327 | Node* tail = DeleteTail(); 328 | if (nullptr == tail) return; 329 | 330 | assert(tail != head_); 331 | 332 | off_t page_offset = tail->offset & ~(sysconf(_SC_PAGE_SIZE) - 1); 333 | char* start = reinterpret_cast(tail->block); 334 | void* addr = static_cast(&start[page_offset - tail->offset]); 335 | if (munmap(addr, tail->size + tail->offset - page_offset) != 0) { 336 | Exit("munmap"); 337 | } 338 | offset2node_.erase(tail->offset); 339 | delete tail; 340 | } 341 | 342 | struct Node { 343 | Node() 344 | : block(nullptr), 345 | offset(0), 346 | size(0), 347 | ref(0), 348 | prev(nullptr), 349 | next(nullptr) {} 350 | 351 | Node(void* block_, off_t offset_, size_t size_) 352 | : block(block_), 353 | offset(offset_), 354 | size(size_), 355 | ref(1), 356 | prev(nullptr), 357 | next(nullptr) {} 358 | 359 | void* block; 360 | off_t offset; 361 | size_t size; 362 | size_t ref; 363 | Node* prev; 364 | Node* next; 365 | }; 366 | 367 | Node* head_; 368 | size_t size_; 369 | std::unordered_map offset2node_; 370 | }; 371 | 372 | BPlusTree::BPlusTree(const char* path) 373 | : fd_(open(path, O_CREAT | O_RDWR, 0600)), block_cache_(new BlockCache()) { 374 | if (fd_ == -1) Exit("open"); 375 | meta_ = Map(kMetaOffset); 376 | if (meta_->height == 0) { 377 | // Initialize B+tree; 378 | constexpr off_t of_root = kMetaOffset + sizeof(Meta); 379 | LeafNode* root = new (Map(of_root)) LeafNode(); 380 | root->offset = of_root; 381 | meta_->height = 1; 382 | meta_->root = of_root; 383 | meta_->block = of_root + sizeof(LeafNode); 384 | UnMap(root); 385 | } 386 | } 387 | 388 | BPlusTree::~BPlusTree() { 389 | UnMap(meta_); 390 | delete block_cache_; 391 | close(fd_); 392 | } 393 | 394 | void BPlusTree::Put(const std::string& key, const std::string& value) { 395 | // 1. Find Leaf node. 396 | off_t of_leaf = GetLeafOffset(key.data()); 397 | LeafNode* leaf_node = Map(of_leaf); 398 | if (InsertKVIntoLeafNode(leaf_node, key.data(), value.data()) <= 399 | GetMaxKeys()) { 400 | // 2.If records of leaf node less than or equals kOrder - 1 then finish. 401 | UnMap(leaf_node); 402 | return; 403 | } 404 | 405 | // 3. Split leaf node to two leaf nodes. 406 | LeafNode* split_node = SplitLeafNode(leaf_node); 407 | const char* mid_key = split_node->FirstKey(); 408 | IndexNode* parent_node = GetOrCreateParent(leaf_node); 409 | off_t of_parent = leaf_node->parent; 410 | split_node->parent = of_parent; 411 | 412 | // 4.Insert key to parent of splited leaf nodes and 413 | // link two splited left nodes to parent. 414 | if (InsertKeyIntoIndexNode(parent_node, mid_key, leaf_node, split_node) <= 415 | GetMaxKeys()) { 416 | UnMap(leaf_node); 417 | UnMap(split_node); 418 | UnMap(parent_node); 419 | return; 420 | } 421 | 422 | // 5.Split index node from bottom to up repeatedly 423 | // until count <= kOrder - 1. 424 | size_t count; 425 | do { 426 | IndexNode* child_node = parent_node; 427 | IndexNode* split_node = SplitIndexNode(child_node); 428 | const char* mid_key = child_node->Key(child_node->count); 429 | parent_node = GetOrCreateParent(child_node); 430 | of_parent = child_node->parent; 431 | split_node->parent = of_parent; 432 | count = 433 | InsertKeyIntoIndexNode(parent_node, mid_key, child_node, split_node); 434 | UnMap(child_node); 435 | } while (count > GetMaxKeys()); 436 | UnMap(parent_node); 437 | } 438 | 439 | bool BPlusTree::Delete(const std::string& key) { 440 | off_t of_leaf = GetLeafOffset(key.data()); 441 | LeafNode* leaf_node = Map(of_leaf); 442 | // 1. Delete key from leaf node 443 | int index = GetIndexFromLeafNode(leaf_node, key.data()); 444 | if (index == -1) { 445 | UnMap(leaf_node); 446 | return false; 447 | } 448 | 449 | leaf_node->DeleteKVAtIndex(index); 450 | --meta_->size; 451 | // 2. If leaf_node is root then return. 452 | if (leaf_node->parent == 0) { 453 | UnMap(leaf_node); 454 | return true; 455 | } 456 | 457 | // 3. If count of leaf_node >= GetMinKeys() then return else execute step 3. 458 | if (leaf_node->count >= GetMinKeys()) { 459 | UnMap(leaf_node); 460 | return true; 461 | } 462 | 463 | // 4. If borrow from siblings successfully then return else execute step 4. 464 | if (BorrowFromLeafSibling(leaf_node)) { 465 | UnMap(leaf_node); 466 | return true; 467 | } 468 | 469 | // 5. Merge two leaf nodes. 470 | leaf_node = MergeLeaf(leaf_node); 471 | 472 | IndexNode* index_node = Map(leaf_node->parent); 473 | UnMap(leaf_node); 474 | 475 | // 6. If count of index_node >= GetMinKeys() then return or execute 6. 476 | // 7. If count of one of sibling > GetMinKeys() then swap its key and parent's 477 | // key then return or execute 7. 478 | while (index_node->parent != 0 && index_node->count < GetMinKeys() && 479 | !BorrowFromIndexSibling(index_node)) { 480 | // 8. Merge index_node and its' parent and sibling. 481 | IndexNode* old_index_node = MergeIndex(index_node); 482 | index_node = Map(old_index_node->parent); 483 | UnMap(old_index_node); 484 | } 485 | 486 | if (index_node->parent == 0 && index_node->count == 0) { 487 | // 9. Root is removed, update new root and height. 488 | Node* new_root = Map(index_node->indexes[0].offset); 489 | assert(new_root->left == 0); 490 | assert(new_root->right == 0); 491 | new_root->parent = 0; 492 | meta_->root = new_root->offset; 493 | --meta_->height; 494 | UnMap(new_root); 495 | Dealloc(index_node); 496 | return true; 497 | } 498 | 499 | UnMap(index_node); 500 | return true; 501 | } 502 | 503 | bool BPlusTree::Get(const std::string& key, std::string& value) const { 504 | off_t of_leaf = GetLeafOffset(key.data()); 505 | LeafNode* leaf_node = Map(of_leaf); 506 | int index = GetIndexFromLeafNode(leaf_node, key.data()); 507 | if (index == -1) { 508 | UnMap(leaf_node); 509 | return false; 510 | } 511 | value = leaf_node->Value(index); 512 | UnMap(leaf_node); 513 | return true; 514 | } 515 | 516 | template 517 | T* BPlusTree::Map(off_t offset) const { 518 | return block_cache_->Get(fd_, offset); 519 | } 520 | 521 | template 522 | void BPlusTree::UnMap(T* map_obj) const { 523 | block_cache_->Put(map_obj); 524 | } 525 | 526 | constexpr size_t BPlusTree::GetMinKeys() const { return (kOrder + 1) / 2 - 1; } 527 | 528 | constexpr size_t BPlusTree::GetMaxKeys() const { return kOrder - 1; } 529 | 530 | BPlusTree::IndexNode* BPlusTree::GetOrCreateParent(Node* node) { 531 | if (node->parent == 0) { 532 | // Split root node. 533 | IndexNode* parent_node = Alloc(); 534 | node->parent = parent_node->offset; 535 | meta_->root = parent_node->offset; 536 | ++meta_->height; 537 | return parent_node; 538 | } 539 | return Map(node->parent); 540 | } 541 | 542 | template 543 | int BPlusTree::UpperBound(T arr[], int n, const char* key) const { 544 | assert(n <= GetMaxKeys()); 545 | int l = 0, r = n - 1; 546 | while (l <= r) { 547 | int mid = (l + r) >> 1; 548 | if (std::strncmp(arr[mid].key, key, kMaxKeySize) <= 0) { 549 | l = mid + 1; 550 | } else { 551 | r = mid - 1; 552 | } 553 | } 554 | return l; 555 | } 556 | 557 | template 558 | int BPlusTree::LowerBound(T arr[], int n, const char* key) const { 559 | assert(n <= GetMaxKeys()); 560 | int l = 0, r = n - 1; 561 | while (l <= r) { 562 | int mid = (l + r) >> 1; 563 | if (std::strncmp(arr[mid].key, key, kMaxKeySize) < 0) { 564 | l = mid + 1; 565 | } else { 566 | r = mid - 1; 567 | } 568 | } 569 | return l; 570 | }; 571 | 572 | template 573 | T* BPlusTree::Alloc() { 574 | T* node = new (Map(meta_->block)) T(); 575 | node->offset = meta_->block; 576 | meta_->block += sizeof(T); 577 | return node; 578 | } 579 | 580 | template 581 | void BPlusTree::Dealloc(T* node) { 582 | UnMap(node); 583 | } 584 | 585 | off_t BPlusTree::GetLeafOffset(const char* key) const { 586 | size_t height = meta_->height; 587 | off_t offset = meta_->root; 588 | if (height <= 1) { 589 | assert(height == 1); 590 | return offset; 591 | } 592 | // 1. Find bottom index node. 593 | IndexNode* index_node = Map(offset); 594 | while (--height > 1) { 595 | int index = UpperBound(index_node->indexes, index_node->count, key); 596 | off_t of_child = index_node->indexes[index].offset; 597 | UnMap(index_node); 598 | index_node = Map(of_child); 599 | offset = of_child; 600 | } 601 | // 2. Get offset of leaf node. 602 | int index = UpperBound(index_node->indexes, index_node->count, key); 603 | off_t of_child = index_node->indexes[index].offset; 604 | UnMap(index_node); 605 | return of_child; 606 | } 607 | 608 | inline size_t BPlusTree::InsertKeyIntoIndexNode(IndexNode* index_node, 609 | const char* key, 610 | Node* left_node, 611 | Node* right_node) { 612 | assert(index_node->count <= GetMaxKeys()); 613 | int index = UpperBound(index_node->indexes, index_node->count, key); 614 | index_node->InsertIndexAtIndex(index, key, left_node->offset); 615 | index_node->UpdateOffset(index + 1, right_node->offset); 616 | return index_node->count; 617 | } 618 | 619 | size_t BPlusTree::InsertKVIntoLeafNode(LeafNode* leaf_node, const char* key, 620 | const char* value) { 621 | assert(leaf_node->count <= GetMaxKeys()); 622 | int index = UpperBound(leaf_node->records, leaf_node->count, key); 623 | if (index > 0 && 624 | std::strncmp(leaf_node->Key(index - 1), key, kMaxKeySize) == 0) { 625 | leaf_node->UpdateValue(index - 1, value); 626 | return leaf_node->count; 627 | } 628 | 629 | leaf_node->InsertKVAtIndex(index, key, value); 630 | ++meta_->size; 631 | return leaf_node->count; 632 | } 633 | 634 | BPlusTree::LeafNode* BPlusTree::SplitLeafNode(LeafNode* leaf_node) { 635 | assert(leaf_node->count == kOrder); 636 | constexpr int mid = (kOrder - 1) >> 1; 637 | constexpr int left_count = mid; 638 | constexpr int right_count = kOrder - mid; 639 | 640 | LeafNode* split_node = Alloc(); 641 | 642 | // Change count. 643 | leaf_node->count = left_count; 644 | split_node->count = right_count; 645 | 646 | // Copy right part of index_node. 647 | std::memcpy(&split_node->records[0], &leaf_node->records[mid], 648 | sizeof(split_node->records[0]) * right_count); 649 | 650 | // Link siblings. 651 | split_node->left = leaf_node->offset; 652 | split_node->right = leaf_node->right; 653 | leaf_node->right = split_node->offset; 654 | if (split_node->right != 0) { 655 | LeafNode* new_sibling = Map(split_node->right); 656 | new_sibling->left = split_node->offset; 657 | UnMap(new_sibling); 658 | } 659 | return split_node; 660 | } 661 | 662 | BPlusTree::IndexNode* BPlusTree::SplitIndexNode(IndexNode* index_node) { 663 | assert(index_node->count == kOrder); 664 | constexpr int mid = (kOrder - 1) >> 1; 665 | constexpr int left_count = mid; 666 | constexpr int right_count = kOrder - mid - 1; 667 | 668 | IndexNode* split_node = Alloc(); 669 | 670 | // Change count. 671 | index_node->count = left_count; 672 | split_node->count = right_count; 673 | 674 | // Copy right part of index_node. 675 | std::memcpy(&split_node->indexes[0], &index_node->indexes[mid + 1], 676 | sizeof(split_node->indexes[0]) * (right_count + 1)); 677 | 678 | // Link old childs to new splited parent. 679 | for (int i = mid + 1; i <= kOrder; ++i) { 680 | off_t of_child = index_node->indexes[i].offset; 681 | LeafNode* child_node = Map(of_child); 682 | child_node->parent = split_node->offset; 683 | UnMap(child_node); 684 | } 685 | 686 | // Link siblings. 687 | split_node->left = index_node->offset; 688 | split_node->right = index_node->right; 689 | index_node->right = split_node->offset; 690 | if (split_node->right != 0) { 691 | IndexNode* new_sibling = Map(split_node->right); 692 | new_sibling->left = split_node->offset; 693 | UnMap(new_sibling); 694 | } 695 | return split_node; 696 | } 697 | 698 | inline int BPlusTree::GetIndexFromLeafNode(LeafNode* leaf_node, 699 | const char* key) const { 700 | int index = LowerBound(leaf_node->records, leaf_node->count, key); 701 | return index < static_cast(leaf_node->count) && 702 | std::strncmp(leaf_node->Key(index), key, kMaxKeySize) == 0 703 | ? index 704 | : -1; 705 | } 706 | 707 | std::vector> BPlusTree::GetRange( 708 | const std::string& left_key, const std::string& right_key) const { 709 | std::vector> res; 710 | off_t of_leaf = GetLeafOffset(left_key.data()); 711 | LeafNode* leaf_node = Map(of_leaf); 712 | int index = LowerBound(leaf_node->records, leaf_node->count, left_key.data()); 713 | for (int i = index; i < leaf_node->count; ++i) { 714 | res.emplace_back(leaf_node->Key(i), leaf_node->Value(i)); 715 | } 716 | 717 | of_leaf = leaf_node->right; 718 | bool finish = false; 719 | while (of_leaf != 0 && !finish) { 720 | LeafNode* right_leaf_node = Map(of_leaf); 721 | for (int i = 0; i < right_leaf_node->count; ++i) { 722 | if (strncmp(right_leaf_node->Key(i), right_key.data(), kMaxKeySize) <= 723 | 0) { 724 | res.emplace_back(right_leaf_node->Key(i), right_leaf_node->Value(i)); 725 | } else { 726 | finish = true; 727 | break; 728 | } 729 | } 730 | of_leaf = right_leaf_node->right; 731 | UnMap(right_leaf_node); 732 | } 733 | 734 | UnMap(leaf_node); 735 | return res; 736 | } 737 | 738 | bool BPlusTree::Empty() const { return meta_->size == 0; } 739 | 740 | size_t BPlusTree::Size() const { return meta_->size; } 741 | 742 | // Try Borrow key from left sibling. 743 | bool BPlusTree::BorrowFromLeftLeafSibling(LeafNode* leaf_node) { 744 | if (leaf_node->left == 0) return false; 745 | LeafNode* sibling = Map(leaf_node->left); 746 | if (sibling->parent != leaf_node->parent || sibling->count <= GetMinKeys()) { 747 | if (sibling->parent == leaf_node->parent) { 748 | assert(sibling->count == GetMinKeys()); 749 | } 750 | UnMap(sibling); 751 | return false; 752 | } 753 | // 1. Borrow last key from left sibling. 754 | leaf_node->InsertKVAtIndex(0, sibling->LastKey(), sibling->LastValue()); 755 | --sibling->count; 756 | 757 | // 2. Update parent's key. 758 | IndexNode* parent_node = Map(leaf_node->parent); 759 | int index = 760 | UpperBound(parent_node->indexes, parent_node->count, sibling->LastKey()); 761 | parent_node->UpdateKey(index, leaf_node->FirstKey()); 762 | UnMap(parent_node); 763 | UnMap(sibling); 764 | return true; 765 | } 766 | 767 | // Try Borrow key from right sibling. 768 | bool BPlusTree::BorrowFromRightLeafSibling(LeafNode* leaf_node) { 769 | if (leaf_node->right == 0) return false; 770 | LeafNode* sibling = Map(leaf_node->right); 771 | 772 | if (sibling->parent != leaf_node->parent || sibling->count <= GetMinKeys()) { 773 | if (sibling->parent == leaf_node->parent) { 774 | assert(sibling->count == GetMinKeys()); 775 | } 776 | UnMap(sibling); 777 | return false; 778 | } 779 | 780 | // 1. Borrow frist key from right sibling. 781 | leaf_node->UpdateKV(leaf_node->count++, sibling->FirstKey(), 782 | sibling->FirstValue()); 783 | sibling->DeleteKVAtIndex(0); 784 | 785 | // 2. Update parent's key. 786 | IndexNode* parent_node = Map(leaf_node->parent); 787 | int index = 788 | UpperBound(parent_node->indexes, parent_node->count, sibling->LastKey()); 789 | parent_node->UpdateKey(index - 1, sibling->FirstKey()); 790 | 791 | UnMap(parent_node); 792 | UnMap(sibling); 793 | return true; 794 | } 795 | 796 | inline bool BPlusTree::BorrowFromLeafSibling(LeafNode* leaf_node) { 797 | assert(leaf_node->count == GetMinKeys() - 1); 798 | assert(leaf_node->parent != 0); 799 | return BorrowFromLeftLeafSibling(leaf_node) || 800 | BorrowFromRightLeafSibling(leaf_node); 801 | } 802 | 803 | // Try merge left leaf node. 804 | bool BPlusTree::MergeLeftLeaf(LeafNode* leaf_node) { 805 | if (leaf_node->left == 0) return false; 806 | LeafNode* sibling = Map(leaf_node->left); 807 | if (sibling->parent != leaf_node->parent) { 808 | UnMap(sibling); 809 | return false; 810 | } 811 | 812 | assert(sibling->count == GetMinKeys()); 813 | // 1. Delete key from parent. 814 | IndexNode* parent_node = Map(leaf_node->parent); 815 | int index = 816 | UpperBound(parent_node->indexes, parent_node->count, sibling->LastKey()); 817 | parent_node->DeleteKeyAtIndex(index); 818 | 819 | // 2. Merge left sibling. 820 | leaf_node->MergeLeftSibling(sibling); 821 | 822 | // 3. Link new sibling. 823 | leaf_node->left = sibling->left; 824 | if (sibling->left != 0) { 825 | LeafNode* new_sibling = Map(sibling->left); 826 | new_sibling->right = leaf_node->offset; 827 | UnMap(new_sibling); 828 | } 829 | 830 | UnMap(parent_node); 831 | Dealloc(sibling); 832 | return true; 833 | } 834 | 835 | // Try Merge right node. 836 | bool BPlusTree::MergeRightLeaf(LeafNode* leaf_node) { 837 | if (leaf_node->right == 0) return false; 838 | LeafNode* sibling = Map(leaf_node->right); 839 | if (sibling->parent != leaf_node->parent) { 840 | UnMap(sibling); 841 | return false; 842 | } 843 | 844 | // 1. Delete key from parent. 845 | IndexNode* parent_node = Map(leaf_node->parent); 846 | int index = 847 | UpperBound(parent_node->indexes, parent_node->count, sibling->LastKey()); 848 | parent_node->UpdateKey(index - 1, parent_node->Key(index)); 849 | parent_node->DeleteKeyAtIndex(index); 850 | UnMap(parent_node); 851 | 852 | // 2. Merge right sibling. 853 | leaf_node->MergeRightSibling(sibling); 854 | 855 | // 3. Link new sibling. 856 | leaf_node->right = sibling->right; 857 | if (sibling->right != 0) { 858 | LeafNode* new_sibling = Map(sibling->right); 859 | new_sibling->left = leaf_node->offset; 860 | UnMap(new_sibling); 861 | } 862 | 863 | Dealloc(sibling); 864 | return true; 865 | } 866 | 867 | inline BPlusTree::LeafNode* BPlusTree::MergeLeaf(LeafNode* leaf_node) { 868 | // Merge left node to leaf_node or right node to leaf_node. 869 | assert(leaf_node->count == GetMinKeys() - 1); 870 | assert(leaf_node->parent != 0); 871 | assert(meta_->root != leaf_node->offset); 872 | assert(MergeLeftLeaf(leaf_node) || MergeRightLeaf(leaf_node)); 873 | return leaf_node; 874 | } 875 | 876 | // Try Swap key between index_node's left sibling and index_node's parent. 877 | bool BPlusTree::BorrowFromLeftIndexSibling(IndexNode* index_node) { 878 | if (index_node->left == 0) return false; 879 | IndexNode* sibling = Map(index_node->left); 880 | if (sibling->parent != index_node->parent || sibling->count <= GetMinKeys()) { 881 | if (sibling->parent == index_node->parent) { 882 | assert(sibling->count == GetMinKeys()); 883 | } 884 | UnMap(sibling); 885 | return false; 886 | } 887 | 888 | // 1.Insert parent'key to the first of index_node's keys. 889 | IndexNode* parent_node = Map(index_node->parent); 890 | int index = 891 | UpperBound(parent_node->indexes, parent_node->count, sibling->LastKey()); 892 | index_node->InsertKeyAtIndex(0, parent_node->Key(index)); 893 | 894 | // 2. Change parent's key. 895 | parent_node->UpdateKey(index, sibling->LastKey()); 896 | 897 | // 3. Link sibling's last child to index_node, 898 | // and delete sibling's last child. 899 | Node* last_sibling_child = 900 | Map(sibling->indexes[sibling->count--].offset); 901 | index_node->indexes[0].offset = last_sibling_child->offset; 902 | last_sibling_child->parent = index_node->offset; 903 | 904 | UnMap(last_sibling_child); 905 | UnMap(parent_node); 906 | UnMap(sibling); 907 | return true; 908 | } 909 | 910 | bool BPlusTree::BorrowFromRightIndexSibling(IndexNode* index_node) { 911 | if (index_node->right == 0) return false; 912 | IndexNode* sibling = Map(index_node->right); 913 | if (sibling->parent != index_node->parent || sibling->count <= GetMinKeys()) { 914 | if (sibling->parent == index_node->parent) { 915 | assert(sibling->count == GetMinKeys()); 916 | } 917 | UnMap(sibling); 918 | return false; 919 | } 920 | 921 | // 1.Insert parent‘key to the last of index_node's keys. 922 | IndexNode* parent = Map(index_node->parent); 923 | int index = UpperBound(parent->indexes, parent->count, sibling->LastKey()); 924 | index_node->UpdateKey(index_node->count++, parent->Key(index - 1)); 925 | 926 | // 2. Change parent's key. 927 | parent->UpdateKey(index - 1, sibling->FirstKey()); 928 | 929 | // 3. Link index_node's last child to sibling's first child, 930 | // and delete sibling's first child. 931 | Node* first_sibling_child = Map(sibling->indexes[0].offset); 932 | index_node->indexes[index_node->count].offset = first_sibling_child->offset; 933 | first_sibling_child->parent = index_node->offset; 934 | sibling->DeleteKeyAtIndex(0); 935 | 936 | UnMap(first_sibling_child); 937 | UnMap(parent); 938 | UnMap(sibling); 939 | return true; 940 | } 941 | 942 | inline bool BPlusTree::BorrowFromIndexSibling(IndexNode* index_node) { 943 | assert(index_node->count == GetMinKeys() - 1); 944 | return BorrowFromLeftIndexSibling(index_node) || 945 | BorrowFromRightIndexSibling(index_node); 946 | } 947 | 948 | // Try merge left index node. 949 | bool BPlusTree::MergeLeftIndex(IndexNode* index_node) { 950 | if (index_node->left == 0) return false; 951 | IndexNode* sibling = Map(index_node->left); 952 | if (sibling->parent != index_node->parent) { 953 | UnMap(sibling); 954 | return false; 955 | } 956 | 957 | assert(sibling->count == GetMinKeys()); 958 | // 1. Merge left sibling to index_node. 959 | index_node->MergeLeftSibling(sibling); 960 | 961 | // 2. Link sibling's childs to index_node. 962 | for (size_t i = 0; i < sibling->count + 1; ++i) { 963 | Node* child_node = Map(sibling->indexes[i].offset); 964 | child_node->parent = index_node->offset; 965 | UnMap(child_node); 966 | } 967 | 968 | // 3. Link new sibling. 969 | index_node->left = sibling->left; 970 | if (sibling->left != 0) { 971 | IndexNode* new_sibling = Map(sibling->left); 972 | new_sibling->right = index_node->offset; 973 | UnMap(new_sibling); 974 | } 975 | 976 | // 4. Update index_node's mid key. 977 | IndexNode* parent_node = Map(index_node->parent); 978 | int index = 979 | UpperBound(parent_node->indexes, parent_node->count, sibling->LastKey()); 980 | index_node->UpdateKey(sibling->count, parent_node->Key(index)); 981 | 982 | // 5. Delete parent's key. 983 | parent_node->DeleteKeyAtIndex(index); 984 | 985 | Dealloc(sibling); 986 | return true; 987 | } 988 | 989 | // Try merge right index node. 990 | bool BPlusTree::MergeRightIndex(IndexNode* index_node) { 991 | if (index_node->right == 0) return false; 992 | IndexNode* sibling = Map(index_node->right); 993 | if (sibling->parent != index_node->parent) { 994 | UnMap(sibling); 995 | return false; 996 | } 997 | 998 | assert(sibling->count == GetMinKeys()); 999 | // 1. Update index_node's last key. 1000 | IndexNode* parent = Map(index_node->parent); 1001 | int index = UpperBound(parent->indexes, parent->count, sibling->LastKey()); 1002 | index_node->UpdateKey(index_node->count++, parent->Key(index - 1)); 1003 | 1004 | // 2. Merge right sibling to index_node. 1005 | index_node->MergeRightSibling(sibling); 1006 | 1007 | // 3. Link sibling's childs to index_node. 1008 | for (size_t i = 0; i < sibling->count + 1; ++i) { 1009 | Node* child_node = Map(sibling->indexes[i].offset); 1010 | child_node->parent = index_node->offset; 1011 | UnMap(child_node); 1012 | } 1013 | 1014 | // 4. Link new sibling. 1015 | index_node->right = sibling->right; 1016 | if (sibling->right != 0) { 1017 | IndexNode* new_sibling = Map(sibling->right); 1018 | new_sibling->left = index_node->offset; 1019 | UnMap(new_sibling); 1020 | } 1021 | 1022 | // 5. Delete parent's key. 1023 | parent->UpdateKey(index - 1, parent->Key(index)); 1024 | parent->DeleteKeyAtIndex(index); 1025 | 1026 | Dealloc(sibling); 1027 | return true; 1028 | } 1029 | 1030 | inline BPlusTree::IndexNode* BPlusTree::MergeIndex(IndexNode* index_node) { 1031 | assert(index_node->count == GetMinKeys() - 1); 1032 | assert(index_node->parent != 0); 1033 | assert(meta_->root != index_node->offset); 1034 | assert(MergeLeftIndex(index_node) || MergeRightIndex(index_node)); 1035 | return index_node; 1036 | } 1037 | 1038 | #ifdef DEBUG 1039 | #include 1040 | void BPlusTree::Dump() { 1041 | std::vector>> res( 1042 | 5, std::vector>()); 1043 | std::queue> q; 1044 | q.emplace(meta_->root, 1); 1045 | while (!q.empty()) { 1046 | auto cur = q.front(); 1047 | q.pop(); 1048 | if (cur.second < meta_->height) { 1049 | IndexNode* index_node = Map(cur.first); 1050 | std::vector v; 1051 | for (int i = 0; i < index_node->count + 1; ++i) { 1052 | if (i == index_node->count) { 1053 | v.push_back(""); 1054 | } else { 1055 | v.push_back(index_node->indexes[i].key); 1056 | } 1057 | if (index_node->indexes[i].offset != 0) { 1058 | q.emplace(index_node->indexes[i].offset, cur.second + 1); 1059 | } 1060 | } 1061 | res[cur.second].push_back(v); 1062 | UnMap(index_node); 1063 | } else { 1064 | LeafNode* leaf_node = Map(cur.first); 1065 | std::vector v; 1066 | for (int i = 0; i < leaf_node->count; ++i) { 1067 | v.push_back(leaf_node->records[i].key); 1068 | } 1069 | res[cur.second].push_back(v); 1070 | UnMap(leaf_node); 1071 | } 1072 | } 1073 | 1074 | for (int i = 1; i <= meta_->height; ++i) { 1075 | for (int j = 0; j < meta_->height - i; ++j) { 1076 | LOG2("%s", "\t"); 1077 | } 1078 | for (auto& v : res[i]) { 1079 | for (auto& k : v) { 1080 | LOG2("%s,", k.data()); 1081 | } 1082 | LOG2("%s", " "); 1083 | } 1084 | LOG2("%s", "\n"); 1085 | } 1086 | } 1087 | #endif --------------------------------------------------------------------------------