├── .gitignore ├── ATree ├── bptree.h ├── config.h ├── db_impl.h └── shrinkingcone_segmentation.h ├── CMakeLists.txt ├── README.md ├── demo_dataset ├── books_200M_uint64.csv ├── fb_200M_uint64.csv ├── linear.csv ├── normal.csv ├── osmc_200M_uint64.csv ├── seg1.csv ├── seg10.csv └── wiki_ts_200M_uint64.csv ├── license ├── scripts └── exp.sh ├── test ├── test_latency.cc ├── test_throughput.cc └── test_workload.cc └── util ├── datagen.py └── process_SOSD_data.cc /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | build/ 3 | .vscode/ 4 | .vs/ 5 | cmake-build-debug/ 6 | cmake-build-release/ 7 | # dataset/ 8 | CMakeFiles/ 9 | -------------------------------------------------------------------------------- /ATree/bptree.h: -------------------------------------------------------------------------------- 1 | #ifndef _BPTREE_H_ 2 | #define _BPTREE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "shrinkingcone_segmentation.h" 9 | #include "config.h" 10 | using namespace std; 11 | 12 | typedef long long ll; 13 | int Se; 14 | 15 | // BP node 16 | class Node { 17 | bool IS_LEAF; 18 | ll *key; 19 | int size; 20 | Node **ptr; 21 | Segment **seg; 22 | friend class BPTree; 23 | 24 | public: 25 | Node(); 26 | ~Node(); 27 | }; 28 | 29 | // BP tree 30 | class BPTree { 31 | Node *root; 32 | void insertInternal(ll, Node *, Node *); 33 | Node *findParent(Node *, Node *); 34 | 35 | public: 36 | BPTree(); 37 | State search(ll); 38 | void insert(ll); 39 | State construct(const vector& underdata); 40 | void display(Node *cursor); 41 | // void display_seg(); 42 | State delta_insert(ll); 43 | int calculate_size(); 44 | int internal_calculate_size(Node*); 45 | ~BPTree(); 46 | void internal_destruct(Node* &); 47 | void destruct(); 48 | }; 49 | 50 | Node::Node() { 51 | size = 0; 52 | key = new ll[config::FANOUT]; 53 | ptr = new Node* [config::FANOUT + 1]; 54 | seg = new Segment* [config::FANOUT + 1]; 55 | for (int i = 0; i < config::FANOUT + 1; i += 1) { 56 | ptr[i] = NULL; 57 | seg[i] = NULL; 58 | } 59 | seg[0] = new Segment; 60 | } 61 | 62 | Node::~Node() { 63 | delete key; 64 | 65 | for (int i = 0; i < size; i += 1) { 66 | if (ptr[i] != NULL) { 67 | delete ptr[i]; 68 | ptr[i] = NULL; 69 | } 70 | if (seg[i] != NULL) { 71 | delete seg[i]; 72 | seg[i] = NULL; 73 | } 74 | } 75 | // 这一步不必要也不能加,因为在该步处理之前, 就已经达到该步的效果了 76 | // 不可重复delete 77 | // if (size > 0) { 78 | // if (ptr[size] != NULL) delete ptr[size]; ptr[size] = NULL; 79 | // if (seg[size] != NULL) delete seg[size]; ptr[size] = NULL; 80 | // } 81 | 82 | if (ptr != NULL) { 83 | delete [] ptr; 84 | ptr = NULL; 85 | } 86 | if (seg != NULL) { 87 | delete [] seg; 88 | seg = NULL; 89 | } 90 | } 91 | 92 | BPTree::BPTree() { 93 | root = NULL; 94 | } 95 | 96 | void BPTree::internal_destruct(Node* &node) { 97 | if (node == NULL) { 98 | return; 99 | } 100 | if (node -> IS_LEAF != true) { 101 | for (int i = node->size; i >= 0; i -= 1) { 102 | internal_destruct(node -> ptr[i]); 103 | } 104 | } 105 | delete node; 106 | node = NULL; 107 | } 108 | 109 | BPTree::~BPTree() { 110 | destruct(); 111 | } 112 | 113 | void BPTree::destruct() { 114 | // display(root); 115 | internal_destruct(root); 116 | root = NULL; 117 | } 118 | 119 | // Search operation 120 | State BPTree::search(ll x) { 121 | if (root == NULL) { 122 | cout << "Tree is empty\n"; 123 | return State::FAIL; 124 | } else { 125 | Node *cursor = root; 126 | // cout << "Lookup中结果: " << "\n"; 127 | while (cursor->IS_LEAF == false) { 128 | for (int i = 0; i < cursor->size; i++) { 129 | if (x < cursor->key[i]) { 130 | cursor = cursor->ptr[i]; 131 | break; 132 | } 133 | if (i == cursor->size - 1) { 134 | cursor = cursor->ptr[i + 1]; 135 | break; 136 | } 137 | } 138 | } 139 | // 进入到叶子节点中,使用Segment算法 140 | if (x < cursor->key[0]) { 141 | if (cursor->seg[0]->search_buffer(x) == x) { 142 | return State::SUCCESS; 143 | } 144 | return State::FAIL; 145 | } 146 | int i = 0; 147 | for (; i < cursor->size; i++) { 148 | if (i == cursor->size - 1 || (x >= cursor->key[i] && x < cursor->key[i + 1])) { 149 | break; 150 | } 151 | } 152 | Segment *seg = cursor->seg[i + 1]; 153 | int pos = seg->slope * (x - seg->start); 154 | int l_bound = max_double(pos - config::ERROR, 0); 155 | int r_bound = min_double(pos + config::ERROR, seg->data.size()); 156 | pos = distance(seg->data.begin(), lower_bound(seg->data.begin() + l_bound, seg->data.begin() + r_bound, x)); 157 | if (seg->data[pos] == x) { 158 | //cout << x << endl; 159 | return State::SUCCESS; 160 | } 161 | if (seg->search_buffer(x) == x) { 162 | return State::SUCCESS; 163 | } 164 | } 165 | return State::FAIL; 166 | } 167 | 168 | // Insert Operation 169 | void BPTree::insert(ll x) { 170 | if (root == NULL) { 171 | root = new Node; 172 | root->key[0] = x; 173 | root->IS_LEAF = true; 174 | root->size = 1; 175 | } else { 176 | Node *cursor = root; 177 | Node *parent; 178 | while (cursor->IS_LEAF == false) { 179 | parent = cursor; 180 | for (int i = 0; i < cursor->size; i++) { 181 | if (x < cursor->key[i]) { 182 | cursor = cursor->ptr[i]; 183 | break; 184 | } 185 | if (i == cursor->size - 1) { 186 | cursor = cursor->ptr[i + 1]; 187 | break; 188 | } 189 | } 190 | } 191 | if (cursor->size < config::FANOUT) { 192 | int i = 0; 193 | while (x > cursor->key[i] && i < cursor->size) 194 | i++; 195 | for (int j = cursor->size; j > i; j--) { 196 | cursor->key[j] = cursor->key[j - 1]; 197 | } 198 | cursor->key[i] = x; 199 | cursor->size++; 200 | cursor->ptr[cursor->size] = cursor->ptr[cursor->size - 1]; 201 | cursor->ptr[cursor->size - 1] = NULL; 202 | } else { 203 | Node *newLeaf = new Node; 204 | ll virtualNode[config::FANOUT + 2]; 205 | for (int i = 0; i < config::FANOUT; i++) { 206 | virtualNode[i] = cursor->key[i]; 207 | } 208 | int i = 0, j; 209 | while (x > virtualNode[i] && i < config::FANOUT) 210 | i++; 211 | for (int j = config::FANOUT + 1; j > i; j--) { 212 | virtualNode[j] = virtualNode[j - 1]; 213 | } 214 | virtualNode[i] = x; 215 | newLeaf->IS_LEAF = true; 216 | cursor->size = (config::FANOUT + 1) / 2; 217 | newLeaf->size = config::FANOUT + 1 - (config::FANOUT + 1) / 2; 218 | cursor->ptr[cursor->size] = newLeaf; 219 | newLeaf->ptr[newLeaf->size] = cursor->ptr[config::FANOUT]; 220 | cursor->ptr[config::FANOUT] = NULL; 221 | for (i = 0; i < cursor->size; i++) { 222 | cursor->key[i] = virtualNode[i]; 223 | } 224 | for (i = 0, j = cursor->size; i < newLeaf->size; i++, j++) { 225 | newLeaf->key[i] = virtualNode[j]; 226 | } 227 | if (cursor == root) { 228 | Node *newRoot = new Node; 229 | newRoot->key[0] = newLeaf->key[0]; 230 | newRoot->ptr[0] = cursor; 231 | newRoot->ptr[1] = newLeaf; 232 | newRoot->IS_LEAF = false; 233 | newRoot->size = 1; 234 | root = newRoot; 235 | } else { 236 | insertInternal(newLeaf->key[0], parent, newLeaf); 237 | } 238 | } 239 | } 240 | } 241 | 242 | // Insert Operation 243 | void BPTree::insertInternal(ll x, Node *cursor, Node *child) { 244 | if (cursor->size < config::FANOUT) { 245 | int i = 0; 246 | while (x > cursor->key[i] && i < cursor->size) 247 | i++; 248 | for (int j = cursor->size; j > i; j--) { 249 | cursor->key[j] = cursor->key[j - 1]; 250 | } 251 | for (int j = cursor->size + 1; j > i + 1; j--) { 252 | cursor->ptr[j] = cursor->ptr[j - 1]; 253 | } 254 | cursor->key[i] = x; 255 | cursor->size++; 256 | cursor->ptr[i + 1] = child; 257 | } else { 258 | Node *newInternal = new Node; 259 | ll virtualKey[config::FANOUT + 1]; 260 | Node *virtualPtr[config::FANOUT + 2]; 261 | for (int i = 0; i < config::FANOUT; i++) { 262 | virtualKey[i] = cursor->key[i]; 263 | } 264 | for (int i = 0; i < config::FANOUT + 1; i++) { 265 | virtualPtr[i] = cursor->ptr[i]; 266 | } 267 | int i = 0, j; 268 | while (x > virtualKey[i] && i < config::FANOUT) 269 | i++; 270 | for (int j = config::FANOUT + 1; j > i; j--) { 271 | virtualKey[j] = virtualKey[j - 1]; 272 | } 273 | virtualKey[i] = x; 274 | for (int j = config::FANOUT + 2; j > i + 1; j--) { 275 | virtualPtr[j] = virtualPtr[j - 1]; 276 | } 277 | virtualPtr[i + 1] = child; 278 | newInternal->IS_LEAF = false; 279 | cursor->size = (config::FANOUT + 1) / 2; 280 | newInternal->size = config::FANOUT - (config::FANOUT + 1) / 2; 281 | for (i = 0, j = cursor->size + 1; i < newInternal->size; i++, j++) { 282 | newInternal->key[i] = virtualKey[j]; 283 | } 284 | for (i = 0, j = cursor->size + 1; i < newInternal->size + 1; i++, j++) { 285 | newInternal->ptr[i] = virtualPtr[j]; 286 | } 287 | if (cursor == root) { 288 | Node *newRoot = new Node; 289 | newRoot->key[0] = cursor->key[cursor->size]; 290 | newRoot->ptr[0] = cursor; 291 | newRoot->ptr[1] = newInternal; 292 | newRoot->IS_LEAF = false; 293 | newRoot->size = 1; 294 | root = newRoot; 295 | } else { 296 | insertInternal(cursor->key[cursor->size], findParent(root, cursor), newInternal); 297 | } 298 | } 299 | } 300 | 301 | // Find the parent 302 | Node *BPTree::findParent(Node *cursor, Node *child) { 303 | Node *parent; 304 | if (cursor->IS_LEAF || (cursor->ptr[0])->IS_LEAF) { 305 | return NULL; 306 | } 307 | for (int i = 0; i < cursor->size + 1; i++) { 308 | if (cursor->ptr[i] == child) { 309 | parent = cursor; 310 | return parent; 311 | } else { 312 | parent = findParent(cursor->ptr[i], child); 313 | if (parent != NULL) 314 | return parent; 315 | } 316 | } 317 | return parent; 318 | } 319 | 320 | State BPTree::construct(const vector& underdata) { 321 | try { 322 | vector _; 323 | _.resize(0); 324 | vector underlying_data(underdata.begin(), underdata.end()); 325 | destruct(); 326 | 327 | vector underlying_segs = shrinkingcore_segmentation(underlying_data, _); 328 | Se = underlying_segs.size(); 329 | for (Segment seg : underlying_segs) { 330 | insert(seg.start); 331 | } 332 | // display(root); 333 | for (int j = 0; j < Se; j += 1) { 334 | Node *cursor = root; 335 | while (cursor->IS_LEAF == false) { 336 | for (int i = 0; i < cursor->size; i++) { 337 | if (underlying_segs[j].start < cursor->key[i]) { 338 | cursor = cursor->ptr[i]; 339 | break; 340 | } 341 | if (i == cursor->size - 1) { 342 | cursor = cursor->ptr[i + 1]; 343 | break; 344 | } 345 | } 346 | } 347 | // 抵达叶子节点, 找到合适的指针, 使其指向下层的'线段节点' 348 | for (int i = 0; i < cursor->size; i++) { 349 | if (cursor->key[i] == underlying_segs[j].start) { 350 | cursor->seg[i + 1] = new Segment(underlying_segs[j]); 351 | break; 352 | } 353 | } 354 | } 355 | // cout << "FITing-Tree内部节点: " << "\n"; 356 | // display(root); 357 | return State::SUCCESS; 358 | } catch (exception& e) { 359 | cout << e.what() << "\n"; 360 | return State::FAIL; 361 | } 362 | } 363 | 364 | // Print the tree 365 | void BPTree::display(Node *cursor) { 366 | if (cursor != NULL) { 367 | for (int i = 0; i < cursor->size; i++) { 368 | cout << cursor->key[i] << " "; 369 | } 370 | cout << "\n"; 371 | if (cursor->IS_LEAF != true) { 372 | for (int i = 0; i < cursor->size + 1; i++) { 373 | display(cursor->ptr[i]); 374 | } 375 | } 376 | } 377 | } 378 | 379 | //void BPTree::display_seg() { 380 | // for (int j = 0; j < root->size; j += 1) { 381 | // cout << "seg: " << root->seg[j + 1]->start << " " << root->seg[j + 1]->slope << "\n"; 382 | // } 383 | //} 384 | 385 | State BPTree::delta_insert(ll x) { 386 | try { 387 | Node *cursor = root; 388 | while (cursor->IS_LEAF == false) { 389 | for (int i = 0; i < cursor->size; i++) { 390 | if (x < cursor->key[i]) { 391 | cursor = cursor->ptr[i]; 392 | break; 393 | } 394 | if (i == cursor->size - 1) { 395 | cursor = cursor->ptr[i + 1]; 396 | break; 397 | } 398 | } 399 | } 400 | 401 | Segment *seg; 402 | // 进入到叶子节点中,使用Segment算法 403 | if (x < cursor->key[0]) { 404 | seg = cursor->seg[0]; 405 | } 406 | else { 407 | int i = 0; 408 | for (; i < cursor->size; i++) { 409 | if (i == cursor->size - 1 || (x >= cursor->key[i] && x < cursor->key[i + 1])) { 410 | break; 411 | } 412 | } 413 | seg = cursor->seg[i + 1]; 414 | } 415 | seg->insert_buffer(x); 416 | if (seg->is_buffer_full()) { 417 | vector segs = shrinkingcore_segmentation(seg->data, seg->buf); 418 | for (Segment seg : segs) { 419 | insert(seg.start); 420 | } 421 | for (int j = 0; j < segs.size(); j += 1) { 422 | Node *cursor = root; 423 | while (cursor->IS_LEAF == false) { 424 | for (int i = 0; i < cursor->size; i++) { 425 | if (segs[j].start < cursor->key[i]) { 426 | cursor = cursor->ptr[i]; 427 | break; 428 | } 429 | if (i == cursor->size - 1) { 430 | cursor = cursor->ptr[i + 1]; 431 | break; 432 | } 433 | } 434 | } 435 | // 抵达叶子节点, 找到合适的指针, 使其指向下层的'线段节点' 436 | for (int i = 0; i < cursor->size; i++) { 437 | if (cursor->key[i] == segs[j].start) { 438 | cursor->seg[i + 1] = new Segment(segs[j]); 439 | break; 440 | } 441 | } 442 | } 443 | delete seg; 444 | seg = NULL; 445 | return State::SUCCESS; 446 | } 447 | } catch (exception& e) { 448 | std::cout << e.what() << "\n"; 449 | return State::FAIL; 450 | } 451 | return State::FAIL; 452 | } 453 | 454 | int BPTree::internal_calculate_size(Node* cursor) { 455 | int size = 0; 456 | if (cursor != NULL) { 457 | size += sizeof(*cursor); 458 | if (cursor->IS_LEAF != true) { 459 | for (int i = 0; i < cursor->size + 1; i++) { 460 | size += internal_calculate_size(cursor->ptr[i]); 461 | } 462 | } else { 463 | // Segment 索引部分仅包含两个参数:double, long long,计12个Byte 464 | size += 16; 465 | } 466 | } 467 | return size; 468 | } 469 | 470 | int BPTree::calculate_size() { 471 | return internal_calculate_size(root); 472 | } 473 | 474 | double get_latency() { 475 | return config::C * (log2(Se) / log2(config::FANOUT) + log2(config::ERROR) + log2(config::BUFFER_SIZE)); 476 | } 477 | 478 | #endif 479 | -------------------------------------------------------------------------------- /ATree/config.h: -------------------------------------------------------------------------------- 1 | #ifndef _CONFIG_H_ 2 | #define _CONFIG_H_ 3 | 4 | namespace config { 5 | const int FANOUT = 30; 6 | const int BUFFER_SIZE = 30 + 1; 7 | double ERROR = 240; 8 | const int C = 50; 9 | const int PERFORMANCE_FIRST = 0; 10 | const int STORAGE_FIRST = 1; 11 | } 12 | 13 | enum State { 14 | SUCCESS, 15 | FAIL 16 | }; 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /ATree/db_impl.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Jiananyuan on 2022/10/19. 3 | // 4 | 5 | #ifndef _DB_IMPL_H_ 6 | #define _DB_IMPL_H_ 7 | 8 | #include 9 | #include "bptree.h" 10 | #include "config.h" 11 | typedef long long ll; 12 | BPTree* fiting_tree = new BPTree; 13 | 14 | State get(ll x) { 15 | return fiting_tree -> search(x); 16 | } 17 | 18 | State insert(ll x) { 19 | return fiting_tree -> delta_insert(x); 20 | } 21 | 22 | State construct(const vector& underdata) { 23 | return fiting_tree->construct(underdata); 24 | } 25 | 26 | int getsize() { 27 | return fiting_tree->calculate_size(); 28 | } 29 | 30 | // 网格搜索 31 | const int e[] = {1000, 100, 10}; 32 | 33 | // 性能优先保障 34 | int performance_tradeoff(double Lreq, const vector& underdata) { 35 | int best_e = 0; 36 | int min_size = INT_MAX; 37 | for (int ei : e) { 38 | config::ERROR = ei; 39 | construct(underdata); 40 | double latency = get_latency(); 41 | if (latency < Lreq) { 42 | int size = fiting_tree -> calculate_size(); 43 | if (size < min_size) { 44 | min_size = size; 45 | best_e = ei; 46 | } 47 | } 48 | } 49 | return best_e; 50 | } 51 | 52 | // 存储优先保障 53 | int size_tradeoff(double Sreq, const vector& underdata) { 54 | int best_e = 0; 55 | int min_latency = INT_MAX; 56 | for (int ei : e) { 57 | config::ERROR = ei; 58 | construct(underdata); 59 | int size = fiting_tree -> calculate_size(); 60 | if (size < Sreq) { 61 | double latency = get_latency(); 62 | if (latency < min_latency) { 63 | min_latency = latency; 64 | best_e = ei; 65 | } 66 | } 67 | } 68 | return best_e; 69 | } 70 | 71 | int get_e(int op, double req, const vector& underdata) { 72 | if (op == config::PERFORMANCE_FIRST) { 73 | return performance_tradeoff(req, underdata); 74 | } 75 | else if (op == config::STORAGE_FIRST) { 76 | return size_tradeoff(req, underdata); 77 | } 78 | return -1; 79 | } 80 | 81 | #endif 82 | -------------------------------------------------------------------------------- /ATree/shrinkingcone_segmentation.h: -------------------------------------------------------------------------------- 1 | // 2 | // Created by Jiananyuan on 2022/10/19. 3 | // 4 | 5 | #ifndef _SHRINKINGCORE_SEGMENTATION_H_ 6 | #define _SHRINKINGCORE_SEGMENTATION_H_ 7 | 8 | #include 9 | #include 10 | #include "config.h" 11 | typedef long long ll; 12 | 13 | struct Segment { 14 | double slope; 15 | ll start; 16 | std::vector data; 17 | std::vector buf; 18 | 19 | Segment() { 20 | slope = 0; 21 | start = 0; 22 | buf.resize(config::BUFFER_SIZE); 23 | } 24 | 25 | Segment(double _slope, ll _start, std::vector& _data) { 26 | slope = _slope; 27 | start = _start; 28 | data.assign(_data.begin(), _data.end()); 29 | buf.resize(config::BUFFER_SIZE); 30 | } 31 | 32 | Segment(const Segment& s) { 33 | slope = s.slope; 34 | start = s.start; 35 | data.assign(s.data.begin(), s.data.end()); 36 | buf.assign(s.buf.begin(), s.buf.end()); 37 | } 38 | 39 | void insert_buffer(ll key) { // 1 2 4 5 <-- (3) 40 | int i = buf.size() - 1; 41 | buf.push_back(0); 42 | while (i >= 0 && buf[i] > key) { 43 | buf[i + 1] = buf[i]; 44 | i -= 1; 45 | } 46 | buf[i + 1] = key; 47 | } 48 | 49 | bool is_buffer_full() { 50 | return buf.size() == config::BUFFER_SIZE; 51 | } 52 | 53 | int search_buffer(ll x) { 54 | if (buf.size() != 0) { 55 | int pos = std::distance(buf.begin(), lower_bound(buf.begin(), buf.end(), x)); 56 | return buf[pos]; 57 | } 58 | return -1; 59 | } 60 | 61 | ~Segment() { 62 | data.resize(0); 63 | buf.resize(0); 64 | } 65 | 66 | }; 67 | 68 | double min_double(double d1, double d2) { 69 | if (d1 - d2 < 0) { 70 | return d1; 71 | } 72 | return d2; 73 | } 74 | 75 | double max_double(double d1, double d2) { 76 | if (d1 - d2 < 0) { 77 | return d2; 78 | } 79 | return d1; 80 | } 81 | 82 | std::vector shrinkingcore_segmentation(std::vector& keys, std::vector& buf) { 83 | keys.insert(keys.end(), buf.begin(), buf.end()); 84 | buf.resize(0); 85 | std::sort(keys.begin(), keys.end()); 86 | double sl_high = 1e7; // infinite 87 | double sl_low = 0; 88 | int origin_loc = 0; 89 | std::vector s_segs; 90 | std::vector data; 91 | data.push_back(keys[0]); 92 | for (int i = 1; i < keys.size(); i += 1) { 93 | double k_up = i + config::ERROR; 94 | double k_low = i - config::ERROR; 95 | double max_bound = sl_high * keys[i]; 96 | double min_bound = sl_low * keys[i]; 97 | if (k_up >= min_bound || k_low <= max_bound) { 98 | if (k_up >= min_bound) { 99 | sl_high = min_double((k_up - origin_loc) / (keys[i] - keys[origin_loc]), sl_high); 100 | } 101 | if (k_low <= max_bound) { 102 | sl_low = max_double((k_low - origin_loc) / (keys[i] - keys[origin_loc]), sl_low); 103 | } 104 | data.push_back(keys[i]); 105 | if (i == keys.size() - 1) { 106 | double slope = (i - origin_loc) / (keys[i] - keys[origin_loc]); 107 | s_segs.emplace_back(Segment(slope, keys[origin_loc], data)); 108 | data.resize(0); 109 | } 110 | } 111 | else { 112 | double slope = 1.0 * ((i - 1) - origin_loc) / (keys[i - 1] - keys[origin_loc]); 113 | s_segs.emplace_back(Segment(slope, keys[origin_loc], data)); 114 | origin_loc = i; 115 | sl_high = 1e9; 116 | sl_low = 0; 117 | data.clear(); 118 | data.resize(0); 119 | data.push_back(keys[i]); 120 | } 121 | } 122 | if (!data.empty()) { 123 | s_segs.emplace_back(Segment(0, keys[origin_loc], data)); 124 | data.resize(0); 125 | } 126 | // for (int i = 0; i < s_segs.size(); i += 1) { 127 | // Segment& seg = s_segs[i]; 128 | // std::cout << "seg" << i << ": " << seg.start << " slope: " << seg.slope << " data_range: (" << *(seg.data.begin()) << ", " << *(seg.data.rbegin()) << ")" << "\n"; 129 | // } 130 | return s_segs; 131 | } 132 | 133 | #endif 134 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # CMake 最低版本号要求 2 | cmake_minimum_required (VERSION 2.8) 3 | 4 | # 项目信息 5 | project (FITing_Tree) 6 | 7 | set(CMAKE_CXX_STANDARD 14) 8 | 9 | option(NDEBUG_SWITCH "disable assertion" OFF) 10 | 11 | # 编译指导 12 | if(NOT CMAKE_BUILD_TYPE) 13 | set(CMAKE_BUILD_TYPE Release) 14 | endif() 15 | 16 | if (NDEBUG_SWITCH) 17 | add_definitions(-DNDEBUG) 18 | list(APPEND CMAKE_REQUIRED_FLAGS -Werror -Wthread-safety -O3) 19 | else (NDEBUG_SWITCH) 20 | list(APPEND CMAKE_REQUIRED_FLAGS -Werror -Wthread-safety) 21 | endif(NDEBUG_SWITCH) 22 | 23 | set(CMAKE_CXX_FLAGS_RELEASE -Ofast) 24 | set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -O3 -Wall") 25 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3 -Wall") 26 | 27 | # 指定生成目标 28 | add_executable(test_latency test/test_latency.cc ATree/bptree.h ATree/db_impl.h ATree/shrinkingcone_segmentation.h) 29 | add_executable(test_throughput test/test_throughput.cc ATree/bptree.h ATree/db_impl.h ATree/shrinkingcone_segmentation.h) 30 | add_executable(test_workload test/test_workload.cc ATree/bptree.h ATree/db_impl.h ATree/shrinkingcone_segmentation.h) 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![poster](https://github.com/JiananYuan/FITing-Tree/assets/53621620/63940e14-fdcc-4414-828a-284d121df0b7) 2 | 3 | ## Intro 4 | This is an implementation (easy demo) of FITing-Tree written in C++. For the original SIGMOD '19 paper, see also [FITing-Tree: A Data-aware Index Structure](https://dl.acm.org/doi/10.1145/3299869.3319860). 5 | 6 | ## Install and Run 7 | - git clone this project and execute `mkdir build` in it 8 | - enter the build directory and execute `cmake ..`, `make -j` 9 | - back to the root directory of the project and execute `chmod +x ./exp.sh`, `./exp.sh` 10 | - wait for the program to produce the experiment results 11 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Andy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scripts/exp.sh: -------------------------------------------------------------------------------- 1 | # cmake -DCMAKE_BUILD_TYPE=Release .. 2 | 3 | dataset=(linear seg1 seg10 normal books_200M_uint64 fb_200M_uint64 osmc_200M_uint64 wiki_ts_200M_uint64) 4 | for ds in ${dataset[@]} 5 | do 6 | echo ">>>>>>>>>> $ds: 时延 <<<<<<<<<<" 7 | ../build/test_latency ../demo_dataset/$ds.csv 0 1 8 | done 9 | 10 | for ds in ${dataset[@]} 11 | do 12 | echo ">>>>>>>>>> $ds: 不同静态数据集吞吐量 <<<<<<<<<<" 13 | ../build/test_throughput ../demo_dataset/$ds.csv 0 1 14 | done 15 | 16 | # # 读多写少 17 | # for ds in ${dataset[@]} 18 | # do 19 | # echo ">>>>>>>>>> $ds: 读多写少负载的吞吐量 <<<<<<<<<<" 20 | # ../build/test_workload ./demo_dataset/$ds.csv 0 1 0.8 0.2 21 | # done 22 | 23 | # # 写多读少 24 | # for ds in ${dataset[@]} 25 | # do 26 | # echo ">>>>>>>>>> $ds: 写多读少负载的吞吐量 <<<<<<<<<<" 27 | # ../build/test_workload ./demo_dataset/$ds.csv 0 1 0.2 0.8 28 | # done 29 | 30 | # # 读写均衡 31 | # for ds in ${dataset[@]} 32 | # do 33 | # echo ">>>>>>>>>> $ds: 读写均衡负载的吞吐量 <<<<<<<<<<" 34 | # ../build/test_workload ./demo_dataset/$ds.csv 0 1 0.5 0.5 35 | # done 36 | -------------------------------------------------------------------------------- /test/test_latency.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by yuanj on 2022/11/6. 3 | // 测量时延和index大小 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "../ATree/db_impl.h" 16 | #include "../ATree/config.h" 17 | 18 | using namespace std; 19 | using namespace chrono; 20 | typedef long long ll; 21 | vector under_data; 22 | 23 | int main(int argc, char** argv) { 24 | assert(argc == 3 + 1); 25 | string PATH = string(argv[1]); 26 | int op = atoi(argv[2]); // 0: 性能优先 1: 存储优先 27 | double req = atoi(argv[3]); // 性能或存储限制 28 | 29 | cout << "[Stage 1]: 从外部文件读取数据..." << "\n"; 30 | ifstream fp(PATH); 31 | string line; 32 | // getline(fp, line); 33 | while (getline(fp, line)) { 34 | istringstream readstr(line); 35 | string number; 36 | getline(readstr, number, ','); 37 | under_data.push_back(atoll(number.data())); 38 | } 39 | cout << "数据规模: " << under_data.size() << "\n"; 40 | 41 | // 挑选e值 42 | // cout << "[Stage 2]: 使用Cost Model返回最佳e值..." << "\n"; 43 | // config::ERROR = get_e(op, req, under_data); 44 | // assert(op == 0 || op == 1); 45 | // assert(config::ERROR == 10 || config::ERROR == 100 || config::ERROR == 1000); 46 | cout << "e值: " << config::ERROR << "\n"; 47 | 48 | cout << "[Stage 3]: 建立FITing-tree..." << "\n"; 49 | construct(under_data); 50 | cout << "索引大小" << getsize() << "\n"; 51 | 52 | default_random_engine e(255); 53 | uniform_int_distribution uniform_dist_file(0, under_data.size() - 1); 54 | uniform_int_distribution uniform_dist_file2(0, 1000000); 55 | double total_time = 0; 56 | 57 | cout << "[Stage 4]: 读过程..." << "\n"; 58 | const int READ_SCALE = 10000; 59 | vector p99; 60 | for (int i = 1; i <= READ_SCALE; i += 1) { 61 | ll tk = uniform_dist_file(e); 62 | tk = under_data[tk]; 63 | auto st = system_clock::now(); 64 | // cout << tk << "\n"; 65 | get(tk); 66 | auto en = system_clock::now(); 67 | auto duration = duration_cast(en - st); 68 | auto ns_d = duration_cast(en - st); 69 | p99.push_back(double(ns_d.count())); 70 | total_time += double(duration.count()) * microseconds::period::num / microseconds::period::den; 71 | } 72 | printf("读时延: %.15f s \n", total_time / READ_SCALE); 73 | std::sort(p99.begin(), p99.end()); 74 | printf("p99读延迟: %.15f s \n", p99[int(READ_SCALE * 0.99)]); 75 | 76 | cout << "[Stage 5]: 写过程..." << "\n"; 77 | p99.clear(); 78 | const int WRITE_SCALE = 3000; 79 | total_time = 0; 80 | for (int i = 1; i <= WRITE_SCALE; i += 1) { 81 | ll tk = uniform_dist_file2(e); 82 | auto st = system_clock::now(); 83 | insert(tk); 84 | auto en = system_clock::now(); 85 | auto duration = duration_cast(en - st); 86 | auto ns_d = duration_cast(en - st); 87 | p99.push_back(double(ns_d.count())); 88 | total_time += double(duration.count()) * microseconds::period::num / microseconds::period::den; 89 | } 90 | printf("写时延: %.15f s \n", total_time / WRITE_SCALE); 91 | std::sort(p99.begin(), p99.end()); 92 | printf("p99写延迟: %.15f s \n", p99[int(WRITE_SCALE * 0.99)]); 93 | return 0; 94 | } 95 | -------------------------------------------------------------------------------- /test/test_throughput.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by yuanj on 2022/11/6. 3 | // 测量吞吐量 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "../ATree/db_impl.h" 15 | #include "../ATree/config.h" 16 | 17 | using namespace std; 18 | using namespace chrono; 19 | typedef long long ll; 20 | vector under_data; 21 | 22 | int main(int argc, char** argv) { 23 | assert(argc == 3 + 1); 24 | string PATH = string(argv[1]); 25 | int op = atoi(argv[2]); // 0: 性能优先 1: 存储优先 26 | double req = atoi(argv[3]); // 性能或存储限制 27 | 28 | cout << "[Stage 1]: 从外部文件读取数据..." << "\n"; 29 | ifstream fp(PATH); 30 | string line; 31 | // getline(fp, line); 32 | while (getline(fp, line)) { 33 | istringstream readstr(line); 34 | string number; 35 | getline(readstr, number, ','); 36 | under_data.push_back(atoll(number.data())); 37 | } 38 | cout << "数据规模: " << under_data.size() << "\n"; 39 | 40 | // 挑选e值 41 | // cout << "[Stage 2]: 使用Cost Model返回最佳e值..." << "\n"; 42 | // config::ERROR = get_e(op, req, under_data); 43 | // assert(op == 0 || op == 1); 44 | // assert(config::ERROR == 10 || config::ERROR == 100 || config::ERROR == 1000); 45 | cout << "e值: " << config::ERROR << "\n"; 46 | 47 | cout << "[Stage 3]: 建立FITing-tree..." << "\n"; 48 | construct(under_data); 49 | 50 | default_random_engine e(255); 51 | uniform_int_distribution uniform_dist_file(0, under_data.size() - 1); 52 | uniform_int_distribution uniform_dist_file2(0, 1000000); 53 | double total_time = 0; 54 | ll cnt = 50000; 55 | cout << "[Stage 4]: 读过程..." << "\n"; 56 | for (int i = 0; i < cnt; i += 1) { 57 | ll tk = uniform_dist_file(e); 58 | tk = under_data[tk]; 59 | auto st = system_clock::now(); 60 | get(tk); 61 | auto en = system_clock::now(); 62 | auto duration = duration_cast(en - st); 63 | total_time += double(duration.count()) * microseconds::period::num / microseconds::period::den; // 单位: s 64 | } 65 | cout << "读吞吐量: " << cnt / total_time << " ops/sec \n"; 66 | 67 | cout << "[Stage 5]: 写过程..." << "\n"; 68 | total_time = 0; 69 | for (int i = 0; i < cnt; i += 1) { 70 | ll tk = uniform_dist_file2(e); 71 | auto st = system_clock::now(); 72 | insert(tk); 73 | auto en = system_clock::now(); 74 | auto duration = duration_cast(en - st); 75 | total_time += double(duration.count()) * microseconds::period::num / microseconds::period::den; // 单位: s 76 | } 77 | cout << "写吞吐量: " << cnt / total_time << " ops/sec \n"; 78 | return 0; 79 | } 80 | -------------------------------------------------------------------------------- /test/test_workload.cc: -------------------------------------------------------------------------------- 1 | // 2 | // Created by yuanj on 2022/11/6. 3 | // 测量吞吐量 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "../ATree/db_impl.h" 15 | #include "../ATree/config.h" 16 | 17 | using namespace std; 18 | using namespace chrono; 19 | typedef long long ll; 20 | vector under_data; 21 | 22 | int main(int argc, char** argv) { 23 | string PATH = string(argv[1]); 24 | int op = atoi(argv[2]); // 0: 性能优先 1: 存储优先 25 | double req = atoi(argv[3]); // 性能或存储限制 26 | double read_percentage = atof(argv[4]); 27 | double write_percentage = atof(argv[5]); 28 | 29 | cout << "[Stage 1]: 从外部文件读取数据..." << "\n"; 30 | ifstream fp(PATH); 31 | string line; 32 | // getline(fp, line); 33 | while (getline(fp, line)) { 34 | istringstream readstr(line); 35 | string number; 36 | getline(readstr, number, ','); 37 | under_data.push_back(atoll(number.data())); 38 | } 39 | cout << "数据规模: " << under_data.size() << "\n"; 40 | 41 | // 挑选e值 42 | // cout << "[Stage 2]: 使用Cost Model返回最佳e值..." << "\n"; 43 | // config::ERROR = get_e(op, req, under_data); 44 | // assert(op == 0 || op == 1); 45 | // assert(config::ERROR == 10 || config::ERROR == 100 || config::ERROR == 1000); 46 | cout << "e值: " << config::ERROR << "\n"; 47 | 48 | cout << "[Stage 3]: 建立FITing-tree..." << "\n"; 49 | // vector first_in_data(under_data.begin(), under_data.begin() + int(under_data.size() * read_percentage)); 50 | // construct(first_in_data); 51 | construct(under_data); 52 | 53 | default_random_engine e(255); 54 | uniform_int_distribution uniform_dist_file(0, under_data.size() - 1); 55 | uniform_int_distribution uniform_dist_file2(0, 1000000); 56 | double total_time = 0; 57 | ll cnt = 50000; 58 | int rwop = 0; // 概率操作, < 为读操作, > 为写操作 59 | srand((unsigned)time(nullptr)); 60 | cout << "[Stage 4]: 混合读写..." << "\n"; 61 | for (int i = 0; i < cnt; i += 1) { 62 | rwop = rand() % 10; 63 | // 读操作 64 | if (rwop < read_percentage * 10) { 65 | ll tk = uniform_dist_file(e); 66 | tk = under_data[tk]; 67 | auto st = system_clock::now(); 68 | get(tk); 69 | auto en = system_clock::now(); 70 | auto duration = duration_cast(en - st); 71 | total_time += double(duration.count()) * microseconds::period::num / microseconds::period::den; 72 | } 73 | // 写操作 74 | else { 75 | ll tk = uniform_dist_file2(e); 76 | auto st = system_clock::now(); 77 | insert(tk); 78 | auto en = system_clock::now(); 79 | auto duration = duration_cast(en - st); 80 | total_time += double(duration.count()) * microseconds::period::num / microseconds::period::den; 81 | } 82 | } 83 | cout << "混合吞吐量: " << cnt / total_time << "\n"; 84 | return 0; 85 | } 86 | -------------------------------------------------------------------------------- /util/datagen.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import sys 3 | import os 4 | import numpy as np 5 | 6 | def generate_data(_type, _n): 7 | if os.path.exists('../demo_dataset') == False: 8 | print('path do not exist, create new directory!') 9 | os.mkdir('../demo_dataset') 10 | csv_path = '../demo_dataset/' + _type + '.csv' 11 | with open(csv_path, 'w', newline='') as f: 12 | writer = csv.writer(f) 13 | lst = [] 14 | if _type == 'linear': 15 | for i in range(1, _n + 1): 16 | lst.append([i]) 17 | elif _type == 'seg1': 18 | k = 1 19 | b = 2000 20 | for i in range(1, _n + 1): 21 | if i % (_n / 100) == 0: 22 | k += 1 23 | b += 500 24 | lst.append([k * i + b]) 25 | elif _type == 'seg10': 26 | k = 1 27 | b = 2000 28 | for i in range(1, _n + 1): 29 | if i % (_n / 10) == 0: 30 | k += 1 31 | b += 500 32 | lst.append([k * i + b]) 33 | elif _type == 'normal': 34 | mu ,sigma = 10, 1 35 | sampleNo = _n 36 | np.random.seed(0) 37 | s = np.random.normal(mu, sigma, sampleNo) 38 | s = np.sort(s) 39 | for i in s: 40 | lst.append([int(i * _n)]) 41 | writer.writerows(lst) 42 | 43 | # python3 datagen.py seg1 1000000 44 | if __name__ == '__main__': 45 | _type = sys.argv[1] 46 | _n = int(sys.argv[2]) 47 | generate_data(_type, _n) 48 | print("ok") 49 | -------------------------------------------------------------------------------- /util/process_SOSD_data.cc: -------------------------------------------------------------------------------- 1 | // 处理SOSD 二进制文件的脚本 2 | // 转化为csv文件, 归档文件备份 3 | 4 | #include 5 | using namespace std; 6 | typedef long long ll; 7 | 8 | std::vector load_data(const std::string &filename) { 9 | /* Open file. */ 10 | std::ifstream in(filename, std::ios::binary); 11 | if (!in.is_open()) 12 | exit(EXIT_FAILURE); 13 | 14 | /* Read number of keys. */ 15 | uint64_t n_keys; 16 | in.read(reinterpret_cast(&n_keys), sizeof(uint64_t)); 17 | 18 | /* Initialize vector. */ 19 | std::vector data; 20 | data.resize(n_keys); 21 | 22 | /* Read keys. */ 23 | in.read(reinterpret_cast(data.data()), n_keys * sizeof(ll)); 24 | in.close(); 25 | 26 | return data; 27 | } 28 | 29 | int main() { 30 | vector vec = load_data("./books_200M_uint64"); 31 | std::ofstream outFile; 32 | outFile.open("books_200M_uint64.csv", std::ios::out | std::ios::trunc); 33 | for (int i = 0; i < 100000; i += 1) { 34 | outFile << vec[i] << std::endl; 35 | } 36 | outFile.close(); 37 | return 0; 38 | } 39 | --------------------------------------------------------------------------------