├── alex └── src.txt ├── rs └── src.txt ├── stx_btree ├── src.txt ├── README.md ├── btree ├── btree_set ├── btree_map ├── btree_multiset └── btree_multimap ├── .DS_Store ├── .gitignore ├── .vscode ├── tasks.json ├── launch.json └── settings.json ├── src ├── experiment.cpp ├── include │ ├── func │ │ ├── get_node_info.h │ │ ├── calculate_space.h │ │ ├── delete_function.h │ │ ├── split_function.h │ │ ├── insert_function.h │ │ └── find_function.h │ ├── construct │ │ ├── dp.h │ │ ├── dp_inner.h │ │ ├── dp_leaf.h │ │ ├── store_node.h │ │ ├── construct_root.h │ │ ├── structures.h │ │ ├── minor_function.h │ │ └── greedy.h │ ├── nodes │ │ ├── rootNode │ │ │ ├── root_nodes.h │ │ │ └── trainModel │ │ │ │ └── linear_regression.h │ │ └── innerNode │ │ │ ├── bs_model.h │ │ │ ├── lr_model.h │ │ │ └── candidate_plr.h │ ├── memoryLayout │ │ ├── empty_block.h │ │ └── node_array.h │ ├── base_node.h │ └── params.h ├── experiment │ ├── dataset │ │ ├── normal_distribution.h │ │ ├── exponential_distribution.h │ │ ├── uniform_distribution.h │ │ ├── lognormal_distribution.h │ │ ├── longlat.h │ │ ├── longitudes.h │ │ ├── osmc.h │ │ ├── ycsb.h │ │ └── base_dataset.h │ ├── workload │ │ ├── public_functions.h │ │ ├── zipfian.h │ │ ├── public_functions.cpp │ │ └── workloads_external.h │ ├── experiment_params.h │ ├── functions.h │ ├── core.cpp │ └── main_experiment.cpp ├── unitTest │ ├── rootNodeTest │ │ ├── lr_test.cpp │ │ └── piecewiseLR_test.cpp │ ├── innerNodeTest │ │ ├── binary_search_test.cpp │ │ ├── linear_regression_test.cpp │ │ ├── piecewise_lr_test.cpp │ │ └── histogram_test.cpp │ ├── leafNodeTest │ │ ├── external_array_test.cpp │ │ └── cfarray_test.cpp │ └── carmiTest │ │ ├── carmi_map_test.cpp │ │ └── carmi_external_map_test.cpp ├── CMakeLists.txt ├── profiler │ ├── binary_search.cpp │ └── inner_node_time.cpp └── example │ └── example.cpp ├── LICENSE └── README.md /alex/src.txt: -------------------------------------------------------------------------------- 1 | https://github.com/microsoft/ALEX -------------------------------------------------------------------------------- /rs/src.txt: -------------------------------------------------------------------------------- 1 | https://github.com/learnedsystems/RadixSpline -------------------------------------------------------------------------------- /stx_btree/src.txt: -------------------------------------------------------------------------------- 1 | https://panthema.net/2007/stx-btree/. -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/embryo-labs/CARMI/HEAD/.DS_Store -------------------------------------------------------------------------------- /stx_btree/README.md: -------------------------------------------------------------------------------- 1 | URL: https://github.com/bingmann/stx-btree.git 2 | 3 | Commit ID: 68db9cc6c7bdbc145f99ef323ea3ef031dda4425 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/settings.json 2 | build/* 3 | CMakeFiles/* 4 | .vscode/* 5 | *.csv 6 | Doxyfile 7 | src/.vscode/* 8 | src/build/* 9 | doc/* 10 | alex/alex_base.h 11 | alex/alex_fanout_tree.h 12 | alex/alex_map.h 13 | alex/alex_multimap.h 14 | alex/alex_nodes.h 15 | alex/alex.h 16 | rs/builder.h 17 | rs/common.h 18 | rs/multi_map.h 19 | rs/radix_spline.h 20 | src/include/baseNode 21 | src/include/nodes/innerNode/candidate_plr -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "tasks": [ 3 | { 4 | "type": "shell", 5 | "label": "g++.exe build active file", 6 | "command": "g++", 7 | "args": [ 8 | "-g", 9 | "${file}", 10 | "-std=c++11", 11 | "-o", 12 | // "${fileDirname}\\${fileBasenameNoExtension}.exe" 13 | "${fileBasenameNoExtension}.out" 14 | ] 15 | // , 16 | // "options": { 17 | // "cwd": "F:\\TDM-GCC-64\\bin" 18 | // } 19 | } 20 | ], 21 | "version": "2.0.0" 22 | } -------------------------------------------------------------------------------- /src/experiment.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file main.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include "./experiment/functions.h" 17 | 18 | std::ofstream outRes; 19 | 20 | int main() { 21 | kPrimaryIndex = false; 22 | outRes.open("res_1122.csv", std::ios::app); 23 | 24 | time_t timep; 25 | time(&timep); 26 | char tmpTime[64]; 27 | strftime(tmpTime, sizeof(tmpTime), "%Y-%m-%d %H:%M:%S", localtime(&timep)); 28 | std::cout << "\nTest time: " << tmpTime << std::endl; 29 | outRes << "\nTest time: " << tmpTime << std::endl; 30 | 31 | mainExperiment(); 32 | 33 | outRes << "----------------------------------------------" << std::endl; 34 | 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /src/include/func/get_node_info.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file get_node_info.h 3 | * @author Jiaoyi 4 | * @brief get the information of the node in CARMI 5 | * @version 3.0 6 | * @date 2021-10-24 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef FUNC_GET_NODE_INFO_H_ 12 | #define FUNC_GET_NODE_INFO_H_ 13 | #include 14 | 15 | #include "../carmi.h" 16 | 17 | template 19 | int CARMI::GetNodeInfo( 20 | int idx, int *childNumber, int *childStartIndex) { 21 | // Case 1: the index of the node is invalid 22 | if (idx < 0 || idx >= node.nowNodeNumber) { 23 | return -1; 24 | } 25 | // Case 2: the node is valid 26 | int type = node.nodeArray[idx].lr.flagNumber >> 24; 27 | *childNumber = node.nodeArray[idx].lr.flagNumber & 0xFFFFFF; 28 | *childStartIndex = node.nodeArray[idx].lr.childLeft; 29 | return type; 30 | } 31 | 32 | #endif // FUNC_GET_NODE_INFO_H_ 33 | -------------------------------------------------------------------------------- /src/experiment/dataset/normal_distribution.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file normal_distribution.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_DATASET_NORMAL_DISTRIBUTION_H_ 12 | #define EXPERIMENT_DATASET_NORMAL_DISTRIBUTION_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "./base_dataset.h" 21 | 22 | class NormalDataset : public BaseDataset { 23 | public: 24 | explicit NormalDataset(float initRatio) : BaseDataset(initRatio) {} 25 | 26 | void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset, 27 | DataVecType *testInsertQuery) { 28 | // create dataset randomly 29 | std::default_random_engine generator; 30 | std::normal_distribution distribution(0.0, 1.0); 31 | 32 | SplitInitTest>( 33 | distribution, initDataset, insertDataset, testInsertQuery); 34 | } 35 | }; 36 | 37 | #endif // EXPERIMENT_DATASET_NORMAL_DISTRIBUTION_H_ 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 JiaoyiZhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/experiment/dataset/exponential_distribution.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file exponential_distribution.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_DATASET_EXPONENTIAL_DISTRIBUTION_H_ 12 | #define EXPERIMENT_DATASET_EXPONENTIAL_DISTRIBUTION_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "./base_dataset.h" 21 | 22 | class ExponentialDataset : public BaseDataset { 23 | public: 24 | explicit ExponentialDataset(float initRatio) : BaseDataset(initRatio) {} 25 | 26 | void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset, 27 | DataVecType *testInsertQuery) { 28 | // create dataset randomly 29 | std::default_random_engine generator; 30 | std::exponential_distribution distribution(0.25); 31 | 32 | SplitInitTest>( 33 | distribution, initDataset, insertDataset, testInsertQuery); 34 | } 35 | }; 36 | 37 | #endif // EXPERIMENT_DATASET_EXPONENTIAL_DISTRIBUTION_H_ 38 | -------------------------------------------------------------------------------- /src/experiment/dataset/uniform_distribution.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file uniform_distribution.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-15 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_DATASET_UNIFORM_DISTRIBUTION_H_ 12 | #define EXPERIMENT_DATASET_UNIFORM_DISTRIBUTION_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "./base_dataset.h" 22 | 23 | class UniformDataset : public BaseDataset { 24 | public: 25 | explicit UniformDataset(float initRatio) : BaseDataset(initRatio) {} 26 | void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset, 27 | DataVecType *testInsertQuery) { 28 | std::default_random_engine generator; 29 | std::uniform_real_distribution distribution(0.0, 1.0); 30 | 31 | SplitInitTest>( 32 | distribution, initDataset, insertDataset, testInsertQuery); 33 | return; 34 | } 35 | }; 36 | 37 | #endif // EXPERIMENT_DATASET_UNIFORM_DISTRIBUTION_H_ 38 | -------------------------------------------------------------------------------- /src/experiment/workload/public_functions.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file public_functions.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-04-07 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_WORKLOAD_PUBLIC_FUNCTIONS_H_ 12 | #define EXPERIMENT_WORKLOAD_PUBLIC_FUNCTIONS_H_ 13 | 14 | #include 15 | #include 16 | 17 | #include "../../include/carmi_map.h" 18 | #include "../experiment_params.h" 19 | #include "./zipfian.h" 20 | 21 | extern std::ofstream outRes; 22 | 23 | /** 24 | * @brief prepare query workloads 25 | * 26 | * @param[in] findQueryset 27 | * @param[in] insertDataset 28 | * @param[inout] findQuery 29 | * @param[inout] insertQuery 30 | * @param[inout] index 31 | */ 32 | void InitTestSet(const DataVecType &findQueryset, 33 | const DataVecType &insertDataset, bool isZipfian, 34 | DataVecType *findQuery, DataVecType *insertQuery, 35 | std::vector *index); 36 | 37 | /** 38 | * @brief print the average time of the workload 39 | * 40 | * @param[in] time 41 | */ 42 | void PrintAvgTime(double time); 43 | 44 | #endif // EXPERIMENT_WORKLOAD_PUBLIC_FUNCTIONS_H_ 45 | -------------------------------------------------------------------------------- /src/experiment/dataset/lognormal_distribution.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file lognormal_distribution.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_DATASET_LOGNORMAL_DISTRIBUTION_H_ 12 | #define EXPERIMENT_DATASET_LOGNORMAL_DISTRIBUTION_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "./base_dataset.h" 21 | 22 | class LognormalDataset : public BaseDataset { 23 | public: 24 | explicit LognormalDataset(float initRatio) : BaseDataset(initRatio) {} 25 | 26 | void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset, 27 | DataVecType *testInsertQuery) { 28 | // create dataset randomly 29 | std::default_random_engine generator; 30 | std::lognormal_distribution distribution(0.0, 1.0); 31 | 32 | SplitInitTest>( 33 | distribution, initDataset, insertDataset, testInsertQuery); 34 | } 35 | }; 36 | 37 | #endif // EXPERIMENT_DATASET_LOGNORMAL_DISTRIBUTION_H_ 38 | -------------------------------------------------------------------------------- /src/experiment/workload/zipfian.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file zipfian.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_WORKLOAD_ZIPFIAN_H_ 12 | #define EXPERIMENT_WORKLOAD_ZIPFIAN_H_ 13 | 14 | #include 15 | #include 16 | 17 | #include 18 | #include 19 | class Zipfian { 20 | public: 21 | double *pf; 22 | void InitZipfian(double A, int num) { 23 | pf = new double[num]; 24 | double sum = 0.0; 25 | for (int i = 0; i < num; i++) { 26 | sum += 1 / pow(static_cast(i + 2), A); 27 | } 28 | for (int i = 0; i < num; i++) { 29 | if (i == 0) 30 | pf[i] = 1 / pow(static_cast(i + 2), A) / sum; 31 | else 32 | pf[i] = pf[i - 1] + 1 / pow(static_cast(i + 2), A) / sum; 33 | } 34 | } 35 | 36 | int GenerateNextIndex() { 37 | int index = 0; 38 | std::default_random_engine e(time(0)); 39 | std::uniform_real_distribution dis(0, 1); 40 | double data = dis(e); // 0-1 41 | while (data > pf[index]) index++; 42 | return index; 43 | } 44 | }; 45 | 46 | #endif // EXPERIMENT_WORKLOAD_ZIPFIAN_H_ 47 | -------------------------------------------------------------------------------- /src/unitTest/rootNodeTest/lr_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file lr_test.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 0.1 6 | * @date 2021-11-03 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | 12 | #include "../../experiment/dataset/lognormal_distribution.h" 13 | #include "../../include/nodes/rootNode/trainModel/linear_regression.h" 14 | #include "gtest/gtest.h" 15 | 16 | std::vector> initData; 17 | std::vector> insertData; 18 | std::vector> testInsert; 19 | 20 | const int kChildNum = 512; 21 | const int kTestMaxValue = kMaxValue; 22 | 23 | LognormalDataset logData(0.9); 24 | LinearRegression model; 25 | 26 | TEST(TestTrain, TrainLRModel) { 27 | logData.GenerateDataset(&initData, &insertData, &testInsert); 28 | model.maxChildIdx = kChildNum - 1; 29 | model.Train(initData); 30 | EXPECT_EQ(kChildNum - 1, model.maxChildIdx); 31 | } 32 | 33 | TEST(TestPredictInitData, PredictInitData) { 34 | for (int i = 0; i < initData.size(); i++) { 35 | int p = model.Predict(initData[i].first); 36 | EXPECT_GE(p, 0); 37 | EXPECT_LT(p, kChildNum); 38 | } 39 | } 40 | 41 | TEST(TestPredictInsertData, PredictInsertData) { 42 | for (int i = 0; i < insertData.size(); i++) { 43 | int p = model.Predict(insertData[i].first); 44 | EXPECT_GE(p, 0); 45 | EXPECT_LT(p, kChildNum); 46 | } 47 | } -------------------------------------------------------------------------------- /src/experiment/experiment_params.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file experiment_params.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-05-19 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | 12 | #ifndef EXPERIMENT_EXPERIMENT_PARAMS_H_ 13 | #define EXPERIMENT_EXPERIMENT_PARAMS_H_ 14 | 15 | #define PARAM_ZIPFIAN 0.99 16 | #define DEBUG 17 | // #define TEST_UINT64 18 | 19 | #include 20 | #include 21 | 22 | #ifdef TEST_UINT64 23 | typedef uint64_t KeyType; 24 | typedef uint64_t ValueType; 25 | #else 26 | typedef double KeyType; 27 | typedef double ValueType; 28 | #endif // TEST_UINT 29 | 30 | typedef std::pair DataType; 31 | typedef std::vector DataVecType; 32 | 33 | static bool kPrimaryIndex = false; 34 | 35 | const int kDatasetSize = 36 | 1024.0 / sizeof(DataType) * 1024 * 1024; // 1 GB / 16 byte 37 | const float kTestSize = 100000.0; 38 | const float kMaxValue = 100000000; 39 | 40 | const float kReadOnly = 1; 41 | const float kWriteHeavy = 0.5; 42 | const float kReadHeavy = 0.95; 43 | const float kWritePartial = 0.85; 44 | const float kRangeScan = 2; 45 | 46 | const float kSecondToNanosecond = 1000000000.0; 47 | 48 | #ifdef DEBUG 49 | const std::vector rate = {0.025}; 50 | #else 51 | const std::vector rate = {0.01, 0.02, 0.025, 0.03, 0.05, 0.1}; 52 | #endif // !DEBUG 53 | 54 | #endif // EXPERIMENT_EXPERIMENT_PARAMS_H_ 55 | -------------------------------------------------------------------------------- /src/unitTest/rootNodeTest/piecewiseLR_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file piecewiseLR_test.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 0.1 6 | * @date 2021-11-03 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | 12 | #include "../../include/nodes/rootNode/trainModel/piecewiseLR.h" 13 | 14 | #include "../../experiment/dataset/lognormal_distribution.h" 15 | #include "gtest/gtest.h" 16 | 17 | std::vector> initData; 18 | std::vector> insertData; 19 | std::vector> testInsert; 20 | 21 | const int kChildNum = 512; 22 | const int kTestMaxValue = kMaxValue; 23 | 24 | LognormalDataset logData(0.9); 25 | PiecewiseLR model; 26 | 27 | TEST(TestTrain, TrainPLRModel) { 28 | logData.GenerateDataset(&initData, &insertData, &testInsert); 29 | model.maxChildIdx = kChildNum - 1; 30 | model.Train(initData); 31 | EXPECT_EQ(kChildNum - 1, model.maxChildIdx); 32 | } 33 | 34 | TEST(TestPredictInitData, PredictInitData) { 35 | for (int i = 0; i < initData.size(); i++) { 36 | int p = model.Predict(initData[i].first); 37 | EXPECT_GE(p, 0); 38 | EXPECT_LT(p, kChildNum); 39 | } 40 | } 41 | 42 | TEST(TestPredictInsertData, PredictInsertData) { 43 | for (int i = 0; i < insertData.size(); i++) { 44 | int p = model.Predict(insertData[i].first); 45 | EXPECT_GE(p, 0); 46 | EXPECT_LT(p, kChildNum); 47 | } 48 | } -------------------------------------------------------------------------------- /src/include/func/calculate_space.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file calculate_space.h 3 | * @author Jiaoyi 4 | * @brief calculate the space of CARMI 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef FUNC_CALCULATE_SPACE_H_ 12 | #define FUNC_CALCULATE_SPACE_H_ 13 | 14 | #include 15 | 16 | #include "../carmi.h" 17 | #include "../params.h" 18 | 19 | template 21 | long long CARMI::CalculateSpace() const { 22 | // calculate the space of the plr root node 23 | long long space_cost = kPLRRootSpace * 1024.0 * 1024.0; 24 | // calculate the space of the node array 25 | space_cost += kBaseNodeSpace * node.nowNodeNumber * 1024.0 * 1024.0; 26 | #ifdef DEBUG 27 | std::cout << "node.size(), " << node.nodeArray.size() << ",\tnowChildNumber," 28 | << node.nowNodeNumber << std::endl; 29 | std::cout << "data.size(), " << data.dataArray.size() 30 | << ",\tkMaxLeafNodeSize," << carmi_params::kMaxLeafNodeSize 31 | << std::endl; 32 | #endif // DEBUG 33 | 34 | if (!isPrimary) { 35 | // calculate the space of the data array 36 | space_cost += static_cast(data.dataArray.size()) * 37 | carmi_params::kMaxLeafNodeSize; 38 | } 39 | return space_cost; 40 | } 41 | 42 | #endif // FUNC_CALCULATE_SPACE_H_ 43 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | # Ubuntu 3 | 4 | cmake_minimum_required(VERSION 3.0) 5 | project(CARMI) 6 | set(CMAKE_CXX_STANDARD 17) 7 | 8 | aux_source_directory(experiment/workload source_list_workload) 9 | SET(CMAKE_BUILD_TYPE "Release") 10 | 11 | # add_executable(CARMI profiler/inner_node_time.cpp) 12 | # add_executable(CARMI profiler/leaf_node_time.cpp) 13 | add_executable(CARMI experiment.cpp experiment/core.cpp experiment/main_experiment.cpp ${source_list_workload} ) 14 | target_link_libraries(CARMI) 15 | 16 | ## Test 17 | # add_executable(CARMI unitTest/carmiTest/carmi_map_test.cpp) 18 | # add_executable(CARMI unitTest/carmiTest/carmi_external_map_test.cpp) 19 | # add_executable(CARMI unitTest/carmiTest/map_test.cpp) 20 | # add_executable(CARMI unitTest/carmiTest/externalmap_test.cpp) 21 | 22 | ## rootNode 23 | # add_executable(CARMI unitTest/rootNodeTest/piecewiseLR_test.cpp) 24 | # add_executable(CARMI unitTest/rootNodeTest/lr_test.cpp) 25 | 26 | ## innerNode 27 | # add_executable(CARMI unitTest/innerNodeTest/linear_regression_test.cpp) 28 | # add_executable(CARMI unitTest/innerNodeTest/piecewise_lr_test.cpp) 29 | # add_executable(CARMI unitTest/innerNodeTest/histogram_test.cpp) 30 | # add_executable(CARMI unitTest/innerNodeTest/binary_search_test.cpp) 31 | 32 | ## leafNode 33 | # add_executable(CARMI unitTest/leafNodeTest/cfarray_test.cpp) 34 | # add_executable(CARMI unitTest/leafNodeTest/external_array_test.cpp) 35 | 36 | 37 | # target_link_libraries(CARMI gtest_main gtest pthread) -------------------------------------------------------------------------------- /src/experiment/dataset/longlat.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file longlat.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_DATASET_LONGLAT_H_ 12 | #define EXPERIMENT_DATASET_LONGLAT_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "./base_dataset.h" 25 | class LonglatDataset : public BaseDataset { 26 | public: 27 | explicit LonglatDataset(float initRatio) : BaseDataset(initRatio) {} 28 | 29 | void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset, 30 | DataVecType *testInsertQuery) { 31 | DataVecType ds; 32 | std::ifstream inFile("../experiment/dataset/longlat.csv", std::ios::in); 33 | if (!inFile) { 34 | std::cout << "open longlat.csv failed" << std::endl; 35 | exit(1); 36 | } 37 | std::string line; 38 | while (getline(inFile, line)) { 39 | if (line.empty()) continue; 40 | std::istringstream sin(line); 41 | std::vector fields; 42 | std::string field; 43 | while (getline(sin, field, ',')) fields.push_back(field); 44 | std::string key = fields[0]; 45 | std::string value = fields[1]; 46 | double k = stod(key); 47 | double v = stod(value); 48 | ds.push_back({k, v}); 49 | if (ds.size() == kDatasetSize + round(kTestSize * (1 - proportion))) { 50 | break; 51 | } 52 | } 53 | 54 | SplitInitTest(&ds, initDataset, insertDataset, testInsertQuery); 55 | } 56 | }; 57 | 58 | #endif // EXPERIMENT_DATASET_LONGLAT_H_ 59 | -------------------------------------------------------------------------------- /src/experiment/dataset/longitudes.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file longitudes.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_DATASET_LONGITUDES_H_ 12 | #define EXPERIMENT_DATASET_LONGITUDES_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "./base_dataset.h" 25 | class LongitudesDataset : public BaseDataset { 26 | public: 27 | explicit LongitudesDataset(float initRatio) : BaseDataset(initRatio) {} 28 | 29 | void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset, 30 | DataVecType *testInsertQuery) { 31 | DataVecType ds; 32 | std::ifstream inFile("../experiment/dataset/longitude.csv", std::ios::in); 33 | if (!inFile) { 34 | std::cout << "open longitude.csv failed" << std::endl; 35 | exit(1); 36 | } 37 | std::string line; 38 | while (getline(inFile, line)) { 39 | if (line.empty()) continue; 40 | std::istringstream sin(line); 41 | std::vector fields; 42 | std::string field; 43 | while (getline(sin, field, ',')) fields.push_back(field); 44 | std::string key = fields[0]; 45 | std::string value = fields[1]; 46 | double k = stod(key); 47 | double v = stod(value); 48 | ds.push_back({k, v}); 49 | if (ds.size() == kDatasetSize + round(kTestSize * (1 - proportion))) { 50 | break; 51 | } 52 | } 53 | 54 | SplitInitTest(&ds, initDataset, insertDataset, testInsertQuery); 55 | } 56 | }; 57 | 58 | #endif // EXPERIMENT_DATASET_LONGITUDES_H_ 59 | -------------------------------------------------------------------------------- /src/experiment/dataset/osmc.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file osmc.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-12-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_DATASET_OSMC_H_ 12 | #define EXPERIMENT_DATASET_OSMC_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "./base_dataset.h" 25 | class OsmcDataset : public BaseDataset { 26 | public: 27 | explicit OsmcDataset(float initRatio) : BaseDataset(initRatio) {} 28 | 29 | void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset, 30 | DataVecType *testInsertQuery) { 31 | DataVecType ds; 32 | std::ifstream inFile("../experiment/dataset/osmc.csv", std::ios::in); 33 | if (!inFile) { 34 | std::cout << "open osmc.csv failed" << std::endl; 35 | exit(1); 36 | } 37 | std::string line; 38 | while (getline(inFile, line)) { 39 | if (line.empty()) continue; 40 | std::istringstream sin(line); 41 | std::vector fields; 42 | std::string field; 43 | while (getline(sin, field, ',')) fields.push_back(field); 44 | std::string key = fields[0]; 45 | std::string value = fields[1]; 46 | uint64_t k, v; 47 | std::stringstream strK, strV; 48 | strK << key; 49 | strK >> k; 50 | strV << value; 51 | strV >> v; 52 | ds.push_back({k, v}); 53 | if (ds.size() == kDatasetSize + round(kTestSize * (1 - proportion))) { 54 | break; 55 | } 56 | } 57 | 58 | SplitInitTest(&ds, initDataset, insertDataset, testInsertQuery); 59 | } 60 | }; 61 | 62 | #endif // EXPERIMENT_DATASET_OSMC_H_ 63 | -------------------------------------------------------------------------------- /src/experiment/workload/public_functions.cpp: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file public_functions.cpp 4 | * @author Jiaoyi 5 | * @brief 6 | * @version 3.0 7 | * @date 2021-04-07 8 | * 9 | * @copyright Copyright (c) 2021 10 | * 11 | */ 12 | 13 | #include "public_functions.h" 14 | 15 | #include 16 | 17 | #include "../experiment_params.h" 18 | 19 | /** 20 | * @brief prepare query workloads 21 | * 22 | * @param[in] findQueryset 23 | * @param[in] insertDataset 24 | * @param[inout] findQuery 25 | * @param[inout] insertQuery 26 | * @param[inout] index 27 | */ 28 | void InitTestSet(const DataVecType &findQueryset, 29 | const DataVecType &insertDataset, bool isZipfian, 30 | DataVecType *findQuery, DataVecType *insertQuery, 31 | std::vector *index) { 32 | (*findQuery) = findQueryset; 33 | (*insertQuery) = insertDataset; 34 | 35 | std::default_random_engine engine; 36 | 37 | unsigned seed = std::clock(); 38 | engine = std::default_random_engine(seed); 39 | shuffle((*findQuery).begin(), (*findQuery).end(), engine); 40 | 41 | if (!kPrimaryIndex) { 42 | unsigned seed1 = std::clock(); 43 | engine = std::default_random_engine(seed1); 44 | shuffle((*insertQuery).begin(), (*insertQuery).end(), engine); 45 | } 46 | 47 | if (isZipfian) { 48 | Zipfian zip; 49 | zip.InitZipfian(PARAM_ZIPFIAN, (*findQuery).size()); 50 | *index = std::vector(kTestSize, 0); 51 | for (int i = 0; i < kTestSize; i++) { 52 | int idx = zip.GenerateNextIndex(); 53 | (*index)[i] = idx; 54 | } 55 | } 56 | } 57 | 58 | /** 59 | * @brief print the average time of the workload 60 | * 61 | * @param[in] time 62 | */ 63 | void PrintAvgTime(double time) { 64 | std::cout << "average time," << time * kSecondToNanosecond / kTestSize 65 | << std::endl; 66 | outRes << time * kSecondToNanosecond / kTestSize << ","; 67 | } 68 | -------------------------------------------------------------------------------- /stx_btree/btree: -------------------------------------------------------------------------------- 1 | // -*- mode: c++ -*- 2 | /******************************************************************************* 3 | * include/stx/btree 4 | * 5 | * STX B+ Tree Template Classes v0.9 6 | * Copyright (C) 2008-2013 Timo Bingmann 7 | * 8 | * Boost Software License - Version 1.0 - August 17th, 2003 9 | * 10 | * Permission is hereby granted, free of charge, to any person or organization 11 | * obtaining a copy of the software and accompanying documentation covered by 12 | * this license (the "Software") to use, reproduce, display, distribute, 13 | * execute, and transmit the Software, and to prepare derivative works of the 14 | * Software, and to permit third-parties to whom the Software is furnished to 15 | * do so, all subject to the following: 16 | * 17 | * The copyright notices in the Software and this entire statement, including 18 | * the above license grant, this restriction and the following disclaimer, must 19 | * be included in all copies of the Software, in whole or in part, and all 20 | * derivative works of the Software, unless such copies or derivative works are 21 | * solely in the form of machine-executable object code generated by a source 22 | * language processor. 23 | * 24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 27 | * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 28 | * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 29 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 30 | * DEALINGS IN THE SOFTWARE. 31 | ******************************************************************************/ 32 | 33 | #ifndef _STX_BTREE_ 34 | #define _STX_BTREE_ 35 | 36 | /** \file btree 37 | * Forwarder header to btree.h 38 | */ 39 | 40 | #include 41 | 42 | #endif // _STX_BTREE_ 43 | 44 | /******************************************************************************/ 45 | -------------------------------------------------------------------------------- /stx_btree/btree_set: -------------------------------------------------------------------------------- 1 | // -*- mode: c++ -*- 2 | /******************************************************************************* 3 | * include/stx/btree_set 4 | * 5 | * STX B+ Tree Template Classes v0.9 6 | * Copyright (C) 2008-2013 Timo Bingmann 7 | * 8 | * Boost Software License - Version 1.0 - August 17th, 2003 9 | * 10 | * Permission is hereby granted, free of charge, to any person or organization 11 | * obtaining a copy of the software and accompanying documentation covered by 12 | * this license (the "Software") to use, reproduce, display, distribute, 13 | * execute, and transmit the Software, and to prepare derivative works of the 14 | * Software, and to permit third-parties to whom the Software is furnished to 15 | * do so, all subject to the following: 16 | * 17 | * The copyright notices in the Software and this entire statement, including 18 | * the above license grant, this restriction and the following disclaimer, must 19 | * be included in all copies of the Software, in whole or in part, and all 20 | * derivative works of the Software, unless such copies or derivative works are 21 | * solely in the form of machine-executable object code generated by a source 22 | * language processor. 23 | * 24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 27 | * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 28 | * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 29 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 30 | * DEALINGS IN THE SOFTWARE. 31 | ******************************************************************************/ 32 | 33 | #ifndef _STX_BTREE_SET_ 34 | #define _STX_BTREE_SET_ 35 | 36 | /** \file btree_set 37 | * Forwarder header to btree_set.h 38 | */ 39 | 40 | #include 41 | 42 | #endif // _STX_BTREE_SET_ 43 | 44 | /******************************************************************************/ 45 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // 使用 IntelliSense 了解相关属性。 3 | // 悬停以查看现有属性的描述。 4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "(gdb) 启动", 9 | "type": "cppdbg", 10 | "request": "launch", 11 | // "program": "${workspaceFolder}/${fileBasenameNoExtension}.out", 12 | "program": "${workspaceFolder}/build/CARMI", 13 | "args": [], 14 | "stopAtEntry": false, 15 | "cwd": "${workspaceFolder}", 16 | "environment": [], 17 | "externalConsole": true, 18 | "MIMode": "gdb", 19 | "preLaunchTask": "CARMI", 20 | "setupCommands": [ 21 | { 22 | "description": "Enable pretty-printing for gdb", 23 | "text": "-enable-pretty-printing", 24 | "ignoreFailures": true 25 | } 26 | ], 27 | "sourceFileMap": { 28 | "/build/glibc-ZN95T4": "/usr/src/glibc" 29 | } 30 | } 31 | // , 32 | // { 33 | // "name": "g++.exe build and debug active file", 34 | // "type": "cppdbg", 35 | // "request": "launch", 36 | // "program": "${fileDirname}\\${fileBasenameNoExtension}.exe", 37 | // "args": [], 38 | // "stopAtEntry": false, 39 | // "cwd": "${workspaceFolder}", 40 | // "environment": [], 41 | // "externalConsole": false, 42 | // "MIMode": "gdb", 43 | // "miDebuggerPath": "F:\\TDM-GCC-64\\bin\\gdb.exe", 44 | // "setupCommands": [ 45 | // { 46 | // "description": "为 gdb 启用整齐打印", 47 | // "text": "-enable-pretty-printing", 48 | // "ignoreFailures": true 49 | // } 50 | // ], 51 | // "preLaunchTask": "g++.exe build active file" 52 | // } 53 | ] 54 | } -------------------------------------------------------------------------------- /stx_btree/btree_map: -------------------------------------------------------------------------------- 1 | // -*- mode: c++ -*- 2 | /******************************************************************************* 3 | * include/stx/btree_map 4 | * 5 | * STX B+ Tree Template Classes v0.9 6 | * Copyright (C) 2008-2013 Timo Bingmann 7 | * 8 | * Boost Software License - Version 1.0 - August 17th, 2003 9 | * 10 | * Permission is hereby granted, free of charge, to any person or organization 11 | * obtaining a copy of the software and accompanying documentation covered by 12 | * this license (the "Software") to use, reproduce, display, distribute, 13 | * execute, and transmit the Software, and to prepare derivative works of the 14 | * Software, and to permit third-parties to whom the Software is furnished to 15 | * do so, all subject to the following: 16 | * 17 | * The copyright notices in the Software and this entire statement, including 18 | * the above license grant, this restriction and the following disclaimer, must 19 | * be included in all copies of the Software, in whole or in part, and all 20 | * derivative works of the Software, unless such copies or derivative works are 21 | * solely in the form of machine-executable object code generated by a source 22 | * language processor. 23 | * 24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 27 | * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 28 | * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 29 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 30 | * DEALINGS IN THE SOFTWARE. 31 | ******************************************************************************/ 32 | 33 | #ifndef _STX_BTREE_MAP_ 34 | #define _STX_BTREE_MAP_ 35 | 36 | /** \file btree_map 37 | * Forwarder header to btree_map.h 38 | */ 39 | 40 | #include 41 | 42 | #endif // _STX_BTREE_MAP_ 43 | 44 | /******************************************************************************/ 45 | -------------------------------------------------------------------------------- /stx_btree/btree_multiset: -------------------------------------------------------------------------------- 1 | // -*- mode: c++ -*- 2 | /******************************************************************************* 3 | * include/stx/btree_multiset 4 | * 5 | * STX B+ Tree Template Classes v0.9 6 | * Copyright (C) 2008-2013 Timo Bingmann 7 | * 8 | * Boost Software License - Version 1.0 - August 17th, 2003 9 | * 10 | * Permission is hereby granted, free of charge, to any person or organization 11 | * obtaining a copy of the software and accompanying documentation covered by 12 | * this license (the "Software") to use, reproduce, display, distribute, 13 | * execute, and transmit the Software, and to prepare derivative works of the 14 | * Software, and to permit third-parties to whom the Software is furnished to 15 | * do so, all subject to the following: 16 | * 17 | * The copyright notices in the Software and this entire statement, including 18 | * the above license grant, this restriction and the following disclaimer, must 19 | * be included in all copies of the Software, in whole or in part, and all 20 | * derivative works of the Software, unless such copies or derivative works are 21 | * solely in the form of machine-executable object code generated by a source 22 | * language processor. 23 | * 24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 27 | * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 28 | * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 29 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 30 | * DEALINGS IN THE SOFTWARE. 31 | ******************************************************************************/ 32 | 33 | #ifndef _STX_BTREE_MULTISET_ 34 | #define _STX_BTREE_MULTISET_ 35 | 36 | /** \file btree_multiset 37 | * Forwarder header to btree_multiset.h 38 | */ 39 | 40 | #include 41 | 42 | #endif // _STX_BTREE_MULTISET_ 43 | 44 | /******************************************************************************/ 45 | -------------------------------------------------------------------------------- /stx_btree/btree_multimap: -------------------------------------------------------------------------------- 1 | // -*- mode: c++ -*- 2 | /******************************************************************************* 3 | * include/stx/btree_multimap 4 | * 5 | * STX B+ Tree Template Classes v0.9 6 | * Copyright (C) 2008-2013 Timo Bingmann 7 | * 8 | * Boost Software License - Version 1.0 - August 17th, 2003 9 | * 10 | * Permission is hereby granted, free of charge, to any person or organization 11 | * obtaining a copy of the software and accompanying documentation covered by 12 | * this license (the "Software") to use, reproduce, display, distribute, 13 | * execute, and transmit the Software, and to prepare derivative works of the 14 | * Software, and to permit third-parties to whom the Software is furnished to 15 | * do so, all subject to the following: 16 | * 17 | * The copyright notices in the Software and this entire statement, including 18 | * the above license grant, this restriction and the following disclaimer, must 19 | * be included in all copies of the Software, in whole or in part, and all 20 | * derivative works of the Software, unless such copies or derivative works are 21 | * solely in the form of machine-executable object code generated by a source 22 | * language processor. 23 | * 24 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 27 | * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 28 | * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 29 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 30 | * DEALINGS IN THE SOFTWARE. 31 | ******************************************************************************/ 32 | 33 | #ifndef _STX_BTREE_MULTIMAP_ 34 | #define _STX_BTREE_MULTIMAP_ 35 | 36 | /** \file btree_multimap 37 | * Forwarder header to btree_multimap.h 38 | */ 39 | 40 | #include 41 | 42 | #endif // _STX_BTREE_MULTIMAP_ 43 | 44 | /******************************************************************************/ 45 | -------------------------------------------------------------------------------- /src/profiler/binary_search.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file binary_search.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-05-24 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | const int kSize = 100000000; 19 | const float kSecondToNanosecond = 1000000000.0; 20 | std::vector data; 21 | std::vector idx; 22 | const int end = kSize / 64 - 8; 23 | 24 | inline int BinarySearch(double key, int start, int end) { 25 | while (start < end) { 26 | int mid = (start + end) / 2; 27 | if (data[mid] < key) 28 | start = mid + 1; 29 | else 30 | end = mid; 31 | } 32 | return start; 33 | } 34 | 35 | void GetBinarySearchTime(int nodeSize) { 36 | unsigned seed = std::clock(); 37 | std::default_random_engine engine(seed); 38 | shuffle(idx.begin(), idx.end(), engine); 39 | 40 | int start, endidx; 41 | double value; 42 | std::clock_t s, e; 43 | double tmp; 44 | int c; 45 | s = std::clock(); 46 | for (int i = 0, j = 0; i < end; i++, j++) { 47 | start = idx[i]; 48 | endidx = start + nodeSize - 1; 49 | 50 | value = start + j; 51 | c = BinarySearch(value, start, endidx); 52 | 53 | j &= nodeSize - 1; 54 | } 55 | e = std::clock(); 56 | tmp = (e - s) / static_cast(CLOCKS_PER_SEC); 57 | 58 | s = std::clock(); 59 | for (int i = 0, j = 0; i < end; i++, j++) { 60 | start = idx[i]; 61 | endidx = start + nodeSize - 1; 62 | value = start + j; 63 | j &= nodeSize - 1; 64 | } 65 | e = std::clock(); 66 | double tmp1 = (e - s) / static_cast(CLOCKS_PER_SEC); 67 | std::cout << nodeSize * 8 68 | << " bs average time:" << (tmp - tmp1) * kSecondToNanosecond / end 69 | << std::endl; 70 | } 71 | 72 | int main() { 73 | data = std::vector(kSize, 0); 74 | idx = std::vector(end); 75 | for (int i = 0; i < kSize; i++) { 76 | data[i] = i; 77 | } 78 | for (int i = 0; i < end; i++) { 79 | idx[i] = i * 64; 80 | } 81 | GetBinarySearchTime(64 / 8); 82 | GetBinarySearchTime(128 / 8); 83 | GetBinarySearchTime(256 / 8); 84 | GetBinarySearchTime(512 / 8); 85 | } 86 | -------------------------------------------------------------------------------- /src/unitTest/innerNodeTest/binary_search_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file binary_search_test.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 0.1 6 | * @date 2021-11-03 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include "../../experiment/dataset/lognormal_distribution.h" 12 | #include "../../include/nodes/innerNode/bs_model.h" 13 | #include "gtest/gtest.h" 14 | 15 | typedef double KeyType; 16 | typedef double ValueType; 17 | typedef std::pair DataType; 18 | 19 | std::vector initData; 20 | std::vector insertData; 21 | std::vector testInsert; 22 | 23 | const int kChildNum = 15; 24 | const int kTestMaxValue = kMaxValue; 25 | 26 | LognormalDataset logData(0.9); 27 | BSModel model(kChildNum); 28 | std::default_random_engine engine(time(0)); 29 | 30 | TEST(TestMultiTrain, MultiTrainBSModel) { 31 | std::uniform_real_distribution dis(0, kTestMaxValue); 32 | std::vector testTrainData; 33 | unsigned int seed = time(NULL); 34 | for (int i = 0; i < 9; i++) { 35 | int tmpSize = std::pow(10, i) - 1; 36 | testTrainData = std::vector(tmpSize); 37 | for (int j = 0; j < tmpSize; j++) { 38 | KeyType tmpKey = dis(engine); 39 | testTrainData[j] = {tmpKey, tmpKey}; 40 | } 41 | std::sort(testTrainData.begin(), testTrainData.end()); 42 | BSModel tmpModel(kChildNum); 43 | tmpModel.Train(0, testTrainData.size(), testTrainData); 44 | EXPECT_EQ(kChildNum, tmpModel.flagNumber & 0x00FFFFFF); 45 | for (int j = 0; j < 13; j++) { 46 | EXPECT_LE(tmpModel.keys[j], tmpModel.keys[j + 1]); 47 | } 48 | } 49 | } 50 | 51 | TEST(TestTrain, TrainBSModel) { 52 | logData.GenerateDataset(&initData, &insertData, &testInsert); 53 | model.Train(0, initData.size(), initData); 54 | EXPECT_EQ(kChildNum, model.flagNumber & 0x00FFFFFF); 55 | EXPECT_EQ(4, model.flagNumber >> 24); 56 | } 57 | 58 | TEST(TestPredictInitData, PredictInitData) { 59 | for (int i = 0; i < initData.size(); i++) { 60 | int p = model.Predict(initData[i].first); 61 | EXPECT_GE(p, 0); 62 | EXPECT_LT(p, kChildNum); 63 | } 64 | } 65 | 66 | TEST(TestPredictInsertData, PredictInsertData) { 67 | for (int i = 0; i < insertData.size(); i++) { 68 | int p = model.Predict(insertData[i].first); 69 | EXPECT_GE(p, 0); 70 | EXPECT_LT(p, kChildNum); 71 | } 72 | } -------------------------------------------------------------------------------- /src/experiment/dataset/ycsb.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file ycsb.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-22 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_DATASET_YCSB_H_ 12 | #define EXPERIMENT_DATASET_YCSB_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "./base_dataset.h" 24 | 25 | class YCSBDataset : public BaseDataset { 26 | public: 27 | explicit YCSBDataset(float initRatio) : BaseDataset(initRatio) {} 28 | 29 | void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset, 30 | DataVecType *testInsertQuery) { 31 | (*initDataset) = std::vector(kDatasetSize); 32 | int end = round(kTestSize * (1 - proportion)); 33 | (*testInsertQuery) = std::vector(end); 34 | 35 | DataVecType ds; 36 | std::ifstream inFile("../experiment/dataset/newycsbdata.csv", std::ios::in); 37 | if (!inFile) { 38 | std::cout << "open ycsb.csv failed" << std::endl; 39 | exit(1); 40 | } 41 | std::string line; 42 | while (getline(inFile, line)) { 43 | if (line.empty()) continue; 44 | std::istringstream sin(line); 45 | std::vector fields; 46 | std::string field; 47 | while (getline(sin, field, ',')) fields.push_back(field); 48 | std::string key = fields[0]; 49 | key.erase(0, 4); 50 | double k = stod(key); 51 | double v = k / 10; 52 | ds.push_back({k, v}); 53 | if (ds.size() == kDatasetSize + end) { 54 | break; 55 | } 56 | } 57 | 58 | std::sort(ds.begin(), ds.end()); 59 | for (int i = 0; i < kDatasetSize; i++) { 60 | (*initDataset)[i] = ds[i]; 61 | } 62 | double lastKey = ds[ds.size() - 1].first; 63 | if (ds.size() < kDatasetSize + end) { 64 | for (int i = 0; i < end; i++) { 65 | (*testInsertQuery)[i] = {lastKey + i, lastKey + i}; 66 | } 67 | } else { 68 | for (int i = 0; i < end; i++) { 69 | (*testInsertQuery)[i] = ds[i + kDatasetSize]; 70 | } 71 | } 72 | 73 | std::cout << "YCSB: init size:" << (*initDataset).size() 74 | << "\tWrite size:" << (*testInsertQuery).size() << std::endl; 75 | } 76 | }; 77 | 78 | #endif // EXPERIMENT_DATASET_YCSB_H_ 79 | -------------------------------------------------------------------------------- /src/include/func/delete_function.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file delete_function.h 3 | * @author Jiaoyi 4 | * @brief delete a record 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef FUNC_DELETE_FUNCTION_H_ 12 | #define FUNC_DELETE_FUNCTION_H_ 13 | 14 | #include 15 | 16 | #include "../carmi.h" 17 | 18 | template 20 | bool CARMI::Delete(const KeyType &key, 21 | size_t *cnt) { 22 | int idx = 0; // idx in the node array 23 | int type = root.flagNumber; 24 | while (1) { 25 | switch (type) { 26 | case PLR_ROOT_NODE: 27 | // Case 0: this node is the plr root node 28 | // use the plr root node to find the index of the next node 29 | idx = root.PLRType::model.Predict(key); 30 | break; 31 | case LR_INNER_NODE: 32 | // Case 1: this node is the lr inner node 33 | // use the predict function of lr inner node to obtain the index of the 34 | // next node 35 | idx = node.nodeArray[idx].lr.Predict(key); 36 | break; 37 | case PLR_INNER_NODE: 38 | // Case 2: this node is the plr inner node 39 | // use the predict function of plr inner node to obtain the index of the 40 | // next node 41 | idx = node.nodeArray[idx].plr.Predict(key); 42 | break; 43 | case HIS_INNER_NODE: 44 | // Case 3: this node is the his inner node 45 | // use the predict function of his inner node to obtain the index of the 46 | // next node 47 | idx = node.nodeArray[idx].his.Predict(key); 48 | break; 49 | case BS_INNER_NODE: 50 | // Case 4: this node is the bs inner node 51 | // use the predict function of bs inner node to obtain the index of the 52 | // next node 53 | idx = node.nodeArray[idx].bs.Predict(key); 54 | break; 55 | case ARRAY_LEAF_NODE: { 56 | // Case 5: this node is the cache-friendly array leaf node 57 | // Delete the data point in the cf leaf node 58 | return node.nodeArray[idx].cfArray.Delete(key, cnt, &data); 59 | } 60 | } 61 | 62 | type = node.nodeArray[idx].lr.flagNumber >> 24; 63 | } 64 | } 65 | 66 | #endif // FUNC_DELETE_FUNCTION_H_ 67 | -------------------------------------------------------------------------------- /src/unitTest/innerNodeTest/linear_regression_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file linear_regression_test.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 0.1 6 | * @date 2021-11-03 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include 12 | 13 | #include "../../experiment/dataset/lognormal_distribution.h" 14 | #include "../../include/nodes/innerNode/lr_model.h" 15 | #include "gtest/gtest.h" 16 | 17 | typedef double KeyType; 18 | typedef double ValueType; 19 | typedef std::pair DataType; 20 | 21 | std::vector initData; 22 | std::vector insertData; 23 | std::vector testInsert; 24 | 25 | const int kChildNum = 512; 26 | const int kTestMaxValue = kMaxValue; 27 | 28 | LognormalDataset logData(0.9); 29 | LRModel model(kChildNum); 30 | std::default_random_engine engine(time(0)); 31 | 32 | TEST(TestMultiTrain, MultiTrainLRModel) { 33 | std::vector testTrainData; 34 | std::uniform_real_distribution dis(0, kTestMaxValue); 35 | for (int i = 0; i < 9; i++) { 36 | int tmpSize = std::pow(10, i) - 1; 37 | std::cout << "Start test size: " << tmpSize << std::endl; 38 | testTrainData = std::vector(tmpSize); 39 | for (int j = 0; j < tmpSize; j++) { 40 | KeyType tmpKey = dis(engine); 41 | testTrainData[j] = {tmpKey, tmpKey}; 42 | } 43 | std::sort(testTrainData.begin(), testTrainData.end()); 44 | std::cout << "Dataset is ready, start to test." << std::endl; 45 | LRModel tmpModel(kChildNum); 46 | tmpModel.Train(0, testTrainData.size(), testTrainData); 47 | EXPECT_EQ(kChildNum, tmpModel.flagNumber & 0x00FFFFFF); 48 | EXPECT_GE(tmpModel.slope, 0); 49 | } 50 | } 51 | 52 | TEST(TestTrain, TrainLRModel) { 53 | logData.GenerateDataset(&initData, &insertData, &testInsert); 54 | model.Train(0, initData.size(), initData); 55 | EXPECT_EQ(kChildNum, model.flagNumber & 0x00FFFFFF); 56 | EXPECT_EQ(1, model.flagNumber >> 24); 57 | } 58 | 59 | TEST(TestPredictInitData, PredictInitData) { 60 | for (int i = 0; i < initData.size(); i++) { 61 | int p = model.Predict(initData[i].first); 62 | EXPECT_GE(p, 0); 63 | EXPECT_LT(p, kChildNum); 64 | } 65 | } 66 | 67 | TEST(TestPredictInsertData, PredictInsertData) { 68 | for (int i = 0; i < insertData.size(); i++) { 69 | int p = model.Predict(insertData[i].first); 70 | EXPECT_GE(p, 0); 71 | EXPECT_LT(p, kChildNum); 72 | } 73 | } -------------------------------------------------------------------------------- /src/unitTest/innerNodeTest/piecewise_lr_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file piecewise_lr_test.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 0.1 6 | * @date 2021-11-03 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include 12 | 13 | #include "../../experiment/dataset/lognormal_distribution.h" 14 | #include "../../include/nodes/innerNode/plr_model.h" 15 | #include "gtest/gtest.h" 16 | 17 | typedef double KeyType; 18 | typedef double ValueType; 19 | typedef std::pair DataType; 20 | 21 | std::vector initData; 22 | std::vector insertData; 23 | std::vector testInsert; 24 | 25 | const int kChildNum = 512; 26 | const int kTestMaxValue = kMaxValue; 27 | 28 | LognormalDataset logData(0.9); 29 | PLRModel model(kChildNum); 30 | std::default_random_engine engine(time(0)); 31 | 32 | TEST(TestMultiTrain, MultiTrainPLRModel) { 33 | std::vector testTrainData; 34 | std::uniform_real_distribution dis(0, kTestMaxValue); 35 | for (int i = 0; i < 9; i++) { 36 | int tmpSize = std::pow(10, i) - 1; 37 | std::cout << "Start test size: " << tmpSize << std::endl; 38 | testTrainData = std::vector(tmpSize); 39 | for (int j = 0; j < tmpSize; j++) { 40 | KeyType tmpKey = dis(engine); 41 | testTrainData[j] = {tmpKey, tmpKey}; 42 | } 43 | std::sort(testTrainData.begin(), testTrainData.end()); 44 | std::cout << "Dataset is ready, start to test." << std::endl; 45 | PLRModel tmpModel(kChildNum); 46 | tmpModel.Train(0, testTrainData.size(), testTrainData); 47 | EXPECT_EQ(kChildNum, tmpModel.flagNumber & 0x00FFFFFF); 48 | for (int j = 0; j < 5; j++) { 49 | EXPECT_LE(tmpModel.index[j], tmpModel.index[j + 1]); 50 | } 51 | for (int j = 0; j < 7; j++) { 52 | EXPECT_LT(tmpModel.keys[j], tmpModel.keys[j + 1]); 53 | } 54 | std::cout << "Subtest " << i << " over!" << std::endl; 55 | } 56 | } 57 | 58 | TEST(TestTrain, TrainPLRModel) { 59 | logData.GenerateDataset(&initData, &insertData, &testInsert); 60 | model.Train(0, initData.size(), initData); 61 | EXPECT_EQ(kChildNum, model.flagNumber & 0x00FFFFFF); 62 | EXPECT_EQ(2, model.flagNumber >> 24); 63 | } 64 | 65 | TEST(TestPredictInitData, PredictInitData) { 66 | for (int i = 0; i < initData.size(); i++) { 67 | int p = model.Predict(initData[i].first); 68 | EXPECT_GE(p, 0); 69 | EXPECT_LT(p, kChildNum); 70 | } 71 | } 72 | 73 | TEST(TestPredictInsertData, PredictInsertData) { 74 | for (int i = 0; i < insertData.size(); i++) { 75 | int p = model.Predict(insertData[i].first); 76 | EXPECT_GE(p, 0); 77 | EXPECT_LT(p, kChildNum); 78 | } 79 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "random": "cpp", 4 | "array": "cpp", 5 | "memory": "cpp", 6 | "tuple": "cpp", 7 | "type_traits": "cpp", 8 | "utility": "cpp", 9 | "xmemory0": "cpp", 10 | "xstddef": "cpp", 11 | "xtr1common": "cpp", 12 | "xutility": "cpp", 13 | "algorithm": "cpp", 14 | "chrono": "cpp", 15 | "cmath": "cpp", 16 | "cstddef": "cpp", 17 | "cstdint": "cpp", 18 | "cstdio": "cpp", 19 | "cstdlib": "cpp", 20 | "cstring": "cpp", 21 | "cwchar": "cpp", 22 | "exception": "cpp", 23 | "functional": "cpp", 24 | "initializer_list": "cpp", 25 | "ios": "cpp", 26 | "iosfwd": "cpp", 27 | "iostream": "cpp", 28 | "istream": "cpp", 29 | "iterator": "cpp", 30 | "limits": "cpp", 31 | "list": "cpp", 32 | "map": "cpp", 33 | "new": "cpp", 34 | "ostream": "cpp", 35 | "ratio": "cpp", 36 | "set": "cpp", 37 | "sstream": "cpp", 38 | "stdexcept": "cpp", 39 | "streambuf": "cpp", 40 | "string": "cpp", 41 | "system_error": "cpp", 42 | "typeinfo": "cpp", 43 | "unordered_set": "cpp", 44 | "vector": "cpp", 45 | "xfacet": "cpp", 46 | "xfunctional": "cpp", 47 | "xhash": "cpp", 48 | "xiosbase": "cpp", 49 | "xlocale": "cpp", 50 | "xlocinfo": "cpp", 51 | "xlocnum": "cpp", 52 | "xmemory": "cpp", 53 | "xstring": "cpp", 54 | "xtree": "cpp", 55 | "ctime": "cpp", 56 | "iomanip": "cpp", 57 | "stack": "cpp", 58 | "__locale": "cpp", 59 | "__bit_reference": "cpp", 60 | "__split_buffer": "cpp", 61 | "filesystem": "cpp", 62 | "deque": "cpp", 63 | "__functional_base": "cpp", 64 | "__functional_base_03": "cpp", 65 | "__hash_table": "cpp", 66 | "__tree": "cpp", 67 | "__tuple": "cpp", 68 | "any": "cpp", 69 | "__node_handle": "cpp", 70 | "atomic": "cpp", 71 | "*.tcc": "cpp", 72 | "cctype": "cpp", 73 | "clocale": "cpp", 74 | "cstdarg": "cpp", 75 | "cwctype": "cpp", 76 | "unordered_map": "cpp", 77 | "optional": "cpp", 78 | "string_view": "cpp", 79 | "fstream": "cpp", 80 | "numeric": "cpp", 81 | "bit": "cpp", 82 | "memory_resource": "cpp", 83 | "variant": "cpp", 84 | "codecvt": "cpp", 85 | "bitset": "cpp" 86 | }, 87 | "C_Cpp.default.configurationProvider": "go2sh.cmake-integration" 88 | } -------------------------------------------------------------------------------- /src/experiment/functions.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file functions.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-04-07 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef EXPERIMENT_FUNCTIONS_H_ 12 | #define EXPERIMENT_FUNCTIONS_H_ 13 | 14 | #include 15 | 16 | #include "../include/carmi_external_map.h" 17 | #include "../include/carmi_map.h" 18 | #include "./workload/workloads.h" 19 | #include "./workload/workloads_external.h" 20 | #include "dataset/exponential_distribution.h" 21 | #include "dataset/lognormal_distribution.h" 22 | #include "dataset/longitudes.h" 23 | #include "dataset/longlat.h" 24 | #include "dataset/normal_distribution.h" 25 | #include "dataset/osmc.h" 26 | #include "dataset/uniform_distribution.h" 27 | #include "dataset/ycsb.h" 28 | 29 | /** 30 | * @brief prepare query workloads 31 | * 32 | * @param[in] Ratio the ratio of find queries 33 | * @param[in] findQueryset 34 | * @param[in] insertDataset 35 | * @param[inout] findQuery 36 | * @param[inout] insertQuery 37 | * @param[inout] index 38 | */ 39 | void InitTestSet(double Ratio, const DataVecType &findQueryset, 40 | const DataVecType &insertDataset, DataVecType *findQuery, 41 | DataVecType *insertQuery, std::vector *index); 42 | 43 | /** 44 | * @brief print the average time of the workload 45 | * 46 | * @param[in] time 47 | */ 48 | void PrintAvgTime(double time); 49 | 50 | /** 51 | * @brief the function of using CARMI 52 | * 53 | * @param[in] isZipfian whether to use zipfian access during the test 54 | * @param[in] initRatio the workload type 55 | * @param[in] rate the weight of space 56 | * @param[in] length the length of range scan 57 | * @param[in] initDataset 58 | * @param[in] insertDataset 59 | * @param[in] testInsertQuery 60 | */ 61 | void CoreCARMI(bool isZipfian, double initRatio, double rate, 62 | const std::vector &length, const DataVecType &initDataset, 63 | const DataVecType &insertDataset, 64 | const DataVecType &testInsertQuery); 65 | 66 | /** 67 | * @brief the function of using external CARMI 68 | * 69 | * @param[in] isZipfian whether to use zipfian access during the test 70 | * @param[in] initRatio the workload type 71 | * @param[in] rate the weight of space 72 | * @param[in] length the length of range scan 73 | * @param[in] initDataset 74 | * @param[in] testInsertQuery 75 | */ 76 | void CoreExternalCARMI(bool isZipfian, double initRatio, double rate, 77 | const std::vector &length, 78 | const DataVecType &initDataset, 79 | const DataVecType &testInsertQuery); 80 | 81 | void mainSynthetic(double initRatio, const std::vector &length); 82 | void mainYCSB(double initRatio, const std::vector &length); 83 | void mainMap(double initRatio, const std::vector &length); 84 | void mainExperiment(); 85 | 86 | #endif // EXPERIMENT_FUNCTIONS_H_ 87 | -------------------------------------------------------------------------------- /src/unitTest/leafNodeTest/external_array_test.cpp: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @file external_array_test.cpp 4 | * @author Jiaoyi 5 | * @brief 6 | * @version 0.1 7 | * @date 2021-11-04 8 | * 9 | * @copyright Copyright (c) 2021 10 | * 11 | */ 12 | #include 13 | 14 | #include "../../include/nodes/leafNode/external_array_type.h" 15 | #include "gtest/gtest.h" 16 | 17 | typedef double KeyType; 18 | typedef double ValueType; 19 | 20 | const int kTestMaxValue = 10000; 21 | unsigned int seed = time(NULL); 22 | std::default_random_engine engine(time(0)); 23 | std::uniform_real_distribution dis(0, kTestMaxValue); 24 | 25 | template 26 | class DataType { 27 | public: 28 | typedef ValueType ValueType_; 29 | DataType() { 30 | k = 0; 31 | v = 0; 32 | } 33 | explicit DataType(KeyType key, ValueType_ value) { 34 | k = key; 35 | v = value; 36 | } 37 | const KeyType& key() const { return k; } 38 | const ValueType_& data() const { return v; } 39 | 40 | bool operator<(const DataType& a) const { 41 | if (k == a.k) { 42 | return v < a.v; 43 | } 44 | return k < a.k; 45 | } 46 | 47 | KeyType k; 48 | ValueType_ v; 49 | }; 50 | 51 | TEST(TestTrain, TrainExternalArrayNode) { 52 | for (int i = 0; i < carmi_params::kMaxLeafNodeSizeExternal; i++) { 53 | std::vector> testTrainData(i); 54 | ExternalArray externalNode; 55 | for (int j = 0; j < i; j++) { 56 | KeyType tmpKey = dis(engine); 57 | testTrainData[j] = {tmpKey, tmpKey * 10}; 58 | } 59 | std::sort(testTrainData.begin(), testTrainData.end()); 60 | externalNode.Train(testTrainData, 0, i); 61 | EXPECT_GE(externalNode.error, 0); 62 | } 63 | } 64 | 65 | TEST(TestFind, ExternalArrayNodeFind) { 66 | for (int i = 0; i < carmi_params::kMaxLeafNodeSizeExternal; i++) { 67 | std::vector> testTrainData(i); 68 | ExternalArray externalNode; 69 | KeyType* externalDataset = new KeyType[i * 2]; 70 | for (int j = 0, k = 0; j < i; j++, k += 2) { 71 | KeyType tmpKey = dis(engine); 72 | testTrainData[j] = {tmpKey, tmpKey * 10}; 73 | } 74 | std::sort(testTrainData.begin(), testTrainData.end()); 75 | for (int j = 0, k = 0; j < i; j++, k += 2) { 76 | *(externalDataset + k) = testTrainData[j].first; 77 | *(externalDataset + k + 1) = testTrainData[j].second; 78 | } 79 | externalNode.m_left = 0; 80 | externalNode.Train(testTrainData, 0, i); 81 | for (int j = 0; j < i; j++) { 82 | int currslot = 83 | externalNode.Find(testTrainData[j].first, 16, externalDataset); 84 | KeyType res = testTrainData[currslot].first; 85 | if (res != testTrainData[j].first) { 86 | currslot = 87 | externalNode.Find(testTrainData[j].first, 16, externalDataset); 88 | } 89 | ASSERT_EQ(res, testTrainData[j].first); 90 | } 91 | } 92 | } -------------------------------------------------------------------------------- /src/include/construct/dp.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file dp.h 3 | * @author Jiaoyi 4 | * @brief the main function of dynamic programming algorithm 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef CONSTRUCT_DP_H_ 12 | #define CONSTRUCT_DP_H_ 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include "../params.h" 21 | #include "./dp_inner.h" 22 | #include "./dp_leaf.h" 23 | #include "./greedy.h" 24 | #include "./structures.h" 25 | 26 | template 28 | NodeCost CARMI::DP(const DataRange &range) { 29 | NodeCost nodeCost; 30 | // Case 1: the dataset is empty, construct an empty node and return directly 31 | if (range.initRange.size == 0) { 32 | nodeCost = emptyCost; 33 | // Construct an empty leaf node when the sub-dataset is empty and store 34 | // it in the structMap. The type of this leaf node depends on the isPrimary 35 | // parameter, if it is true, construct an external array leaf node, 36 | // otherwise, construct a cache-friendly array leaf node. 37 | BaseNode optimal_node_struct; 38 | if (isPrimary) { 39 | optimal_node_struct.externalArray = 40 | ExternalArray(); 41 | } else { 42 | optimal_node_struct.cfArray = 43 | CFArrayType(); 44 | } 45 | structMap.insert({range.initRange, optimal_node_struct}); 46 | return nodeCost; 47 | } 48 | 49 | // Case 2: this sub-dataset has been solved before, return the minimum cost 50 | // directly 51 | auto it = COST.find(range.initRange); 52 | if (it != COST.end()) { 53 | nodeCost = it->second; 54 | return nodeCost; 55 | } 56 | 57 | double minRatio = 0.95; 58 | // record the maximum capacity of the leaf node 59 | int maxStoredNum = 60 | CFArrayType::kMaxLeafCapacity; 61 | if (isPrimary) { 62 | maxStoredNum = carmi_params::kMaxLeafNodeSizeExternal; 63 | } 64 | if (range.initRange.size + range.insertRange.size <= 65 | minRatio * maxStoredNum) { 66 | // Case 3: if the size is smaller than the threshold, directly construct a 67 | // leaf node 68 | return DPLeaf(range); 69 | } else if (range.initRange.size + range.insertRange.size > maxStoredNum) { 70 | // Case 4: if the size is larger than the maximum capacity of a leaf node, 71 | // directly construct an inner node 72 | return DPInner(range); 73 | } else { 74 | // Case 5: construct a leaf node and an inner node respectively, and choose 75 | // the setting with a lower cost 76 | auto resInner = DPInner(range); 77 | auto resLeaf = DPLeaf(range); 78 | if (resInner.cost > resLeaf.cost) 79 | return resLeaf; 80 | else 81 | return resInner; 82 | } 83 | } 84 | 85 | #endif // CONSTRUCT_DP_H_ 86 | -------------------------------------------------------------------------------- /src/profiler/inner_node_time.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file inner_node_time.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-05-25 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "../include/carmi.h" 19 | #include "../include/nodes/innerNode/bs_model.h" 20 | #include "../include/nodes/innerNode/his_model.h" 21 | #include "../include/nodes/innerNode/lr_model.h" 22 | #include "../include/nodes/innerNode/plr_model.h" 23 | 24 | const int kSize = 1024; 25 | const float kSecondToNanosecond = 1000000000.0; 26 | const int kModelNumber = 100000000; 27 | const int block = 512; 28 | const int end = kModelNumber / block; 29 | std::vector> data(kSize); 30 | std::vector idx(end); 31 | 32 | template 33 | double GetNodePredictTime() { 34 | std::vector node(kModelNumber, TYPE(20)); 35 | node[0].Train(0, kSize, data); 36 | for (int i = 0; i < end; i++) { 37 | node[i * block] = node[0]; 38 | } 39 | std::vector keys(kSize); 40 | for (int i = 0; i < kSize; i++) { 41 | keys[i] = i; 42 | } 43 | 44 | std::default_random_engine engine(std::clock()); 45 | shuffle(idx.begin(), idx.end(), engine); 46 | shuffle(keys.begin(), keys.end(), engine); 47 | 48 | int start, endidx, tmpIdx, type, key; 49 | int nodeSize = 8; 50 | double res; 51 | std::clock_t s, e; 52 | double tmp, tmp1 = 0; 53 | 54 | std::uniform_int_distribution dis_idx(0, end); 55 | std::uniform_int_distribution dis_key(0, kSize); 56 | s = std::clock(); 57 | for (int i = 0; i < end; i++) { 58 | tmpIdx = idx[dis_idx(engine)]; 59 | key = keys[dis_key(engine)]; 60 | res = node[tmpIdx].Predict(key); 61 | } 62 | e = std::clock(); 63 | tmp = (e - s) / static_cast(CLOCKS_PER_SEC); 64 | s = std::clock(); 65 | for (int i = 0; i < end; i++) { 66 | tmpIdx = idx[dis_idx(engine)]; 67 | key = keys[dis_key(engine)]; 68 | res = node[tmpIdx].flagNumber + node[tmpIdx].childLeft; 69 | } 70 | e = std::clock(); 71 | tmp1 = (e - s) / static_cast(CLOCKS_PER_SEC); 72 | return (tmp - tmp1) * kSecondToNanosecond / end; 73 | } 74 | 75 | int main() { 76 | for (int i = 0; i < kSize; i++) { 77 | data[i] = {i, i * 10}; 78 | } 79 | for (int i = 0; i < end; i++) { 80 | idx[i] = i * block; 81 | } 82 | double lr = 0, plr = 0, bs = 0, his = 0; 83 | float times = 1.0; 84 | for (int i = 0; i < times; i++) { 85 | lr += GetNodePredictTime>(); 86 | plr += GetNodePredictTime>(); 87 | his += GetNodePredictTime>(); 88 | bs += GetNodePredictTime>(); 89 | } 90 | 91 | std::cout << "lr average time:" << lr / times << std::endl; 92 | std::cout << "plr average time:" << plr / times << std::endl; 93 | std::cout << "his average time:" << his / times << std::endl; 94 | std::cout << "bs average time:" << bs / times << std::endl; 95 | } 96 | -------------------------------------------------------------------------------- /src/unitTest/innerNodeTest/histogram_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file histogram_test.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 0.1 6 | * @date 2021-11-03 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include 12 | 13 | #include "../../experiment/dataset/lognormal_distribution.h" 14 | #include "../../include/nodes/innerNode/his_model.h" 15 | #include "gtest/gtest.h" 16 | 17 | typedef double KeyType; 18 | typedef double ValueType; 19 | typedef std::pair DataType; 20 | 21 | std::vector initData; 22 | std::vector insertData; 23 | std::vector testInsert; 24 | 25 | const int kChildNum = 256; 26 | const int kTestMaxValue = kMaxValue; 27 | 28 | LognormalDataset logData(0.9); 29 | HisModel model(kChildNum); 30 | std::default_random_engine engine(time(0)); 31 | 32 | TEST(TestMultiTrain, MultiTrainHisModel) { 33 | std::vector testTrainData; 34 | std::uniform_real_distribution dis(0, kTestMaxValue); 35 | for (int i = 0; i < 9; i++) { 36 | int tmpSize = std::pow(10, i) - 1; 37 | std::cout << "Start test size: " << tmpSize << std::endl; 38 | testTrainData = std::vector(tmpSize); 39 | for (int j = 0; j < tmpSize; j++) { 40 | KeyType tmpKey = dis(engine); 41 | testTrainData[j] = {tmpKey, tmpKey}; 42 | } 43 | std::sort(testTrainData.begin(), testTrainData.end()); 44 | HisModel tmpModel(kChildNum); 45 | tmpModel.Train(0, testTrainData.size(), testTrainData); 46 | EXPECT_EQ(kChildNum, tmpModel.flagNumber & 0x00FFFFFF); 47 | EXPECT_NE(0, tmpModel.divisor); 48 | for (int j = 0; j < 16; j++) { 49 | EXPECT_GE(tmpModel.base[j], 0); 50 | EXPECT_LT(tmpModel.base[j], kChildNum); 51 | } 52 | for (int j = 0; j < 255; j++) { 53 | int l = tmpModel.offset[(j >> 4)] >> (15 - (j & 0x0000000F)); 54 | l = (l & 0x55555555) + ((l >> 1) & 0x55555555); 55 | l = (l & 0x33333333) + ((l >> 2) & 0x33333333); 56 | l = (l & 0x0f0f0f0f) + ((l >> 4) & 0x0f0f0f0f); 57 | l = (l & 0x00ff00ff) + ((l >> 8) & 0x00ff00ff); 58 | l += tmpModel.base[(j >> 4)]; 59 | 60 | int r = tmpModel.offset[((j + 1) >> 4)] >> (15 - ((j + 1) & 0x0000000F)); 61 | r = (r & 0x55555555) + ((r >> 1) & 0x55555555); 62 | r = (r & 0x33333333) + ((r >> 2) & 0x33333333); 63 | r = (r & 0x0f0f0f0f) + ((r >> 4) & 0x0f0f0f0f); 64 | r = (r & 0x00ff00ff) + ((r >> 8) & 0x00ff00ff); 65 | r += tmpModel.base[((j + 1) >> 4)]; 66 | EXPECT_LE(l, r); 67 | } 68 | std::cout << "Subtest " << i << " over!" << std::endl; 69 | } 70 | } 71 | 72 | TEST(TestTrain, TrainHisModel) { 73 | logData.GenerateDataset(&initData, &insertData, &testInsert); 74 | model.Train(0, initData.size(), initData); 75 | EXPECT_EQ(kChildNum, model.flagNumber & 0x00FFFFFF); 76 | EXPECT_EQ(3, model.flagNumber >> 24); 77 | } 78 | 79 | TEST(TestPredictInitData, PredictInitData) { 80 | for (int i = 0; i < initData.size(); i++) { 81 | int p = model.Predict(initData[i].first); 82 | EXPECT_GE(p, 0); 83 | EXPECT_LT(p, kChildNum); 84 | } 85 | } 86 | 87 | TEST(TestPredictInsertData, PredictInsertData) { 88 | for (int i = 0; i < insertData.size(); i++) { 89 | int p = model.Predict(insertData[i].first); 90 | EXPECT_GE(p, 0); 91 | EXPECT_LT(p, kChildNum); 92 | } 93 | } -------------------------------------------------------------------------------- /src/experiment/dataset/base_dataset.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file base_dataset.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-26 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | 12 | #ifndef EXPERIMENT_DATASET_BASE_DATASET_H_ 13 | #define EXPERIMENT_DATASET_BASE_DATASET_H_ 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "../experiment_params.h" 23 | 24 | class BaseDataset { 25 | public: 26 | float proportion; 27 | 28 | explicit BaseDataset(float init) { proportion = init; } 29 | virtual void GenerateDataset(DataVecType *initDataset, 30 | DataVecType *insertDataset, 31 | DataVecType *testInsertQuery) = 0; 32 | template 33 | void SplitInitTest(DistributionType &distribution, DataVecType *initDataset, 34 | DataVecType *insertDataset, DataVecType *testInsertQuery) { 35 | (*initDataset) = std::vector(kDatasetSize); 36 | int end = round(kTestSize * (1 - proportion)); 37 | (*testInsertQuery) = std::vector(end); 38 | std::default_random_engine generator; 39 | 40 | // generate initDataset 41 | for (int i = 0; i < kDatasetSize; i++) { 42 | double tmp = distribution(generator) * kMaxValue; 43 | (*initDataset)[i] = {tmp, tmp * 10}; 44 | } 45 | 46 | // generate testInsertQuery 47 | for (int i = 0; i < end; i++) { 48 | double tmp = distribution(generator) * kMaxValue; 49 | (*testInsertQuery)[i] = {tmp, tmp * 10}; 50 | } 51 | 52 | std::sort(initDataset->begin(), initDataset->end()); 53 | // generate insertQuery 54 | if (testInsertQuery->size() > 0) { 55 | for (int i = 10; i < kDatasetSize - 1; i += 10) { 56 | double tmp = 57 | ((*initDataset)[i].first + (*initDataset)[i + 1].first) / 2; 58 | (*insertDataset).push_back({tmp, tmp * 10}); 59 | } 60 | } 61 | std::sort(insertDataset->begin(), insertDataset->end()); 62 | 63 | std::cout << "generate dataset over! init size:" << initDataset->size() 64 | << "\tWrite size:" << testInsertQuery->size() << std::endl; 65 | } 66 | 67 | void SplitInitTest(DataVecType *dataset, DataVecType *initDataset, 68 | DataVecType *insertDataset, DataVecType *testInsertQuery) { 69 | (*initDataset) = std::vector(kDatasetSize); 70 | int end = round(kTestSize * (1 - proportion)); 71 | (*testInsertQuery) = std::vector(end); 72 | 73 | unsigned seed = std::clock(); 74 | std::default_random_engine engine(seed); 75 | shuffle((*dataset).begin(), (*dataset).end(), engine); 76 | 77 | int i = 0; 78 | for (int j = 0; i < end; i++, j++) { 79 | (*testInsertQuery)[j] = (*dataset)[i]; 80 | } 81 | end = (*dataset).size(); 82 | for (int j = 0; i < end; i++, j++) { 83 | (*initDataset)[j] = (*dataset)[i]; 84 | } 85 | 86 | std::sort(initDataset->begin(), initDataset->end()); 87 | if (testInsertQuery->size() > 0) { 88 | for (int i = 10; i < kDatasetSize - 1; i += 10) { 89 | double tmp = 90 | ((*initDataset)[i].first + (*initDataset)[i + 1].first) / 2; 91 | (*insertDataset).push_back({tmp, tmp * 10}); 92 | } 93 | } 94 | std::sort(insertDataset->begin(), insertDataset->end()); 95 | 96 | std::cout << " init size:" << (*initDataset).size() 97 | << "\tWrite size:" << (*testInsertQuery).size() << std::endl; 98 | } 99 | }; 100 | 101 | #endif // EXPERIMENT_DATASET_BASE_DATASET_H_ 102 | -------------------------------------------------------------------------------- /src/include/nodes/rootNode/root_nodes.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file root_nodes.h 3 | * @author Jiaoyi 4 | * @brief the details of root nodes 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef NODES_ROOTNODE_ROOT_NODES_H_ 12 | #define NODES_ROOTNODE_ROOT_NODES_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include "../../construct/structures.h" 19 | #include "../../params.h" 20 | #include "trainModel/linear_regression.h" 21 | #include "trainModel/piecewiseLR.h" 22 | #include "trainModel/prefetch_plr.h" 23 | 24 | /** 25 | * @brief piecewise linear regression root node 26 | * 27 | * The piecewise linear regression model with five segments can allocate data 28 | * points more evenly. 29 | * 30 | * Since the root node is always in the cache, we do not limit its size here. We 31 | * use a five-segment P. LR model, occupying 76 bytes. In addition, to support 32 | * the prefetch function, we add a prefetch prediction model to speed up the 33 | * process of accessing a data point. 34 | * 35 | * @tparam DataVectorType the type of dataset 36 | * @tparam KeyType the type of the given key value 37 | */ 38 | template 39 | class PLRType { 40 | public: 41 | // *** Constructed Types and Constructor 42 | 43 | /** 44 | * @brief The type of the model: piecewise linear regression 45 | */ 46 | typedef PiecewiseLR ModelType; 47 | 48 | /** 49 | * @brief Construct a new PLRType object with the default constructor 50 | */ 51 | PLRType() = default; 52 | 53 | /** 54 | * @brief Construct a new PLRType object and train the plr model with the 55 | * given dataset. 56 | * 57 | * PLR root node uses a piecewise linear regression model to predict the index 58 | * of the next node. When finding the position of the data point, we first 59 | * find the first breakpoint greater than or equal to the given key value, and 60 | * then use the corresponding model parameters for the calculation and 61 | * boundary processing. 62 | * 63 | * @param[in] childNum the number of the child nodes in the root node 64 | * @param[in] dataset the dataset used to train the plr model of the root node 65 | */ 66 | PLRType(int childNum, const DataVectorType &dataset) { 67 | flagNumber = PLR_ROOT_NODE; 68 | model.maxChildIdx = std::max(2, childNum - 1); 69 | model.Train(dataset); 70 | } 71 | 72 | public: 73 | // *** Static Constant Options and Values of P. LR Root Node Objects 74 | 75 | /** 76 | * @brief The time cost of the plr root node. 77 | */ 78 | static constexpr double kTimeCost = carmi_params::kPLRRootTime; 79 | 80 | public: 81 | //*** Public Data Members of P. LR Root Node Objects 82 | 83 | /** 84 | * @brief the main root model: piecewise linear regression model with five 85 | * segments to allocate the dataset to the child nodes. 86 | * 87 | * We use this model to predict the index of the next node, and use the raw 88 | * output of this model (leaf index before rounding down) as the input to the 89 | * prefetch prediction model. (72 bytes) 90 | */ 91 | ModelType model; 92 | 93 | /** 94 | * @brief the prefetch prediction model. 95 | * 96 | * This model is also a piecewise linear regression model, which uses the 97 | * output of the main root model to compute a block index. In this model, 98 | * the slope and intercept of each segment are forced to be integers, so that 99 | * within each segment, each leaf node is mapped to the same number of data 100 | * blocks. 101 | */ 102 | PrefetchPLR fetch_model; 103 | 104 | /** 105 | * @brief the type of the root node: PLR_ROOT_NODE (4 bytes) 106 | */ 107 | int flagNumber; 108 | }; 109 | 110 | #endif // NODES_ROOTNODE_ROOT_NODES_H_ 111 | -------------------------------------------------------------------------------- /src/include/memoryLayout/empty_block.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file empty_block.h 3 | * @author Jiaoyi 4 | * @brief the class of empty memory blocks 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef MEMORYLAYOUT_EMPTY_BLOCK_H_ 12 | #define MEMORYLAYOUT_EMPTY_BLOCK_H_ 13 | 14 | #include 15 | #include 16 | 17 | /** 18 | * @brief Basic class used to manage empty memory blocks. 19 | * 20 | * This class is used to manage empty memory blocks with the size m_width. 21 | * In CARMI, this class can speed up the process of memory allocation, which 22 | * only needs to return the first element in m_block. 23 | * 24 | * This class is used as a member type of the vector of the DataArrayStructure 25 | * in data_array.h. Users can customize the granularity of the width of the 26 | * empty memory blocks according to the node type they implement. For example, 27 | * these can be 1~7 for the CF array leaf node. At the same time, it can also 28 | * have a coarser granularity:2, 4, 8, ..., 512, 1024, 2048. 29 | */ 30 | class EmptyMemoryBlock { 31 | public: 32 | //*** Constructor 33 | 34 | /** 35 | * @brief Construct a new EmptyMemoryBlock object, set the width of the empty 36 | * memory block 37 | * 38 | * @param[in] width the width of this type of empty memory block 39 | */ 40 | explicit EmptyMemoryBlock(int width) { m_width = width; } 41 | 42 | public: 43 | //*** Public Functions of EmptyMemoryBlock Objects 44 | 45 | /** 46 | * @brief Allocate a block of empty memory. If the set of memory blocks of 47 | * size m_width has empty blocks available for allocation, which means there 48 | * are still elements in m_block, then return the empty memory block index 49 | * with the smallest index among all the empty blocks. If there are no empty 50 | * blocks, allocation fails, and this function returns -1. 51 | * 52 | * @return int: if allocation is successful, return the smallest element in 53 | * m_block, otherwise return -1. 54 | * @retval -1 allocation fails 55 | */ 56 | int Allocate() { 57 | // Case 1: if the set is empty, allocation fails 58 | if (m_block.empty()) { 59 | return -1; 60 | } 61 | // Case 2: allocation succeeds, return the smallest element of m_block and 62 | // erase this block from the empty set 63 | int res = *m_block.begin(); 64 | m_block.erase(m_block.begin()); 65 | return res; 66 | } 67 | 68 | /** 69 | * @brief add the corresponding empty blocks (insert the left index of the 70 | * block into the m_block set) 71 | * 72 | * @param[in] idx the index of blocks 73 | * @param[in] size the size of blocks 74 | * @return int: the size of the empty block after this action 75 | */ 76 | int AddBlock(int idx, int size) { 77 | if (size < m_width) return -1; 78 | int newIdx = idx + size - m_width; 79 | m_block.insert(newIdx); 80 | return size - m_width; 81 | } 82 | 83 | /** 84 | * @brief check whether the memory block with the beginning index idx is 85 | * empty, return the check result 86 | * 87 | * @param[in] idx the beginning index of this block 88 | * @retval true this block is empty 89 | * @retval false this block is not empty and has been allocated 90 | */ 91 | bool IsEmpty(int idx) { 92 | std::set::iterator it = m_block.find(idx); 93 | if (it != m_block.end()) 94 | return true; 95 | else 96 | return false; 97 | } 98 | 99 | public: 100 | //*** Public Data Members of EmptyMemoryBlock Objects 101 | 102 | /** 103 | * @brief used to store the beginning indexes of all empty memory blocks with 104 | * m_width 105 | */ 106 | std::set m_block; 107 | 108 | /** 109 | * @brief the width of this empty memory block 110 | */ 111 | int m_width; 112 | }; 113 | 114 | #endif // MEMORYLAYOUT_EMPTY_BLOCK_H_ 115 | -------------------------------------------------------------------------------- /src/include/base_node.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file base_node.h 3 | * @author Jiaoyi 4 | * @brief the main structures of nodes 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef BASE_NODE_H_ 12 | #define BASE_NODE_H_ 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "nodes/innerNode/bs_model.h" 22 | #include "nodes/innerNode/his_model.h" 23 | #include "nodes/innerNode/lr_model.h" 24 | #include "nodes/innerNode/plr_model.h" 25 | #include "nodes/leafNode/cfarray_type.h" 26 | #include "nodes/leafNode/external_array_type.h" 27 | #include "nodes/rootNode/root_nodes.h" 28 | 29 | /** 30 | * @brief the root type of CARMI 31 | * 32 | * This class inherits PLRType as the root node. When accessing a data point, we 33 | * first use the root node's model to compute the next node's index. In the 34 | * CARMI framework, the object of this class serves as one of its private 35 | * members. 36 | * 37 | * @tparam DataVectorType the type of data vector 38 | * @tparam KeyType the type of the given key value 39 | */ 40 | template 41 | class CARMIRoot : public PLRType { 42 | public: 43 | // *** Constructed Types and Constructor 44 | 45 | /** 46 | * @brief Construct a new CARMIRoot object with the default constructor 47 | */ 48 | CARMIRoot() = default; 49 | 50 | /** 51 | * @brief Copy from a PLRType object to an object of the current object. 52 | * 53 | * @param[in] currnode the PLRType object 54 | * @return CARMIRoot& the object of the current class 55 | */ 56 | CARMIRoot& operator=(const PLRType& currnode) { 57 | this->PLRType::model = currnode.model; 58 | this->PLRType::fetch_model = currnode.fetch_model; 59 | this->flagNumber = currnode.flagNumber; 60 | return *this; 61 | } 62 | }; 63 | 64 | /** 65 | * @brief the 64 bytes structure for all types of nodes to support the 66 | * cache-aware design 67 | * 68 | * The first byte is always the node type identifier, and the next three bytes 69 | * are used to store the number of child nodes (the number of data blocks for 70 | * leaf nodes). For inner nodes, the following 4 bytes represent the starting 71 | * index of the child nodes in the node array. For leaf nodes, they store the 72 | * starting index of data blocks in the data array instead. The remaining 56 73 | * bytes store additional information depending on the tree node type. 74 | * 75 | * @tparam KeyType the type of the given key value 76 | * @tparam ValueType the type of the value 77 | */ 78 | template , 80 | typename Alloc = std::allocator>> 81 | union BaseNode { 82 | /** 83 | * @brief the linear regression inner node 84 | */ 85 | LRModel lr; 86 | 87 | /** 88 | * @brief the piecewise linear regression inner node 89 | */ 90 | PLRModel plr; 91 | 92 | /** 93 | * @brief the histogram inner node 94 | */ 95 | HisModel his; 96 | 97 | /** 98 | * @brief the binary search inner node 99 | */ 100 | BSModel bs; 101 | 102 | /** 103 | * @brief the cache-friendly array leaf node 104 | */ 105 | CFArrayType cfArray; 106 | 107 | /** 108 | * @brief the external array leaf node 109 | */ 110 | ExternalArray externalArray; 111 | 112 | BaseNode() {} 113 | ~BaseNode() {} 114 | 115 | BaseNode& operator=(const BaseNode& currnode) { 116 | if (this != &currnode) { 117 | this->lr = currnode.lr; 118 | } 119 | return *this; 120 | } 121 | }; 122 | 123 | #endif // BASE_NODE_H_ 124 | -------------------------------------------------------------------------------- /src/include/memoryLayout/node_array.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file node_array.h 3 | * @author Jiaoyi 4 | * @brief manage the node array 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef MEMORYLAYOUT_NODE_ARRAY_H_ 12 | #define MEMORYLAYOUT_NODE_ARRAY_H_ 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "../base_node.h" 20 | 21 | /** 22 | * @brief the structure of node array 23 | * 24 | * @tparam KeyType the type of the keyword 25 | * @tparam ValueType the type of the value 26 | * @tparam Compare A binary predicate that takes two element keys as arguments 27 | * and returns a bool. 28 | * @tparam Alloc Type of the allocator object used to define the storage 29 | * allocation model. 30 | */ 31 | template , 33 | typename Alloc = std::allocator>> 34 | class NodeArrayStructure { 35 | public: 36 | // *** Constructed Types and Constructor 37 | 38 | /** 39 | * @brief Construct a new Node Array Structure object 40 | * Initialize the nowNodeNumber with 0 and construct 4096 empty members of 41 | * BaseNode vector 42 | */ 43 | NodeArrayStructure() { 44 | nowNodeNumber = 0; 45 | std::vector>( 46 | 4096, BaseNode()) 47 | .swap(nodeArray); 48 | } 49 | 50 | public: 51 | //*** Public Functions of NodeArrayStructure 52 | 53 | /** 54 | * @brief allocate a block of empty memory to store the nodes 55 | * 56 | * @param[in] size the size of nodes needed to be stored 57 | * @return int: the beginning index of this allocated memory 58 | */ 59 | int AllocateNodeMemory(int size); 60 | 61 | /** 62 | * @brief After the construction of CARMI is completed, the useless memory 63 | * exceeding the needed size will be released. 64 | * 65 | * @param[in] neededSize the size of needed node blocks 66 | */ 67 | void ReleaseUselessMemory(int neededSize); 68 | 69 | public: 70 | //*** Public Data Member of Node Array Structure Objects 71 | 72 | /** 73 | * @brief the node array mentioned in the paper. 74 | * 75 | * All tree nodes, including both inner nodes and leaf nodes, are stored in 76 | * this node array. Each member occupies a fixed size according to the 77 | * BaseNode class. 78 | */ 79 | std::vector> nodeArray; 80 | 81 | /** 82 | * @brief the used size of nodeArray 83 | */ 84 | int nowNodeNumber; 85 | }; 86 | 87 | template 89 | int NodeArrayStructure::AllocateNodeMemory( 90 | int size) { 91 | if (size < 0) { 92 | throw std::invalid_argument( 93 | "NodeArrayStructure::AllocateNodeMemory: the size is less than 0."); 94 | } 95 | int newLeft = -1; 96 | unsigned int tmpSize = nodeArray.size(); 97 | 98 | // allocation fails, need to expand the nodeArray 99 | while (nowNodeNumber + size > tmpSize) { 100 | BaseNode t; 101 | tmpSize *= 1.25; 102 | nodeArray.resize(tmpSize, t); 103 | } 104 | newLeft = nowNodeNumber; 105 | nowNodeNumber += size; 106 | return newLeft; 107 | } 108 | 109 | template 111 | void NodeArrayStructure::ReleaseUselessMemory(int neededSize) { 113 | if (neededSize < 0) { 114 | throw std::invalid_argument( 115 | "NodeArrayStructure::ReleaseUselessMemory: the size is less than 0."); 116 | } 117 | std::vector> tmp( 118 | nodeArray.begin(), nodeArray.begin() + neededSize); 119 | std::vector>().swap(nodeArray); 120 | nodeArray = tmp; 121 | } 122 | #endif // MEMORYLAYOUT_NODE_ARRAY_H_ 123 | -------------------------------------------------------------------------------- /src/include/nodes/rootNode/trainModel/linear_regression.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file linear_regression.h 3 | * @author Jiaoyi 4 | * @brief linear regression model 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef NODES_ROOTNODE_TRAINMODEL_LINEAR_REGRESSION_H_ 12 | #define NODES_ROOTNODE_TRAINMODEL_LINEAR_REGRESSION_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "../../../params.h" 22 | 23 | /** 24 | * @brief linear regression model for root node 25 | * 26 | * This model is a very simple root node consisting of a linear regression 27 | * model. In addition to the number of child nodes, only two linear regression 28 | * model parameters need to be stored. We only need one model prediction and the 29 | * boundary condition processing to get the index of the next child node. 30 | * 31 | * @tparam DataVectorType the vector type of the dataset 32 | * @tparam KeyType the type of the key value 33 | */ 34 | template 35 | class LinearRegression { 36 | public: 37 | // *** Constructor 38 | 39 | /** 40 | * @brief Construct a new Linear Regression object and set the default value 41 | * of the linear regression model parameters. 42 | */ 43 | LinearRegression() { 44 | slope = 0.0001; 45 | intercept = 0.666; 46 | maxChildIdx = 2; 47 | minValue = 0; 48 | } 49 | 50 | /** 51 | * @brief use the given dataset to train the lr model 52 | * 53 | * @param[in] dataset the original dataset, each data point is: {key, value} 54 | */ 55 | void Train(const DataVectorType &dataset) { 56 | int idx = 0; 57 | int size = dataset.size(); 58 | if (size == 0) return; 59 | minValue = dataset[0].first; 60 | std::vector index(size, 0); 61 | // construct the training dataset, x is the key value in the dataset, y is 62 | // the corresponding ratio of index in the maxChildIdx 63 | for (int i = 0; i < size; i++) { 64 | index[idx++] = static_cast(i) / size * maxChildIdx; 65 | } 66 | 67 | // train the lr model 68 | long double t1 = 0, t2 = 0, t3 = 0, t4 = 0; 69 | for (int i = 0; i < size; i++) { 70 | t1 += static_cast(dataset[i].first - minValue) * 71 | static_cast(dataset[i].first - minValue); 72 | t2 += static_cast(dataset[i].first - minValue); 73 | t3 += static_cast(dataset[i].first - minValue) * 74 | static_cast(index[i]); 75 | t4 += static_cast(index[i]); 76 | } 77 | if (t1 * size - t2 * t2) { 78 | slope = (t3 * size - t2 * t4) / (t1 * size - t2 * t2); 79 | intercept = (t1 * t4 - t2 * t3) / (t1 * size - t2 * t2); 80 | } else { 81 | slope = 1.0; 82 | intercept = 1.0; 83 | } 84 | } 85 | 86 | /** 87 | * @brief output the unrounded index of the next node of the given key value 88 | * 89 | * @param[in] key the given key value 90 | * @return double: the unrounded index 91 | */ 92 | inline double Predict(KeyType key) const { 93 | // predict the index of the next node using the lr model 94 | double p = slope * static_cast(key - minValue) + intercept; 95 | // boundary processing 96 | if (p < 0) 97 | p = 0; 98 | else if (p > maxChildIdx) 99 | p = maxChildIdx; 100 | return p; 101 | } 102 | 103 | public: 104 | //*** Public Data Members of LR Model Objects 105 | 106 | /** 107 | * @brief The number of the child nodes 108 | */ 109 | int maxChildIdx; 110 | 111 | private: 112 | //*** Private Data Members of LR Model Objects 113 | 114 | /** 115 | * @brief The linear regression parameter: the slope 116 | */ 117 | double slope; 118 | 119 | /** 120 | * @brief The linear regression parameter: the intercept 121 | */ 122 | double intercept; 123 | 124 | /** 125 | * @brief The minimum value. 126 | */ 127 | KeyType minValue; 128 | }; 129 | #endif // NODES_ROOTNODE_TRAINMODEL_LINEAR_REGRESSION_H_ 130 | -------------------------------------------------------------------------------- /src/include/func/split_function.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file split_function.h 3 | * @author Jiaoyi 4 | * @brief the split function for insert function 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef FUNC_SPLIT_FUNCTION_H_ 12 | #define FUNC_SPLIT_FUNCTION_H_ 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | #include "../carmi.h" 20 | #include "../construct/minor_function.h" 21 | #include "../params.h" 22 | 23 | template 25 | template 26 | inline void CARMI::Split(int idx) { 27 | // get the parameters of this leaf node 28 | int previousIdx = node.nodeArray[idx].cfArray.previousLeaf; 29 | int nextIdx = node.nodeArray[idx].cfArray.nextLeaf; 30 | 31 | DataVectorType tmpDataset; 32 | int leftIdx; 33 | // extract pure data points 34 | if (isPrimary) { 35 | leftIdx = node.nodeArray[idx].externalArray.m_left; 36 | int rightIdx = 37 | leftIdx + (node.nodeArray[idx].externalArray.flagNumber & 0x00FFFFFF); 38 | tmpDataset = ExternalArray::ExtractDataset( 39 | external_data, leftIdx, rightIdx, recordLength); 40 | } else { 41 | leftIdx = node.nodeArray[idx].cfArray.m_left; 42 | int rightIdx = 43 | leftIdx + (node.nodeArray[idx].cfArray.flagNumber & 0x00FFFFFF); 44 | tmpDataset = 45 | CFArrayType::ExtractDataset( 46 | data, leftIdx, rightIdx); 47 | } 48 | int actualSize = tmpDataset.size(); 49 | 50 | // create a new inner node and store it in the node[idx] 51 | auto currnode = LRModel(kInsertNewChildNumber); 52 | currnode.Train(0, actualSize, tmpDataset); 53 | 54 | std::vector perSize(kInsertNewChildNumber, emptyRange); 55 | IndexPair range{0, actualSize}; 56 | NodePartition>(currnode, range, tmpDataset, 57 | &perSize); 58 | currnode.childLeft = node.AllocateNodeMemory(kInsertNewChildNumber); 59 | node.nodeArray[idx].lr = currnode; 60 | 61 | int tmpLeft = leftIdx; 62 | // create kInsertNewChildNumber new leaf nodes and store them in the node 63 | // array 64 | for (int i = 0; i < kInsertNewChildNumber; i++) { 65 | LeafNodeType tmpLeaf; 66 | std::vector prefetchIndex(perSize[i].size); 67 | int s = perSize[i].left; 68 | int e = perSize[i].left + perSize[i].size; 69 | for (int j = s; j < e; j++) { 70 | double predictLeafIdx = root.model.Predict(tmpDataset[j].first); 71 | int p = root.fetch_model.PrefetchPredict(predictLeafIdx); 72 | prefetchIndex[j - s] = p; 73 | } 74 | tmpLeaf.Init(tmpDataset, prefetchIndex, s, &data); 75 | if (isPrimary) { 76 | tmpLeaf.m_left = tmpLeft; 77 | tmpLeft += perSize[i].size; 78 | } 79 | node.nodeArray[currnode.childLeft + i].cfArray = 80 | *(reinterpret_cast *>( 81 | &tmpLeaf)); 82 | } 83 | if (idx == lastLeaf) { 84 | lastLeaf = currnode.childLeft + kInsertNewChildNumber - 1; 85 | } 86 | if (idx == firstLeaf) { 87 | firstLeaf = currnode.childLeft; 88 | } 89 | 90 | // if the original leaf node is the cf array leaf node, we need to update the 91 | // pointer to the siblings of the new leaf nodes 92 | if (!isPrimary) { 93 | if (previousIdx >= 0) { 94 | node.nodeArray[previousIdx].cfArray.nextLeaf = currnode.childLeft; 95 | } 96 | node.nodeArray[currnode.childLeft].cfArray.previousLeaf = previousIdx; 97 | node.nodeArray[currnode.childLeft].cfArray.nextLeaf = 98 | currnode.childLeft + 1; 99 | int end = currnode.childLeft + kInsertNewChildNumber - 1; 100 | for (int i = currnode.childLeft + 1; i < end; i++) { 101 | node.nodeArray[i].cfArray.previousLeaf = i - 1; 102 | node.nodeArray[i].cfArray.nextLeaf = i + 1; 103 | } 104 | node.nodeArray[end].cfArray.previousLeaf = end - 1; 105 | if (nextIdx != -1) { 106 | node.nodeArray[end].cfArray.nextLeaf = nextIdx; 107 | node.nodeArray[nextIdx].cfArray.previousLeaf = end; 108 | } 109 | } 110 | } 111 | 112 | #endif // FUNC_SPLIT_FUNCTION_H_ 113 | -------------------------------------------------------------------------------- /src/include/func/insert_function.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file insert_function.h 3 | * @author Jiaoyi 4 | * @brief insert a record 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef FUNC_INSERT_FUNCTION_H_ 12 | #define FUNC_INSERT_FUNCTION_H_ 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "../carmi.h" 22 | #include "../construct/minor_function.h" 23 | #include "./split_function.h" 24 | 25 | template 27 | std::pair *, bool> 28 | CARMI::Insert(const DataType &datapoint, 29 | int *currblock, 30 | int *currslot) { 31 | int idx = 0; // idx in the node array 32 | int type = root.flagNumber; 33 | while (1) { 34 | switch (type) { 35 | case PLR_ROOT_NODE: 36 | // Case 0: this node is the plr root node 37 | // use the plr root node to find the index of the next node 38 | idx = root.PLRType::model.Predict( 39 | datapoint.first); 40 | break; 41 | case LR_INNER_NODE: 42 | // Case 1: this node is the lr inner node 43 | // use the predict function of lr inner node to obtain the index of the 44 | // next node 45 | idx = node.nodeArray[idx].lr.Predict(datapoint.first); 46 | break; 47 | case PLR_INNER_NODE: 48 | // Case 2: this node is the plr inner node 49 | // use the predict function of plr inner node to obtain the index of the 50 | // next node 51 | idx = node.nodeArray[idx].plr.Predict(datapoint.first); 52 | break; 53 | case HIS_INNER_NODE: 54 | // Case 3: this node is the his inner node 55 | // use the predict function of his inner node to obtain the index of the 56 | // next node 57 | idx = node.nodeArray[idx].his.Predict(datapoint.first); 58 | break; 59 | case BS_INNER_NODE: 60 | // Case 4: this node is the bs inner node 61 | // use the predict function of bs inner node to obtain the index of the 62 | // next node 63 | idx = node.nodeArray[idx].bs.Predict(datapoint.first); 64 | break; 65 | case ARRAY_LEAF_NODE: { 66 | // Case 5: this node is the cache-friendly array leaf node 67 | // insert the data point in the cf leaf node 68 | bool isSuccess = node.nodeArray[idx].cfArray.Insert( 69 | datapoint, currblock, currslot, &data); 70 | if (isSuccess) { 71 | if (datapoint.first > lastKey) { 72 | lastLeaf = idx; 73 | lastKey = datapoint.first; 74 | } 75 | if (datapoint.first < firstKey) { 76 | firstLeaf = idx; 77 | firstKey = datapoint.first; 78 | } 79 | currsize++; 80 | return {&node.nodeArray[idx], true}; 81 | } else { 82 | // if this leaf node cannot accomodate more data points, we need to 83 | // split it and replace it with a new inner node and several new leaf 84 | // nodes 85 | Split>(idx); 86 | idx = node.nodeArray[idx].lr.Predict(datapoint.first); 87 | } 88 | break; 89 | } 90 | case EXTERNAL_ARRAY_LEAF_NODE: { 91 | // Case 6: this node is the external array leaf node 92 | // insert the key value of the data point in the external leaf node 93 | bool isSuccess = 94 | node.nodeArray[idx].externalArray.Insert(datapoint, &currsize); 95 | 96 | if (isSuccess) { 97 | *currslot = currsize - 1; 98 | return {&node.nodeArray[idx], true}; 99 | } else { 100 | // if this leaf node cannot accomodate more data points, we need to 101 | // split it and replace it with a new inner node and several new leaf 102 | // nodes 103 | Split>(idx); 104 | idx = node.nodeArray[idx].lr.Predict(datapoint.first); 105 | } 106 | } 107 | } 108 | 109 | type = node.nodeArray[idx].lr.flagNumber >> 24; 110 | } 111 | } 112 | 113 | #endif // FUNC_INSERT_FUNCTION_H_ 114 | -------------------------------------------------------------------------------- /src/include/params.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file params.h 3 | * @author Jiaoyi 4 | * @brief parameters in carmi_params space 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef PARAMS_H_ 12 | #define PARAMS_H_ 13 | 14 | #define DEBUG 15 | 16 | #ifdef __APPLE__ 17 | #include 18 | #if TARGET_OS_OSX == 1 19 | #define CATCH_PLATFORM_MAC 20 | #elif TARGET_OS_IPHONE == 1 21 | #define CATCH_PLATFORM_IPHONE 22 | #endif 23 | 24 | #elif defined(linux) || defined(__linux) || defined(__linux__) 25 | #define CATCH_PLATFORM_LINUX 26 | 27 | #elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || \ 28 | defined(_MSC_VER) || defined(__MINGW32__) 29 | #define CATCH_PLATFORM_WINDOWS 30 | #endif 31 | 32 | /** 33 | * @brief These are parameters in the CARMI framework. The first three 34 | * parameters are the maximum capacity of the cf array leaf node and external 35 | * array leaf node we have provided in the source code, and the boundary value 36 | * for switching between the dynamic programming algorithm and the greedy node 37 | * selection algorithm. Users can change their values according to their actual 38 | * needs. As for the parameters of the time costs of different nodes, users can 39 | * use our profiler to obtain them on their machine. 40 | */ 41 | namespace carmi_params { 42 | /** 43 | * @brief bytes, the size of a data block in cf array leaf nodes. 44 | * The value must be an integer multiple of the size of the cache line, the 45 | * reference values are: 64, 128, 256, 512, etc. 46 | */ 47 | static constexpr int kMaxLeafNodeSize = 256; 48 | 49 | /** 50 | * @brief The maximum number of data points in an external leaf node. 51 | * This value is generally an integer multiple of 2. Since the external dataset 52 | * is not stored in our index structure, the value can be larger to reduce the 53 | * space cost. Reference values are 512, 1024, 2048, and so on. 54 | */ 55 | static constexpr int kMaxLeafNodeSizeExternal = 1024; 56 | 57 | /** 58 | * @brief The maximum number of data points which can use the DP algorithm to 59 | * construct an inner node. If the size of the sub-dataset exceeds this 60 | * parameter, greedy node selection algorithm is used to construct the inner 61 | * node. 62 | * This value needs to be no less than the first two parameters. 63 | */ 64 | static constexpr int kAlgorithmThreshold = 60000; 65 | 66 | /** 67 | * @brief The latency of a memory access 68 | */ 69 | static constexpr double kMemoryAccessTime = 80.09; 70 | 71 | /** 72 | * @brief The time cost of the lr root node including the latency of 73 | * accessing the cache (8.29 ns) and the CPU time (3.25 ns) 74 | */ 75 | static constexpr double kLRRootTime = 11.54; 76 | 77 | /** 78 | * @brief The time cost of the plr root node including the latency of 79 | * accessing the cache (11.24 ns) and the CPU time (18.38 ns) 80 | */ 81 | static constexpr double kPLRRootTime = 29.62; 82 | 83 | /** 84 | * @brief The time cost of the lr inner node including the latency of 85 | * memory access and the CPU time 86 | */ 87 | static constexpr double kLRInnerTime = kMemoryAccessTime + 5.23; 88 | 89 | /** 90 | * @brief The time cost of the plr inner node including the latency of 91 | * memory access and the CPU time 92 | */ 93 | static constexpr double kPLRInnerTime = kMemoryAccessTime + 22.8; 94 | 95 | /** 96 | * @brief The time cost of the his inner node including the latency of 97 | * memory access and the CPU time 98 | */ 99 | static constexpr double kHisInnerTime = kMemoryAccessTime + 18.44; 100 | 101 | /** 102 | * @brief The time cost of the bs inner node including the latency of 103 | * memory access and the CPU time 104 | */ 105 | static constexpr double kBSInnerTime = kMemoryAccessTime + 26.38; 106 | 107 | /** 108 | * @brief The time cost of moving a data point 109 | */ 110 | static constexpr double kCostMoveTime = 6.25; 111 | 112 | /** 113 | * @brief The basic time cost of a leaf node including the latency of accessing 114 | * the leaf node in the memory and the time cost of searching in the leaf node 115 | * (25.4 ns). 116 | */ 117 | static constexpr double kLeafBaseTime = kMemoryAccessTime + 25.4; 118 | 119 | /** 120 | * @brief The average time cost of a binary search 121 | */ 122 | static constexpr double kCostBSTime = 10.9438; 123 | } // namespace carmi_params 124 | 125 | #endif // PARAMS_H_ 126 | -------------------------------------------------------------------------------- /src/include/construct/dp_inner.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file dp_inner.h 3 | * @author Jiaoyi 4 | * @brief use dynamic programming algorithm to construct inner nodes 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef CONSTRUCT_DP_INNER_H_ 12 | #define CONSTRUCT_DP_INNER_H_ 13 | #include 14 | 15 | #include 16 | #include 17 | 18 | #include "../carmi.h" 19 | 20 | template 22 | template 23 | void CARMI::UpdateDPOptSetting( 24 | const DataRange &dataRange, int c, double frequency_weight, 25 | NodeCost *optimalCost, InnerNodeType *optimal_node_struct) { 26 | double space_cost = kBaseNodeSpace * static_cast(c); 27 | double time_cost = 28 | InnerNodeType::kTimeCost * static_cast(frequency_weight); 29 | double RootCost = time_cost + lambda * space_cost; 30 | // Case 1: the cost of the root node has been larger than the optimal cost, 31 | // return directly 32 | if (RootCost > optimalCost->cost) { 33 | return; 34 | } 35 | 36 | // Case 2: construct an inner node and divide the dataset into c sub-datasets 37 | SubDataset subDataset(c); 38 | auto currnode = InnerDivideAll(dataRange, c, &subDataset); 39 | 40 | for (int i = 0; i < c; i++) { 41 | // calculate the cost of each child node 42 | DataRange range(subDataset.subInit[i], subDataset.subFind[i], 43 | subDataset.subInsert[i]); 44 | // Case 2.1: if this inner node fails to divide dataset evenly, return 45 | // directly 46 | if (range.initRange.size + range.initRange.size == 47 | dataRange.initRange.size + dataRange.insertRange.size) { 48 | return; 49 | } 50 | 51 | NodeCost res = DP(range); 52 | 53 | space_cost += res.space; 54 | time_cost += res.time; 55 | RootCost += lambda * res.space + res.time; 56 | } 57 | // if the current cost is smaller than the optimal cost, update the optimal 58 | // cost and node setting 59 | if (RootCost <= optimalCost->cost) { 60 | *optimalCost = {time_cost, space_cost, RootCost}; 61 | *optimal_node_struct = currnode; 62 | } 63 | } 64 | 65 | template 67 | NodeCost CARMI::DPInner( 68 | const DataRange &dataRange) { 69 | // the optimal cost of this sub-dataset 70 | NodeCost optimalCost{DBL_MAX, DBL_MAX, DBL_MAX}; 71 | // the optimal node of this sub-dataset 72 | BaseNode optimal_node_struct = emptyNode; 73 | // calculate the weight of the frequency of this sub-dataset (findQuery and 74 | // insertQury) 75 | double frequency_weight = CalculateFrequencyWeight(dataRange); 76 | int tmpEnd = std::min(0x00FFFFFF, dataRange.initRange.size / 16); 77 | tmpEnd = std::max(tmpEnd, kMinChildNumber); 78 | for (int c = kMinChildNumber; c <= tmpEnd; c *= 2) { 79 | // Case 1: construct a LR inner node, if it is better than the current 80 | // optimal setting, then use it to update the optimal setting 81 | UpdateDPOptSetting>( 82 | dataRange, c, frequency_weight, &optimalCost, 83 | &(optimal_node_struct.lr)); 84 | // Case 2: construct a P. LR inner node, if it is better than the current 85 | // optimal setting, then use it to update the optimal setting 86 | UpdateDPOptSetting>( 87 | dataRange, c, frequency_weight, &optimalCost, 88 | &(optimal_node_struct.plr)); 89 | // Case 3: construct a His inner node, if it is better than the current 90 | // optimal setting, then use it to update the optimal setting 91 | if (c <= kHisMaxChildNumber) 92 | UpdateDPOptSetting>( 93 | dataRange, c, frequency_weight, &optimalCost, 94 | &(optimal_node_struct.his)); 95 | // Case 4: construct a BS inner node, if it is better than the current 96 | // optimal setting, then use it to update the optimal setting 97 | if (c <= kBSMaxChildNumber) 98 | UpdateDPOptSetting>( 99 | dataRange, c, frequency_weight, &optimalCost, 100 | &(optimal_node_struct.bs)); 101 | } 102 | // store the optimal setting of this sub-dataset 103 | structMap.insert({dataRange.initRange, optimal_node_struct}); 104 | // store the minimum cost of this sub-dataset 105 | COST.insert({dataRange.initRange, optimalCost}); 106 | return optimalCost; 107 | } 108 | 109 | #endif // CONSTRUCT_DP_INNER_H_ 110 | -------------------------------------------------------------------------------- /src/example/example.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file example.cpp 3 | * @author Jiaoyi 4 | * @brief The examples of CARMI 5 | * @version 3.0 6 | * @date 2021-04-07 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #include "../include/carmi_external_map.h" 18 | #include "../include/carmi_map.h" 19 | const float kWriteHeavy = 0.5; // write-heavy workload 20 | 21 | void TestCarmi() { 22 | // generate datasets 23 | std::vector> initDataset(10, {1, 1}); 24 | for (int i = 0; i < 10; i++) { 25 | initDataset[i].first = i * 2; 26 | } 27 | 28 | CARMIMap carmi(initDataset.begin(), initDataset.end()); 29 | 30 | // find the value of the given key 31 | auto it = carmi.find(initDataset[0].first); 32 | std::cout << "1. FIND is successful, the value of the given key is: " 33 | << it.data() << std::endl; 34 | std::cout << " Current and all subsequent key-value pairs:"; 35 | for (; it != carmi.end(); ++it) { 36 | std::cout << "{" << it.key() << ", " << it.data() << "} "; 37 | } 38 | std::cout << std::endl; 39 | 40 | // insert a data point 41 | std::pair data = {5, 500}; 42 | auto res = carmi.insert(data); 43 | std::cout << "2. INSERT is successful!" << std::endl; 44 | 45 | it = carmi.find(data.first); 46 | std::cout 47 | << " FIND after INSERT is successful, the value of the given key is: " 48 | << it.data() << std::endl; 49 | std::cout << " Current and all subsequent key-value pairs:"; 50 | for (; it != carmi.end(); ++it) { 51 | std::cout << "{" << it.key() << ", " << it.data() << "} "; 52 | } 53 | std::cout << std::endl; 54 | 55 | // delete the record of the given key 56 | int cnt = carmi.erase(initDataset[0].first); 57 | if (cnt > 0) 58 | std::cout << "4. DELETE is successful!" << std::endl; 59 | else 60 | std::cout << " DELETE failed!" << std::endl; 61 | it = carmi.find(initDataset[0].first); 62 | if (it == carmi.end() || it.key() != initDataset[0].first) { 63 | std::cout << " FIND after DELETE failed." << std::endl; 64 | } 65 | } 66 | 67 | template 68 | class ExternalDataType { 69 | public: 70 | typedef ValueType ValueType_; 71 | ExternalDataType() { 72 | k = 0; 73 | v = 0; 74 | } 75 | explicit ExternalDataType(KeyType key, ValueType_ value) { 76 | k = key; 77 | v = value; 78 | } 79 | const KeyType &key() const { return k; } 80 | const ValueType_ &data() const { return v; } 81 | 82 | bool operator<(const ExternalDataType &a) const { 83 | if (k == a.k) { 84 | return v < a.v; 85 | } 86 | return k < a.k; 87 | } 88 | 89 | KeyType k; 90 | ValueType_ v; 91 | }; 92 | 93 | void TestExternalCarmi() { 94 | // generate datasets 95 | int initRatio = kWriteHeavy; 96 | int size = 10; 97 | std::vector> initDataset(size, {1, 1}); 98 | for (int i = 0; i < size; i++) { 99 | initDataset[i].first = i * 2; 100 | } 101 | 102 | const int record_size = sizeof(double) * 2; 103 | int extLen = initDataset.size() * 2 + 10; 104 | double *externalDataset = new double[extLen]; 105 | for (int i = 0, j = 0; i < initDataset.size(); i++) { 106 | *(externalDataset + j) = initDataset[i].first; 107 | *(externalDataset + j + 1) = initDataset[i].second; 108 | j += 2; // due to 109 | } 110 | double maxKey = initDataset[initDataset.size() - 1].first; 111 | std::vector futureinsertKey(1, maxKey + 1); 112 | 113 | CARMIExternalMap> carmi( 114 | externalDataset, futureinsertKey, initDataset.size(), record_size); 115 | 116 | // find the value of the given key 117 | auto it = carmi.find(initDataset[4].first); 118 | std::cout << "1. FIND is successful, the given key is: " << it.key() 119 | << ",\tthe value is: " << it.data() << std::endl; 120 | 121 | // insert data into the external array 122 | *(externalDataset + size * 2) = futureinsertKey[0]; 123 | *(externalDataset + size * 2 + 1) = 100; 124 | 125 | // insert a data point 126 | carmi.insert(futureinsertKey[0]); // insert key into carmi 127 | std::cout << "2. INSERT is successful!" << std::endl; 128 | it = carmi.find(futureinsertKey[0]); 129 | std::cout << " FIND is successful, the given key is: " << it.key() 130 | << ",\tthe value is: " << it.data() << std::endl; 131 | } 132 | 133 | int main() { 134 | std::cout << "Test carmi:" << std::endl; 135 | TestCarmi(); 136 | std::cout << "Test external carmi:" << std::endl; 137 | TestExternalCarmi(); 138 | return 0; 139 | } -------------------------------------------------------------------------------- /src/include/construct/dp_leaf.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file dp_leaf.h 3 | * @author Jiaoyi 4 | * @brief use dynamic programming algorithm to construct a leaf node 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef CONSTRUCT_DP_LEAF_H_ 12 | #define CONSTRUCT_DP_LEAF_H_ 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include "../carmi.h" 21 | #include "../params.h" 22 | #include "./structures.h" 23 | 24 | template 26 | NodeCost CARMI::DPLeaf( 27 | const DataRange &dataRange) { 28 | NodeCost nodeCost{DBL_MAX, DBL_MAX, DBL_MAX}; 29 | BaseNode optimal_node_struct; 30 | 31 | nodeCost.time = 0.0; 32 | if (isPrimary) { 33 | // construct an external array leaf node as the current node 34 | nodeCost.space = 0.0; 35 | 36 | ExternalArray tmp; 37 | tmp.Train(initDataset, dataRange.initRange.left, dataRange.initRange.size); 38 | int findEnd = dataRange.findRange.left + dataRange.findRange.size; 39 | // calculate the time cost of this external array leaf node 40 | for (int i = dataRange.findRange.left; i < findEnd; i++) { 41 | int p = tmp.Predict(findQuery[i].first) + dataRange.findRange.left; 42 | int d = abs(i - p); 43 | nodeCost.time += 44 | (carmi_params::kLeafBaseTime * findQuery[i].second) / querySize; 45 | // Case 1: if the data point is within the error range, perform binary 46 | // search over the range of [p - error / 2, p + error / 2] 47 | if (d <= tmp.error) 48 | nodeCost.time += log2(tmp.error + 1) * findQuery[i].second * 49 | carmi_params::kCostBSTime / querySize; 50 | // Case 2: the data point is not in the error range, perform binary search 51 | // over the entire sub-dataset 52 | else 53 | nodeCost.time += log2(dataRange.initRange.size) * findQuery[i].second * 54 | carmi_params::kCostBSTime / querySize; 55 | } 56 | optimal_node_struct.externalArray = tmp; 57 | 58 | } else { 59 | // choose a cf array node as the leaf node 60 | int totalDataNum = dataRange.initRange.size + dataRange.insertRange.size; 61 | // calculate the number of needed data blocks 62 | int blockNum = 63 | CFArrayType::CalNeededBlockNum( 64 | totalDataNum); 65 | int avgSlotNum = 66 | std::max(1.0, ceil(static_cast(totalDataNum) / blockNum)); 67 | avgSlotNum = std::min( 68 | avgSlotNum, 69 | CFArrayType::kMaxBlockCapacity); 70 | 71 | nodeCost.space = 72 | blockNum * carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0; 73 | // calculate the time cost of find operations 74 | int end = dataRange.findRange.left + dataRange.findRange.size; 75 | for (int i = dataRange.findRange.left; i < end; i++) { 76 | nodeCost.time += static_cast(findQuery[i].second) / querySize * 77 | (carmi_params::kLeafBaseTime + 78 | log2(avgSlotNum) * carmi_params::kCostBSTime); 79 | } 80 | // calculate the time cost of insert operations 81 | end = dataRange.insertRange.left + dataRange.insertRange.size; 82 | for (int i = dataRange.insertRange.left; i < end; i++) { 83 | nodeCost.time += 1.0 / static_cast(querySize) * 84 | (carmi_params::kLeafBaseTime + 85 | log2(avgSlotNum) * carmi_params::kCostBSTime + 86 | (1 + avgSlotNum) / 2.0 * carmi_params::kCostMoveTime); 87 | } 88 | 89 | optimal_node_struct.cfArray = 90 | CFArrayType(); 91 | } 92 | nodeCost.cost = nodeCost.time + nodeCost.space * lambda; 93 | 94 | // if dp algorithm also constructs an inner node on this sub-dataset, we need 95 | // to check which one is the better setting 96 | auto it = COST.find(dataRange.initRange); 97 | if (it != COST.end()) { 98 | if (it->second.cost < nodeCost.cost) { 99 | // Case 1: the inner node is the better one, return the cost of it 100 | // directly. 101 | return nodeCost; 102 | } else { 103 | // Case 2: the leaf node is the better one, erase the cost and the setting 104 | // of the inner node 105 | COST.erase(dataRange.initRange); 106 | structMap.erase(dataRange.initRange); 107 | } 108 | } 109 | // store the optimal cost and setting 110 | COST.insert({dataRange.initRange, nodeCost}); 111 | structMap.insert({dataRange.initRange, optimal_node_struct}); 112 | return nodeCost; 113 | } 114 | 115 | #endif // CONSTRUCT_DP_LEAF_H_ 116 | -------------------------------------------------------------------------------- /src/unitTest/carmiTest/carmi_map_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file carmi_map_test.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 0.1 6 | * @date 2021-11-14 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | 12 | #include "../../include/carmi_map.h" 13 | 14 | #include "gtest/gtest.h" 15 | 16 | const int kTestMaxValue = 1000000; 17 | const int kInitSize = 10000; 18 | const int kInsertSize = 100; 19 | const float kRate = 0.1; 20 | unsigned int seed = time(NULL); 21 | 22 | typedef double KeyType; 23 | typedef double ValueType; 24 | typedef std::pair DataType; 25 | typedef std::vector DataVecType; 26 | typedef CARMIMap CarmiType; 27 | 28 | DataVecType initDataset(kInitSize); 29 | DataVecType insertDataset(kInsertSize); 30 | DataVecType testInsertQuery(kInsertSize); 31 | CarmiType carmi; 32 | std::default_random_engine engine(time(0)); 33 | 34 | TEST(TestCarmimapConstructor, CARMIMapConstructor) { 35 | std::uniform_real_distribution dis(0, kTestMaxValue); 36 | for (int i = 0; i < kInitSize; i++) { 37 | KeyType tmpKey = dis(engine); 38 | initDataset[i] = {tmpKey, tmpKey * 10}; 39 | } 40 | std::sort(initDataset.begin(), initDataset.end()); 41 | for (int i = 0; i < kInsertSize; i++) { 42 | KeyType tmpKey = dis(engine); 43 | insertDataset[i] = {tmpKey, tmpKey * 10}; 44 | } 45 | std::sort(insertDataset.begin(), insertDataset.end()); 46 | for (int i = 0; i < kInsertSize; i++) { 47 | KeyType tmpKey = dis(engine); 48 | testInsertQuery[i] = {tmpKey, tmpKey * 10}; 49 | } 50 | ASSERT_TRUE(carmi.empty()); 51 | 52 | CarmiType c(initDataset.begin(), initDataset.end(), insertDataset.begin(), 53 | insertDataset.end(), kRate); 54 | carmi.swap(c); 55 | 56 | ASSERT_EQ(carmi.size(), kInitSize); 57 | ASSERT_FALSE(carmi.empty()); 58 | 59 | auto it = carmi.begin(); 60 | for (int i = 0; i < kInitSize; i++) { 61 | EXPECT_EQ(it.key(), initDataset[i].first) << " i:" << i << std::endl; 62 | EXPECT_EQ(it.data(), initDataset[i].second); 63 | it++; 64 | } 65 | } 66 | 67 | TEST(TestCarmimapFind, CARMIMapFind) { 68 | for (int i = 0; i < kInitSize; i++) { 69 | auto it = carmi.find(initDataset[i].first); 70 | EXPECT_EQ(it.key(), initDataset[i].first); 71 | EXPECT_EQ(it.data(), initDataset[i].second); 72 | } 73 | } 74 | 75 | TEST(TestCarmimapLowerbound, CARMIMapLowerbound) { 76 | for (int i = 0; i < kInitSize; i++) { 77 | auto it = carmi.lower_bound(initDataset[i].first); 78 | EXPECT_EQ(it.key(), initDataset[i].first); 79 | } 80 | for (int i = 0; i < kInsertSize; i++) { 81 | if (testInsertQuery[i].first < initDataset[kInitSize - 1].first) { 82 | auto it = carmi.lower_bound(testInsertQuery[i].first); 83 | auto vector_res = std::lower_bound(initDataset.begin(), initDataset.end(), 84 | testInsertQuery[i]) - 85 | initDataset.begin(); 86 | EXPECT_EQ(it.key(), initDataset[vector_res].first); 87 | } 88 | } 89 | } 90 | 91 | TEST(TestCarmimapUpperbound, CARMIMapUpperbound) { 92 | for (int i = 0; i < kInitSize - 1; i++) { 93 | auto it = carmi.upper_bound(initDataset[i].first); 94 | EXPECT_GT(it.key(), initDataset[i].first); 95 | } 96 | } 97 | 98 | TEST(TestCarmimapEqualRange, CARMIMapEqualRange) { 99 | for (int i = 0; i < kInitSize; i++) { 100 | auto res = carmi.equal_range(initDataset[i].first); 101 | for (auto it = res.first; it != res.second; it++) { 102 | EXPECT_EQ(it.key(), initDataset[i].first); 103 | } 104 | } 105 | } 106 | 107 | TEST(TestCarmimapCount, CARMIMapCount) { 108 | for (int i = 0; i < kInitSize; i++) { 109 | auto res = carmi.count(initDataset[i].first); 110 | int cnt = 0; 111 | auto vector_res = 112 | std::count(initDataset.begin(), initDataset.end(), initDataset[i]); 113 | EXPECT_EQ(res, vector_res); 114 | } 115 | } 116 | 117 | TEST(TestCarmimapInsert, CARMIMapInsert) { 118 | for (int i = 0; i < kInsertSize; i++) { 119 | auto it = carmi.insert(testInsertQuery[i]); 120 | EXPECT_TRUE(it.second); 121 | EXPECT_EQ(it.first.key(), testInsertQuery[i].first); 122 | EXPECT_EQ(it.first.data(), testInsertQuery[i].second); 123 | for (int j = 0; j < i; j++) { 124 | auto res = carmi.find(testInsertQuery[j].first); 125 | EXPECT_EQ(res.key(), testInsertQuery[j].first); 126 | } 127 | for (int j = 0; j < kInitSize; j++) { 128 | auto res = carmi.find(initDataset[j].first); 129 | EXPECT_EQ(res.key(), initDataset[j].first); 130 | EXPECT_EQ(res.data(), initDataset[j].second); 131 | } 132 | } 133 | } 134 | 135 | TEST(TestCarmimapErase, CARMIMapErase) { 136 | for (int i = 0; i < kInsertSize; i++) { 137 | carmi.erase(testInsertQuery[i].first); 138 | auto it = carmi.find(testInsertQuery[i].first); 139 | EXPECT_EQ(it, carmi.end()); 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # README 2 | # A simple implementation of CARMI 3 | 4 | This is a simple implementation of our paper: **CARMI: A Cache-Aware Learned Index with a Cost-based Construction Algorithm**. 5 | 6 | ## Reproducing the experiment 7 | 8 | If you want to reproduce the experiment in our paper, do the following 9 | 10 | ``` 11 | cd ./src 12 | cmake . 13 | make 14 | ./CARMI 15 | ``` 16 | 17 | ## Using CARMI 18 | 19 | If you want to use CARMI as an index, then you only need to include the header file respectively: 20 | 21 | Use in-memory index: 22 | ``` 23 | #include "./include/carmi_common.h" 24 | ``` 25 | 26 | Use external index: 27 | ``` 28 | #include "./include/carmi_external.h" 29 | ``` 30 | 31 | ## Instructions 32 | 33 | **Method of constructing an index:** 34 | 35 | 1. **Automatic construction(CARMI)**: prepare the initial dataset, training datasets (historical access and insertion queries), and then create a CARMI object, and the hybrid algorithm will automatically build the index. 36 | 37 | **Main functions:** 38 | 39 | 1. **find**: find the corresponding record of the given key, return the iterator 40 | 41 | ``` 42 | iterator find(const KeyType &key); 43 | ``` 44 | 45 | 2. **lower_bound**: return an iterator pointing to the first element in the container whose key is not less than key. 46 | 47 | ``` 48 | iterator lower_bound(const KeyType &key); 49 | ``` 50 | 51 | 3. **upper_bound**: return an iterator pointing to the first element in the container whose key is larger than key. 52 | 53 | ``` 54 | iterator upper_bound(const KeyType &key); 55 | ``` 56 | 57 | 4. **insert**: insert a data point into the index. 58 | 59 | ``` 60 | std::pair insert(const DataType &datapoint); 61 | ``` 62 | 63 | 5. **erase**: delete the record of the given key and return the number of elements erased. 64 | 65 | ``` 66 | size_t erase(const KeyType &key); 67 | ``` 68 | 69 | 6. **swap**: swap two carmi tree objects. 70 | 71 | ``` 72 | void swap(CARMIMap &other); 73 | ``` 74 | 75 | 7. **size**: return the number of data points in the carmi tree. 76 | 77 | ``` 78 | size_t size(); 79 | ``` 80 | 81 | 8. **CalculateSpace**: return the space of the carmi tree in bytes. 82 | 83 | ``` 84 | long long CalculateSpace(); 85 | ``` 86 | 87 | Only a few commonly used functions are briefly introduced here. In fact, we provide all interfaces similar to std::map in the C++11 version, and you can use CARMIMap like std::map. CARMIExternalMap is designed to store the data points externally. It also implements the std::map interfaces, but the template parameters are slightly different. You can check the examples we provide to use. 88 | 89 | ## File structure of CARMI 90 | 91 | In this project, we include the CARMI header files, the source code of the experimental part and the baseline. The description of each file in CARMI's header file is as follows: 92 | 93 | - **include** 94 | - base_node.h *( the union structure of all nodes )* 95 | - carmi.h *( the implementation class of CARMI )* 96 | - carmi_map.h *( the CARMI map class for common use )* 97 | - carmi_external_map.h *( the CARMI map class for the dataset stored in the external position )* 98 | - **construct** *( files used to construct the index )* 99 | - construction.h *( the main function of our algorithm )* 100 | - construct_root.h *( use the optimal root node to construct child nodes )* 101 | - dp.h *( the main dynamic programming algorithm )* 102 | - dp_inner.h *( use DP to construct inner node )* 103 | - dp_leaf.h *( use DP to construct leaf node )* 104 | - greedy.h *( the greedy node selection algorithm )* 105 | - minor_function.h *( minor functions )* 106 | - structures.h *( the structures of CARMI )* 107 | - store_node.h *( use the optimal setting to construct a new node )* 108 | - **memoryLayout** *( manage the two main arrays )* 109 | - data_array.h 110 | - node_array.h 111 | - empty_block.h 112 | - **nodes** *( all nodes we have implemented )* 113 | - **rootNode** 114 | - root_nodes.h *( the classes of root nodes )* 115 | - **trainModel** *( models used to train the root nodes )* 116 | - linear_regression.h 117 | - piecewiseLR.h 118 | - prefetch_plr.h 119 | - **innerNode** 120 | - lr_model.h 121 | - plr_model.h 122 | - his_model.h 123 | - bs_model.h 124 | - candidate_plr.h *( for piecewiseLR )* 125 | - **leafNode** 126 | - cfarray_type.h 127 | - external_array_type.h 128 | - **func** *( public functions )* 129 | - find_function.h 130 | - insert_function.h 131 | - delete_function.h 132 | - update_function.h 133 | - split_function.h 134 | - calculate_space.h 135 | - get_node_info.h 136 | 137 | ## Dependencies 138 | 139 | This code is based on C++17. 140 | -------------------------------------------------------------------------------- /src/include/construct/store_node.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file store_node.h 3 | * @author Jiaoyi 4 | * @brief store inner and leaf nodes 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef CONSTRUCT_STORE_NODE_H_ 12 | #define CONSTRUCT_STORE_NODE_H_ 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "../carmi.h" 22 | #include "../memoryLayout/node_array.h" 23 | #include "../nodes/innerNode/bs_model.h" 24 | #include "../nodes/innerNode/his_model.h" 25 | #include "../nodes/innerNode/lr_model.h" 26 | #include "../nodes/innerNode/plr_model.h" 27 | #include "../nodes/leafNode/cfarray_type.h" 28 | #include "../nodes/leafNode/external_array_type.h" 29 | #include "../params.h" 30 | #include "./dp_inner.h" 31 | 32 | template 34 | template 35 | void CARMI::StoreInnerNode( 36 | const DataRange &range, InnerNodeType *currnode) { 37 | // get the number of child nodes 38 | int optimalChildNumber = currnode->flagNumber & 0x00FFFFFF; 39 | // divide the initDataset 40 | SubDataset subDataset(optimalChildNumber); 41 | NodePartition(*currnode, range.initRange, initDataset, 42 | &(subDataset.subInit)); 43 | NodePartition(*currnode, range.insertRange, insertQuery, 44 | &(subDataset.subInsert)); 45 | // allocate a block of empty memory for this node in the node array 46 | currnode->childLeft = node.AllocateNodeMemory(optimalChildNumber); 47 | 48 | for (int i = 0; i < optimalChildNumber; i++) { 49 | // store each child node 50 | DataRange subRange(subDataset.subInit[i], subDataset.subFind[i], 51 | subDataset.subInsert[i]); 52 | StoreOptimalNode(subRange, currnode->childLeft + i); 53 | } 54 | } 55 | 56 | template 58 | void CARMI::StoreOptimalNode( 59 | const DataRange &range, int storeIdx) { 60 | // find the optimal setting of this sub-dataset 61 | auto it = structMap.find(range.initRange); 62 | 63 | int type = it->second.cfArray.flagNumber >> 24; 64 | switch (type) { 65 | case LR_INNER_NODE: { 66 | // Case 1: the optimal node is the lr inner node, use the StoreInnerNode 67 | // function to store itself and its child nodes. 68 | StoreInnerNode>(range, &(it->second.lr)); 69 | node.nodeArray[storeIdx].lr = it->second.lr; 70 | break; 71 | } 72 | case PLR_INNER_NODE: { 73 | // Case 2: the optimal node is the p. lr inner node, use the 74 | // StoreInnerNode function to store itself and its child nodes. 75 | StoreInnerNode>(range, &(it->second.plr)); 76 | node.nodeArray[storeIdx].plr = it->second.plr; 77 | break; 78 | } 79 | case HIS_INNER_NODE: { 80 | // Case 3: the optimal node is the his inner node, use the StoreInnerNode 81 | // function to store itself and its child nodes. 82 | StoreInnerNode>(range, &(it->second.his)); 83 | node.nodeArray[storeIdx].his = it->second.his; 84 | break; 85 | } 86 | case BS_INNER_NODE: { 87 | // Case 4: the optimal node is the bs inner node, use the StoreInnerNode 88 | // function to store itself and its child nodes. 89 | StoreInnerNode>(range, &(it->second.bs)); 90 | node.nodeArray[storeIdx].bs = it->second.bs; 91 | break; 92 | } 93 | case ARRAY_LEAF_NODE: { 94 | // Case 5: the optimal node is the cf array leaf node, and then we store 95 | // its information in the remainingNode for future processing due to the 96 | // prefetching mechanism 97 | scanLeaf.push_back(storeIdx); 98 | remainingNode.push_back(storeIdx); 99 | remainingRange.push_back(range); 100 | break; 101 | } 102 | case EXTERNAL_ARRAY_LEAF_NODE: { 103 | // Case 6: the optimal node is the external array leaf node, store it in 104 | // the node array 105 | ExternalArray currnode = 106 | it->second.externalArray; 107 | int size = range.initRange.size; 108 | if (size <= 0) 109 | currnode.m_left = prefetchEnd; 110 | else 111 | currnode.m_left = range.initRange.left; 112 | prefetchEnd += range.initRange.size; 113 | node.nodeArray[storeIdx].externalArray = currnode; 114 | break; 115 | } 116 | } 117 | if (type >= ARRAY_LEAF_NODE && range.initRange.size > 0) { 118 | lastLeaf = storeIdx; 119 | if (firstLeaf == -1) { 120 | firstLeaf = storeIdx; 121 | } 122 | } 123 | } 124 | 125 | #endif // CONSTRUCT_STORE_NODE_H_ 126 | -------------------------------------------------------------------------------- /src/include/func/find_function.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file find_function.h 3 | * @author Jiaoyi 4 | * @brief find a record 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef FUNC_FIND_FUNCTION_H_ 12 | #define FUNC_FIND_FUNCTION_H_ 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #if defined(CATCH_PLATFORM_WINDOWS) 22 | #include 23 | #endif 24 | 25 | #include "../carmi.h" 26 | 27 | template 29 | BaseNode * 30 | CARMI::Find(const KeyType &key, 31 | int *currblock, int *currslot) { 32 | int idx = 0; 33 | int type = root.flagNumber; 34 | int fetch_start = 0; 35 | double fetch_leafIdx; 36 | while (1) { 37 | switch (type) { 38 | case PLR_ROOT_NODE: 39 | // Case 0: this node is the plr root node 40 | // use the plr root node to find the index of the next node and prefetch 41 | // the data block 42 | if (isPrimary == false) { 43 | fetch_leafIdx = 44 | root.PLRType::model.Predict(key); 45 | idx = fetch_leafIdx; 46 | fetch_start = root.PLRType::fetch_model 47 | .PrefetchPredict(fetch_leafIdx); 48 | #if defined(CATCH_PLATFORM_LINUX) || defined(CATCH_PLATFORM_MAC) 49 | // the instructions of prefetching in Ubuntu 50 | __builtin_prefetch(&data.dataArray[fetch_start], 0, 3); 51 | // __builtin_prefetch(&data.dataArray[fetch_start] + 64, 0, 3); 52 | // __builtin_prefetch(&data.dataArray[fetch_start] + 128, 0, 3); 53 | // __builtin_prefetch(&data.dataArray[fetch_start] + 192, 0, 3); 54 | #elif defined(CATCH_PLATFORM_WINDOWS) 55 | // the instructions of prefetching in Windows 56 | _mm_prefetch(static_cast( 57 | static_cast(&data.dataArray[fetch_start])), 58 | _MM_HINT_T1); 59 | _mm_prefetch(static_cast( 60 | static_cast(&data.dataArray[fetch_start])) + 61 | 64, 62 | _MM_HINT_T1); 63 | _mm_prefetch(static_cast( 64 | static_cast(&data.dataArray[fetch_start])) + 65 | 128, 66 | _MM_HINT_T1); 67 | _mm_prefetch(static_cast( 68 | static_cast(&data.dataArray[fetch_start])) + 69 | 192, 70 | _MM_HINT_T1); 71 | #endif 72 | } else { 73 | idx = root.PLRType::model.Predict(key); 74 | } 75 | type = node.nodeArray[idx].lr.flagNumber >> 24; 76 | break; 77 | case LR_INNER_NODE: 78 | // Case 1: this node is the lr inner node 79 | // use the predict function of lr inner node to obtain the index of the 80 | // next node 81 | idx = node.nodeArray[idx].lr.Predict(key); 82 | type = node.nodeArray[idx].lr.flagNumber >> 24; 83 | break; 84 | case PLR_INNER_NODE: 85 | // Case 2: this node is the plr inner node 86 | // use the predict function of plr inner node to obtain the index of the 87 | // next node 88 | idx = node.nodeArray[idx].plr.Predict(key); 89 | type = node.nodeArray[idx].lr.flagNumber >> 24; 90 | break; 91 | case HIS_INNER_NODE: 92 | // Case 3: this node is the his inner node 93 | // use the predict function of his inner node to obtain the index of the 94 | // next node 95 | idx = node.nodeArray[idx].his.Predict(key); 96 | type = node.nodeArray[idx].lr.flagNumber >> 24; 97 | break; 98 | case BS_INNER_NODE: 99 | // Case 4: this node is the bs inner node 100 | // use the predict function of bs inner node to obtain the index of the 101 | // next node 102 | idx = node.nodeArray[idx].bs.Predict(key); 103 | type = node.nodeArray[idx].lr.flagNumber >> 24; 104 | break; 105 | case ARRAY_LEAF_NODE: { 106 | // Case 5: this node is the cache-friendly array leaf node 107 | // find the data point in the cf leaf node and return its position 108 | *currslot = node.nodeArray[idx].cfArray.Find(data, key, currblock); 109 | return &node.nodeArray[idx]; 110 | } 111 | case EXTERNAL_ARRAY_LEAF_NODE: { 112 | // Case 6: this node is the external array leaf node 113 | // find the data point in the external leaf node and return its position 114 | *currslot = node.nodeArray[idx].externalArray.Find(key, recordLength, 115 | external_data); 116 | return &node.nodeArray[idx]; 117 | } 118 | } 119 | } 120 | } 121 | 122 | #endif // FUNC_FIND_FUNCTION_H_ 123 | -------------------------------------------------------------------------------- /src/unitTest/carmiTest/carmi_external_map_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file carmi_external_map_test.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 0.1 6 | * @date 2021-11-14 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | 12 | #include "../../include/carmi_external_map.h" 13 | 14 | #include "gtest/gtest.h" 15 | 16 | const int kTestMaxValue = 1000000; 17 | const int kInitSize = 10000; 18 | const int kInsertSize = 100; 19 | const float kRate = 0.1; 20 | unsigned int seed = time(NULL); 21 | 22 | typedef double KeyType; 23 | typedef double ValueType; 24 | typedef std::pair DataType; 25 | typedef std::vector DataVecType; 26 | 27 | const int record_size = sizeof(KeyType) + sizeof(ValueType); 28 | 29 | DataVecType initDataset(kInitSize); 30 | DataVecType insertDataset(kInsertSize); 31 | 32 | template 33 | class ExternalDataType { 34 | public: 35 | typedef ValueType ValueType_; 36 | ExternalDataType() { 37 | k = 0; 38 | v = 0; 39 | } 40 | explicit ExternalDataType(KeyType key, ValueType_ value) { 41 | k = key; 42 | v = value; 43 | } 44 | const KeyType &key() const { return k; } 45 | const ValueType_ &data() const { return v; } 46 | 47 | bool operator<(const ExternalDataType &a) const { 48 | if (k == a.k) { 49 | return v < a.v; 50 | } 51 | return k < a.k; 52 | } 53 | 54 | KeyType k; 55 | ValueType_ v; 56 | }; 57 | 58 | typedef CARMIExternalMap> 59 | CarmiType; 60 | CarmiType carmi; 61 | KeyType *externalDataset; 62 | std::default_random_engine engine(time(0)); 63 | 64 | TEST(TestCarmiExtmapConstructor, CARMIExtMapConstructor) { 65 | std::uniform_real_distribution dis(0, kTestMaxValue); 66 | for (int i = 0; i < kInitSize; i++) { 67 | KeyType tmpKey = dis(engine); 68 | initDataset[i] = {tmpKey, tmpKey * 10}; 69 | } 70 | std::sort(initDataset.begin(), initDataset.end()); 71 | KeyType lastKey = initDataset[kInitSize - 1].first; 72 | std::vector futureInsertKey(kInsertSize); 73 | for (int i = 0; i < kInsertSize; i++) { 74 | lastKey += 1; 75 | insertDataset[i] = {lastKey, lastKey * 10}; 76 | futureInsertKey[i] = insertDataset[i].first; 77 | } 78 | std::sort(insertDataset.begin(), insertDataset.end()); 79 | ASSERT_TRUE(carmi.empty()); 80 | int extLen = initDataset.size() * 2 + kInsertSize * 2; 81 | externalDataset = new KeyType[extLen]; 82 | for (int i = 0, j = 0; i < static_cast(initDataset.size()); i++) { 83 | *(externalDataset + j) = initDataset[i].first; 84 | *(externalDataset + j + 1) = initDataset[i].second; 85 | j += 2; // due to 86 | } 87 | 88 | CarmiType c(externalDataset, futureInsertKey, initDataset.size(), record_size, 89 | kRate); 90 | 91 | carmi.swap(c); 92 | 93 | ASSERT_EQ(carmi.size(), kInitSize); 94 | ASSERT_FALSE(carmi.empty()); 95 | 96 | auto it = carmi.begin(); 97 | for (int i = 0; i < kInitSize; i++) { 98 | EXPECT_EQ(it.key(), initDataset[i].first) << " i:" << i << std::endl; 99 | EXPECT_EQ(it.data(), initDataset[i].second); 100 | it++; 101 | } 102 | } 103 | 104 | TEST(TestCarmiExtmapFind, CARMIExtMapFind) { 105 | for (int i = 0; i < kInitSize; i++) { 106 | auto it = carmi.find(initDataset[i].first); 107 | EXPECT_EQ(it.key(), initDataset[i].first); 108 | EXPECT_EQ(it.data(), initDataset[i].second); 109 | } 110 | } 111 | 112 | TEST(TestCarmiExtmapLowerbound, CARMIExtMapLowerbound) { 113 | for (int i = 0; i < kInitSize; i++) { 114 | auto it = carmi.lower_bound(initDataset[i].first); 115 | EXPECT_EQ(it.key(), initDataset[i].first); 116 | } 117 | for (int i = 0; i < kInsertSize; i++) { 118 | if (insertDataset[i].first < initDataset[kInitSize - 1].first) { 119 | auto it = carmi.lower_bound(insertDataset[i].first); 120 | EXPECT_GE(it.key(), insertDataset[i].first); 121 | } 122 | } 123 | } 124 | 125 | TEST(TestCarmiExtmapUpperbound, CARMIExtMapUpperbound) { 126 | for (int i = 0; i < kInitSize - 1; i++) { 127 | auto it = carmi.upper_bound(initDataset[i].first); 128 | EXPECT_GT(it.key(), initDataset[i].first); 129 | } 130 | } 131 | 132 | TEST(TestCarmiExtmapEqualRange, CARMIExtMapEqualRange) { 133 | for (int i = 0; i < kInitSize; i++) { 134 | auto res = carmi.equal_range(initDataset[i].first); 135 | for (auto it = res.first; it != res.second; it++) { 136 | EXPECT_EQ(it.key(), initDataset[i].first); 137 | } 138 | } 139 | } 140 | 141 | TEST(TestCarmiExtmapCount, CARMIExtMapCount) { 142 | for (int i = 0; i < kInitSize; i++) { 143 | auto res = carmi.count(initDataset[i].first); 144 | auto vector_res = 145 | std::count(initDataset.begin(), initDataset.end(), initDataset[i]); 146 | EXPECT_EQ(res, vector_res); 147 | } 148 | } 149 | 150 | TEST(TestCarmiExtmapInsert, CARMIExtMapInsert) { 151 | int cnt = 2 * kInitSize; 152 | for (int i = 0; i < kInsertSize; i++, cnt += 2) { 153 | auto it = carmi.insert(insertDataset[i].first); 154 | *(externalDataset + cnt) = insertDataset[i].first; 155 | *(externalDataset + cnt + 1) = insertDataset[i].second; 156 | EXPECT_TRUE(it.second); 157 | EXPECT_EQ(it.first.key(), insertDataset[i].first); 158 | EXPECT_EQ(it.first.data(), insertDataset[i].second); 159 | for (int j = 0; j < i; j++) { 160 | auto res = carmi.find(insertDataset[j].first); 161 | EXPECT_EQ(res.key(), insertDataset[j].first); 162 | } 163 | for (int j = 0; j < kInitSize; j++) { 164 | auto res = carmi.find(initDataset[j].first); 165 | EXPECT_EQ(res.key(), initDataset[j].first); 166 | EXPECT_EQ(res.data(), initDataset[j].second); 167 | } 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /src/include/nodes/innerNode/bs_model.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file bs_model.h 3 | * @author Jiaoyi 4 | * @brief binary search inner node 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef NODES_INNERNODE_BS_MODEL_H_ 12 | #define NODES_INNERNODE_BS_MODEL_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include "../../construct/structures.h" 21 | 22 | /** 23 | * @brief binary search inner node 24 | * 25 | * This class is the binary search inner node. Due to the size limit of 64 26 | * bytes, we can only store the kMaxKeyNum key values. Thus, this type is not 27 | * suitable for nodes with a large number of child nodes. However, the bs node 28 | * can divide the dataset evenly, thus dealing with the uneven dataset. 29 | * 30 | * @tparam KeyType the type of the keyword 31 | * @tparam ValueType the type of the value 32 | */ 33 | template 34 | class BSModel { 35 | public: 36 | // *** Constructed Types and Constructor 37 | 38 | /** 39 | * @brief the pair of data points 40 | */ 41 | typedef std::pair DataType; 42 | 43 | /** 44 | * @brief the vector of data points, which is the type of dataset 45 | */ 46 | typedef std::vector DataVectorType; 47 | 48 | /** 49 | * @brief Construct a new BS Model object and use c to set its child number 50 | * 51 | * This model is a binary search model, which performs a binary search between 52 | * the index vector to find the index of the given key value, and the size of 53 | * the index must be less than 14 due to the limit of 64 bytes. 54 | * 55 | * @param[in] c the number of its child nodes 56 | */ 57 | explicit BSModel(int c) { 58 | childLeft = 0; 59 | flagNumber = 60 | (BS_INNER_NODE << 24) + std::max(2, std::min(c, kMaxKeyNum + 1)); 61 | for (int i = 0; i < kMaxKeyNum; i++) { 62 | keys[i] = 0; 63 | } 64 | } 65 | 66 | public: 67 | // *** Basic Functions of BS Inner Node Objects 68 | 69 | /** 70 | * @brief train the binary search model 71 | * 72 | * The training data points are stored in dataset[left, left + size]. 73 | * 74 | * @param[in] left the starting index of data points 75 | * @param[in] size the size of data points 76 | * @param[in] dataset used to train the model 77 | */ 78 | void Train(int left, int size, const DataVectorType &dataset); 79 | 80 | /** 81 | * @brief predict the next node which manages the data point corresponding to 82 | * the given key value 83 | * 84 | * @param[in] key the given key value 85 | * @return int: the predicted index of next node 86 | */ 87 | int Predict(KeyType key) const; 88 | 89 | public: 90 | // *** Static Constant Options and Values of BS Inner Node Objects 91 | 92 | /** 93 | * @brief The time cost of the bs inner node. 94 | */ 95 | static constexpr double kTimeCost = carmi_params::kBSInnerTime; 96 | 97 | /** 98 | * @brief The maximum number of stored keys. 99 | */ 100 | static constexpr int kMaxKeyNum = 56 / sizeof(KeyType); 101 | 102 | public: 103 | //*** Public Data Members of BS Inner Node Objects 104 | 105 | /** 106 | * @brief A combined integer, composed of the flag of bs inner node 107 | * (BS_INNER_NODE, 1 byte) and the number of its child nodes (3 bytes). (This 108 | * member is 4 bytes) 109 | */ 110 | int flagNumber; 111 | 112 | /** 113 | * @brief The index of its first child node in the node array. All the child 114 | * nodes are stored in node[childLeft, childLeft + size]. Through this member 115 | * and the right three bytes of flagNumber, all the child nodes can be 116 | * accessed. (4 bytes) 117 | */ 118 | int childLeft; 119 | 120 | /** 121 | * @brief store at most kMaxKeyNum key values 122 | * This bs model divides the key range into kMaxKeyNum + 1 intervals. To 123 | * determine which branch to go through, perform a binary search among the 124 | * kMaxKeyNum key values to locate the corresponding key value interval 125 | * covering the input key. (56 bytes) 126 | */ 127 | KeyType keys[kMaxKeyNum]; 128 | }; 129 | 130 | template 131 | inline void BSModel::Train(int left, int size, 132 | const DataVectorType &dataset) { 133 | if (size == 0) return; 134 | if (left < 0 || size < 0 || left + size > dataset.size()) { 135 | throw std::out_of_range( 136 | "BSModel::Train: the range of training dataset is invalid."); 137 | } 138 | 139 | int childNumber = flagNumber & 0x00FFFFFF; 140 | // calculate the value of the segment 141 | float value = static_cast(size) / childNumber; 142 | int cnt = 1; 143 | int start = left + value; 144 | int end = left + size; 145 | // store the minimum value of each segment 146 | for (int i = start; i < end; i += value) { 147 | if (cnt >= childNumber) { 148 | break; 149 | } 150 | keys[cnt - 1] = dataset[i].first; 151 | cnt++; 152 | } 153 | } 154 | 155 | template 156 | inline int BSModel::Predict(KeyType key) const { 157 | int start_idx = 0; 158 | // get the maximum index 159 | int end_idx = (flagNumber & 0x00FFFFFF) - 2; 160 | if (key > keys[end_idx]) { 161 | return childLeft + end_idx + 1; 162 | } 163 | int mid; 164 | // perform binary search between the index vector 165 | while (start_idx < end_idx) { 166 | mid = (start_idx + end_idx) >> 1; 167 | if (keys[mid] < key) 168 | start_idx = mid + 1; 169 | else 170 | end_idx = mid; 171 | } 172 | return start_idx + childLeft; 173 | } 174 | 175 | #endif // NODES_INNERNODE_BS_MODEL_H_ 176 | -------------------------------------------------------------------------------- /src/include/nodes/innerNode/lr_model.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file lr_model.h 3 | * @author Jiaoyi 4 | * @brief linear regression inner node 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef NODES_INNERNODE_LR_MODEL_H_ 12 | #define NODES_INNERNODE_LR_MODEL_H_ 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | #include "../../construct/structures.h" 23 | 24 | /** 25 | * @brief linear regression inner node 26 | * 27 | * This class is the LR inner node, which uses linear regression to train the 28 | * model and predict the index of the next node. The CPU time cost of this node 29 | * is less than the other nodes. 30 | * 31 | * @tparam KeyType the type of the keyword 32 | * @tparam ValueType the type of the value 33 | */ 34 | template 35 | class LRModel { 36 | public: 37 | // *** Constructed Types and Constructor 38 | 39 | /** 40 | * @brief the pair of data points 41 | */ 42 | typedef std::pair DataType; 43 | 44 | /** 45 | * @brief the vector of data points, which is the type of dataset 46 | */ 47 | typedef std::vector DataVectorType; 48 | 49 | /** 50 | * @brief Construct a new LRModel object and use c to set its child number 51 | * 52 | * @param[in] c the number of its child nodes 53 | */ 54 | explicit LRModel(int c) { 55 | childLeft = 0; 56 | slope = 0; 57 | intercept = 0; 58 | minValue = 0; 59 | flagNumber = (LR_INNER_NODE << 24) + std::max(std::min(c, 0x00FFFFFF), 2); 60 | } 61 | 62 | public: 63 | // *** Basic Functions of LR Inner Node Objects 64 | 65 | /** 66 | * @brief train the linear regression model 67 | * 68 | * The training data points are stored in dataset[left, left + size]. 69 | * 70 | * @param[in] left the starting index of data points 71 | * @param[in] size the size of data points 72 | * @param[in] dataset used to train the model 73 | */ 74 | void Train(int left, int size, const DataVectorType &dataset); 75 | 76 | /** 77 | * @brief predict the next node which manages the data point corresponding to 78 | * the given key value 79 | * 80 | * @param[in] key the given key value 81 | * @return int: the predicted index of next node 82 | */ 83 | int Predict(KeyType key) const; 84 | 85 | public: 86 | // *** Static Constant Options and Values of LR Inner Node Objects 87 | 88 | /** 89 | * @brief The time cost of the lr inner node. 90 | */ 91 | static constexpr double kTimeCost = carmi_params::kLRInnerTime; 92 | 93 | /** 94 | * @brief The bytes of placeholder. 95 | */ 96 | static constexpr int kPlaceHolderLen = 48 - sizeof(KeyType); 97 | 98 | public: 99 | //*** Public Data Members of LR Inner Node Objects 100 | 101 | /** 102 | * @brief A combined integer, composed of the flag of lr inner node 103 | * (LR_INNER_NODE, 1 byte) and the number of its child nodes (3 bytes). (This 104 | * member is 4 bytes) 105 | */ 106 | int flagNumber; 107 | 108 | /** 109 | * @brief The index of its first child node in the node array. All the child 110 | * nodes are stored in node[childLeft, childLeft + size]. Through this member 111 | * and the right three bytes of flagNumber, all the child nodes can be 112 | * accessed. (4 bytes) 113 | */ 114 | int childLeft; 115 | 116 | /** 117 | * @brief The slope parameter of the linear regression model. (4 bytes) 118 | */ 119 | float slope; 120 | 121 | /** 122 | * @brief The intercept parameter of the linear regression model. (4 bytes) 123 | */ 124 | float intercept; 125 | 126 | /** 127 | * @brief The minimum value. 128 | */ 129 | KeyType minValue; 130 | 131 | /** 132 | * @brief Placeholder, used to make sure that the size of this node is 64 133 | * bytes. (kPlaceHolderLen bytes) 134 | */ 135 | char Placeholder[kPlaceHolderLen]; 136 | }; 137 | 138 | template 139 | inline void LRModel::Train(int left, int size, 140 | const DataVectorType &dataset) { 141 | // Case 1: the dataset is empty, return directly 142 | if (size == 0) return; 143 | if (left < 0 || size < 0 || left + size > dataset.size()) { 144 | throw std::out_of_range( 145 | "LRModel::Train: the range of training dataset is invalid."); 146 | } 147 | 148 | // Case 2: use the dataset to train the model 149 | // extract data points from dataset[left, left + size] and use their processed 150 | // relative index as y to train 151 | int childNumber = flagNumber & 0x00FFFFFF; 152 | minValue = dataset[left].first; 153 | std::vector> currdata(size); 154 | for (int i = 0, j = left; i < size; i++, j++) { 155 | currdata[i].first = dataset[j].first - minValue; 156 | currdata[i].second = i * 1.0 / size * childNumber; 157 | } 158 | 159 | // train the lr model 160 | double t1 = 0, t2 = 0, t3 = 0, t4 = 0; 161 | for (int i = 0; i < size; i++) { 162 | t1 += static_cast(currdata[i].first) * 163 | static_cast(currdata[i].first); 164 | t2 += static_cast(currdata[i].first); 165 | t3 += static_cast(currdata[i].first) * currdata[i].second; 166 | t4 += currdata[i].second; 167 | } 168 | if (t1 * size - t2 * t2) { 169 | slope = (t3 * size - t2 * t4) / (t1 * size - t2 * t2); 170 | intercept = (t1 * t4 - t2 * t3) / (t1 * size - t2 * t2); 171 | } else { 172 | slope = 0; 173 | intercept = 0; 174 | } 175 | } 176 | 177 | template 178 | inline int LRModel::Predict(KeyType key) const { 179 | // use the lr model to predict the index of the next node 180 | int p = slope * static_cast(key - minValue) + intercept; 181 | // get its child number 182 | int bound = flagNumber & 0x00FFFFFF; 183 | // check whether p exceeds the boundaries 184 | if (p < 0) 185 | p = 0; 186 | else if (p >= bound) 187 | p = bound - 1; 188 | return p + childLeft; 189 | } 190 | #endif // NODES_INNERNODE_LR_MODEL_H_ 191 | -------------------------------------------------------------------------------- /src/include/construct/construct_root.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file construct_root.h 3 | * @author Jiaoyi 4 | * @brief functions for constructing the root 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef CONSTRUCT_CONSTRUCT_ROOT_H_ 12 | #define CONSTRUCT_CONSTRUCT_ROOT_H_ 13 | #include 14 | #include 15 | 16 | #include "../carmi.h" 17 | #include "../nodes/rootNode/trainModel/linear_regression.h" 18 | #include "./dp.h" 19 | #include "./store_node.h" 20 | #include "./structures.h" 21 | 22 | template 24 | template 25 | void CARMI::UpdateRootOptSetting( 26 | int c, double *optimalCost, RootStruct *rootStruct) { 27 | // calculate the basic space cost of the c child nodes of the root node 28 | double space_cost = kBaseNodeSpace * static_cast(c); 29 | // calculate the time cost of the root node 30 | double time_cost = RootNodeType::kTimeCost; 31 | 32 | // train this type of the root node 33 | RootNodeType tmpRoot(c, initDataset); 34 | IndexPair range{0, static_cast(initDataset.size())}; 35 | IndexPair insertRange{0, static_cast(insertQuery.size())}; 36 | // initialize the variables that store the range of each sub-dataset 37 | std::vector perSize(c, emptyRange); 38 | std::vector perInsertSize(c, emptyRange); 39 | // split initDataset into c sub-datasets 40 | NodePartition(tmpRoot.model, range, 41 | initDataset, &perSize); 42 | // split insertDataset into c sub-datasets 43 | NodePartition(tmpRoot.model, insertRange, 44 | insertQuery, &perInsertSize); 45 | 46 | int maxLeafCapacity = carmi_params::kMaxLeafNodeSizeExternal; 47 | if (!isPrimary) { 48 | maxLeafCapacity = 49 | CFArrayType::kMaxLeafCapacity; 50 | } 51 | for (int i = 0; i < c; i++) { 52 | if (perSize[i].size == static_cast(initDataset.size())) { 53 | return; 54 | } 55 | int totalDataNum = perSize[i].size + perInsertSize[i].size; 56 | // if leaf nodes are cf array leaf nodes, add the space cost of data 57 | // blocks to the total space cost 58 | if (!isPrimary) { 59 | int tmpBlockNum = 60 | CFArrayType::CalNeededBlockNum( 61 | totalDataNum); 62 | space_cost += 63 | tmpBlockNum * carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0; 64 | } 65 | // if the total number of data points exceeds the maximum capacity of the 66 | // leaf node, the current node needs at least kMinChildNumber inner nodes to 67 | // manage the data points together 68 | if (totalDataNum > maxLeafCapacity) { 69 | space_cost += kBaseNodeSpace * kMinChildNumber; 70 | time_cost += carmi_params::kMemoryAccessTime * 71 | static_cast(perSize[i].size) / 72 | static_cast(initDataset.size()); 73 | } 74 | } 75 | 76 | // calculate the entropy of the root node 77 | double entropy = CalculateEntropy(perSize); 78 | double cost = 79 | (time_cost + lambda * static_cast(space_cost)) / entropy; 80 | 81 | // if the current cost is smaller than the optimal cost, update the optimal 82 | // cost and root setting 83 | if (cost <= *optimalCost) { 84 | *optimalCost = cost; 85 | rootStruct->rootChildNum = c; 86 | rootStruct->rootType = tmpRoot.flagNumber; 87 | } 88 | } 89 | 90 | template 92 | RootStruct CARMI::ChooseRoot() { 93 | double OptimalValue = DBL_MAX; 94 | RootStruct rootStruct(PLR_ROOT_NODE, kMinChildNumber); 95 | int minNum = 96 | std::max(kMinChildNumber, static_cast(initDataset.size() / 1024)); 97 | int maxNum = 98 | std::max(kMinChildNumber, static_cast(initDataset.size() / 2)); 99 | 100 | // Calculate the cost of different settings and choose the optimal setting 101 | for (int c = minNum; c <= maxNum; c *= 1.3) { 102 | UpdateRootOptSetting>( 103 | c * 1.001, &OptimalValue, &rootStruct); 104 | } 105 | // return the optimal root setting 106 | return rootStruct; 107 | } 108 | 109 | template 111 | SubDataset CARMI::StoreRoot( 112 | const RootStruct &rootStruct) { 113 | SubDataset subDataset(rootStruct.rootChildNum); 114 | // allocate a block of empty memory for these child nodes 115 | node.AllocateNodeMemory(rootStruct.rootChildNum); 116 | DataRange range({0, static_cast(initDataset.size())}, 117 | {0, static_cast(findQuery.size())}, 118 | {0, static_cast(insertQuery.size())}); 119 | switch (rootStruct.rootType) { 120 | case PLR_ROOT_NODE: { 121 | // construct the root node and train the model 122 | root = PLRType(rootStruct.rootChildNum, 123 | initDataset); 124 | // split the dataset 125 | NodePartition::ModelType>( 126 | root.model, range.initRange, initDataset, &(subDataset.subInit)); 127 | subDataset.subFind = subDataset.subInit; 128 | NodePartition::ModelType>( 129 | root.model, range.insertRange, insertQuery, &(subDataset.subInsert)); 130 | break; 131 | } 132 | } 133 | // roughly calculate the number of needed data blocks 134 | int blockNum = 0; 135 | for (int i = 0; i < rootStruct.rootChildNum; i++) { 136 | if (subDataset.subInit[i].size + subDataset.subInsert[i].size < 137 | CFArrayType::kMaxLeafCapacity) 138 | blockNum += 139 | CFArrayType::CalNeededBlockNum( 140 | subDataset.subInit[i].size + subDataset.subInsert[i].size); 141 | } 142 | 143 | // update the block number of the prefetch prediction model 144 | root.fetch_model.SetBlockNumber(blockNum); 145 | // update the size of the data array 146 | data.dataArray.resize(blockNum, LeafSlots()); 147 | return subDataset; 148 | } 149 | #endif // CONSTRUCT_CONSTRUCT_ROOT_H_ 150 | -------------------------------------------------------------------------------- /src/experiment/core.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file core.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include 12 | 13 | #include "../include/construct/construction.h" 14 | #include "../include/func/calculate_space.h" 15 | #include "../include/func/get_node_info.h" 16 | #include "./experiment_params.h" 17 | #include "./functions.h" 18 | 19 | extern std::ofstream outRes; 20 | 21 | /** 22 | * @brief the function of using CARMI 23 | * 24 | * @param[in] isZipfian whether to use zipfian access during the test 25 | * @param[in] initRatio the workload type 26 | * @param[in] rate the weight of space 27 | * @param[in] length the length of range scan 28 | * @param[in] initDataset 29 | * @param[in] testInsertQuery 30 | */ 31 | void CoreCARMI(bool isZipfian, double initRatio, double rate, 32 | const std::vector &length, const DataVecType &initDataset, 33 | const DataVecType &insertDataset, 34 | const DataVecType &testInsertQuery) { 35 | #ifdef DEBUG 36 | std::cout << std::endl; 37 | std::cout << "-------------------------------" << std::endl; 38 | std::cout << "kRate: " << rate << std::endl; 39 | std::cout << "Start construction!" << std::endl; 40 | time_t timep; 41 | time(&timep); 42 | char tmpTime[64]; 43 | strftime(tmpTime, sizeof(tmpTime), "%Y-%m-%d %H:%M:%S", localtime(&timep)); 44 | std::cout << "\nTEST time: " << tmpTime << std::endl; 45 | #endif 46 | 47 | typedef CARMIMap CarmiType; 48 | CarmiType carmi(initDataset.begin(), initDataset.end(), insertDataset.begin(), 49 | insertDataset.end(), rate); 50 | 51 | #ifdef DEBUG 52 | time(&timep); 53 | char tmpTime1[64]; 54 | strftime(tmpTime1, sizeof(tmpTime1), "%Y-%m-%d %H:%M:%S", localtime(&timep)); 55 | std::cout << "finish time: " << tmpTime1 << std::endl; 56 | 57 | std::cout << "\nprint the space:" << std::endl; 58 | auto space = carmi.CalculateSpace() / 1024.0 / 1024.0; 59 | outRes << space << ","; 60 | std::cout << space << " MB\n"; 61 | 62 | #endif 63 | 64 | if (initRatio == kWriteHeavy) 65 | WorkloadA(isZipfian, initDataset, testInsertQuery, 66 | &carmi); // write-heavy 67 | else if (initRatio == kReadHeavy) 68 | WorkloadB(isZipfian, initDataset, testInsertQuery, 69 | &carmi); // read-heavy 70 | else if (initRatio == kReadOnly) 71 | WorkloadC(isZipfian, initDataset, 72 | &carmi); // read-only 73 | else if (initRatio == kWritePartial) 74 | WorkloadD(isZipfian, initDataset, testInsertQuery, 75 | &carmi); // write-partial 76 | else if (initRatio == kRangeScan) 77 | WorkloadE(isZipfian, initDataset, testInsertQuery, 78 | length, 79 | &carmi); // range scan 80 | } 81 | 82 | template 83 | class ExternalDataType { 84 | public: 85 | typedef ValueType ValueType_; 86 | ExternalDataType() { 87 | k = 0; 88 | v = 0; 89 | } 90 | explicit ExternalDataType(KeyType key, ValueType_ value) { 91 | k = key; 92 | v = value; 93 | } 94 | const KeyType &key() const { return k; } 95 | const ValueType_ &data() const { return v; } 96 | 97 | bool operator<(const ExternalDataType &a) const { 98 | if (k == a.k) { 99 | return v < a.v; 100 | } 101 | return k < a.k; 102 | } 103 | 104 | KeyType k; 105 | ValueType_ v; 106 | }; 107 | 108 | /** 109 | * @brief the function of using external CARMI 110 | * 111 | * @param[in] isZipfian whether to use zipfian access during the test 112 | * @param[in] initRatio the workload type 113 | * @param[in] rate the weight of space 114 | * @param[in] length the length of range scan 115 | * @param[in] initDataset 116 | * @param[in] testInsertQuery 117 | */ 118 | void CoreExternalCARMI(bool isZipfian, double initRatio, double rate, 119 | const std::vector &length, 120 | const DataVecType &initDataset, 121 | const DataVecType &testInsertQuery) { 122 | DataVecType init = initDataset; 123 | 124 | #ifdef DEBUG 125 | std::cout << std::endl; 126 | std::cout << "-------------------------------" << std::endl; 127 | std::cout << "Start construction!" << std::endl; 128 | time_t timep; 129 | time(&timep); 130 | char tmpTime[64]; 131 | strftime(tmpTime, sizeof(tmpTime), "%Y-%m-%d %H:%M:%S", localtime(&timep)); 132 | std::cout << "\nTEST time: " << tmpTime << std::endl; 133 | #endif 134 | 135 | KeyType *externalDataset; 136 | const int record_size = sizeof(KeyType) + sizeof(ValueType); 137 | typedef CARMIExternalMap> 138 | CarmiType; 139 | int extLen = initDataset.size() * 2 + kTestSize * 2; 140 | externalDataset = new KeyType[extLen]; 141 | for (int i = 0, j = 0; i < static_cast(initDataset.size()); i++) { 142 | *(externalDataset + j) = initDataset[i].first; 143 | *(externalDataset + j + 1) = initDataset[i].second; 144 | j += 2; // due to 145 | } 146 | std::vector futureInsertKey(testInsertQuery.size(), 0); 147 | for (int i = 0; i < static_cast(testInsertQuery.size()); i++) { 148 | futureInsertKey[i] = testInsertQuery[i].first; 149 | } 150 | // initDataset -> only includes the findQuery 151 | CarmiType carmi(externalDataset, futureInsertKey, initDataset.size(), 152 | record_size, rate); 153 | 154 | #ifdef DEBUG 155 | time(&timep); 156 | char tmpTime1[64]; 157 | strftime(tmpTime1, sizeof(tmpTime1), "%Y-%m-%d %H:%M:%S", localtime(&timep)); 158 | std::cout << "finish time: " << tmpTime1 << std::endl; 159 | #endif 160 | 161 | if (initRatio == kWriteHeavy) 162 | WorkloadA>( 163 | isZipfian, init, testInsertQuery, 164 | &carmi); // write-heavy 165 | else if (initRatio == kReadHeavy) 166 | WorkloadB>( 167 | isZipfian, init, testInsertQuery, 168 | &carmi); // read-heavy 169 | else if (initRatio == kReadOnly) 170 | WorkloadC>( 171 | isZipfian, init, 172 | &carmi); // read-only 173 | else if (initRatio == kRangeScan) 174 | WorkloadE>( 175 | isZipfian, init, testInsertQuery, length, 176 | &carmi); // range scan 177 | } 178 | -------------------------------------------------------------------------------- /src/experiment/main_experiment.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file main_experiment.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | 12 | #include 13 | #include 14 | 15 | #include "./experiment_params.h" 16 | #include "./functions.h" 17 | extern std::ofstream outRes; 18 | 19 | /** 20 | * @brief test all datasets and workloads 21 | */ 22 | void mainExperiment() { 23 | // for range scan 24 | std::vector length; 25 | 26 | // read-only 27 | mainSynthetic(kReadOnly, length); 28 | mainYCSB(kReadOnly, length); 29 | mainMap(kReadOnly, length); 30 | 31 | // write-heavy 32 | mainSynthetic(kWriteHeavy, length); 33 | mainYCSB(kWriteHeavy, length); 34 | mainMap(kWriteHeavy, length); 35 | 36 | // read-heavy 37 | mainSynthetic(kReadHeavy, length); 38 | mainYCSB(kReadHeavy, length); 39 | mainMap(kReadHeavy, length); 40 | 41 | // write-partial 42 | mainSynthetic(kWritePartial, length); 43 | mainYCSB(kWritePartial, length); 44 | mainMap(kWritePartial, length); 45 | 46 | // range scan 47 | std::default_random_engine e(time(0)); 48 | std::uniform_int_distribution dis(0, 100); 49 | for (int i = 0; i < kDatasetSize; i++) { 50 | length.push_back(std::min(dis(e), kDatasetSize) - i); 51 | } 52 | mainSynthetic(kRangeScan, length); 53 | mainYCSB(kRangeScan, length); 54 | mainMap(kRangeScan, length); 55 | } 56 | 57 | /** 58 | * @brief test the synthetic datasets 59 | * 60 | * @param[in] initRatio the workload type 61 | * @param[in] length the length of range scan 62 | */ 63 | void mainSynthetic(double initRatio, const std::vector &length) { 64 | std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" 65 | "&&&&&&&" 66 | << std::endl; 67 | std::cout << "initRatio is: " << initRatio << std::endl; 68 | outRes << "initRatio," << initRatio << std::endl; 69 | double init = initRatio; 70 | if (init == kRangeScan) { 71 | init = kReadHeavy; 72 | } 73 | LognormalDataset logData(init); 74 | UniformDataset uniData(init); 75 | NormalDataset norData(init); 76 | ExponentialDataset expData(init); 77 | 78 | DataVecType initData; 79 | DataVecType insertData; 80 | DataVecType testInsert; 81 | 82 | for (int r = 0; r < static_cast(rate.size()); r++) { 83 | double kRate = rate[r]; 84 | outRes << "kRate:" << kRate << std::endl; 85 | std::cout << "+++++++++++ uniform dataset ++++++++++++++++++++++++++" 86 | << std::endl; 87 | uniData.GenerateDataset(&initData, &insertData, &testInsert); 88 | CoreCARMI(false, initRatio, kRate, length, initData, insertData, 89 | testInsert); 90 | CoreCARMI(true, initRatio, kRate, length, initData, insertData, testInsert); 91 | 92 | // std::cout << "+++++++++++ exponential dataset +++++++++++++++++++" 93 | // << std::endl; 94 | // expData.GenerateDataset(&initData, &insertData, &testInsert); 95 | // CoreCARMI(false, initRatio, kRate, length, initData, insertData, 96 | // testInsert); 97 | // // CoreCARMI(true, initRatio, kRate, length, initData, insertData, 98 | // // testInsert); 99 | 100 | std::cout << "+++++++++++ normal dataset ++++++++++++++++++++++++++" 101 | << std::endl; 102 | norData.GenerateDataset(&initData, &insertData, &testInsert); 103 | CoreCARMI(false, initRatio, kRate, length, initData, insertData, 104 | testInsert); 105 | CoreCARMI(true, initRatio, kRate, length, initData, insertData, testInsert); 106 | 107 | std::cout << "+++++++++++ lognormal dataset ++++++++++++++++++++++++++" 108 | << std::endl; 109 | logData.GenerateDataset(&initData, &insertData, &testInsert); 110 | CoreCARMI(false, initRatio, kRate, length, initData, insertData, 111 | testInsert); 112 | CoreCARMI(true, initRatio, kRate, length, initData, insertData, testInsert); 113 | 114 | outRes << std::endl; 115 | } 116 | } 117 | 118 | /** 119 | * @brief test the map datasets 120 | * 121 | * @param[in] initRatio the workload type 122 | * @param[in] length the length of range scan 123 | */ 124 | void mainMap(double initRatio, const std::vector &length) { 125 | std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" 126 | "&&&&&&&" 127 | << std::endl; 128 | std::cout << "initRatio is: " << initRatio << std::endl; 129 | outRes << "initRatio," << initRatio << std::endl; 130 | std::cout << "construct map" << std::endl; 131 | outRes << "construct map" << std::endl; 132 | double init = initRatio; 133 | if (init == kRangeScan) { 134 | init = kReadHeavy; 135 | } 136 | OsmcDataset osmcData(init); 137 | 138 | DataVecType initData; 139 | DataVecType insertData; 140 | DataVecType testInsert; 141 | 142 | for (int r = 0; r < static_cast(rate.size()); r++) { 143 | double kRate = rate[r]; 144 | outRes << "kRate:" << kRate << std::endl; 145 | 146 | std::cout << "+++++++++++ osmc dataset ++++++++++++++++++++++++++" 147 | << std::endl; 148 | osmcData.GenerateDataset(&initData, &insertData, &testInsert); 149 | CoreCARMI(true, initRatio, kRate, length, initData, insertData, testInsert); 150 | CoreCARMI(false, initRatio, kRate, length, initData, insertData, 151 | testInsert); 152 | 153 | outRes << std::endl; 154 | } 155 | } 156 | 157 | /** 158 | * @brief test the YCSB datasets 159 | * 160 | * @param[in] initRatio the workload type 161 | * @param[in] length the length of range scan 162 | */ 163 | void mainYCSB(double initRatio, const std::vector &length) { 164 | kPrimaryIndex = true; 165 | std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" 166 | "&&&&&&&" 167 | << std::endl; 168 | std::cout << "initRatio is: " << initRatio << std::endl; 169 | outRes << "initRatio," << initRatio << std::endl; 170 | std::cout << "construct ycsb" << std::endl; 171 | outRes << "construct ycsb" << std::endl; 172 | double init = initRatio; 173 | if (init == kRangeScan) { 174 | init = kReadHeavy; 175 | } 176 | YCSBDataset ycsbData(init); 177 | 178 | DataVecType initData; 179 | DataVecType insertData; 180 | DataVecType testInsert; 181 | 182 | for (int r = 0; r < static_cast(rate.size()); r++) { 183 | double kRate = rate[r]; 184 | outRes << "kRate:" << kRate << std::endl; 185 | std::cout << "+++++++++++ ycsb dataset ++++++++++++++++++++++++++" 186 | << std::endl; 187 | ycsbData.GenerateDataset(&initData, &insertData, &testInsert); 188 | CoreExternalCARMI(true, initRatio, kRate, length, initData, testInsert); 189 | 190 | outRes << std::endl; 191 | } 192 | kPrimaryIndex = false; 193 | } 194 | -------------------------------------------------------------------------------- /src/include/nodes/innerNode/candidate_plr.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file candidate_plr.h 3 | * @author Jiaoyi 4 | * @brief class for piecewise linear regression model 5 | * @version 3.0 6 | * @date 2021-03-16 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | #include "../../construct/structures.h" 18 | #include "../../params.h" 19 | 20 | #ifndef NODES_INNERNODE_CANDIDATE_PLR_H_ 21 | #define NODES_INNERNODE_CANDIDATE_PLR_H_ 22 | 23 | /** 24 | * @brief Designed for the piecewise linear regression model. This structure 25 | * records all the contents that need to be stored in the training process of 26 | * the piecewise linear function, which is the item in the dp table. 27 | */ 28 | template 29 | struct SegmentPoint { 30 | /** 31 | * @brief the current cost 32 | */ 33 | float cost = -DBL_MAX; 34 | 35 | /** 36 | * @brief the key values 37 | */ 38 | KeyType key[12] = {KeyType(), KeyType(), KeyType(), KeyType(), 39 | KeyType(), KeyType(), KeyType(), KeyType(), 40 | KeyType(), KeyType(), KeyType(), KeyType()}; 41 | 42 | /** 43 | * @brief the corresponding indexes 44 | */ 45 | int idx[12] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 46 | 47 | /** 48 | * @brief the number of blocks for the dp table in the prefetch prediction 49 | * model 50 | */ 51 | int blockNum[12] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; 52 | }; 53 | 54 | /** 55 | * @brief designed for piecewise linear regression model. 56 | * 57 | * This class stores the cost between each candidate point, the parameters of 58 | * the line segment between two points, entropy, and so on in the process of 59 | * dynamic programming algorithm in P. LR model, which is used to assist and 60 | * accelerate the DP algorithm. 61 | * 62 | * @tparam DataVectorType the vector type of the dataset, each element is a 63 | * pair: {key, value} 64 | */ 65 | template 66 | class CandidateCost { 67 | public: 68 | /** 69 | * @brief Construct a new empty Candidate Cost object 70 | */ 71 | CandidateCost() {} 72 | 73 | /** 74 | * @brief store the slope and intercept of each segment 75 | * 76 | * @param[in] dataset the given dataset, each element is: {key value, y} 77 | * @param[in] index the indexes of candidates 78 | */ 79 | void StoreTheta(const DataVectorType &dataset, 80 | const std::vector &index) { 81 | // store the value of each segment for least squares, used to speed up the 82 | // training process of the linear regression 83 | std::vector xx(index.size(), 0); 84 | std::vector x(index.size(), 0); 85 | std::vector px(index.size(), 0); 86 | std::vector p(index.size(), 0); 87 | xx[0] = 0.0; 88 | x[0] = 0.0; 89 | px[0] = 0.0; 90 | p[0] = 0.0; 91 | for (int i = 1; i < static_cast(index.size()); i++) { 92 | for (int j = index[i - 1]; j < index[i]; j++) { 93 | xx[i] += static_cast(dataset[j].first) * 94 | static_cast(dataset[j].first); 95 | x[i] += static_cast(dataset[j].first); 96 | px[i] += static_cast(dataset[j].first) * 97 | static_cast(dataset[j].second); 98 | p[i] += static_cast(dataset[j].second); 99 | } 100 | xx[i] += xx[i - 1]; 101 | x[i] += x[i - 1]; 102 | px[i] += px[i - 1]; 103 | p[i] += p[i - 1]; 104 | } 105 | xx[index.size() - 1] += 106 | static_cast(dataset[index[index.size() - 1]].first) * 107 | static_cast(dataset[index[index.size() - 1]].first); 108 | x[index.size() - 1] += 109 | static_cast(dataset[index[index.size() - 1]].first); 110 | px[index.size() - 1] += 111 | static_cast(dataset[index[index.size() - 1]].first) * 112 | static_cast(dataset[index[index.size() - 1]].second); 113 | p[index.size() - 1] += 114 | static_cast(dataset[index[index.size() - 1]].second); 115 | 116 | // store the parameters of each segment 117 | for (int i = 0; i < index.size() - 1; i++) { 118 | for (int j = i + 1; j < index.size(); j++) { 119 | int tmpSize = index[j] - index[i]; 120 | 121 | double theta1 = 0.0001, theta2 = 0.666; 122 | long double t1 = 0, t2 = 0, t3 = 0, t4 = 0; 123 | t1 = xx[j] - xx[i]; 124 | t2 = x[j] - x[i]; 125 | t3 = px[j] - px[i]; 126 | t4 = p[j] - p[i]; 127 | if (t1 * tmpSize - t2 * t2 == 0) { 128 | if (dataset[index[j]].first - dataset[index[i]].first == 0) { 129 | theta1 = 0; 130 | theta2 = dataset[index[j]].second; 131 | } else { 132 | theta1 = (static_cast(dataset[index[j]].second) - 133 | static_cast(dataset[index[i]].second)) / 134 | (static_cast(dataset[index[j]].first) - 135 | static_cast(dataset[index[i]].first)); 136 | theta2 = static_cast(dataset[index[j]].second) - 137 | theta1 * static_cast(dataset[index[j]].first); 138 | } 139 | } else { 140 | theta1 = (t3 * tmpSize - t2 * t4) / (t1 * tmpSize - t2 * t2); 141 | theta2 = (t1 * t4 - t2 * t3) / (t1 * tmpSize - t2 * t2); 142 | } 143 | if (theta1 <= 0) { 144 | theta1 = std::abs(theta1); 145 | } 146 | 147 | theta.insert({{index[i], index[j]}, {theta1, theta2}}); 148 | } 149 | } 150 | } 151 | 152 | /** 153 | * @brief calculate the entropy of each segment 154 | * 155 | * @param[in] leftIdx the left index of the sub-dataset 156 | * @param[in] rightIdx the right-index of the sub-dataset 157 | * @param[in] y1 158 | * @param[in] y2 159 | * @return double: entropy 160 | */ 161 | double Entropy(int leftIdx, int rightIdx, double y1, double y2) { 162 | auto tmp_theta = theta.find({leftIdx, rightIdx}); 163 | double a = tmp_theta->second.first; 164 | double entropy = -DBL_MAX; 165 | if (a > 0) { 166 | entropy = log2(a) * (y2 - y1); 167 | } 168 | return entropy; 169 | } 170 | 171 | public: 172 | //*** Private Data Members of CandidatePLR Objects 173 | /** 174 | * @brief params for the corresponding segment, each element is {{the index of 175 | * the left candidate points in the dataset, the index of the right candidate 176 | * points in the dataset}, {the slope, the intercept}} 177 | */ 178 | std::map, std::pair> theta; 179 | }; 180 | 181 | #endif // NODES_INNERNODE_CANDIDATE_PLR_H_ 182 | -------------------------------------------------------------------------------- /src/include/construct/structures.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file structures.h 3 | * @author Jiaoyi 4 | * @brief structures for CARMI 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef CONSTRUCT_STRUCTURES_H_ 12 | #define CONSTRUCT_STRUCTURES_H_ 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include "../params.h" 21 | 22 | /** 23 | * @brief Root node settings: the type of the root node and the number of child 24 | * nodes 25 | */ 26 | struct RootStruct { 27 | /** 28 | * @brief the type identifier of the root node 29 | */ 30 | int rootType; 31 | 32 | /** 33 | * @brief the number of its child nodes 34 | */ 35 | int rootChildNum; 36 | 37 | /** 38 | * @brief Construct a new Root Struct object and set the values 39 | * 40 | * @param[in] type the type of the root node 41 | * @param[in] c the number of child nodes 42 | */ 43 | RootStruct(int type, int c) { 44 | rootType = type; 45 | rootChildNum = c; 46 | } 47 | }; 48 | 49 | /** 50 | * @brief three parts of the cost: time cost, space cost and the total cost. 51 | * 52 | * The total cost = time cost + lambda * space cost. 53 | */ 54 | struct NodeCost { 55 | /** 56 | * @brief the time cost 57 | */ 58 | double time; 59 | 60 | /** 61 | * @brief the space cost 62 | */ 63 | double space; 64 | 65 | /** 66 | * @brief the total cost: time cost + lambda * space cost. 67 | */ 68 | double cost; 69 | }; 70 | 71 | /** 72 | * @brief the index range of data points: [left, left + size) 73 | */ 74 | struct IndexPair { 75 | /** 76 | * @brief the left index of data points in the dataset 77 | */ 78 | int left; 79 | 80 | /** 81 | * @brief the size of data points 82 | */ 83 | int size; 84 | 85 | bool operator<(const IndexPair& a) const { 86 | if (left == a.left) 87 | return size < a.size; 88 | else 89 | return left < a.left; 90 | } 91 | }; 92 | 93 | /** 94 | * @brief the index ranges of sub-initDataset, sub-findQuery and 95 | * sub-insertQuery: {initDataset: {left, size}, findQuery: {left, size}, 96 | * insertQuery: {left, size}} 97 | */ 98 | class DataRange { 99 | public: 100 | /** 101 | * @brief the index range of initDataset: {the left index of the sub-dataset 102 | * in the initDataset, the size of the sub-dataset} 103 | */ 104 | IndexPair initRange; 105 | 106 | /** 107 | * @brief the index range of findQuery: {the left index of the sub-dataset 108 | * in the findQuery, the size of the sub-dataset} 109 | */ 110 | IndexPair findRange; 111 | 112 | /** 113 | * @brief the index range of insertQuery: {the left index of the sub-dataset 114 | * in the insertQuery, the size of the sub-dataset} 115 | */ 116 | IndexPair insertRange; 117 | 118 | /** 119 | * @brief Construct a new Data Range object 120 | * 121 | * @param[in] init the index range of sub-initDataset: {the left index of the 122 | * sub-dataset in the initDataset, the size of the sub-dataset} 123 | * @param[in] find the index range of sub-findQuery: {the left index of the 124 | * sub-dataset in the findQuery, the size of the sub-dataset} 125 | * @param[in] insert the index range of sub-insertQuery: {the left index of 126 | * the sub-dataset in the insertQuery, the size of the sub-dataset} 127 | */ 128 | DataRange(IndexPair init, IndexPair find, IndexPair insert) 129 | : initRange(init), findRange(find), insertRange(insert) {} 130 | }; 131 | 132 | /** 133 | * @brief the starting index and size of sub-dataset in each child node, each 134 | * element is: {the vector of the sub-initDataset, the vector of the 135 | * sub-findDataset, the vector of sub-insertDataset}. Each sub-dataset is 136 | * represented by: {left, size}, which means the range of it in the dataset is 137 | * [left, left + size). 138 | */ 139 | class SubDataset { 140 | public: 141 | /** 142 | * @brief the IndexPair vector of sub-initDataset, each element is: {the left 143 | * index of the sub-dataset in the initDataset, the size of the sub-dataset} 144 | */ 145 | std::vector subInit; 146 | 147 | /** 148 | * @brief the IndexPair vector of sub-findDataset, each element is: {the left 149 | * index of the sub-dataset in the findDataset, the size of the sub-dataset} 150 | */ 151 | std::vector subFind; 152 | 153 | /** 154 | * @brief the IndexPair vector of sub-insertDataset, each element is: {the 155 | * left index of the sub-dataset in the insertDataset, the size of the 156 | * sub-dataset} 157 | */ 158 | std::vector subInsert; 159 | 160 | /** 161 | * @brief Construct a new SubDataset object and the size of the vector is c 162 | * 163 | * @param[in] c the size of the vector 164 | */ 165 | explicit SubDataset(int c) 166 | : subInit(std::vector(c, {-1, 0})), 167 | subFind(std::vector(c, {-1, 0})), 168 | subInsert(std::vector(c, {-1, 0})) {} 169 | ~SubDataset() {} 170 | }; 171 | 172 | /** 173 | * @brief enumerate type of all node types 174 | */ 175 | enum NodeType { 176 | PLR_ROOT_NODE, 177 | 178 | LR_INNER_NODE, 179 | PLR_INNER_NODE, 180 | HIS_INNER_NODE, 181 | BS_INNER_NODE, 182 | 183 | ARRAY_LEAF_NODE, 184 | EXTERNAL_ARRAY_LEAF_NODE 185 | }; 186 | 187 | /** 188 | * @brief the structure of a data block 189 | * 190 | * This structure is designed for the CF array leaf nodes, so as to make better 191 | * use of the cache mechanism to speed up data access. The size of this class is 192 | * fixed as kMaxLeafNodeSize. 193 | * 194 | * @tparam KeyType the type of the given key value 195 | * @tparam ValueType the type of the value 196 | */ 197 | template 198 | class LeafSlots { 199 | public: 200 | /** 201 | * @brief the structure of a data block which actually stores the data points, 202 | * its size is determined by the kMaxLeafNodeSize and the type of the data 203 | * point. Each element in slots is: {key value, value}. 204 | */ 205 | std::pair slots[carmi_params::kMaxLeafNodeSize / 206 | sizeof(std::pair)]; 207 | 208 | /** 209 | * @brief Construct a new Leaf Slots object and set the default value of each 210 | * element to the pair of {DBL_MAX, DBL_MAX} 211 | */ 212 | LeafSlots() { 213 | int len = 214 | carmi_params::kMaxLeafNodeSize / sizeof(std::pair); 215 | for (int i = 0; i < len; i++) { 216 | slots[i] = {DBL_MAX, DBL_MAX}; 217 | } 218 | } 219 | 220 | LeafSlots& operator=(const LeafSlots& currnode) { 221 | if (this != &currnode) { 222 | int len = carmi_params::kMaxLeafNodeSize / 223 | sizeof(std::pair); 224 | for (int i = 0; i < len; i++) { 225 | this->slots[i] = currnode.slots[i]; 226 | } 227 | } 228 | return *this; 229 | } 230 | }; 231 | #endif // CONSTRUCT_STRUCTURES_H_ 232 | -------------------------------------------------------------------------------- /src/include/construct/minor_function.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file minor_function.h 3 | * @author Jiaoyi 4 | * @brief the minor functions for constructing CARMI 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef CONSTRUCT_MINOR_FUNCTION_H_ 12 | #define CONSTRUCT_MINOR_FUNCTION_H_ 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | #include "../carmi.h" 19 | 20 | template 22 | double CARMI::CalculateFrequencyWeight( 23 | const DataRange &dataRange) { 24 | float frequency = 0.0; 25 | // count the frequency of findQuery 26 | int findEnd = dataRange.findRange.left + dataRange.findRange.size; 27 | for (int i = dataRange.findRange.left; i < findEnd; i++) 28 | frequency += findQuery[i].second; 29 | // count the frequency of insertQuery 30 | frequency += dataRange.insertRange.size; 31 | // calculate the weighted frequency of this sub-dataset 32 | double frequency_weight = frequency / querySize; 33 | return frequency_weight; 34 | } 35 | 36 | template 38 | double CARMI::CalculateEntropy( 39 | const std::vector &perSize) const { 40 | // the sum of -size(i)*log(size(i)) 41 | double slogs = 0.0; 42 | // the total size of the dataset 43 | int n = 0; 44 | for (int i = 0; i < perSize.size(); i++) { 45 | n += perSize[i].size; 46 | if (perSize[i].size != 0) 47 | slogs += static_cast(perSize[i].size) * (-log2(perSize[i].size)); 48 | } 49 | if (n == 0) { 50 | return -DBL_MAX; 51 | } 52 | 53 | double entropy = slogs / n + log2(n); 54 | return entropy; 55 | } 56 | 57 | template 59 | std::vector CARMI::CalculateCFArrayCost(int size, 61 | int totalPrefetchedNum) { 62 | std::vector cost( 63 | CFArrayType::kMaxBlockNum, 0); 64 | for (int k = 0; 65 | k < CFArrayType::kMaxBlockNum; k++) { 66 | double space = kBaseNodeSpace; 67 | double time = carmi_params::kLeafBaseTime; 68 | if ((k + 1) * CFArrayType::kMaxBlockCapacity >= 70 | size) { 71 | // Case 1: these data points can be prefetched, then the space cost is the 72 | // space cost of allocated data blocks, and the time cost does not 73 | // increase 74 | space += (k + 1) * carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0; 75 | } else { 76 | // Case 2: these data points cannot be prefetched, then the space cost is 77 | // the space cost of actually needed data blocks and the time cost should 78 | // include the latency of a memory access 79 | int neededBlock = 80 | CFArrayType::CalNeededBlockNum( 81 | size); 82 | space += static_cast(neededBlock) * 83 | carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0; 84 | time += carmi_params::kMemoryAccessTime; 85 | } 86 | time *= static_cast(size) / totalPrefetchedNum; 87 | cost[k] = time + lambda * space; 88 | } 89 | return cost; 90 | } 91 | 92 | template 94 | template 95 | void CARMI::NodePartition( 96 | const InnerNodeType &currnode, const IndexPair &range, 97 | const DataVectorType &dataset, std::vector *subData) const { 98 | int end = range.left + range.size; 99 | for (int i = range.left; i < end; i++) { 100 | int p = currnode.Predict(dataset[i].first); 101 | if (p < 0 || p >= (*subData).size()) { 102 | throw std::out_of_range( 103 | "CARMI::NodePartition: the output of the model is out of range."); 104 | } 105 | 106 | // if this sub-dataset is newly divided, store its leaf index in the dataset 107 | if ((*subData)[p].left == -1) { 108 | (*subData)[p].left = i; 109 | } 110 | // count the size of this sub-dataset 111 | (*subData)[p].size++; 112 | } 113 | } 114 | 115 | template 117 | template 118 | void CARMI::NodePartition( 119 | const InnerNodeType &currnode, const IndexPair &range, 120 | const KeyVectorType &dataset, std::vector *subData) const { 121 | int end = range.left + range.size; 122 | for (int i = range.left; i < end; i++) { 123 | int p = currnode.Predict(dataset[i]); 124 | 125 | // if this sub-dataset is newly divided, store its leaf index in the dataset 126 | if ((*subData)[p].left == -1) { 127 | (*subData)[p].left = i; 128 | } 129 | // count the size of this sub-dataset 130 | (*subData)[p].size++; 131 | } 132 | } 133 | 134 | template 136 | template 137 | InnerNodeType CARMI::InnerDivideAll( 138 | const DataRange &range, int c, SubDataset *subDataset) { 139 | InnerNodeType currnode(c); 140 | int s = range.initRange.left; 141 | int e = range.initRange.size + s; 142 | DataVectorType tmpDataset(initDataset.begin() + s, initDataset.begin() + e); 143 | if (range.insertRange.size > 0) { 144 | s = range.insertRange.left; 145 | e = s + range.insertRange.size; 146 | for (int j = s; j < e; j++) { 147 | tmpDataset.push_back({insertQuery[j], static_cast(DBL_MAX)}); 148 | } 149 | std::sort(tmpDataset.begin(), tmpDataset.end()); 150 | } 151 | currnode.Train(0, tmpDataset.size(), tmpDataset); 152 | // split initDataset into c sub-datasets 153 | NodePartition(currnode, range.initRange, initDataset, 154 | &(subDataset->subInit)); 155 | // split findQuery into c sub-datasets 156 | subDataset->subFind = subDataset->subInit; 157 | // split insertQuery into c sub-datasets 158 | NodePartition(currnode, range.insertRange, insertQuery, 159 | &(subDataset->subInsert)); 160 | return currnode; 161 | } 162 | 163 | template 165 | void CARMI::UpdateLeaf() { 166 | if (isPrimary) return; 167 | node.nodeArray[scanLeaf[0]].cfArray.nextLeaf = scanLeaf[1]; 168 | int end = scanLeaf.size() - 1; 169 | node.nodeArray[scanLeaf[end]].cfArray.nextLeaf = -1; 170 | node.nodeArray[scanLeaf[end]].cfArray.previousLeaf = scanLeaf[end - 1]; 171 | for (int i = 1; i < end; i++) { 172 | node.nodeArray[scanLeaf[i]].cfArray.nextLeaf = scanLeaf[i + 1]; 173 | node.nodeArray[scanLeaf[i]].cfArray.previousLeaf = scanLeaf[i - 1]; 174 | } 175 | 176 | std::vector().swap(scanLeaf); 177 | } 178 | 179 | #endif // CONSTRUCT_MINOR_FUNCTION_H_ 180 | -------------------------------------------------------------------------------- /src/unitTest/leafNodeTest/cfarray_test.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file cfarray_test.cpp 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 0.1 6 | * @date 2021-11-03 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | 12 | #include 13 | 14 | #include "../../include/nodes/leafNode/cfarray_type.h" 15 | #include "gtest/gtest.h" 16 | 17 | const int kTestMaxValue = 10000; 18 | typedef double KeyType; 19 | typedef double ValueType; 20 | typedef CFArrayType CFType; 21 | std::default_random_engine engine(time(0)); 22 | std::uniform_real_distribution dis(0, kTestMaxValue); 23 | 24 | TEST(TestCalNeededBlockNum, CalNeededBlockNum) { 25 | int CFSize = sizeof(CFType); 26 | ASSERT_EQ(64, CFSize); 27 | int maxBlockNum = CFType::kMaxBlockNum; 28 | for (int i = 0; i < CFType::kMaxLeafCapacity; i++) { 29 | int needBlockNum = CFType::CalNeededBlockNum(i); 30 | ASSERT_GE(needBlockNum, 0); 31 | ASSERT_LE(needBlockNum, maxBlockNum); 32 | } 33 | } 34 | 35 | TEST(TestSearchDataBlock, CheckSearchBlockRes) { 36 | for (int i = 0; i < CFType::kMaxBlockCapacity; i++) { 37 | std::vector> testTrainData(i); 38 | LeafSlots currblock; 39 | CFType tmpCFNode; 40 | for (int j = 0; j < i; j++) { 41 | KeyType tmpKey = dis(engine); 42 | testTrainData[j] = {tmpKey, tmpKey * 10}; 43 | } 44 | std::sort(testTrainData.begin(), testTrainData.end()); 45 | for (int j = 0; j < i; j++) { 46 | currblock.slots[j] = testTrainData[j]; 47 | } 48 | for (int j = 0; j < i; j++) { 49 | int res = tmpCFNode.SearchDataBlock(currblock, testTrainData[j].first, i); 50 | EXPECT_EQ(testTrainData[res].first, testTrainData[j].first) 51 | << "j:" << j << ",\tres:" << res << ",\ti:" << i << std::endl; 52 | } 53 | } 54 | } 55 | 56 | TEST(TestNormalStoreData, CheckStoreData) { 57 | int maxBlockNum = CFType::kMaxBlockNum; 58 | for (int i = 0; i < CFType::kMaxLeafCapacity; i++) { 59 | std::vector> testTrainData(i); 60 | DataArrayStructure data(maxBlockNum, i); 61 | CFType tmpCFNode; 62 | for (int t = 0; t < CFType::kMaxPerSizeNum; t++) { 63 | ASSERT_EQ(static_cast(tmpCFNode.perSize[t]), 0); 64 | } 65 | for (int j = 0; j < i; j++) { 66 | KeyType tmpKey = dis(engine); 67 | testTrainData[j] = {tmpKey, tmpKey * 10}; 68 | } 69 | std::sort(testTrainData.begin(), testTrainData.end()); 70 | int needBlockNum = CFType::CalNeededBlockNum(i); 71 | int tmpEnd = -1; 72 | auto isSuccess = 73 | tmpCFNode.StoreData(testTrainData, std::vector(i), false, 74 | needBlockNum, 0, &data, &tmpEnd); 75 | ASSERT_TRUE(isSuccess); 76 | for (int j = 0; j < CFType::kMaxBlockNum - 2; j++) { 77 | ASSERT_LE(tmpCFNode.slotkeys[j], tmpCFNode.slotkeys[j + 1]); 78 | } 79 | for (int t = 0; t < CFType::kMaxPerSizeNum; t++) { 80 | ASSERT_GE(static_cast(tmpCFNode.perSize[t]), 0); 81 | ASSERT_LE(static_cast(tmpCFNode.perSize[t]), 255); 82 | } 83 | for (int j = 0; j < i; j++) { 84 | for (int k = 0; k < CFType::kMaxBlockCapacity - 1; k++) { 85 | KeyType l = data.dataArray[j].slots[k].first; 86 | KeyType r = data.dataArray[j].slots[k + 1].first; 87 | ASSERT_LE(l, r) << "the size is:" << i; 88 | } 89 | } 90 | } 91 | } 92 | 93 | TEST(TestFind, CFArrayFindData) { 94 | int maxBlockNum = CFType::kMaxBlockNum; 95 | for (int i = 0; i < CFType::kMaxLeafCapacity; i++) { 96 | std::vector> testTrainData(i); 97 | DataArrayStructure data(maxBlockNum, i); 98 | CFType tmpCFNode; 99 | for (int j = 0; j < i; j++) { 100 | KeyType tmpKey = dis(engine); 101 | testTrainData[j] = {tmpKey, tmpKey * 10}; 102 | } 103 | std::sort(testTrainData.begin(), testTrainData.end()); 104 | int needBlockNum = CFType::CalNeededBlockNum(i); 105 | int tmpEnd = -1; 106 | auto isSuccess = 107 | tmpCFNode.StoreData(testTrainData, std::vector(i), false, 108 | needBlockNum, 0, &data, &tmpEnd); 109 | for (int j = 0; j < i; j++) { 110 | int currblock = 0; 111 | int currslot = tmpCFNode.Find(data, testTrainData[j].first, &currblock); 112 | KeyType res = 113 | data.dataArray[tmpCFNode.m_left + currblock].slots[currslot].first; 114 | ASSERT_EQ(res, testTrainData[j].first) 115 | << "j:" << j << ",\tres:" << res << ",\ti:" << i; 116 | } 117 | } 118 | } 119 | 120 | TEST(TestInsert, InsertData) { 121 | int maxBlockNum = CFType::kMaxBlockNum; 122 | for (int i = 0; i < CFType::kMaxLeafCapacity; i++) { 123 | std::vector> testTrainData(i); 124 | DataArrayStructure data(maxBlockNum, i); 125 | CFType tmpCFNode; 126 | for (int j = 0; j < i; j++) { 127 | KeyType tmpKey = dis(engine); 128 | testTrainData[j] = {tmpKey, tmpKey * 10}; 129 | } 130 | std::sort(testTrainData.begin(), testTrainData.end()); 131 | int needBlockNum = CFType::CalNeededBlockNum(i); 132 | int tmpEnd = -1; 133 | tmpCFNode.StoreData(testTrainData, std::vector(i), false, needBlockNum, 134 | 0, &data, &tmpEnd); 135 | KeyType tmpKey = dis(engine); 136 | std::pair datapoint = {tmpKey, tmpKey}; 137 | int currblock = 0, currslot = 0; 138 | auto isSuccess = tmpCFNode.Insert(datapoint, &currblock, &currslot, &data); 139 | if (isSuccess) { 140 | int m_left = tmpCFNode.m_left; 141 | int blockNum = tmpCFNode.flagNumber & 0x00FFFFFF; 142 | int nowDataNum = CFType::GetDataNum(data, m_left, m_left + blockNum); 143 | 144 | ASSERT_EQ(i + 1, nowDataNum); 145 | for (int j = m_left; j < m_left + blockNum; j++) { 146 | for (int k = 0; k < CFType::kMaxBlockCapacity - 1; k++) { 147 | KeyType l = data.dataArray[j].slots[k].first; 148 | KeyType r = data.dataArray[j].slots[k + 1].first; 149 | ASSERT_LE(l, r); 150 | } 151 | } 152 | } 153 | } 154 | } 155 | 156 | TEST(TestDelete, DeleteData) { 157 | int maxBlockNum = CFType::kMaxBlockNum; 158 | int size = 90; 159 | std::vector> testTrainData(size); 160 | DataArrayStructure data(maxBlockNum, size); 161 | CFType tmpCFNode; 162 | for (int j = 0; j < size; j++) { 163 | KeyType tmpKey = dis(engine); 164 | testTrainData[j] = {tmpKey, tmpKey * 10}; 165 | } 166 | std::sort(testTrainData.begin(), testTrainData.end()); 167 | int needBlockNum = CFType::CalNeededBlockNum(size); 168 | int tmpEnd = -1; 169 | tmpCFNode.StoreData(testTrainData, std::vector(size), false, 170 | needBlockNum, 0, &data, &tmpEnd); 171 | 172 | for (int j = 0; j < size; j += 5) { 173 | size_t cnt = 0; 174 | auto isSuccess = tmpCFNode.Delete(testTrainData[j].first, &cnt, &data); 175 | ASSERT_TRUE(isSuccess); 176 | ASSERT_GT(cnt, 0); 177 | int m_left = tmpCFNode.m_left; 178 | int blockNum = tmpCFNode.flagNumber & 0x00FFFFFF; 179 | int nowDataNum = CFType::GetDataNum(data, m_left, m_left + blockNum); 180 | for (int j = m_left; j < m_left + blockNum; j++) { 181 | for (int k = 0; k < CFType::kMaxBlockCapacity - 1; k++) { 182 | KeyType l = data.dataArray[j].slots[k].first; 183 | KeyType r = data.dataArray[j].slots[k + 1].first; 184 | ASSERT_LE(l, r); 185 | } 186 | } 187 | } 188 | } -------------------------------------------------------------------------------- /src/include/construct/greedy.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file greedy.h 3 | * @author Jiaoyi 4 | * @brief use the greedy node selection algorithm to construct inner nodes 5 | * @version 3.0 6 | * @date 2021-03-11 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | #ifndef CONSTRUCT_GREEDY_H_ 12 | #define CONSTRUCT_GREEDY_H_ 13 | 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | 20 | #include "../carmi.h" 21 | #include "../params.h" 22 | #include "./dp_inner.h" 23 | #include "./minor_function.h" 24 | #include "./structures.h" 25 | 26 | template 28 | template 29 | void CARMI::UpdateGreedyOptSetting( 30 | const DataRange &range, int c, double frequency_weight, 31 | NodeCost *optimalCost, InnerNodeType *optimal_node_struct) { 32 | // calculate the basic space cost of the c child nodes of the inner node 33 | double space_cost = kBaseNodeSpace * static_cast(c); 34 | // calculate the time cost of the inner node 35 | double time_cost = InnerNodeType::kTimeCost; 36 | 37 | SubDataset subDataset(c); 38 | InnerNodeType currnode = InnerDivideAll(range, c, &subDataset); 39 | int maxLeafCapacity = carmi_params::kMaxLeafNodeSizeExternal; 40 | if (!isPrimary) { 41 | maxLeafCapacity = 42 | CFArrayType::kMaxLeafCapacity; 43 | } 44 | for (int i = 0; i < c; i++) { 45 | int totalDataNum = 46 | subDataset.subInit[i].size + subDataset.subInsert[i].size; 47 | if (totalDataNum == range.initRange.size + range.insertRange.size) { 48 | return; 49 | } 50 | // if leaf nodes are cf array leaf nodes, add the space cost of data 51 | // blocks to the total space cost 52 | if (!isPrimary) { 53 | int tmpBlockNum = 54 | CFArrayType::CalNeededBlockNum( 55 | totalDataNum); 56 | space_cost += static_cast(tmpBlockNum) * 57 | carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0; 58 | } 59 | // if the total number of data points exceeds the maximum capacity of the 60 | // leaf node, the current node needs at least kMinChildNumber inner nodes to 61 | // manage the data points together 62 | if (totalDataNum > maxLeafCapacity) { 63 | space_cost += kBaseNodeSpace * kMinChildNumber; 64 | time_cost += carmi_params::kMemoryAccessTime * 65 | static_cast(subDataset.subInit[i].size) / 66 | static_cast(range.initRange.size); 67 | } 68 | } 69 | // calculate the entropy of the inner node 70 | double entropy = CalculateEntropy(subDataset.subInit); 71 | double cost = (time_cost + lambda * space_cost / frequency_weight) / entropy; 72 | 73 | // if the current cost is smaller than the optimal cost, update the optimal 74 | // cost and node setting 75 | if (cost <= optimalCost->cost) { 76 | *optimal_node_struct = currnode; 77 | *optimalCost = {time_cost, space_cost, cost}; 78 | } 79 | } 80 | 81 | template 83 | NodeCost CARMI::GreedyAlgorithm( 84 | const DataRange &dataRange) { 85 | // the optimal cost of this sub-dataset 86 | NodeCost optimalCost{DBL_MAX, DBL_MAX, DBL_MAX}; 87 | // the optimal node of this sub-dataset 88 | BaseNode opt_struct; 89 | // calculate the weight of the frequency of this sub-dataset (findQuery and 90 | // insertQury) 91 | double frequency_weight = CalculateFrequencyWeight(dataRange); 92 | int tmpEnd = std::min(0x00FFFFFF, dataRange.initRange.size / 16); 93 | tmpEnd = std::max(tmpEnd, kMinChildNumber); 94 | for (int c = kMinChildNumber; c <= tmpEnd; c *= 2) { 95 | // Case 1: construct a LR inner node, if it is better than the current 96 | // optimal setting, then use it to update the optimal setting 97 | UpdateGreedyOptSetting>( 98 | dataRange, c, frequency_weight, &optimalCost, &(opt_struct.lr)); 99 | // Case 2: construct a P. LR inner node, if it is better than the current 100 | // optimal setting, then use it to update the optimal setting 101 | UpdateGreedyOptSetting>( 102 | dataRange, c, frequency_weight, &optimalCost, &(opt_struct.plr)); 103 | // Case 3: construct a His inner node, if it is better than the current 104 | // optimal setting, then use it to update the optimal setting 105 | if (c <= kHisMaxChildNumber) 106 | UpdateGreedyOptSetting>( 107 | dataRange, c, frequency_weight, &optimalCost, &(opt_struct.his)); 108 | // Case 4: construct a BS inner node, if it is better than the current 109 | // optimal setting, then use it to update the optimal setting 110 | if (c <= kBSMaxChildNumber) 111 | UpdateGreedyOptSetting>( 112 | dataRange, c, frequency_weight, &optimalCost, &(opt_struct.bs)); 113 | } 114 | 115 | // use the optimal inner node to divide dataset into childNum sub-datasets 116 | int childNum = opt_struct.lr.flagNumber & 0x00FFFFFF; 117 | int type = opt_struct.lr.flagNumber >> 24; 118 | SubDataset subDataset(childNum); 119 | switch (type) { 120 | case LR_INNER_NODE: { 121 | InnerDivideAll>(dataRange, childNum, 122 | &subDataset); 123 | break; 124 | } 125 | case PLR_INNER_NODE: { 126 | InnerDivideAll>(dataRange, childNum, 127 | &subDataset); 128 | break; 129 | } 130 | case HIS_INNER_NODE: { 131 | InnerDivideAll>(dataRange, childNum, 132 | &subDataset); 133 | break; 134 | } 135 | case BS_INNER_NODE: { 136 | InnerDivideAll>(dataRange, childNum, 137 | &subDataset); 138 | break; 139 | } 140 | } 141 | 142 | // recursively calculate the cost of the child nodes 143 | for (int i = 0; i < childNum; i++) { 144 | NodeCost res = emptyCost; 145 | DataRange range(subDataset.subInit[i], subDataset.subFind[i], 146 | subDataset.subInsert[i]); 147 | // choose the suitable algorithm to construct the sub-tree according to the 148 | // size of the sub-dataset 149 | double minRatio = 0.95; 150 | // record the maximum capacity of the leaf node 151 | int maxStoredNum = 152 | CFArrayType::kMaxLeafCapacity; 153 | if (isPrimary) { 154 | maxStoredNum = carmi_params::kMaxLeafNodeSizeExternal; 155 | } 156 | if (range.initRange.size + range.insertRange.size <= 157 | minRatio * maxStoredNum) { 158 | // Case 3: if the size is smaller than the threshold, directly construct a 159 | // leaf node 160 | res = DPLeaf(range); 161 | } else if (subDataset.subInit[i].size + subDataset.subInsert[i].size > 162 | carmi_params::kAlgorithmThreshold) { 163 | res = GreedyAlgorithm(range); 164 | } else { 165 | res = DP(range); 166 | } 167 | optimalCost.cost += res.cost; 168 | optimalCost.time += res.time; 169 | optimalCost.space += res.space; 170 | } 171 | 172 | // store the optimal setting of this sub-dataset 173 | structMap.insert({dataRange.initRange, opt_struct}); 174 | // store the minimum cost of this sub-dataset 175 | COST.insert({dataRange.initRange, optimalCost}); 176 | return optimalCost; 177 | } 178 | #endif // CONSTRUCT_GREEDY_H_ 179 | -------------------------------------------------------------------------------- /src/experiment/workload/workloads_external.h: -------------------------------------------------------------------------------- 1 | /** 2 | * @file workloads_external.h 3 | * @author Jiaoyi 4 | * @brief 5 | * @version 3.0 6 | * @date 2021-03-26 7 | * 8 | * @copyright Copyright (c) 2021 9 | * 10 | */ 11 | 12 | #ifndef EXPERIMENT_WORKLOAD_WORKLOADS_EXTERNAL_H_ 13 | #define EXPERIMENT_WORKLOAD_WORKLOADS_EXTERNAL_H_ 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include "../../include/carmi_external_map.h" 20 | #include "../functions.h" 21 | #include "./public_functions.h" 22 | #include "./zipfian.h" 23 | 24 | extern std::ofstream outRes; 25 | 26 | /** 27 | * @brief write heavy workload for external CARMI, 28 | * a mix of 50/50 reads and writes 29 | * 30 | * @tparam KeyType 31 | * @param[in] isZipfian whether to use zipfian access during the test 32 | * @param[in] findDataset 33 | * @param[in] insertDataset 34 | * @param[inout] carmi 35 | */ 36 | template 37 | void WorkloadA(bool isZipfian, const DataVecType &findDataset, 38 | const DataVecType &insertDataset, 39 | CARMIExternalMap *carmi) { 40 | DataVecType findQuery; 41 | DataVecType insertQuery; 42 | std::vector index; 43 | int end = kTestSize * kWriteHeavy; 44 | InitTestSet(findDataset, insertDataset, isZipfian, &findQuery, &insertQuery, 45 | &index); 46 | 47 | std::clock_t s, e; 48 | double tmp; 49 | auto resIte = carmi->end(); 50 | KeyType res = 0; 51 | s = std::clock(); 52 | if (isZipfian) { 53 | for (int i = 0; i < end; i++) { 54 | resIte = carmi->find(findQuery[index[i]].first); 55 | res += resIte.data(); 56 | carmi->insert(insertQuery[i].first); 57 | } 58 | } else { 59 | for (int i = 0; i < end; i++) { 60 | resIte = carmi->find(findQuery[i].first); 61 | res += resIte.data(); 62 | carmi->insert(insertQuery[i].first); 63 | } 64 | } 65 | e = std::clock(); 66 | tmp = (e - s) / static_cast(CLOCKS_PER_SEC); 67 | std::cout << " res: " << res << std::endl; 68 | 69 | PrintAvgTime(tmp); 70 | } 71 | 72 | /** 73 | * @brief read heavy workload for external CARMI, 74 | * a mix of 95/5 reads and writes 75 | * 76 | * @tparam KeyType 77 | * @param[in] isZipfian whether to use zipfian access during the test 78 | * @param[in] findDataset 79 | * @param[in] insertDataset 80 | * @param[inout] carmi 81 | */ 82 | template 83 | void WorkloadB(bool isZipfian, const DataVecType &findDataset, 84 | const DataVecType &insertDataset, 85 | CARMIExternalMap *carmi) { 86 | DataVecType findQuery; 87 | DataVecType insertQuery; 88 | std::vector index; 89 | InitTestSet(findDataset, insertDataset, isZipfian, &findQuery, &insertQuery, 90 | &index); 91 | 92 | int end = round(kTestSize * (1 - kReadHeavy)); 93 | int findCnt = 0; 94 | 95 | std::clock_t s, e; 96 | auto resIte = carmi->end(); 97 | KeyType res = 0; 98 | double tmp; 99 | s = std::clock(); 100 | if (isZipfian) { 101 | for (int i = 0; i < end; i++) { 102 | for (int j = 0; j < 19; j++) { 103 | resIte = carmi->find(findQuery[index[findCnt]].first); 104 | res += resIte.data(); 105 | findCnt++; 106 | } 107 | carmi->insert(insertQuery[i].first); 108 | } 109 | } else { 110 | for (int i = 0; i < end; i++) { 111 | for (int j = 0; j < 19 && findCnt < static_cast(findQuery.size()); 112 | j++) { 113 | resIte = carmi->find(findQuery[findCnt++].first); 114 | res += resIte.data(); 115 | } 116 | carmi->insert(insertQuery[i].first); 117 | } 118 | } 119 | e = std::clock(); 120 | tmp = (e - s) / static_cast(CLOCKS_PER_SEC); 121 | 122 | std::cout << " res: " << res << std::endl; 123 | PrintAvgTime(tmp); 124 | } 125 | 126 | /** 127 | * @brief read only workload for external CARMI, 100% read 128 | * 129 | * @tparam KeyType 130 | * @param[in] isZipfian whether to use zipfian access during the test 131 | * @param[in] findDataset 132 | * @param[inout] carmi 133 | */ 134 | template 135 | void WorkloadC(bool isZipfian, const DataVecType &findDataset, 136 | CARMIExternalMap *carmi) { 137 | DataVecType findQuery; 138 | DataVecType insertQuery; 139 | std::vector index; 140 | int end = kTestSize * kReadOnly; 141 | InitTestSet(findDataset, DataVecType(), isZipfian, &findQuery, &insertQuery, 142 | &index); 143 | 144 | std::clock_t s, e; 145 | double tmp; 146 | auto resIte = carmi->end(); 147 | KeyType res = 0; 148 | s = std::clock(); 149 | if (isZipfian) { 150 | for (int i = 0; i < end; i++) { 151 | resIte = carmi->find(findQuery[index[i]].first); 152 | res += resIte.data(); 153 | } 154 | } else { 155 | for (int i = 0; i < end; i++) { 156 | resIte = carmi->find(findQuery[i].first); 157 | res += resIte.data(); 158 | } 159 | } 160 | e = std::clock(); 161 | tmp = (e - s) / static_cast(CLOCKS_PER_SEC); 162 | 163 | std::cout << " res: " << res << std::endl; 164 | 165 | PrintAvgTime(tmp); 166 | } 167 | 168 | /** 169 | * @brief read mostly workload (range scan) for external CARMI, 170 | * a mix of 95/5 reads and writes 171 | * 172 | * @tparam KeyType 173 | * @param[in] isZipfian whether to use zipfian access during the test 174 | * @param[in] findDataset 175 | * @param[in] insertDataset 176 | * @param[in] length 177 | * @param[inout] carmi 178 | */ 179 | template 180 | void WorkloadE(bool isZipfian, const DataVecType &findDataset, 181 | const DataVecType &insertDataset, const std::vector &length, 182 | CARMIExternalMap *carmi) { 183 | DataVecType findQuery; 184 | DataVecType insertQuery; 185 | std::vector index; 186 | InitTestSet(findDataset, insertDataset, isZipfian, &findQuery, &insertQuery, 187 | &index); 188 | 189 | int end = round(kTestSize * (1 - kReadHeavy)); 190 | int findCnt = 0; 191 | 192 | std::vector>> ret( 193 | 100, {KeyType(), {KeyType()}}); 194 | std::clock_t s, e; 195 | double tmp; 196 | s = std::clock(); 197 | if (isZipfian) { 198 | for (int i = 0; i < end; i++) { 199 | for (int j = 0; j < 19 && findCnt < static_cast(index.size()); j++) { 200 | auto it = carmi->find(findQuery[index[findCnt]].first); 201 | 202 | for (int l = 0; l < length[index[findCnt]]; l++) { 203 | // ret[l] = *it; 204 | it++; 205 | } 206 | findCnt++; 207 | } 208 | carmi->insert(insertQuery[i].first); 209 | } 210 | } else { 211 | for (int i = 0; i < end; i++) { 212 | for (int j = 0; j < 19 && findCnt < static_cast(findQuery.size()); 213 | j++) { 214 | auto it = carmi->find(findQuery[findCnt].first); 215 | for (int l = 0; l < length[findCnt]; l++) { 216 | // ret[l] = *it; 217 | it++; 218 | } 219 | findCnt++; 220 | } 221 | carmi->insert(insertQuery[i].first); 222 | } 223 | } 224 | e = std::clock(); 225 | tmp = (e - s) / static_cast(CLOCKS_PER_SEC); 226 | 227 | findCnt = 0; 228 | s = std::clock(); 229 | if (isZipfian) { 230 | for (int i = 0; i < end; i++) { 231 | for (int j = 0; j < 19 && findCnt < static_cast(index.size()); j++) { 232 | for (int l = 0; l < length[index[findCnt]]; l++) { 233 | } 234 | findCnt++; 235 | } 236 | } 237 | } else { 238 | for (int i = 0; i < end; i++) { 239 | for (int j = 0; j < 19 && findCnt < static_cast(findQuery.size()); 240 | j++) { 241 | for (int l = 0; l < length[findCnt]; l++) { 242 | } 243 | findCnt++; 244 | } 245 | } 246 | } 247 | e = std::clock(); 248 | double tmp0 = (e - s) / static_cast(CLOCKS_PER_SEC); 249 | tmp -= tmp0; 250 | 251 | PrintAvgTime(tmp); 252 | } 253 | #endif // EXPERIMENT_WORKLOAD_WORKLOADS_EXTERNAL_H_ 254 | --------------------------------------------------------------------------------