├── alex
    └── src.txt
├── rs
    └── src.txt
├── stx_btree
    ├── src.txt
    ├── README.md
    ├── btree
    ├── btree_set
    ├── btree_map
    ├── btree_multiset
    └── btree_multimap
├── .DS_Store
├── .gitignore
├── .vscode
    ├── tasks.json
    ├── launch.json
    └── settings.json
├── src
    ├── experiment.cpp
    ├── include
    │   ├── func
    │   │   ├── get_node_info.h
    │   │   ├── calculate_space.h
    │   │   ├── delete_function.h
    │   │   ├── split_function.h
    │   │   ├── insert_function.h
    │   │   └── find_function.h
    │   ├── construct
    │   │   ├── dp.h
    │   │   ├── dp_inner.h
    │   │   ├── dp_leaf.h
    │   │   ├── store_node.h
    │   │   ├── construct_root.h
    │   │   ├── structures.h
    │   │   ├── minor_function.h
    │   │   └── greedy.h
    │   ├── nodes
    │   │   ├── rootNode
    │   │   │   ├── root_nodes.h
    │   │   │   └── trainModel
    │   │   │   │   └── linear_regression.h
    │   │   └── innerNode
    │   │   │   ├── bs_model.h
    │   │   │   ├── lr_model.h
    │   │   │   └── candidate_plr.h
    │   ├── memoryLayout
    │   │   ├── empty_block.h
    │   │   └── node_array.h
    │   ├── base_node.h
    │   └── params.h
    ├── experiment
    │   ├── dataset
    │   │   ├── normal_distribution.h
    │   │   ├── exponential_distribution.h
    │   │   ├── uniform_distribution.h
    │   │   ├── lognormal_distribution.h
    │   │   ├── longlat.h
    │   │   ├── longitudes.h
    │   │   ├── osmc.h
    │   │   ├── ycsb.h
    │   │   └── base_dataset.h
    │   ├── workload
    │   │   ├── public_functions.h
    │   │   ├── zipfian.h
    │   │   ├── public_functions.cpp
    │   │   └── workloads_external.h
    │   ├── experiment_params.h
    │   ├── functions.h
    │   ├── core.cpp
    │   └── main_experiment.cpp
    ├── unitTest
    │   ├── rootNodeTest
    │   │   ├── lr_test.cpp
    │   │   └── piecewiseLR_test.cpp
    │   ├── innerNodeTest
    │   │   ├── binary_search_test.cpp
    │   │   ├── linear_regression_test.cpp
    │   │   ├── piecewise_lr_test.cpp
    │   │   └── histogram_test.cpp
    │   ├── leafNodeTest
    │   │   ├── external_array_test.cpp
    │   │   └── cfarray_test.cpp
    │   └── carmiTest
    │   │   ├── carmi_map_test.cpp
    │   │   └── carmi_external_map_test.cpp
    ├── CMakeLists.txt
    ├── profiler
    │   ├── binary_search.cpp
    │   └── inner_node_time.cpp
    └── example
    │   └── example.cpp
├── LICENSE
└── README.md


/alex/src.txt:
--------------------------------------------------------------------------------
1 | https://github.com/microsoft/ALEX


--------------------------------------------------------------------------------
/rs/src.txt:
--------------------------------------------------------------------------------
1 | https://github.com/learnedsystems/RadixSpline


--------------------------------------------------------------------------------
/stx_btree/src.txt:
--------------------------------------------------------------------------------
1 | https://panthema.net/2007/stx-btree/.


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/embryo-labs/CARMI/HEAD/.DS_Store


--------------------------------------------------------------------------------
/stx_btree/README.md:
--------------------------------------------------------------------------------
1 | URL: https://github.com/bingmann/stx-btree.git
2 | 
3 | Commit ID: 68db9cc6c7bdbc145f99ef323ea3ef031dda4425


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/settings.json
 2 | build/*
 3 | CMakeFiles/*
 4 | .vscode/*
 5 | *.csv
 6 | Doxyfile
 7 | src/.vscode/*
 8 | src/build/*
 9 | doc/*
10 | alex/alex_base.h
11 | alex/alex_fanout_tree.h
12 | alex/alex_map.h
13 | alex/alex_multimap.h
14 | alex/alex_nodes.h
15 | alex/alex.h
16 | rs/builder.h
17 | rs/common.h
18 | rs/multi_map.h
19 | rs/radix_spline.h
20 | src/include/baseNode
21 | src/include/nodes/innerNode/candidate_plr


--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tasks": [
 3 |         {
 4 |             "type": "shell",
 5 |             "label": "g++.exe build active file",
 6 |             "command": "g++",
 7 |             "args": [
 8 |                 "-g",
 9 |                 "${file}",
10 |                 "-std=c++11",
11 |                 "-o",
12 |                 // "${fileDirname}\\${fileBasenameNoExtension}.exe"
13 |                 "${fileBasenameNoExtension}.out"
14 |             ]
15 |             // ,
16 |             // "options": {
17 |             //     "cwd": "F:\\TDM-GCC-64\\bin"
18 |             // }
19 |         }
20 |     ],
21 |     "version": "2.0.0"
22 | }


--------------------------------------------------------------------------------
/src/experiment.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file main.cpp
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-03-16
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #include <algorithm>
12 | #include <fstream>
13 | #include <iostream>
14 | #include <random>
15 | 
16 | #include "./experiment/functions.h"
17 | 
18 | std::ofstream outRes;
19 | 
20 | int main() {
21 |   kPrimaryIndex = false;
22 |   outRes.open("res_1122.csv", std::ios::app);
23 | 
24 |   time_t timep;
25 |   time(&timep);
26 |   char tmpTime[64];
27 |   strftime(tmpTime, sizeof(tmpTime), "%Y-%m-%d %H:%M:%S", localtime(&timep));
28 |   std::cout << "\nTest time: " << tmpTime << std::endl;
29 |   outRes << "\nTest time: " << tmpTime << std::endl;
30 | 
31 |   mainExperiment();
32 | 
33 |   outRes << "----------------------------------------------" << std::endl;
34 | 
35 |   return 0;
36 | }
37 | 


--------------------------------------------------------------------------------
/src/include/func/get_node_info.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file get_node_info.h
 3 |  * @author Jiaoyi
 4 |  * @brief get the information of the node in CARMI
 5 |  * @version 3.0
 6 |  * @date 2021-10-24
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef FUNC_GET_NODE_INFO_H_
12 | #define FUNC_GET_NODE_INFO_H_
13 | #include <vector>
14 | 
15 | #include "../carmi.h"
16 | 
17 | template <typename KeyType, typename ValueType, typename Compare,
18 |           typename Alloc>
19 | int CARMI<KeyType, ValueType, Compare, Alloc>::GetNodeInfo(
20 |     int idx, int *childNumber, int *childStartIndex) {
21 |   // Case 1: the index of the node is invalid
22 |   if (idx < 0 || idx >= node.nowNodeNumber) {
23 |     return -1;
24 |   }
25 |   // Case 2: the node is valid
26 |   int type = node.nodeArray[idx].lr.flagNumber >> 24;
27 |   *childNumber = node.nodeArray[idx].lr.flagNumber & 0xFFFFFF;
28 |   *childStartIndex = node.nodeArray[idx].lr.childLeft;
29 |   return type;
30 | }
31 | 
32 | #endif  // FUNC_GET_NODE_INFO_H_
33 | 


--------------------------------------------------------------------------------
/src/experiment/dataset/normal_distribution.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file normal_distribution.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-03-16
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_DATASET_NORMAL_DISTRIBUTION_H_
12 | #define EXPERIMENT_DATASET_NORMAL_DISTRIBUTION_H_
13 | 
14 | #include <algorithm>
15 | #include <iostream>
16 | #include <random>
17 | #include <utility>
18 | #include <vector>
19 | 
20 | #include "./base_dataset.h"
21 | 
22 | class NormalDataset : public BaseDataset {
23 |  public:
24 |   explicit NormalDataset(float initRatio) : BaseDataset(initRatio) {}
25 | 
26 |   void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset,
27 |                        DataVecType *testInsertQuery) {
28 |     // create dataset randomly
29 |     std::default_random_engine generator;
30 |     std::normal_distribution<double> distribution(0.0, 1.0);
31 | 
32 |     SplitInitTest<std::normal_distribution<double>>(
33 |         distribution, initDataset, insertDataset, testInsertQuery);
34 |   }
35 | };
36 | 
37 | #endif  // EXPERIMENT_DATASET_NORMAL_DISTRIBUTION_H_
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 JiaoyiZhang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/experiment/dataset/exponential_distribution.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file exponential_distribution.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-03-16
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_DATASET_EXPONENTIAL_DISTRIBUTION_H_
12 | #define EXPERIMENT_DATASET_EXPONENTIAL_DISTRIBUTION_H_
13 | 
14 | #include <algorithm>
15 | #include <iostream>
16 | #include <random>
17 | #include <utility>
18 | #include <vector>
19 | 
20 | #include "./base_dataset.h"
21 | 
22 | class ExponentialDataset : public BaseDataset {
23 |  public:
24 |   explicit ExponentialDataset(float initRatio) : BaseDataset(initRatio) {}
25 | 
26 |   void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset,
27 |                        DataVecType *testInsertQuery) {
28 |     // create dataset randomly
29 |     std::default_random_engine generator;
30 |     std::exponential_distribution<double> distribution(0.25);
31 | 
32 |     SplitInitTest<std::exponential_distribution<double>>(
33 |         distribution, initDataset, insertDataset, testInsertQuery);
34 |   }
35 | };
36 | 
37 | #endif  // EXPERIMENT_DATASET_EXPONENTIAL_DISTRIBUTION_H_
38 | 


--------------------------------------------------------------------------------
/src/experiment/dataset/uniform_distribution.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file uniform_distribution.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-03-15
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_DATASET_UNIFORM_DISTRIBUTION_H_
12 | #define EXPERIMENT_DATASET_UNIFORM_DISTRIBUTION_H_
13 | 
14 | #include <algorithm>
15 | #include <iostream>
16 | #include <random>
17 | #include <set>
18 | #include <utility>
19 | #include <vector>
20 | 
21 | #include "./base_dataset.h"
22 | 
23 | class UniformDataset : public BaseDataset {
24 |  public:
25 |   explicit UniformDataset(float initRatio) : BaseDataset(initRatio) {}
26 |   void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset,
27 |                        DataVecType *testInsertQuery) {
28 |     std::default_random_engine generator;
29 |     std::uniform_real_distribution<double> distribution(0.0, 1.0);
30 | 
31 |     SplitInitTest<std::uniform_real_distribution<double>>(
32 |         distribution, initDataset, insertDataset, testInsertQuery);
33 |     return;
34 |   }
35 | };
36 | 
37 | #endif  // EXPERIMENT_DATASET_UNIFORM_DISTRIBUTION_H_
38 | 


--------------------------------------------------------------------------------
/src/experiment/workload/public_functions.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file public_functions.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-04-07
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_WORKLOAD_PUBLIC_FUNCTIONS_H_
12 | #define EXPERIMENT_WORKLOAD_PUBLIC_FUNCTIONS_H_
13 | 
14 | #include <utility>
15 | #include <vector>
16 | 
17 | #include "../../include/carmi_map.h"
18 | #include "../experiment_params.h"
19 | #include "./zipfian.h"
20 | 
21 | extern std::ofstream outRes;
22 | 
23 | /**
24 |  * @brief prepare query workloads
25 |  *
26 |  * @param[in] findQueryset
27 |  * @param[in] insertDataset
28 |  * @param[inout] findQuery
29 |  * @param[inout] insertQuery
30 |  * @param[inout] index
31 |  */
32 | void InitTestSet(const DataVecType &findQueryset,
33 |                  const DataVecType &insertDataset, bool isZipfian,
34 |                  DataVecType *findQuery, DataVecType *insertQuery,
35 |                  std::vector<int> *index);
36 | 
37 | /**
38 |  * @brief print the average time of the workload
39 |  *
40 |  * @param[in] time
41 |  */
42 | void PrintAvgTime(double time);
43 | 
44 | #endif  // EXPERIMENT_WORKLOAD_PUBLIC_FUNCTIONS_H_
45 | 


--------------------------------------------------------------------------------
/src/experiment/dataset/lognormal_distribution.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file lognormal_distribution.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-03-16
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_DATASET_LOGNORMAL_DISTRIBUTION_H_
12 | #define EXPERIMENT_DATASET_LOGNORMAL_DISTRIBUTION_H_
13 | 
14 | #include <algorithm>
15 | #include <iostream>
16 | #include <random>
17 | #include <utility>
18 | #include <vector>
19 | 
20 | #include "./base_dataset.h"
21 | 
22 | class LognormalDataset : public BaseDataset {
23 |  public:
24 |   explicit LognormalDataset(float initRatio) : BaseDataset(initRatio) {}
25 | 
26 |   void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset,
27 |                        DataVecType *testInsertQuery) {
28 |     // create dataset randomly
29 |     std::default_random_engine generator;
30 |     std::lognormal_distribution<double> distribution(0.0, 1.0);
31 | 
32 |     SplitInitTest<std::lognormal_distribution<double>>(
33 |         distribution, initDataset, insertDataset, testInsertQuery);
34 |   }
35 | };
36 | 
37 | #endif  // EXPERIMENT_DATASET_LOGNORMAL_DISTRIBUTION_H_
38 | 


--------------------------------------------------------------------------------
/src/experiment/workload/zipfian.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file zipfian.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-03-16
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_WORKLOAD_ZIPFIAN_H_
12 | #define EXPERIMENT_WORKLOAD_ZIPFIAN_H_
13 | 
14 | #include <math.h>
15 | #include <stdlib.h>
16 | 
17 | #include <iostream>
18 | #include <random>
19 | class Zipfian {
20 |  public:
21 |   double *pf;
22 |   void InitZipfian(double A, int num) {
23 |     pf = new double[num];
24 |     double sum = 0.0;
25 |     for (int i = 0; i < num; i++) {
26 |       sum += 1 / pow(static_cast<double>(i + 2), A);
27 |     }
28 |     for (int i = 0; i < num; i++) {
29 |       if (i == 0)
30 |         pf[i] = 1 / pow(static_cast<double>(i + 2), A) / sum;
31 |       else
32 |         pf[i] = pf[i - 1] + 1 / pow(static_cast<double>(i + 2), A) / sum;
33 |     }
34 |   }
35 | 
36 |   int GenerateNextIndex() {
37 |     int index = 0;
38 |     std::default_random_engine e(time(0));
39 |     std::uniform_real_distribution<double> dis(0, 1);
40 |     double data = dis(e);  // 0-1
41 |     while (data > pf[index]) index++;
42 |     return index;
43 |   }
44 | };
45 | 
46 | #endif  // EXPERIMENT_WORKLOAD_ZIPFIAN_H_
47 | 


--------------------------------------------------------------------------------
/src/unitTest/rootNodeTest/lr_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file lr_test.cpp
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 0.1
 6 |  * @date 2021-11-03
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | 
12 | #include "../../experiment/dataset/lognormal_distribution.h"
13 | #include "../../include/nodes/rootNode/trainModel/linear_regression.h"
14 | #include "gtest/gtest.h"
15 | 
16 | std::vector<std::pair<double, double>> initData;
17 | std::vector<std::pair<double, double>> insertData;
18 | std::vector<std::pair<double, double>> testInsert;
19 | 
20 | const int kChildNum = 512;
21 | const int kTestMaxValue = kMaxValue;
22 | 
23 | LognormalDataset logData(0.9);
24 | LinearRegression<DataVecType, double> model;
25 | 
26 | TEST(TestTrain, TrainLRModel) {
27 |   logData.GenerateDataset(&initData, &insertData, &testInsert);
28 |   model.maxChildIdx = kChildNum - 1;
29 |   model.Train(initData);
30 |   EXPECT_EQ(kChildNum - 1, model.maxChildIdx);
31 | }
32 | 
33 | TEST(TestPredictInitData, PredictInitData) {
34 |   for (int i = 0; i < initData.size(); i++) {
35 |     int p = model.Predict(initData[i].first);
36 |     EXPECT_GE(p, 0);
37 |     EXPECT_LT(p, kChildNum);
38 |   }
39 | }
40 | 
41 | TEST(TestPredictInsertData, PredictInsertData) {
42 |   for (int i = 0; i < insertData.size(); i++) {
43 |     int p = model.Predict(insertData[i].first);
44 |     EXPECT_GE(p, 0);
45 |     EXPECT_LT(p, kChildNum);
46 |   }
47 | }


--------------------------------------------------------------------------------
/src/experiment/experiment_params.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file experiment_params.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-05-19
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | 
12 | #ifndef EXPERIMENT_EXPERIMENT_PARAMS_H_
13 | #define EXPERIMENT_EXPERIMENT_PARAMS_H_
14 | 
15 | #define PARAM_ZIPFIAN 0.99
16 | #define DEBUG
17 | // #define TEST_UINT64
18 | 
19 | #include <utility>
20 | #include <vector>
21 | 
22 | #ifdef TEST_UINT64
23 | typedef uint64_t KeyType;
24 | typedef uint64_t ValueType;
25 | #else
26 | typedef double KeyType;
27 | typedef double ValueType;
28 | #endif  // TEST_UINT
29 | 
30 | typedef std::pair<KeyType, ValueType> DataType;
31 | typedef std::vector<DataType> DataVecType;
32 | 
33 | static bool kPrimaryIndex = false;
34 | 
35 | const int kDatasetSize =
36 |     1024.0 / sizeof(DataType) * 1024 * 1024;  // 1 GB / 16 byte
37 | const float kTestSize = 100000.0;
38 | const float kMaxValue = 100000000;
39 | 
40 | const float kReadOnly = 1;
41 | const float kWriteHeavy = 0.5;
42 | const float kReadHeavy = 0.95;
43 | const float kWritePartial = 0.85;
44 | const float kRangeScan = 2;
45 | 
46 | const float kSecondToNanosecond = 1000000000.0;
47 | 
48 | #ifdef DEBUG
49 | const std::vector<double> rate = {0.025};
50 | #else
51 | const std::vector<double> rate = {0.01, 0.02, 0.025, 0.03, 0.05, 0.1};
52 | #endif  // !DEBUG
53 | 
54 | #endif  // EXPERIMENT_EXPERIMENT_PARAMS_H_
55 | 


--------------------------------------------------------------------------------
/src/unitTest/rootNodeTest/piecewiseLR_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file piecewiseLR_test.cpp
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 0.1
 6 |  * @date 2021-11-03
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | 
12 | #include "../../include/nodes/rootNode/trainModel/piecewiseLR.h"
13 | 
14 | #include "../../experiment/dataset/lognormal_distribution.h"
15 | #include "gtest/gtest.h"
16 | 
17 | std::vector<std::pair<double, double>> initData;
18 | std::vector<std::pair<double, double>> insertData;
19 | std::vector<std::pair<double, double>> testInsert;
20 | 
21 | const int kChildNum = 512;
22 | const int kTestMaxValue = kMaxValue;
23 | 
24 | LognormalDataset logData(0.9);
25 | PiecewiseLR<DataVecType, double> model;
26 | 
27 | TEST(TestTrain, TrainPLRModel) {
28 |   logData.GenerateDataset(&initData, &insertData, &testInsert);
29 |   model.maxChildIdx = kChildNum - 1;
30 |   model.Train(initData);
31 |   EXPECT_EQ(kChildNum - 1, model.maxChildIdx);
32 | }
33 | 
34 | TEST(TestPredictInitData, PredictInitData) {
35 |   for (int i = 0; i < initData.size(); i++) {
36 |     int p = model.Predict(initData[i].first);
37 |     EXPECT_GE(p, 0);
38 |     EXPECT_LT(p, kChildNum);
39 |   }
40 | }
41 | 
42 | TEST(TestPredictInsertData, PredictInsertData) {
43 |   for (int i = 0; i < insertData.size(); i++) {
44 |     int p = model.Predict(insertData[i].first);
45 |     EXPECT_GE(p, 0);
46 |     EXPECT_LT(p, kChildNum);
47 |   }
48 | }


--------------------------------------------------------------------------------
/src/include/func/calculate_space.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file calculate_space.h
 3 |  * @author Jiaoyi
 4 |  * @brief calculate the space of CARMI
 5 |  * @version 3.0
 6 |  * @date 2021-03-11
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef FUNC_CALCULATE_SPACE_H_
12 | #define FUNC_CALCULATE_SPACE_H_
13 | 
14 | #include <vector>
15 | 
16 | #include "../carmi.h"
17 | #include "../params.h"
18 | 
19 | template <typename KeyType, typename ValueType, typename Compare,
20 |           typename Alloc>
21 | long long CARMI<KeyType, ValueType, Compare, Alloc>::CalculateSpace() const {
22 |   // calculate the space of the plr root node
23 |   long long space_cost = kPLRRootSpace * 1024.0 * 1024.0;
24 |   // calculate the space of the node array
25 |   space_cost += kBaseNodeSpace * node.nowNodeNumber * 1024.0 * 1024.0;
26 | #ifdef DEBUG
27 |   std::cout << "node.size(), " << node.nodeArray.size() << ",\tnowChildNumber,"
28 |             << node.nowNodeNumber << std::endl;
29 |   std::cout << "data.size(), " << data.dataArray.size()
30 |             << ",\tkMaxLeafNodeSize," << carmi_params::kMaxLeafNodeSize
31 |             << std::endl;
32 | #endif  // DEBUG
33 | 
34 |   if (!isPrimary) {
35 |     // calculate the space of the data array
36 |     space_cost += static_cast<double>(data.dataArray.size()) *
37 |                   carmi_params::kMaxLeafNodeSize;
38 |   }
39 |   return space_cost;
40 | }
41 | 
42 | #endif  // FUNC_CALCULATE_SPACE_H_
43 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # Ubuntu
 3 | 
 4 | cmake_minimum_required(VERSION 3.0)
 5 | project(CARMI)
 6 | set(CMAKE_CXX_STANDARD 17)
 7 | 
 8 | aux_source_directory(experiment/workload source_list_workload)
 9 | SET(CMAKE_BUILD_TYPE "Release")
10 | 
11 | # add_executable(CARMI profiler/inner_node_time.cpp)
12 | # add_executable(CARMI profiler/leaf_node_time.cpp)
13 | add_executable(CARMI experiment.cpp experiment/core.cpp experiment/main_experiment.cpp ${source_list_workload} )
14 | target_link_libraries(CARMI)
15 | 
16 | ## Test
17 | # add_executable(CARMI unitTest/carmiTest/carmi_map_test.cpp)
18 | # add_executable(CARMI unitTest/carmiTest/carmi_external_map_test.cpp)
19 | # add_executable(CARMI unitTest/carmiTest/map_test.cpp)
20 | # add_executable(CARMI unitTest/carmiTest/externalmap_test.cpp)
21 | 
22 | ## rootNode
23 | # add_executable(CARMI unitTest/rootNodeTest/piecewiseLR_test.cpp)
24 | # add_executable(CARMI unitTest/rootNodeTest/lr_test.cpp)
25 | 
26 | ## innerNode
27 | # add_executable(CARMI unitTest/innerNodeTest/linear_regression_test.cpp)
28 | # add_executable(CARMI unitTest/innerNodeTest/piecewise_lr_test.cpp)
29 | # add_executable(CARMI unitTest/innerNodeTest/histogram_test.cpp)
30 | # add_executable(CARMI unitTest/innerNodeTest/binary_search_test.cpp)
31 | 
32 | ## leafNode
33 | # add_executable(CARMI unitTest/leafNodeTest/cfarray_test.cpp)
34 | # add_executable(CARMI unitTest/leafNodeTest/external_array_test.cpp)
35 |  
36 |  
37 | # target_link_libraries(CARMI gtest_main gtest pthread)


--------------------------------------------------------------------------------
/src/experiment/dataset/longlat.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file longlat.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-03-16
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_DATASET_LONGLAT_H_
12 | #define EXPERIMENT_DATASET_LONGLAT_H_
13 | 
14 | #include <algorithm>
15 | #include <fstream>
16 | #include <iomanip>
17 | #include <iostream>
18 | #include <random>
19 | #include <sstream>
20 | #include <string>
21 | #include <utility>
22 | #include <vector>
23 | 
24 | #include "./base_dataset.h"
25 | class LonglatDataset : public BaseDataset {
26 |  public:
27 |   explicit LonglatDataset(float initRatio) : BaseDataset(initRatio) {}
28 | 
29 |   void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset,
30 |                        DataVecType *testInsertQuery) {
31 |     DataVecType ds;
32 |     std::ifstream inFile("../experiment/dataset/longlat.csv", std::ios::in);
33 |     if (!inFile) {
34 |       std::cout << "open longlat.csv failed" << std::endl;
35 |       exit(1);
36 |     }
37 |     std::string line;
38 |     while (getline(inFile, line)) {
39 |       if (line.empty()) continue;
40 |       std::istringstream sin(line);
41 |       std::vector<std::string> fields;
42 |       std::string field;
43 |       while (getline(sin, field, ',')) fields.push_back(field);
44 |       std::string key = fields[0];
45 |       std::string value = fields[1];
46 |       double k = stod(key);
47 |       double v = stod(value);
48 |       ds.push_back({k, v});
49 |       if (ds.size() == kDatasetSize + round(kTestSize * (1 - proportion))) {
50 |         break;
51 |       }
52 |     }
53 | 
54 |     SplitInitTest(&ds, initDataset, insertDataset, testInsertQuery);
55 |   }
56 | };
57 | 
58 | #endif  // EXPERIMENT_DATASET_LONGLAT_H_
59 | 


--------------------------------------------------------------------------------
/src/experiment/dataset/longitudes.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file longitudes.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-03-16
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_DATASET_LONGITUDES_H_
12 | #define EXPERIMENT_DATASET_LONGITUDES_H_
13 | 
14 | #include <algorithm>
15 | #include <fstream>
16 | #include <iomanip>
17 | #include <iostream>
18 | #include <random>
19 | #include <sstream>
20 | #include <string>
21 | #include <utility>
22 | #include <vector>
23 | 
24 | #include "./base_dataset.h"
25 | class LongitudesDataset : public BaseDataset {
26 |  public:
27 |   explicit LongitudesDataset(float initRatio) : BaseDataset(initRatio) {}
28 | 
29 |   void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset,
30 |                        DataVecType *testInsertQuery) {
31 |     DataVecType ds;
32 |     std::ifstream inFile("../experiment/dataset/longitude.csv", std::ios::in);
33 |     if (!inFile) {
34 |       std::cout << "open longitude.csv failed" << std::endl;
35 |       exit(1);
36 |     }
37 |     std::string line;
38 |     while (getline(inFile, line)) {
39 |       if (line.empty()) continue;
40 |       std::istringstream sin(line);
41 |       std::vector<std::string> fields;
42 |       std::string field;
43 |       while (getline(sin, field, ',')) fields.push_back(field);
44 |       std::string key = fields[0];
45 |       std::string value = fields[1];
46 |       double k = stod(key);
47 |       double v = stod(value);
48 |       ds.push_back({k, v});
49 |       if (ds.size() == kDatasetSize + round(kTestSize * (1 - proportion))) {
50 |         break;
51 |       }
52 |     }
53 | 
54 |     SplitInitTest(&ds, initDataset, insertDataset, testInsertQuery);
55 |   }
56 | };
57 | 
58 | #endif  // EXPERIMENT_DATASET_LONGITUDES_H_
59 | 


--------------------------------------------------------------------------------
/src/experiment/dataset/osmc.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file osmc.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-12-16
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_DATASET_OSMC_H_
12 | #define EXPERIMENT_DATASET_OSMC_H_
13 | 
14 | #include <algorithm>
15 | #include <fstream>
16 | #include <iomanip>
17 | #include <iostream>
18 | #include <random>
19 | #include <sstream>
20 | #include <string>
21 | #include <utility>
22 | #include <vector>
23 | 
24 | #include "./base_dataset.h"
25 | class OsmcDataset : public BaseDataset {
26 |  public:
27 |   explicit OsmcDataset(float initRatio) : BaseDataset(initRatio) {}
28 | 
29 |   void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset,
30 |                        DataVecType *testInsertQuery) {
31 |     DataVecType ds;
32 |     std::ifstream inFile("../experiment/dataset/osmc.csv", std::ios::in);
33 |     if (!inFile) {
34 |       std::cout << "open osmc.csv failed" << std::endl;
35 |       exit(1);
36 |     }
37 |     std::string line;
38 |     while (getline(inFile, line)) {
39 |       if (line.empty()) continue;
40 |       std::istringstream sin(line);
41 |       std::vector<std::string> fields;
42 |       std::string field;
43 |       while (getline(sin, field, ',')) fields.push_back(field);
44 |       std::string key = fields[0];
45 |       std::string value = fields[1];
46 |       uint64_t k, v;
47 |       std::stringstream strK, strV;
48 |       strK << key;
49 |       strK >> k;
50 |       strV << value;
51 |       strV >> v;
52 |       ds.push_back({k, v});
53 |       if (ds.size() == kDatasetSize + round(kTestSize * (1 - proportion))) {
54 |         break;
55 |       }
56 |     }
57 | 
58 |     SplitInitTest(&ds, initDataset, insertDataset, testInsertQuery);
59 |   }
60 | };
61 | 
62 | #endif  // EXPERIMENT_DATASET_OSMC_H_
63 | 


--------------------------------------------------------------------------------
/src/experiment/workload/public_functions.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | /**
 3 |  * @file public_functions.cpp
 4 |  * @author Jiaoyi
 5 |  * @brief
 6 |  * @version 3.0
 7 |  * @date 2021-04-07
 8 |  *
 9 |  * @copyright Copyright (c) 2021
10 |  *
11 |  */
12 | 
13 | #include "public_functions.h"
14 | 
15 | #include <ctime>
16 | 
17 | #include "../experiment_params.h"
18 | 
19 | /**
20 |  * @brief prepare query workloads
21 |  *
22 |  * @param[in] findQueryset
23 |  * @param[in] insertDataset
24 |  * @param[inout] findQuery
25 |  * @param[inout] insertQuery
26 |  * @param[inout] index
27 |  */
28 | void InitTestSet(const DataVecType &findQueryset,
29 |                  const DataVecType &insertDataset, bool isZipfian,
30 |                  DataVecType *findQuery, DataVecType *insertQuery,
31 |                  std::vector<int> *index) {
32 |   (*findQuery) = findQueryset;
33 |   (*insertQuery) = insertDataset;
34 | 
35 |   std::default_random_engine engine;
36 | 
37 |   unsigned seed = std::clock();
38 |   engine = std::default_random_engine(seed);
39 |   shuffle((*findQuery).begin(), (*findQuery).end(), engine);
40 | 
41 |   if (!kPrimaryIndex) {
42 |     unsigned seed1 = std::clock();
43 |     engine = std::default_random_engine(seed1);
44 |     shuffle((*insertQuery).begin(), (*insertQuery).end(), engine);
45 |   }
46 | 
47 |   if (isZipfian) {
48 |     Zipfian zip;
49 |     zip.InitZipfian(PARAM_ZIPFIAN, (*findQuery).size());
50 |     *index = std::vector<int>(kTestSize, 0);
51 |     for (int i = 0; i < kTestSize; i++) {
52 |       int idx = zip.GenerateNextIndex();
53 |       (*index)[i] = idx;
54 |     }
55 |   }
56 | }
57 | 
58 | /**
59 |  * @brief print the average time of the workload
60 |  *
61 |  * @param[in] time
62 |  */
63 | void PrintAvgTime(double time) {
64 |   std::cout << "average time," << time * kSecondToNanosecond / kTestSize
65 |             << std::endl;
66 |   outRes << time * kSecondToNanosecond / kTestSize << ",";
67 | }
68 | 


--------------------------------------------------------------------------------
/stx_btree/btree:
--------------------------------------------------------------------------------
 1 | // -*- mode: c++ -*-
 2 | /*******************************************************************************
 3 |  * include/stx/btree
 4 |  *
 5 |  * STX B+ Tree Template Classes v0.9
 6 |  * Copyright (C) 2008-2013 Timo Bingmann <tb@panthema.net>
 7 |  *
 8 |  * Boost Software License - Version 1.0 - August 17th, 2003
 9 |  *
10 |  * Permission is hereby granted, free of charge, to any person or organization
11 |  * obtaining a copy of the software and accompanying documentation covered by
12 |  * this license (the "Software") to use, reproduce, display, distribute,
13 |  * execute, and transmit the Software, and to prepare derivative works of the
14 |  * Software, and to permit third-parties to whom the Software is furnished to
15 |  * do so, all subject to the following:
16 |  *
17 |  * The copyright notices in the Software and this entire statement, including
18 |  * the above license grant, this restriction and the following disclaimer, must
19 |  * be included in all copies of the Software, in whole or in part, and all
20 |  * derivative works of the Software, unless such copies or derivative works are
21 |  * solely in the form of machine-executable object code generated by a source
22 |  * language processor.
23 |  *
24 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 |  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
27 |  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
28 |  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
29 |  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30 |  * DEALINGS IN THE SOFTWARE.
31 |  ******************************************************************************/
32 | 
33 | #ifndef _STX_BTREE_
34 | #define _STX_BTREE_
35 | 
36 | /** \file btree
37 |  * Forwarder header to btree.h
38 |  */
39 | 
40 | #include <stx/btree.h>
41 | 
42 | #endif // _STX_BTREE_
43 | 
44 | /******************************************************************************/
45 | 


--------------------------------------------------------------------------------
/stx_btree/btree_set:
--------------------------------------------------------------------------------
 1 | // -*- mode: c++ -*-
 2 | /*******************************************************************************
 3 |  * include/stx/btree_set
 4 |  *
 5 |  * STX B+ Tree Template Classes v0.9
 6 |  * Copyright (C) 2008-2013 Timo Bingmann
 7 |  *
 8 |  * Boost Software License - Version 1.0 - August 17th, 2003
 9 |  *
10 |  * Permission is hereby granted, free of charge, to any person or organization
11 |  * obtaining a copy of the software and accompanying documentation covered by
12 |  * this license (the "Software") to use, reproduce, display, distribute,
13 |  * execute, and transmit the Software, and to prepare derivative works of the
14 |  * Software, and to permit third-parties to whom the Software is furnished to
15 |  * do so, all subject to the following:
16 |  *
17 |  * The copyright notices in the Software and this entire statement, including
18 |  * the above license grant, this restriction and the following disclaimer, must
19 |  * be included in all copies of the Software, in whole or in part, and all
20 |  * derivative works of the Software, unless such copies or derivative works are
21 |  * solely in the form of machine-executable object code generated by a source
22 |  * language processor.
23 |  *
24 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 |  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
27 |  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
28 |  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
29 |  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30 |  * DEALINGS IN THE SOFTWARE.
31 |  ******************************************************************************/
32 | 
33 | #ifndef _STX_BTREE_SET_
34 | #define _STX_BTREE_SET_
35 | 
36 | /** \file btree_set
37 |  * Forwarder header to btree_set.h
38 |  */
39 | 
40 | #include <stx/btree_set.h>
41 | 
42 | #endif // _STX_BTREE_SET_
43 | 
44 | /******************************************************************************/
45 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // 使用 IntelliSense 了解相关属性。 
 3 |     // 悬停以查看现有属性的描述。
 4 |     // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "(gdb) 启动",
 9 |             "type": "cppdbg",
10 |             "request": "launch",
11 |             // "program": "${workspaceFolder}/${fileBasenameNoExtension}.out",
12 |             "program": "${workspaceFolder}/build/CARMI",
13 |             "args": [],
14 |             "stopAtEntry": false,
15 |             "cwd": "${workspaceFolder}",
16 |             "environment": [],
17 |             "externalConsole": true,
18 |             "MIMode": "gdb",
19 |             "preLaunchTask": "CARMI",
20 |             "setupCommands": [
21 |                 {
22 |                     "description": "Enable pretty-printing for gdb",
23 |                     "text": "-enable-pretty-printing",
24 |                     "ignoreFailures": true
25 |                 }
26 |             ],
27 |             "sourceFileMap": {
28 |                 "/build/glibc-ZN95T4": "/usr/src/glibc"
29 |             }
30 |         }
31 |         // ,
32 |         // {
33 |         //     "name": "g++.exe build and debug active file",
34 |         //     "type": "cppdbg",
35 |         //     "request": "launch",
36 |         //     "program": "${fileDirname}\\${fileBasenameNoExtension}.exe",
37 |         //     "args": [],
38 |         //     "stopAtEntry": false,
39 |         //     "cwd": "${workspaceFolder}",
40 |         //     "environment": [],
41 |         //     "externalConsole": false,
42 |         //     "MIMode": "gdb",
43 |         //     "miDebuggerPath": "F:\\TDM-GCC-64\\bin\\gdb.exe",
44 |         //     "setupCommands": [
45 |         //         {
46 |         //             "description": "为 gdb 启用整齐打印",
47 |         //             "text": "-enable-pretty-printing",
48 |         //             "ignoreFailures": true
49 |         //         }
50 |         //     ],
51 |         //     "preLaunchTask": "g++.exe build active file"
52 |         // }
53 |     ]
54 | }


--------------------------------------------------------------------------------
/stx_btree/btree_map:
--------------------------------------------------------------------------------
 1 | // -*- mode: c++ -*-
 2 | /*******************************************************************************
 3 |  * include/stx/btree_map
 4 |  *
 5 |  * STX B+ Tree Template Classes v0.9
 6 |  * Copyright (C) 2008-2013 Timo Bingmann <tb@panthema.net>
 7 |  *
 8 |  * Boost Software License - Version 1.0 - August 17th, 2003
 9 |  *
10 |  * Permission is hereby granted, free of charge, to any person or organization
11 |  * obtaining a copy of the software and accompanying documentation covered by
12 |  * this license (the "Software") to use, reproduce, display, distribute,
13 |  * execute, and transmit the Software, and to prepare derivative works of the
14 |  * Software, and to permit third-parties to whom the Software is furnished to
15 |  * do so, all subject to the following:
16 |  *
17 |  * The copyright notices in the Software and this entire statement, including
18 |  * the above license grant, this restriction and the following disclaimer, must
19 |  * be included in all copies of the Software, in whole or in part, and all
20 |  * derivative works of the Software, unless such copies or derivative works are
21 |  * solely in the form of machine-executable object code generated by a source
22 |  * language processor.
23 |  *
24 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 |  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
27 |  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
28 |  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
29 |  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30 |  * DEALINGS IN THE SOFTWARE.
31 |  ******************************************************************************/
32 | 
33 | #ifndef _STX_BTREE_MAP_
34 | #define _STX_BTREE_MAP_
35 | 
36 | /** \file btree_map
37 |  * Forwarder header to btree_map.h
38 |  */
39 | 
40 | #include <stx/btree_map.h>
41 | 
42 | #endif // _STX_BTREE_MAP_
43 | 
44 | /******************************************************************************/
45 | 


--------------------------------------------------------------------------------
/stx_btree/btree_multiset:
--------------------------------------------------------------------------------
 1 | // -*- mode: c++ -*-
 2 | /*******************************************************************************
 3 |  * include/stx/btree_multiset
 4 |  *
 5 |  * STX B+ Tree Template Classes v0.9
 6 |  * Copyright (C) 2008-2013 Timo Bingmann
 7 |  *
 8 |  * Boost Software License - Version 1.0 - August 17th, 2003
 9 |  *
10 |  * Permission is hereby granted, free of charge, to any person or organization
11 |  * obtaining a copy of the software and accompanying documentation covered by
12 |  * this license (the "Software") to use, reproduce, display, distribute,
13 |  * execute, and transmit the Software, and to prepare derivative works of the
14 |  * Software, and to permit third-parties to whom the Software is furnished to
15 |  * do so, all subject to the following:
16 |  *
17 |  * The copyright notices in the Software and this entire statement, including
18 |  * the above license grant, this restriction and the following disclaimer, must
19 |  * be included in all copies of the Software, in whole or in part, and all
20 |  * derivative works of the Software, unless such copies or derivative works are
21 |  * solely in the form of machine-executable object code generated by a source
22 |  * language processor.
23 |  *
24 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 |  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
27 |  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
28 |  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
29 |  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30 |  * DEALINGS IN THE SOFTWARE.
31 |  ******************************************************************************/
32 | 
33 | #ifndef _STX_BTREE_MULTISET_
34 | #define _STX_BTREE_MULTISET_
35 | 
36 | /** \file btree_multiset
37 |  * Forwarder header to btree_multiset.h
38 |  */
39 | 
40 | #include <stx/btree_multiset.h>
41 | 
42 | #endif // _STX_BTREE_MULTISET_
43 | 
44 | /******************************************************************************/
45 | 


--------------------------------------------------------------------------------
/stx_btree/btree_multimap:
--------------------------------------------------------------------------------
 1 | // -*- mode: c++ -*-
 2 | /*******************************************************************************
 3 |  * include/stx/btree_multimap
 4 |  *
 5 |  * STX B+ Tree Template Classes v0.9
 6 |  * Copyright (C) 2008-2013 Timo Bingmann <tb@panthema.net>
 7 |  *
 8 |  * Boost Software License - Version 1.0 - August 17th, 2003
 9 |  *
10 |  * Permission is hereby granted, free of charge, to any person or organization
11 |  * obtaining a copy of the software and accompanying documentation covered by
12 |  * this license (the "Software") to use, reproduce, display, distribute,
13 |  * execute, and transmit the Software, and to prepare derivative works of the
14 |  * Software, and to permit third-parties to whom the Software is furnished to
15 |  * do so, all subject to the following:
16 |  *
17 |  * The copyright notices in the Software and this entire statement, including
18 |  * the above license grant, this restriction and the following disclaimer, must
19 |  * be included in all copies of the Software, in whole or in part, and all
20 |  * derivative works of the Software, unless such copies or derivative works are
21 |  * solely in the form of machine-executable object code generated by a source
22 |  * language processor.
23 |  *
24 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 |  * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
27 |  * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
28 |  * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
29 |  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
30 |  * DEALINGS IN THE SOFTWARE.
31 |  ******************************************************************************/
32 | 
33 | #ifndef _STX_BTREE_MULTIMAP_
34 | #define _STX_BTREE_MULTIMAP_
35 | 
36 | /** \file btree_multimap
37 |  * Forwarder header to btree_multimap.h
38 |  */
39 | 
40 | #include <stx/btree_multimap.h>
41 | 
42 | #endif // _STX_BTREE_MULTIMAP_
43 | 
44 | /******************************************************************************/
45 | 


--------------------------------------------------------------------------------
/src/profiler/binary_search.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file binary_search.cpp
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-05-24
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #include <algorithm>
12 | #include <ctime>
13 | #include <iostream>
14 | #include <random>
15 | #include <utility>
16 | #include <vector>
17 | 
18 | const int kSize = 100000000;
19 | const float kSecondToNanosecond = 1000000000.0;
20 | std::vector<double> data;
21 | std::vector<int> idx;
22 | const int end = kSize / 64 - 8;
23 | 
24 | inline int BinarySearch(double key, int start, int end) {
25 |   while (start < end) {
26 |     int mid = (start + end) / 2;
27 |     if (data[mid] < key)
28 |       start = mid + 1;
29 |     else
30 |       end = mid;
31 |   }
32 |   return start;
33 | }
34 | 
35 | void GetBinarySearchTime(int nodeSize) {
36 |   unsigned seed = std::clock();
37 |   std::default_random_engine engine(seed);
38 |   shuffle(idx.begin(), idx.end(), engine);
39 | 
40 |   int start, endidx;
41 |   double value;
42 |   std::clock_t s, e;
43 |   double tmp;
44 |   int c;
45 |   s = std::clock();
46 |   for (int i = 0, j = 0; i < end; i++, j++) {
47 |     start = idx[i];
48 |     endidx = start + nodeSize - 1;
49 | 
50 |     value = start + j;
51 |     c = BinarySearch(value, start, endidx);
52 | 
53 |     j &= nodeSize - 1;
54 |   }
55 |   e = std::clock();
56 |   tmp = (e - s) / static_cast<double>(CLOCKS_PER_SEC);
57 | 
58 |   s = std::clock();
59 |   for (int i = 0, j = 0; i < end; i++, j++) {
60 |     start = idx[i];
61 |     endidx = start + nodeSize - 1;
62 |     value = start + j;
63 |     j &= nodeSize - 1;
64 |   }
65 |   e = std::clock();
66 |   double tmp1 = (e - s) / static_cast<double>(CLOCKS_PER_SEC);
67 |   std::cout << nodeSize * 8
68 |             << " bs average time:" << (tmp - tmp1) * kSecondToNanosecond / end
69 |             << std::endl;
70 | }
71 | 
72 | int main() {
73 |   data = std::vector<double>(kSize, 0);
74 |   idx = std::vector<int>(end);
75 |   for (int i = 0; i < kSize; i++) {
76 |     data[i] = i;
77 |   }
78 |   for (int i = 0; i < end; i++) {
79 |     idx[i] = i * 64;
80 |   }
81 |   GetBinarySearchTime(64 / 8);
82 |   GetBinarySearchTime(128 / 8);
83 |   GetBinarySearchTime(256 / 8);
84 |   GetBinarySearchTime(512 / 8);
85 | }
86 | 


--------------------------------------------------------------------------------
/src/unitTest/innerNodeTest/binary_search_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file binary_search_test.cpp
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 0.1
 6 |  * @date 2021-11-03
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #include "../../experiment/dataset/lognormal_distribution.h"
12 | #include "../../include/nodes/innerNode/bs_model.h"
13 | #include "gtest/gtest.h"
14 | 
15 | typedef double KeyType;
16 | typedef double ValueType;
17 | typedef std::pair<KeyType, ValueType> DataType;
18 | 
19 | std::vector<DataType> initData;
20 | std::vector<DataType> insertData;
21 | std::vector<DataType> testInsert;
22 | 
23 | const int kChildNum = 15;
24 | const int kTestMaxValue = kMaxValue;
25 | 
26 | LognormalDataset logData(0.9);
27 | BSModel<KeyType, ValueType> model(kChildNum);
28 | std::default_random_engine engine(time(0));
29 | 
30 | TEST(TestMultiTrain, MultiTrainBSModel) {
31 |   std::uniform_real_distribution<KeyType> dis(0, kTestMaxValue);
32 |   std::vector<DataType> testTrainData;
33 |   unsigned int seed = time(NULL);
34 |   for (int i = 0; i < 9; i++) {
35 |     int tmpSize = std::pow(10, i) - 1;
36 |     testTrainData = std::vector<DataType>(tmpSize);
37 |     for (int j = 0; j < tmpSize; j++) {
38 |       KeyType tmpKey = dis(engine);
39 |       testTrainData[j] = {tmpKey, tmpKey};
40 |     }
41 |     std::sort(testTrainData.begin(), testTrainData.end());
42 |     BSModel<KeyType, ValueType> tmpModel(kChildNum);
43 |     tmpModel.Train(0, testTrainData.size(), testTrainData);
44 |     EXPECT_EQ(kChildNum, tmpModel.flagNumber & 0x00FFFFFF);
45 |     for (int j = 0; j < 13; j++) {
46 |       EXPECT_LE(tmpModel.keys[j], tmpModel.keys[j + 1]);
47 |     }
48 |   }
49 | }
50 | 
51 | TEST(TestTrain, TrainBSModel) {
52 |   logData.GenerateDataset(&initData, &insertData, &testInsert);
53 |   model.Train(0, initData.size(), initData);
54 |   EXPECT_EQ(kChildNum, model.flagNumber & 0x00FFFFFF);
55 |   EXPECT_EQ(4, model.flagNumber >> 24);
56 | }
57 | 
58 | TEST(TestPredictInitData, PredictInitData) {
59 |   for (int i = 0; i < initData.size(); i++) {
60 |     int p = model.Predict(initData[i].first);
61 |     EXPECT_GE(p, 0);
62 |     EXPECT_LT(p, kChildNum);
63 |   }
64 | }
65 | 
66 | TEST(TestPredictInsertData, PredictInsertData) {
67 |   for (int i = 0; i < insertData.size(); i++) {
68 |     int p = model.Predict(insertData[i].first);
69 |     EXPECT_GE(p, 0);
70 |     EXPECT_LT(p, kChildNum);
71 |   }
72 | }


--------------------------------------------------------------------------------
/src/experiment/dataset/ycsb.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file ycsb.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-03-22
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_DATASET_YCSB_H_
12 | #define EXPERIMENT_DATASET_YCSB_H_
13 | 
14 | #include <algorithm>
15 | #include <fstream>
16 | #include <iomanip>
17 | #include <iostream>
18 | #include <random>
19 | #include <sstream>
20 | #include <string>
21 | #include <vector>
22 | 
23 | #include "./base_dataset.h"
24 | 
25 | class YCSBDataset : public BaseDataset {
26 |  public:
27 |   explicit YCSBDataset(float initRatio) : BaseDataset(initRatio) {}
28 | 
29 |   void GenerateDataset(DataVecType *initDataset, DataVecType *insertDataset,
30 |                        DataVecType *testInsertQuery) {
31 |     (*initDataset) = std::vector<DataType>(kDatasetSize);
32 |     int end = round(kTestSize * (1 - proportion));
33 |     (*testInsertQuery) = std::vector<DataType>(end);
34 | 
35 |     DataVecType ds;
36 |     std::ifstream inFile("../experiment/dataset/newycsbdata.csv", std::ios::in);
37 |     if (!inFile) {
38 |       std::cout << "open ycsb.csv failed" << std::endl;
39 |       exit(1);
40 |     }
41 |     std::string line;
42 |     while (getline(inFile, line)) {
43 |       if (line.empty()) continue;
44 |       std::istringstream sin(line);
45 |       std::vector<std::string> fields;
46 |       std::string field;
47 |       while (getline(sin, field, ',')) fields.push_back(field);
48 |       std::string key = fields[0];
49 |       key.erase(0, 4);
50 |       double k = stod(key);
51 |       double v = k / 10;
52 |       ds.push_back({k, v});
53 |       if (ds.size() == kDatasetSize + end) {
54 |         break;
55 |       }
56 |     }
57 | 
58 |     std::sort(ds.begin(), ds.end());
59 |     for (int i = 0; i < kDatasetSize; i++) {
60 |       (*initDataset)[i] = ds[i];
61 |     }
62 |     double lastKey = ds[ds.size() - 1].first;
63 |     if (ds.size() < kDatasetSize + end) {
64 |       for (int i = 0; i < end; i++) {
65 |         (*testInsertQuery)[i] = {lastKey + i, lastKey + i};
66 |       }
67 |     } else {
68 |       for (int i = 0; i < end; i++) {
69 |         (*testInsertQuery)[i] = ds[i + kDatasetSize];
70 |       }
71 |     }
72 | 
73 |     std::cout << "YCSB: init size:" << (*initDataset).size()
74 |               << "\tWrite size:" << (*testInsertQuery).size() << std::endl;
75 |   }
76 | };
77 | 
78 | #endif  // EXPERIMENT_DATASET_YCSB_H_
79 | 


--------------------------------------------------------------------------------
/src/include/func/delete_function.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file delete_function.h
 3 |  * @author Jiaoyi
 4 |  * @brief delete a record
 5 |  * @version 3.0
 6 |  * @date 2021-03-11
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef FUNC_DELETE_FUNCTION_H_
12 | #define FUNC_DELETE_FUNCTION_H_
13 | 
14 | #include <algorithm>
15 | 
16 | #include "../carmi.h"
17 | 
18 | template <typename KeyType, typename ValueType, typename Compare,
19 |           typename Alloc>
20 | bool CARMI<KeyType, ValueType, Compare, Alloc>::Delete(const KeyType &key,
21 |                                                        size_t *cnt) {
22 |   int idx = 0;  // idx in the node array
23 |   int type = root.flagNumber;
24 |   while (1) {
25 |     switch (type) {
26 |       case PLR_ROOT_NODE:
27 |         // Case 0: this node is the plr root node
28 |         // use the plr root node to find the index of the next node
29 |         idx = root.PLRType<DataVectorType, KeyType>::model.Predict(key);
30 |         break;
31 |       case LR_INNER_NODE:
32 |         // Case 1: this node is the lr inner node
33 |         // use the predict function of lr inner node to obtain the index of the
34 |         // next node
35 |         idx = node.nodeArray[idx].lr.Predict(key);
36 |         break;
37 |       case PLR_INNER_NODE:
38 |         // Case 2: this node is the plr inner node
39 |         // use the predict function of plr inner node to obtain the index of the
40 |         // next node
41 |         idx = node.nodeArray[idx].plr.Predict(key);
42 |         break;
43 |       case HIS_INNER_NODE:
44 |         // Case 3: this node is the his inner node
45 |         // use the predict function of his inner node to obtain the index of the
46 |         // next node
47 |         idx = node.nodeArray[idx].his.Predict(key);
48 |         break;
49 |       case BS_INNER_NODE:
50 |         // Case 4: this node is the bs inner node
51 |         // use the predict function of bs inner node to obtain the index of the
52 |         // next node
53 |         idx = node.nodeArray[idx].bs.Predict(key);
54 |         break;
55 |       case ARRAY_LEAF_NODE: {
56 |         // Case 5: this node is the cache-friendly array leaf node
57 |         // Delete the data point in the cf leaf node
58 |         return node.nodeArray[idx].cfArray.Delete(key, cnt, &data);
59 |       }
60 |     }
61 | 
62 |     type = node.nodeArray[idx].lr.flagNumber >> 24;
63 |   }
64 | }
65 | 
66 | #endif  // FUNC_DELETE_FUNCTION_H_
67 | 


--------------------------------------------------------------------------------
/src/unitTest/innerNodeTest/linear_regression_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file linear_regression_test.cpp
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 0.1
 6 |  * @date 2021-11-03
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #include <random>
12 | 
13 | #include "../../experiment/dataset/lognormal_distribution.h"
14 | #include "../../include/nodes/innerNode/lr_model.h"
15 | #include "gtest/gtest.h"
16 | 
17 | typedef double KeyType;
18 | typedef double ValueType;
19 | typedef std::pair<KeyType, ValueType> DataType;
20 | 
21 | std::vector<DataType> initData;
22 | std::vector<DataType> insertData;
23 | std::vector<DataType> testInsert;
24 | 
25 | const int kChildNum = 512;
26 | const int kTestMaxValue = kMaxValue;
27 | 
28 | LognormalDataset logData(0.9);
29 | LRModel<double, double> model(kChildNum);
30 | std::default_random_engine engine(time(0));
31 | 
32 | TEST(TestMultiTrain, MultiTrainLRModel) {
33 |   std::vector<DataType> testTrainData;
34 |   std::uniform_real_distribution<KeyType> dis(0, kTestMaxValue);
35 |   for (int i = 0; i < 9; i++) {
36 |     int tmpSize = std::pow(10, i) - 1;
37 |     std::cout << "Start test size: " << tmpSize << std::endl;
38 |     testTrainData = std::vector<DataType>(tmpSize);
39 |     for (int j = 0; j < tmpSize; j++) {
40 |       KeyType tmpKey = dis(engine);
41 |       testTrainData[j] = {tmpKey, tmpKey};
42 |     }
43 |     std::sort(testTrainData.begin(), testTrainData.end());
44 |     std::cout << "Dataset is ready, start to test." << std::endl;
45 |     LRModel<KeyType, ValueType> tmpModel(kChildNum);
46 |     tmpModel.Train(0, testTrainData.size(), testTrainData);
47 |     EXPECT_EQ(kChildNum, tmpModel.flagNumber & 0x00FFFFFF);
48 |     EXPECT_GE(tmpModel.slope, 0);
49 |   }
50 | }
51 | 
52 | TEST(TestTrain, TrainLRModel) {
53 |   logData.GenerateDataset(&initData, &insertData, &testInsert);
54 |   model.Train(0, initData.size(), initData);
55 |   EXPECT_EQ(kChildNum, model.flagNumber & 0x00FFFFFF);
56 |   EXPECT_EQ(1, model.flagNumber >> 24);
57 | }
58 | 
59 | TEST(TestPredictInitData, PredictInitData) {
60 |   for (int i = 0; i < initData.size(); i++) {
61 |     int p = model.Predict(initData[i].first);
62 |     EXPECT_GE(p, 0);
63 |     EXPECT_LT(p, kChildNum);
64 |   }
65 | }
66 | 
67 | TEST(TestPredictInsertData, PredictInsertData) {
68 |   for (int i = 0; i < insertData.size(); i++) {
69 |     int p = model.Predict(insertData[i].first);
70 |     EXPECT_GE(p, 0);
71 |     EXPECT_LT(p, kChildNum);
72 |   }
73 | }


--------------------------------------------------------------------------------
/src/unitTest/innerNodeTest/piecewise_lr_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file piecewise_lr_test.cpp
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 0.1
 6 |  * @date 2021-11-03
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #include <random>
12 | 
13 | #include "../../experiment/dataset/lognormal_distribution.h"
14 | #include "../../include/nodes/innerNode/plr_model.h"
15 | #include "gtest/gtest.h"
16 | 
17 | typedef double KeyType;
18 | typedef double ValueType;
19 | typedef std::pair<KeyType, ValueType> DataType;
20 | 
21 | std::vector<DataType> initData;
22 | std::vector<DataType> insertData;
23 | std::vector<DataType> testInsert;
24 | 
25 | const int kChildNum = 512;
26 | const int kTestMaxValue = kMaxValue;
27 | 
28 | LognormalDataset logData(0.9);
29 | PLRModel<double, double> model(kChildNum);
30 | std::default_random_engine engine(time(0));
31 | 
32 | TEST(TestMultiTrain, MultiTrainPLRModel) {
33 |   std::vector<DataType> testTrainData;
34 |   std::uniform_real_distribution<KeyType> dis(0, kTestMaxValue);
35 |   for (int i = 0; i < 9; i++) {
36 |     int tmpSize = std::pow(10, i) - 1;
37 |     std::cout << "Start test size: " << tmpSize << std::endl;
38 |     testTrainData = std::vector<DataType>(tmpSize);
39 |     for (int j = 0; j < tmpSize; j++) {
40 |       KeyType tmpKey = dis(engine);
41 |       testTrainData[j] = {tmpKey, tmpKey};
42 |     }
43 |     std::sort(testTrainData.begin(), testTrainData.end());
44 |     std::cout << "Dataset is ready, start to test." << std::endl;
45 |     PLRModel<KeyType, ValueType> tmpModel(kChildNum);
46 |     tmpModel.Train(0, testTrainData.size(), testTrainData);
47 |     EXPECT_EQ(kChildNum, tmpModel.flagNumber & 0x00FFFFFF);
48 |     for (int j = 0; j < 5; j++) {
49 |       EXPECT_LE(tmpModel.index[j], tmpModel.index[j + 1]);
50 |     }
51 |     for (int j = 0; j < 7; j++) {
52 |       EXPECT_LT(tmpModel.keys[j], tmpModel.keys[j + 1]);
53 |     }
54 |     std::cout << "Subtest " << i << " over!" << std::endl;
55 |   }
56 | }
57 | 
58 | TEST(TestTrain, TrainPLRModel) {
59 |   logData.GenerateDataset(&initData, &insertData, &testInsert);
60 |   model.Train(0, initData.size(), initData);
61 |   EXPECT_EQ(kChildNum, model.flagNumber & 0x00FFFFFF);
62 |   EXPECT_EQ(2, model.flagNumber >> 24);
63 | }
64 | 
65 | TEST(TestPredictInitData, PredictInitData) {
66 |   for (int i = 0; i < initData.size(); i++) {
67 |     int p = model.Predict(initData[i].first);
68 |     EXPECT_GE(p, 0);
69 |     EXPECT_LT(p, kChildNum);
70 |   }
71 | }
72 | 
73 | TEST(TestPredictInsertData, PredictInsertData) {
74 |   for (int i = 0; i < insertData.size(); i++) {
75 |     int p = model.Predict(insertData[i].first);
76 |     EXPECT_GE(p, 0);
77 |     EXPECT_LT(p, kChildNum);
78 |   }
79 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.associations": {
 3 |         "random": "cpp",
 4 |         "array": "cpp",
 5 |         "memory": "cpp",
 6 |         "tuple": "cpp",
 7 |         "type_traits": "cpp",
 8 |         "utility": "cpp",
 9 |         "xmemory0": "cpp",
10 |         "xstddef": "cpp",
11 |         "xtr1common": "cpp",
12 |         "xutility": "cpp",
13 |         "algorithm": "cpp",
14 |         "chrono": "cpp",
15 |         "cmath": "cpp",
16 |         "cstddef": "cpp",
17 |         "cstdint": "cpp",
18 |         "cstdio": "cpp",
19 |         "cstdlib": "cpp",
20 |         "cstring": "cpp",
21 |         "cwchar": "cpp",
22 |         "exception": "cpp",
23 |         "functional": "cpp",
24 |         "initializer_list": "cpp",
25 |         "ios": "cpp",
26 |         "iosfwd": "cpp",
27 |         "iostream": "cpp",
28 |         "istream": "cpp",
29 |         "iterator": "cpp",
30 |         "limits": "cpp",
31 |         "list": "cpp",
32 |         "map": "cpp",
33 |         "new": "cpp",
34 |         "ostream": "cpp",
35 |         "ratio": "cpp",
36 |         "set": "cpp",
37 |         "sstream": "cpp",
38 |         "stdexcept": "cpp",
39 |         "streambuf": "cpp",
40 |         "string": "cpp",
41 |         "system_error": "cpp",
42 |         "typeinfo": "cpp",
43 |         "unordered_set": "cpp",
44 |         "vector": "cpp",
45 |         "xfacet": "cpp",
46 |         "xfunctional": "cpp",
47 |         "xhash": "cpp",
48 |         "xiosbase": "cpp",
49 |         "xlocale": "cpp",
50 |         "xlocinfo": "cpp",
51 |         "xlocnum": "cpp",
52 |         "xmemory": "cpp",
53 |         "xstring": "cpp",
54 |         "xtree": "cpp",
55 |         "ctime": "cpp",
56 |         "iomanip": "cpp",
57 |         "stack": "cpp",
58 |         "__locale": "cpp",
59 |         "__bit_reference": "cpp",
60 |         "__split_buffer": "cpp",
61 |         "filesystem": "cpp",
62 |         "deque": "cpp",
63 |         "__functional_base": "cpp",
64 |         "__functional_base_03": "cpp",
65 |         "__hash_table": "cpp",
66 |         "__tree": "cpp",
67 |         "__tuple": "cpp",
68 |         "any": "cpp",
69 |         "__node_handle": "cpp",
70 |         "atomic": "cpp",
71 |         "*.tcc": "cpp",
72 |         "cctype": "cpp",
73 |         "clocale": "cpp",
74 |         "cstdarg": "cpp",
75 |         "cwctype": "cpp",
76 |         "unordered_map": "cpp",
77 |         "optional": "cpp",
78 |         "string_view": "cpp",
79 |         "fstream": "cpp",
80 |         "numeric": "cpp",
81 |         "bit": "cpp",
82 |         "memory_resource": "cpp",
83 |         "variant": "cpp",
84 |         "codecvt": "cpp",
85 |         "bitset": "cpp"
86 |     },
87 |     "C_Cpp.default.configurationProvider": "go2sh.cmake-integration"
88 | }


--------------------------------------------------------------------------------
/src/experiment/functions.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file functions.h
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-04-07
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef EXPERIMENT_FUNCTIONS_H_
12 | #define EXPERIMENT_FUNCTIONS_H_
13 | 
14 | #include <vector>
15 | 
16 | #include "../include/carmi_external_map.h"
17 | #include "../include/carmi_map.h"
18 | #include "./workload/workloads.h"
19 | #include "./workload/workloads_external.h"
20 | #include "dataset/exponential_distribution.h"
21 | #include "dataset/lognormal_distribution.h"
22 | #include "dataset/longitudes.h"
23 | #include "dataset/longlat.h"
24 | #include "dataset/normal_distribution.h"
25 | #include "dataset/osmc.h"
26 | #include "dataset/uniform_distribution.h"
27 | #include "dataset/ycsb.h"
28 | 
29 | /**
30 |  * @brief prepare query workloads
31 |  *
32 |  * @param[in] Ratio the ratio of find queries
33 |  * @param[in] findQueryset
34 |  * @param[in] insertDataset
35 |  * @param[inout] findQuery
36 |  * @param[inout] insertQuery
37 |  * @param[inout] index
38 |  */
39 | void InitTestSet(double Ratio, const DataVecType &findQueryset,
40 |                  const DataVecType &insertDataset, DataVecType *findQuery,
41 |                  DataVecType *insertQuery, std::vector<int> *index);
42 | 
43 | /**
44 |  * @brief print the average time of the workload
45 |  *
46 |  * @param[in] time
47 |  */
48 | void PrintAvgTime(double time);
49 | 
50 | /**
51 |  * @brief the function of using CARMI
52 |  *
53 |  * @param[in] isZipfian whether to use zipfian access during the test
54 |  * @param[in] initRatio the workload type
55 |  * @param[in] rate the weight of space
56 |  * @param[in] length the length of range scan
57 |  * @param[in] initDataset
58 |  * @param[in] insertDataset
59 |  * @param[in] testInsertQuery
60 |  */
61 | void CoreCARMI(bool isZipfian, double initRatio, double rate,
62 |                const std::vector<int> &length, const DataVecType &initDataset,
63 |                const DataVecType &insertDataset,
64 |                const DataVecType &testInsertQuery);
65 | 
66 | /**
67 |  * @brief the function of using external CARMI
68 |  *
69 |  * @param[in] isZipfian whether to use zipfian access during the test
70 |  * @param[in] initRatio the workload type
71 |  * @param[in] rate the weight of space
72 |  * @param[in] length the length of range scan
73 |  * @param[in] initDataset
74 |  * @param[in] testInsertQuery
75 |  */
76 | void CoreExternalCARMI(bool isZipfian, double initRatio, double rate,
77 |                        const std::vector<int> &length,
78 |                        const DataVecType &initDataset,
79 |                        const DataVecType &testInsertQuery);
80 | 
81 | void mainSynthetic(double initRatio, const std::vector<int> &length);
82 | void mainYCSB(double initRatio, const std::vector<int> &length);
83 | void mainMap(double initRatio, const std::vector<int> &length);
84 | void mainExperiment();
85 | 
86 | #endif  // EXPERIMENT_FUNCTIONS_H_
87 | 


--------------------------------------------------------------------------------
/src/unitTest/leafNodeTest/external_array_test.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | /**
 3 |  * @file external_array_test.cpp
 4 |  * @author Jiaoyi
 5 |  * @brief
 6 |  * @version 0.1
 7 |  * @date 2021-11-04
 8 |  *
 9 |  * @copyright Copyright (c) 2021
10 |  *
11 |  */
12 | #include <random>
13 | 
14 | #include "../../include/nodes/leafNode/external_array_type.h"
15 | #include "gtest/gtest.h"
16 | 
17 | typedef double KeyType;
18 | typedef double ValueType;
19 | 
20 | const int kTestMaxValue = 10000;
21 | unsigned int seed = time(NULL);
22 | std::default_random_engine engine(time(0));
23 | std::uniform_real_distribution<KeyType> dis(0, kTestMaxValue);
24 | 
25 | template <typename KeyType, typename ValueType>
26 | class DataType {
27 |  public:
28 |   typedef ValueType ValueType_;
29 |   DataType() {
30 |     k = 0;
31 |     v = 0;
32 |   }
33 |   explicit DataType(KeyType key, ValueType_ value) {
34 |     k = key;
35 |     v = value;
36 |   }
37 |   const KeyType& key() const { return k; }
38 |   const ValueType_& data() const { return v; }
39 | 
40 |   bool operator<(const DataType& a) const {
41 |     if (k == a.k) {
42 |       return v < a.v;
43 |     }
44 |     return k < a.k;
45 |   }
46 | 
47 |   KeyType k;
48 |   ValueType_ v;
49 | };
50 | 
51 | TEST(TestTrain, TrainExternalArrayNode) {
52 |   for (int i = 0; i < carmi_params::kMaxLeafNodeSizeExternal; i++) {
53 |     std::vector<std::pair<KeyType, ValueType>> testTrainData(i);
54 |     ExternalArray<KeyType, ValueType> externalNode;
55 |     for (int j = 0; j < i; j++) {
56 |       KeyType tmpKey = dis(engine);
57 |       testTrainData[j] = {tmpKey, tmpKey * 10};
58 |     }
59 |     std::sort(testTrainData.begin(), testTrainData.end());
60 |     externalNode.Train(testTrainData, 0, i);
61 |     EXPECT_GE(externalNode.error, 0);
62 |   }
63 | }
64 | 
65 | TEST(TestFind, ExternalArrayNodeFind) {
66 |   for (int i = 0; i < carmi_params::kMaxLeafNodeSizeExternal; i++) {
67 |     std::vector<std::pair<KeyType, ValueType>> testTrainData(i);
68 |     ExternalArray<KeyType, ValueType> externalNode;
69 |     KeyType* externalDataset = new KeyType[i * 2];
70 |     for (int j = 0, k = 0; j < i; j++, k += 2) {
71 |       KeyType tmpKey = dis(engine);
72 |       testTrainData[j] = {tmpKey, tmpKey * 10};
73 |     }
74 |     std::sort(testTrainData.begin(), testTrainData.end());
75 |     for (int j = 0, k = 0; j < i; j++, k += 2) {
76 |       *(externalDataset + k) = testTrainData[j].first;
77 |       *(externalDataset + k + 1) = testTrainData[j].second;
78 |     }
79 |     externalNode.m_left = 0;
80 |     externalNode.Train(testTrainData, 0, i);
81 |     for (int j = 0; j < i; j++) {
82 |       int currslot =
83 |           externalNode.Find(testTrainData[j].first, 16, externalDataset);
84 |       KeyType res = testTrainData[currslot].first;
85 |       if (res != testTrainData[j].first) {
86 |         currslot =
87 |             externalNode.Find(testTrainData[j].first, 16, externalDataset);
88 |       }
89 |       ASSERT_EQ(res, testTrainData[j].first);
90 |     }
91 |   }
92 | }


--------------------------------------------------------------------------------
/src/include/construct/dp.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file dp.h
 3 |  * @author Jiaoyi
 4 |  * @brief the main function of dynamic programming algorithm
 5 |  * @version 3.0
 6 |  * @date 2021-03-11
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #ifndef CONSTRUCT_DP_H_
12 | #define CONSTRUCT_DP_H_
13 | 
14 | #include <float.h>
15 | 
16 | #include <algorithm>
17 | #include <map>
18 | #include <vector>
19 | 
20 | #include "../params.h"
21 | #include "./dp_inner.h"
22 | #include "./dp_leaf.h"
23 | #include "./greedy.h"
24 | #include "./structures.h"
25 | 
26 | template <typename KeyType, typename ValueType, typename Compare,
27 |           typename Alloc>
28 | NodeCost CARMI<KeyType, ValueType, Compare, Alloc>::DP(const DataRange &range) {
29 |   NodeCost nodeCost;
30 |   // Case 1: the dataset is empty, construct an empty node and return directly
31 |   if (range.initRange.size == 0) {
32 |     nodeCost = emptyCost;
33 |     // Construct an empty leaf node when the sub-dataset is empty and store
34 |     // it in the structMap. The type of this leaf node depends on the isPrimary
35 |     // parameter, if it is true, construct an external array leaf node,
36 |     // otherwise, construct a cache-friendly array leaf node.
37 |     BaseNode<KeyType, ValueType, Compare, Alloc> optimal_node_struct;
38 |     if (isPrimary) {
39 |       optimal_node_struct.externalArray =
40 |           ExternalArray<KeyType, ValueType, Compare>();
41 |     } else {
42 |       optimal_node_struct.cfArray =
43 |           CFArrayType<KeyType, ValueType, Compare, Alloc>();
44 |     }
45 |     structMap.insert({range.initRange, optimal_node_struct});
46 |     return nodeCost;
47 |   }
48 | 
49 |   // Case 2: this sub-dataset has been solved before, return the minimum cost
50 |   // directly
51 |   auto it = COST.find(range.initRange);
52 |   if (it != COST.end()) {
53 |     nodeCost = it->second;
54 |     return nodeCost;
55 |   }
56 | 
57 |   double minRatio = 0.95;
58 |   // record the maximum capacity of the leaf node
59 |   int maxStoredNum =
60 |       CFArrayType<KeyType, ValueType, Compare, Alloc>::kMaxLeafCapacity;
61 |   if (isPrimary) {
62 |     maxStoredNum = carmi_params::kMaxLeafNodeSizeExternal;
63 |   }
64 |   if (range.initRange.size + range.insertRange.size <=
65 |       minRatio * maxStoredNum) {
66 |     // Case 3: if the size is smaller than the threshold, directly construct a
67 |     // leaf node
68 |     return DPLeaf(range);
69 |   } else if (range.initRange.size + range.insertRange.size > maxStoredNum) {
70 |     // Case 4: if the size is larger than the maximum capacity of a leaf node,
71 |     // directly construct an inner node
72 |     return DPInner(range);
73 |   } else {
74 |     // Case 5: construct a leaf node and an inner node respectively, and choose
75 |     // the setting with a lower cost
76 |     auto resInner = DPInner(range);
77 |     auto resLeaf = DPLeaf(range);
78 |     if (resInner.cost > resLeaf.cost)
79 |       return resLeaf;
80 |     else
81 |       return resInner;
82 |   }
83 | }
84 | 
85 | #endif  // CONSTRUCT_DP_H_
86 | 


--------------------------------------------------------------------------------
/src/profiler/inner_node_time.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file inner_node_time.cpp
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 3.0
 6 |  * @date 2021-05-25
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #include <algorithm>
12 | #include <ctime>
13 | #include <iostream>
14 | #include <random>
15 | #include <utility>
16 | #include <vector>
17 | 
18 | #include "../include/carmi.h"
19 | #include "../include/nodes/innerNode/bs_model.h"
20 | #include "../include/nodes/innerNode/his_model.h"
21 | #include "../include/nodes/innerNode/lr_model.h"
22 | #include "../include/nodes/innerNode/plr_model.h"
23 | 
24 | const int kSize = 1024;
25 | const float kSecondToNanosecond = 1000000000.0;
26 | const int kModelNumber = 100000000;
27 | const int block = 512;
28 | const int end = kModelNumber / block;
29 | std::vector<std::pair<double, double>> data(kSize);
30 | std::vector<int> idx(end);
31 | 
32 | template <typename TYPE>
33 | double GetNodePredictTime() {
34 |   std::vector<TYPE> node(kModelNumber, TYPE(20));
35 |   node[0].Train(0, kSize, data);
36 |   for (int i = 0; i < end; i++) {
37 |     node[i * block] = node[0];
38 |   }
39 |   std::vector<int> keys(kSize);
40 |   for (int i = 0; i < kSize; i++) {
41 |     keys[i] = i;
42 |   }
43 | 
44 |   std::default_random_engine engine(std::clock());
45 |   shuffle(idx.begin(), idx.end(), engine);
46 |   shuffle(keys.begin(), keys.end(), engine);
47 | 
48 |   int start, endidx, tmpIdx, type, key;
49 |   int nodeSize = 8;
50 |   double res;
51 |   std::clock_t s, e;
52 |   double tmp, tmp1 = 0;
53 | 
54 |   std::uniform_int_distribution<int> dis_idx(0, end);
55 |   std::uniform_int_distribution<int> dis_key(0, kSize);
56 |   s = std::clock();
57 |   for (int i = 0; i < end; i++) {
58 |     tmpIdx = idx[dis_idx(engine)];
59 |     key = keys[dis_key(engine)];
60 |     res = node[tmpIdx].Predict(key);
61 |   }
62 |   e = std::clock();
63 |   tmp = (e - s) / static_cast<double>(CLOCKS_PER_SEC);
64 |   s = std::clock();
65 |   for (int i = 0; i < end; i++) {
66 |     tmpIdx = idx[dis_idx(engine)];
67 |     key = keys[dis_key(engine)];
68 |     res = node[tmpIdx].flagNumber + node[tmpIdx].childLeft;
69 |   }
70 |   e = std::clock();
71 |   tmp1 = (e - s) / static_cast<double>(CLOCKS_PER_SEC);
72 |   return (tmp - tmp1) * kSecondToNanosecond / end;
73 | }
74 | 
75 | int main() {
76 |   for (int i = 0; i < kSize; i++) {
77 |     data[i] = {i, i * 10};
78 |   }
79 |   for (int i = 0; i < end; i++) {
80 |     idx[i] = i * block;
81 |   }
82 |   double lr = 0, plr = 0, bs = 0, his = 0;
83 |   float times = 1.0;
84 |   for (int i = 0; i < times; i++) {
85 |     lr += GetNodePredictTime<LRModel<double, double>>();
86 |     plr += GetNodePredictTime<PLRModel<double, double>>();
87 |     his += GetNodePredictTime<HisModel<double, double>>();
88 |     bs += GetNodePredictTime<BSModel<double, double>>();
89 |   }
90 | 
91 |   std::cout << "lr average time:" << lr / times << std::endl;
92 |   std::cout << "plr average time:" << plr / times << std::endl;
93 |   std::cout << "his average time:" << his / times << std::endl;
94 |   std::cout << "bs average time:" << bs / times << std::endl;
95 | }
96 | 


--------------------------------------------------------------------------------
/src/unitTest/innerNodeTest/histogram_test.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file histogram_test.cpp
 3 |  * @author Jiaoyi
 4 |  * @brief
 5 |  * @version 0.1
 6 |  * @date 2021-11-03
 7 |  *
 8 |  * @copyright Copyright (c) 2021
 9 |  *
10 |  */
11 | #include <random>
12 | 
13 | #include "../../experiment/dataset/lognormal_distribution.h"
14 | #include "../../include/nodes/innerNode/his_model.h"
15 | #include "gtest/gtest.h"
16 | 
17 | typedef double KeyType;
18 | typedef double ValueType;
19 | typedef std::pair<KeyType, ValueType> DataType;
20 | 
21 | std::vector<DataType> initData;
22 | std::vector<DataType> insertData;
23 | std::vector<DataType> testInsert;
24 | 
25 | const int kChildNum = 256;
26 | const int kTestMaxValue = kMaxValue;
27 | 
28 | LognormalDataset logData(0.9);
29 | HisModel<double, double> model(kChildNum);
30 | std::default_random_engine engine(time(0));
31 | 
32 | TEST(TestMultiTrain, MultiTrainHisModel) {
33 |   std::vector<DataType> testTrainData;
34 |   std::uniform_real_distribution<KeyType> dis(0, kTestMaxValue);
35 |   for (int i = 0; i < 9; i++) {
36 |     int tmpSize = std::pow(10, i) - 1;
37 |     std::cout << "Start test size: " << tmpSize << std::endl;
38 |     testTrainData = std::vector<DataType>(tmpSize);
39 |     for (int j = 0; j < tmpSize; j++) {
40 |       KeyType tmpKey = dis(engine);
41 |       testTrainData[j] = {tmpKey, tmpKey};
42 |     }
43 |     std::sort(testTrainData.begin(), testTrainData.end());
44 |     HisModel<KeyType, ValueType> tmpModel(kChildNum);
45 |     tmpModel.Train(0, testTrainData.size(), testTrainData);
46 |     EXPECT_EQ(kChildNum, tmpModel.flagNumber & 0x00FFFFFF);
47 |     EXPECT_NE(0, tmpModel.divisor);
48 |     for (int j = 0; j < 16; j++) {
49 |       EXPECT_GE(tmpModel.base[j], 0);
50 |       EXPECT_LT(tmpModel.base[j], kChildNum);
51 |     }
52 |     for (int j = 0; j < 255; j++) {
53 |       int l = tmpModel.offset[(j >> 4)] >> (15 - (j & 0x0000000F));
54 |       l = (l & 0x55555555) + ((l >> 1) & 0x55555555);
55 |       l = (l & 0x33333333) + ((l >> 2) & 0x33333333);
56 |       l = (l & 0x0f0f0f0f) + ((l >> 4) & 0x0f0f0f0f);
57 |       l = (l & 0x00ff00ff) + ((l >> 8) & 0x00ff00ff);
58 |       l += tmpModel.base[(j >> 4)];
59 | 
60 |       int r = tmpModel.offset[((j + 1) >> 4)] >> (15 - ((j + 1) & 0x0000000F));
61 |       r = (r & 0x55555555) + ((r >> 1) & 0x55555555);
62 |       r = (r & 0x33333333) + ((r >> 2) & 0x33333333);
63 |       r = (r & 0x0f0f0f0f) + ((r >> 4) & 0x0f0f0f0f);
64 |       r = (r & 0x00ff00ff) + ((r >> 8) & 0x00ff00ff);
65 |       r += tmpModel.base[((j + 1) >> 4)];
66 |       EXPECT_LE(l, r);
67 |     }
68 |     std::cout << "Subtest " << i << " over!" << std::endl;
69 |   }
70 | }
71 | 
72 | TEST(TestTrain, TrainHisModel) {
73 |   logData.GenerateDataset(&initData, &insertData, &testInsert);
74 |   model.Train(0, initData.size(), initData);
75 |   EXPECT_EQ(kChildNum, model.flagNumber & 0x00FFFFFF);
76 |   EXPECT_EQ(3, model.flagNumber >> 24);
77 | }
78 | 
79 | TEST(TestPredictInitData, PredictInitData) {
80 |   for (int i = 0; i < initData.size(); i++) {
81 |     int p = model.Predict(initData[i].first);
82 |     EXPECT_GE(p, 0);
83 |     EXPECT_LT(p, kChildNum);
84 |   }
85 | }
86 | 
87 | TEST(TestPredictInsertData, PredictInsertData) {
88 |   for (int i = 0; i < insertData.size(); i++) {
89 |     int p = model.Predict(insertData[i].first);
90 |     EXPECT_GE(p, 0);
91 |     EXPECT_LT(p, kChildNum);
92 |   }
93 | }


--------------------------------------------------------------------------------
/src/experiment/dataset/base_dataset.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file base_dataset.h
  3 |  * @author Jiaoyi
  4 |  * @brief
  5 |  * @version 3.0
  6 |  * @date 2021-03-26
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | 
 12 | #ifndef EXPERIMENT_DATASET_BASE_DATASET_H_
 13 | #define EXPERIMENT_DATASET_BASE_DATASET_H_
 14 | 
 15 | #include <algorithm>
 16 | #include <ctime>
 17 | #include <iostream>
 18 | #include <random>
 19 | #include <utility>
 20 | #include <vector>
 21 | 
 22 | #include "../experiment_params.h"
 23 | 
 24 | class BaseDataset {
 25 |  public:
 26 |   float proportion;
 27 | 
 28 |   explicit BaseDataset(float init) { proportion = init; }
 29 |   virtual void GenerateDataset(DataVecType *initDataset,
 30 |                                DataVecType *insertDataset,
 31 |                                DataVecType *testInsertQuery) = 0;
 32 |   template <typename DistributionType>
 33 |   void SplitInitTest(DistributionType &distribution, DataVecType *initDataset,
 34 |                      DataVecType *insertDataset, DataVecType *testInsertQuery) {
 35 |     (*initDataset) = std::vector<DataType>(kDatasetSize);
 36 |     int end = round(kTestSize * (1 - proportion));
 37 |     (*testInsertQuery) = std::vector<DataType>(end);
 38 |     std::default_random_engine generator;
 39 | 
 40 |     // generate initDataset
 41 |     for (int i = 0; i < kDatasetSize; i++) {
 42 |       double tmp = distribution(generator) * kMaxValue;
 43 |       (*initDataset)[i] = {tmp, tmp * 10};
 44 |     }
 45 | 
 46 |     // generate testInsertQuery
 47 |     for (int i = 0; i < end; i++) {
 48 |       double tmp = distribution(generator) * kMaxValue;
 49 |       (*testInsertQuery)[i] = {tmp, tmp * 10};
 50 |     }
 51 | 
 52 |     std::sort(initDataset->begin(), initDataset->end());
 53 |     // generate insertQuery
 54 |     if (testInsertQuery->size() > 0) {
 55 |       for (int i = 10; i < kDatasetSize - 1; i += 10) {
 56 |         double tmp =
 57 |             ((*initDataset)[i].first + (*initDataset)[i + 1].first) / 2;
 58 |         (*insertDataset).push_back({tmp, tmp * 10});
 59 |       }
 60 |     }
 61 |     std::sort(insertDataset->begin(), insertDataset->end());
 62 | 
 63 |     std::cout << "generate dataset over! init size:" << initDataset->size()
 64 |               << "\tWrite size:" << testInsertQuery->size() << std::endl;
 65 |   }
 66 | 
 67 |   void SplitInitTest(DataVecType *dataset, DataVecType *initDataset,
 68 |                      DataVecType *insertDataset, DataVecType *testInsertQuery) {
 69 |     (*initDataset) = std::vector<DataType>(kDatasetSize);
 70 |     int end = round(kTestSize * (1 - proportion));
 71 |     (*testInsertQuery) = std::vector<DataType>(end);
 72 | 
 73 |     unsigned seed = std::clock();
 74 |     std::default_random_engine engine(seed);
 75 |     shuffle((*dataset).begin(), (*dataset).end(), engine);
 76 | 
 77 |     int i = 0;
 78 |     for (int j = 0; i < end; i++, j++) {
 79 |       (*testInsertQuery)[j] = (*dataset)[i];
 80 |     }
 81 |     end = (*dataset).size();
 82 |     for (int j = 0; i < end; i++, j++) {
 83 |       (*initDataset)[j] = (*dataset)[i];
 84 |     }
 85 | 
 86 |     std::sort(initDataset->begin(), initDataset->end());
 87 |     if (testInsertQuery->size() > 0) {
 88 |       for (int i = 10; i < kDatasetSize - 1; i += 10) {
 89 |         double tmp =
 90 |             ((*initDataset)[i].first + (*initDataset)[i + 1].first) / 2;
 91 |         (*insertDataset).push_back({tmp, tmp * 10});
 92 |       }
 93 |     }
 94 |     std::sort(insertDataset->begin(), insertDataset->end());
 95 | 
 96 |     std::cout << " init size:" << (*initDataset).size()
 97 |               << "\tWrite size:" << (*testInsertQuery).size() << std::endl;
 98 |   }
 99 | };
100 | 
101 | #endif  // EXPERIMENT_DATASET_BASE_DATASET_H_
102 | 


--------------------------------------------------------------------------------
/src/include/nodes/rootNode/root_nodes.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file root_nodes.h
  3 |  * @author Jiaoyi
  4 |  * @brief the details of root nodes
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef NODES_ROOTNODE_ROOT_NODES_H_
 12 | #define NODES_ROOTNODE_ROOT_NODES_H_
 13 | 
 14 | #include <algorithm>
 15 | #include <fstream>
 16 | #include <vector>
 17 | 
 18 | #include "../../construct/structures.h"
 19 | #include "../../params.h"
 20 | #include "trainModel/linear_regression.h"
 21 | #include "trainModel/piecewiseLR.h"
 22 | #include "trainModel/prefetch_plr.h"
 23 | 
 24 | /**
 25 |  * @brief piecewise linear regression root node
 26 |  *
 27 |  * The piecewise linear regression model with five segments can allocate data
 28 |  * points more evenly.
 29 |  *
 30 |  * Since the root node is always in the cache, we do not limit its size here. We
 31 |  * use a five-segment P. LR model, occupying 76 bytes. In addition, to support
 32 |  * the prefetch function, we add a prefetch prediction model to speed up the
 33 |  * process of accessing a data point.
 34 |  *
 35 |  * @tparam DataVectorType the type of dataset
 36 |  * @tparam KeyType the type of the given key value
 37 |  */
 38 | template <typename DataVectorType, typename KeyType>
 39 | class PLRType {
 40 |  public:
 41 |   // *** Constructed Types and Constructor
 42 | 
 43 |   /**
 44 |    * @brief The type of the model: piecewise linear regression
 45 |    */
 46 |   typedef PiecewiseLR<DataVectorType, KeyType> ModelType;
 47 | 
 48 |   /**
 49 |    * @brief Construct a new PLRType object with the default constructor
 50 |    */
 51 |   PLRType() = default;
 52 | 
 53 |   /**
 54 |    * @brief Construct a new PLRType object and train the plr model with the
 55 |    * given dataset.
 56 |    *
 57 |    * PLR root node uses a piecewise linear regression model to predict the index
 58 |    * of the next node. When finding the position of the data point, we first
 59 |    * find the first breakpoint greater than or equal to the given key value, and
 60 |    * then use the corresponding model parameters for the calculation and
 61 |    * boundary processing.
 62 |    *
 63 |    * @param[in] childNum the number of the child nodes in the root node
 64 |    * @param[in] dataset the dataset used to train the plr model of the root node
 65 |    */
 66 |   PLRType(int childNum, const DataVectorType &dataset) {
 67 |     flagNumber = PLR_ROOT_NODE;
 68 |     model.maxChildIdx = std::max(2, childNum - 1);
 69 |     model.Train(dataset);
 70 |   }
 71 | 
 72 |  public:
 73 |   // *** Static Constant Options and Values of P. LR Root Node Objects
 74 | 
 75 |   /**
 76 |    * @brief The time cost of the plr root node.
 77 |    */
 78 |   static constexpr double kTimeCost = carmi_params::kPLRRootTime;
 79 | 
 80 |  public:
 81 |   //*** Public Data Members of P. LR Root Node Objects
 82 | 
 83 |   /**
 84 |    * @brief the main root model: piecewise linear regression model with five
 85 |    * segments to allocate the dataset to the child nodes.
 86 |    *
 87 |    * We use this model to predict the index of the next node, and use the raw
 88 |    * output of this model (leaf index before rounding down) as the input to the
 89 |    * prefetch prediction model. (72 bytes)
 90 |    */
 91 |   ModelType model;
 92 | 
 93 |   /**
 94 |    * @brief the prefetch prediction model.
 95 |    *
 96 |    * This model is also a piecewise linear regression model, which uses the
 97 |    * output of the main root model to compute a block index. In this model,
 98 |    * the slope and intercept of each segment are forced to be integers, so that
 99 |    * within each segment, each leaf node is mapped to the same number of data
100 |    * blocks.
101 |    */
102 |   PrefetchPLR fetch_model;
103 | 
104 |   /**
105 |    * @brief the type of the root node: PLR_ROOT_NODE (4 bytes)
106 |    */
107 |   int flagNumber;
108 | };
109 | 
110 | #endif  // NODES_ROOTNODE_ROOT_NODES_H_
111 | 


--------------------------------------------------------------------------------
/src/include/memoryLayout/empty_block.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file empty_block.h
  3 |  * @author Jiaoyi
  4 |  * @brief the class of empty memory blocks
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef MEMORYLAYOUT_EMPTY_BLOCK_H_
 12 | #define MEMORYLAYOUT_EMPTY_BLOCK_H_
 13 | 
 14 | #include <iostream>
 15 | #include <set>
 16 | 
 17 | /**
 18 |  * @brief Basic class used to manage empty memory blocks.
 19 |  *
 20 |  * This class is used to manage empty memory blocks with the size m_width.
 21 |  * In CARMI, this class can speed up the process of memory allocation, which
 22 |  * only needs to return the first element in m_block.
 23 |  *
 24 |  * This class is used as a member type of the vector of the DataArrayStructure
 25 |  * in data_array.h. Users can customize the granularity of the width of the
 26 |  * empty memory blocks according to the node type they implement. For example,
 27 |  * these can be 1~7 for the CF array leaf node. At the same time, it can also
 28 |  * have a coarser granularity:2, 4, 8, ..., 512, 1024, 2048.
 29 |  */
 30 | class EmptyMemoryBlock {
 31 |  public:
 32 |   //*** Constructor
 33 | 
 34 |   /**
 35 |    * @brief Construct a new EmptyMemoryBlock object, set the width of the empty
 36 |    * memory block
 37 |    *
 38 |    * @param[in] width the width of this type of empty memory block
 39 |    */
 40 |   explicit EmptyMemoryBlock(int width) { m_width = width; }
 41 | 
 42 |  public:
 43 |   //*** Public Functions of EmptyMemoryBlock Objects
 44 | 
 45 |   /**
 46 |    * @brief Allocate a block of empty memory. If the set of memory blocks of
 47 |    * size m_width has empty blocks available for allocation, which means there
 48 |    * are still elements in m_block, then return the empty memory block index
 49 |    * with the smallest index among all the empty blocks. If there are no empty
 50 |    * blocks, allocation fails, and this function returns -1.
 51 |    *
 52 |    * @return int: if allocation is successful, return the smallest element in
 53 |    * m_block, otherwise return -1.
 54 |    * @retval -1 allocation fails
 55 |    */
 56 |   int Allocate() {
 57 |     // Case 1: if the set is empty, allocation fails
 58 |     if (m_block.empty()) {
 59 |       return -1;
 60 |     }
 61 |     // Case 2: allocation succeeds, return the smallest element of m_block and
 62 |     // erase this block from the empty set
 63 |     int res = *m_block.begin();
 64 |     m_block.erase(m_block.begin());
 65 |     return res;
 66 |   }
 67 | 
 68 |   /**
 69 |    * @brief add the corresponding empty blocks (insert the left index of the
 70 |    * block into the m_block set)
 71 |    *
 72 |    * @param[in] idx the index of blocks
 73 |    * @param[in] size the size of blocks
 74 |    * @return int: the size of the empty block after this action
 75 |    */
 76 |   int AddBlock(int idx, int size) {
 77 |     if (size < m_width) return -1;
 78 |     int newIdx = idx + size - m_width;
 79 |     m_block.insert(newIdx);
 80 |     return size - m_width;
 81 |   }
 82 | 
 83 |   /**
 84 |    * @brief check whether the memory block with the beginning index idx is
 85 |    * empty, return the check result
 86 |    *
 87 |    * @param[in] idx the beginning index of this block
 88 |    * @retval true this block is empty
 89 |    * @retval false this block is not empty and has been allocated
 90 |    */
 91 |   bool IsEmpty(int idx) {
 92 |     std::set<int>::iterator it = m_block.find(idx);
 93 |     if (it != m_block.end())
 94 |       return true;
 95 |     else
 96 |       return false;
 97 |   }
 98 | 
 99 |  public:
100 |   //*** Public Data Members of EmptyMemoryBlock Objects
101 | 
102 |   /**
103 |    * @brief used to store the beginning indexes of all empty memory blocks with
104 |    * m_width
105 |    */
106 |   std::set<int> m_block;
107 | 
108 |   /**
109 |    * @brief the width of this empty memory block
110 |    */
111 |   int m_width;
112 | };
113 | 
114 | #endif  // MEMORYLAYOUT_EMPTY_BLOCK_H_
115 | 


--------------------------------------------------------------------------------
/src/include/base_node.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file base_node.h
  3 |  * @author Jiaoyi
  4 |  * @brief the main structures of nodes
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef BASE_NODE_H_
 12 | #define BASE_NODE_H_
 13 | #include <algorithm>
 14 | #include <functional>
 15 | #include <iostream>
 16 | #include <map>
 17 | #include <memory>
 18 | #include <utility>
 19 | #include <vector>
 20 | 
 21 | #include "nodes/innerNode/bs_model.h"
 22 | #include "nodes/innerNode/his_model.h"
 23 | #include "nodes/innerNode/lr_model.h"
 24 | #include "nodes/innerNode/plr_model.h"
 25 | #include "nodes/leafNode/cfarray_type.h"
 26 | #include "nodes/leafNode/external_array_type.h"
 27 | #include "nodes/rootNode/root_nodes.h"
 28 | 
 29 | /**
 30 |  * @brief the root type of CARMI
 31 |  *
 32 |  * This class inherits PLRType as the root node. When accessing a data point, we
 33 |  * first use the root node's model to compute the next node's index.  In the
 34 |  * CARMI framework, the object of this class serves as one of its private
 35 |  * members.
 36 |  *
 37 |  * @tparam DataVectorType the type of data vector
 38 |  * @tparam KeyType the type of the given key value
 39 |  */
 40 | template <typename DataVectorType, typename KeyType>
 41 | class CARMIRoot : public PLRType<DataVectorType, KeyType> {
 42 |  public:
 43 |   // *** Constructed Types and Constructor
 44 | 
 45 |   /**
 46 |    * @brief Construct a new CARMIRoot object with the default constructor
 47 |    */
 48 |   CARMIRoot() = default;
 49 | 
 50 |   /**
 51 |    * @brief Copy from a PLRType object to an object of the current object.
 52 |    *
 53 |    * @param[in] currnode the PLRType object
 54 |    * @return CARMIRoot& the object of the current class
 55 |    */
 56 |   CARMIRoot& operator=(const PLRType<DataVectorType, KeyType>& currnode) {
 57 |     this->PLRType<DataVectorType, KeyType>::model = currnode.model;
 58 |     this->PLRType<DataVectorType, KeyType>::fetch_model = currnode.fetch_model;
 59 |     this->flagNumber = currnode.flagNumber;
 60 |     return *this;
 61 |   }
 62 | };
 63 | 
 64 | /**
 65 |  * @brief the 64 bytes structure for all types of nodes to support the
 66 |  * cache-aware design
 67 |  *
 68 |  * The first byte is always the node type identifier, and the next three bytes
 69 |  * are used to store the number of child nodes (the number of data blocks for
 70 |  * leaf nodes). For inner nodes, the following 4 bytes represent the starting
 71 |  * index of the child nodes in the node array. For leaf nodes, they store the
 72 |  * starting index of data blocks in the data array instead. The remaining 56
 73 |  * bytes store additional information depending on the tree node type.
 74 |  *
 75 |  * @tparam KeyType the type of the given key value
 76 |  * @tparam ValueType the type of the value
 77 |  */
 78 | template <typename KeyType, typename ValueType,
 79 |           typename Compare = std::less<KeyType>,
 80 |           typename Alloc = std::allocator<LeafSlots<KeyType, ValueType>>>
 81 | union BaseNode {
 82 |   /**
 83 |    * @brief the linear regression inner node
 84 |    */
 85 |   LRModel<KeyType, ValueType> lr;
 86 | 
 87 |   /**
 88 |    * @brief the piecewise linear regression inner node
 89 |    */
 90 |   PLRModel<KeyType, ValueType> plr;
 91 | 
 92 |   /**
 93 |    * @brief the histogram inner node
 94 |    */
 95 |   HisModel<KeyType, ValueType> his;
 96 | 
 97 |   /**
 98 |    * @brief the binary search inner node
 99 |    */
100 |   BSModel<KeyType, ValueType> bs;
101 | 
102 |   /**
103 |    * @brief the cache-friendly array leaf node
104 |    */
105 |   CFArrayType<KeyType, ValueType, Compare, Alloc> cfArray;
106 | 
107 |   /**
108 |    * @brief the external array leaf node
109 |    */
110 |   ExternalArray<KeyType, ValueType, Compare> externalArray;
111 | 
112 |   BaseNode() {}
113 |   ~BaseNode() {}
114 | 
115 |   BaseNode& operator=(const BaseNode& currnode) {
116 |     if (this != &currnode) {
117 |       this->lr = currnode.lr;
118 |     }
119 |     return *this;
120 |   }
121 | };
122 | 
123 | #endif  // BASE_NODE_H_
124 | 


--------------------------------------------------------------------------------
/src/include/memoryLayout/node_array.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file node_array.h
  3 |  * @author Jiaoyi
  4 |  * @brief manage the node array
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef MEMORYLAYOUT_NODE_ARRAY_H_
 12 | #define MEMORYLAYOUT_NODE_ARRAY_H_
 13 | #include <functional>
 14 | #include <iostream>
 15 | #include <memory>
 16 | #include <utility>
 17 | #include <vector>
 18 | 
 19 | #include "../base_node.h"
 20 | 
 21 | /**
 22 |  * @brief the structure of node array
 23 |  *
 24 |  * @tparam KeyType the type of the keyword
 25 |  * @tparam ValueType the type of the value
 26 |  * @tparam Compare A binary predicate that takes two element keys as arguments
 27 |  * and returns a bool.
 28 |  * @tparam Alloc Type of the allocator object used to define the storage
 29 |  * allocation model.
 30 |  */
 31 | template <typename KeyType, typename ValueType,
 32 |           typename Compare = std::less<KeyType>,
 33 |           typename Alloc = std::allocator<LeafSlots<KeyType, ValueType>>>
 34 | class NodeArrayStructure {
 35 |  public:
 36 |   // *** Constructed Types and Constructor
 37 | 
 38 |   /**
 39 |    * @brief Construct a new Node Array Structure object
 40 |    * Initialize the nowNodeNumber with 0 and construct 4096 empty members of
 41 |    * BaseNode vector
 42 |    */
 43 |   NodeArrayStructure() {
 44 |     nowNodeNumber = 0;
 45 |     std::vector<BaseNode<KeyType, ValueType, Compare, Alloc>>(
 46 |         4096, BaseNode<KeyType, ValueType, Compare, Alloc>())
 47 |         .swap(nodeArray);
 48 |   }
 49 | 
 50 |  public:
 51 |   //*** Public Functions of NodeArrayStructure
 52 | 
 53 |   /**
 54 |    * @brief allocate a block of empty memory to store the nodes
 55 |    *
 56 |    * @param[in] size the size of nodes needed to be stored
 57 |    * @return int: the beginning index of this allocated memory
 58 |    */
 59 |   int AllocateNodeMemory(int size);
 60 | 
 61 |   /**
 62 |    * @brief After the construction of CARMI is completed, the useless memory
 63 |    * exceeding the needed size will be released.
 64 |    *
 65 |    * @param[in] neededSize the size of needed node blocks
 66 |    */
 67 |   void ReleaseUselessMemory(int neededSize);
 68 | 
 69 |  public:
 70 |   //*** Public Data Member of Node Array Structure Objects
 71 | 
 72 |   /**
 73 |    * @brief the node array mentioned in the paper.
 74 |    *
 75 |    * All tree nodes, including both inner nodes and leaf nodes, are stored in
 76 |    * this node array. Each member occupies a fixed size according to the
 77 |    * BaseNode class.
 78 |    */
 79 |   std::vector<BaseNode<KeyType, ValueType, Compare, Alloc>> nodeArray;
 80 | 
 81 |   /**
 82 |    * @brief the used size of nodeArray
 83 |    */
 84 |   int nowNodeNumber;
 85 | };
 86 | 
 87 | template <typename KeyType, typename ValueType, typename Compare,
 88 |           typename Alloc>
 89 | int NodeArrayStructure<KeyType, ValueType, Compare, Alloc>::AllocateNodeMemory(
 90 |     int size) {
 91 |   if (size < 0) {
 92 |     throw std::invalid_argument(
 93 |         "NodeArrayStructure::AllocateNodeMemory: the size is less than 0.");
 94 |   }
 95 |   int newLeft = -1;
 96 |   unsigned int tmpSize = nodeArray.size();
 97 | 
 98 |   // allocation fails, need to expand the nodeArray
 99 |   while (nowNodeNumber + size > tmpSize) {
100 |     BaseNode<KeyType, ValueType, Compare, Alloc> t;
101 |     tmpSize *= 1.25;
102 |     nodeArray.resize(tmpSize, t);
103 |   }
104 |   newLeft = nowNodeNumber;
105 |   nowNodeNumber += size;
106 |   return newLeft;
107 | }
108 | 
109 | template <typename KeyType, typename ValueType, typename Compare,
110 |           typename Alloc>
111 | void NodeArrayStructure<KeyType, ValueType, Compare,
112 |                         Alloc>::ReleaseUselessMemory(int neededSize) {
113 |   if (neededSize < 0) {
114 |     throw std::invalid_argument(
115 |         "NodeArrayStructure::ReleaseUselessMemory: the size is less than 0.");
116 |   }
117 |   std::vector<BaseNode<KeyType, ValueType, Compare, Alloc>> tmp(
118 |       nodeArray.begin(), nodeArray.begin() + neededSize);
119 |   std::vector<BaseNode<KeyType, ValueType, Compare, Alloc>>().swap(nodeArray);
120 |   nodeArray = tmp;
121 | }
122 | #endif  // MEMORYLAYOUT_NODE_ARRAY_H_
123 | 


--------------------------------------------------------------------------------
/src/include/nodes/rootNode/trainModel/linear_regression.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file linear_regression.h
  3 |  * @author Jiaoyi
  4 |  * @brief linear regression model
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef NODES_ROOTNODE_TRAINMODEL_LINEAR_REGRESSION_H_
 12 | #define NODES_ROOTNODE_TRAINMODEL_LINEAR_REGRESSION_H_
 13 | 
 14 | #include <algorithm>
 15 | #include <fstream>
 16 | #include <iostream>
 17 | #include <random>
 18 | #include <utility>
 19 | #include <vector>
 20 | 
 21 | #include "../../../params.h"
 22 | 
 23 | /**
 24 |  * @brief linear regression model for root node
 25 |  *
 26 |  * This model is a very simple root node consisting of a linear regression
 27 |  * model. In addition to the number of child nodes, only two linear regression
 28 |  * model parameters need to be stored. We only need one model prediction and the
 29 |  * boundary condition processing to get the index of the next child node.
 30 |  *
 31 |  * @tparam DataVectorType the vector type of the dataset
 32 |  * @tparam KeyType the type of the key value
 33 |  */
 34 | template <typename DataVectorType, typename KeyType>
 35 | class LinearRegression {
 36 |  public:
 37 |   // *** Constructor
 38 | 
 39 |   /**
 40 |    * @brief Construct a new Linear Regression object and set the default value
 41 |    * of the linear regression model parameters.
 42 |    */
 43 |   LinearRegression() {
 44 |     slope = 0.0001;
 45 |     intercept = 0.666;
 46 |     maxChildIdx = 2;
 47 |     minValue = 0;
 48 |   }
 49 | 
 50 |   /**
 51 |    * @brief use the given dataset to train the lr model
 52 |    *
 53 |    * @param[in] dataset the original dataset, each data point is: {key, value}
 54 |    */
 55 |   void Train(const DataVectorType &dataset) {
 56 |     int idx = 0;
 57 |     int size = dataset.size();
 58 |     if (size == 0) return;
 59 |     minValue = dataset[0].first;
 60 |     std::vector<double> index(size, 0);
 61 |     // construct the training dataset, x is the key value in the dataset, y is
 62 |     // the corresponding ratio of index in the maxChildIdx
 63 |     for (int i = 0; i < size; i++) {
 64 |       index[idx++] = static_cast<double>(i) / size * maxChildIdx;
 65 |     }
 66 | 
 67 |     // train the lr model
 68 |     long double t1 = 0, t2 = 0, t3 = 0, t4 = 0;
 69 |     for (int i = 0; i < size; i++) {
 70 |       t1 += static_cast<long double>(dataset[i].first - minValue) *
 71 |             static_cast<long double>(dataset[i].first - minValue);
 72 |       t2 += static_cast<long double>(dataset[i].first - minValue);
 73 |       t3 += static_cast<long double>(dataset[i].first - minValue) *
 74 |             static_cast<long double>(index[i]);
 75 |       t4 += static_cast<long double>(index[i]);
 76 |     }
 77 |     if (t1 * size - t2 * t2) {
 78 |       slope = (t3 * size - t2 * t4) / (t1 * size - t2 * t2);
 79 |       intercept = (t1 * t4 - t2 * t3) / (t1 * size - t2 * t2);
 80 |     } else {
 81 |       slope = 1.0;
 82 |       intercept = 1.0;
 83 |     }
 84 |   }
 85 | 
 86 |   /**
 87 |    * @brief output the unrounded index of the next node of the given key value
 88 |    *
 89 |    * @param[in] key the given key value
 90 |    * @return double: the unrounded index
 91 |    */
 92 |   inline double Predict(KeyType key) const {
 93 |     // predict the index of the next node using the lr model
 94 |     double p = slope * static_cast<double>(key - minValue) + intercept;
 95 |     // boundary processing
 96 |     if (p < 0)
 97 |       p = 0;
 98 |     else if (p > maxChildIdx)
 99 |       p = maxChildIdx;
100 |     return p;
101 |   }
102 | 
103 |  public:
104 |   //*** Public Data Members of LR Model Objects
105 | 
106 |   /**
107 |    * @brief The number of the child nodes
108 |    */
109 |   int maxChildIdx;
110 | 
111 |  private:
112 |   //*** Private Data Members of LR Model Objects
113 | 
114 |   /**
115 |    * @brief The linear regression parameter: the slope
116 |    */
117 |   double slope;
118 | 
119 |   /**
120 |    * @brief The linear regression parameter: the intercept
121 |    */
122 |   double intercept;
123 | 
124 |   /**
125 |    * @brief The minimum value.
126 |    */
127 |   KeyType minValue;
128 | };
129 | #endif  // NODES_ROOTNODE_TRAINMODEL_LINEAR_REGRESSION_H_
130 | 


--------------------------------------------------------------------------------
/src/include/func/split_function.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file split_function.h
  3 |  * @author Jiaoyi
  4 |  * @brief the split function for insert function
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef FUNC_SPLIT_FUNCTION_H_
 12 | #define FUNC_SPLIT_FUNCTION_H_
 13 | 
 14 | #include <float.h>
 15 | 
 16 | #include <algorithm>
 17 | #include <vector>
 18 | 
 19 | #include "../carmi.h"
 20 | #include "../construct/minor_function.h"
 21 | #include "../params.h"
 22 | 
 23 | template <typename KeyType, typename ValueType, typename Compare,
 24 |           typename Alloc>
 25 | template <typename LeafNodeType>
 26 | inline void CARMI<KeyType, ValueType, Compare, Alloc>::Split(int idx) {
 27 |   // get the parameters of this leaf node
 28 |   int previousIdx = node.nodeArray[idx].cfArray.previousLeaf;
 29 |   int nextIdx = node.nodeArray[idx].cfArray.nextLeaf;
 30 | 
 31 |   DataVectorType tmpDataset;
 32 |   int leftIdx;
 33 |   // extract pure data points
 34 |   if (isPrimary) {
 35 |     leftIdx = node.nodeArray[idx].externalArray.m_left;
 36 |     int rightIdx =
 37 |         leftIdx + (node.nodeArray[idx].externalArray.flagNumber & 0x00FFFFFF);
 38 |     tmpDataset = ExternalArray<KeyType, ValueType, Compare>::ExtractDataset(
 39 |         external_data, leftIdx, rightIdx, recordLength);
 40 |   } else {
 41 |     leftIdx = node.nodeArray[idx].cfArray.m_left;
 42 |     int rightIdx =
 43 |         leftIdx + (node.nodeArray[idx].cfArray.flagNumber & 0x00FFFFFF);
 44 |     tmpDataset =
 45 |         CFArrayType<KeyType, ValueType, Compare, Alloc>::ExtractDataset(
 46 |             data, leftIdx, rightIdx);
 47 |   }
 48 |   int actualSize = tmpDataset.size();
 49 | 
 50 |   // create a new inner node and store it in the node[idx]
 51 |   auto currnode = LRModel<KeyType, ValueType>(kInsertNewChildNumber);
 52 |   currnode.Train(0, actualSize, tmpDataset);
 53 | 
 54 |   std::vector<IndexPair> perSize(kInsertNewChildNumber, emptyRange);
 55 |   IndexPair range{0, actualSize};
 56 |   NodePartition<LRModel<KeyType, ValueType>>(currnode, range, tmpDataset,
 57 |                                              &perSize);
 58 |   currnode.childLeft = node.AllocateNodeMemory(kInsertNewChildNumber);
 59 |   node.nodeArray[idx].lr = currnode;
 60 | 
 61 |   int tmpLeft = leftIdx;
 62 |   // create kInsertNewChildNumber new leaf nodes and store them in the node
 63 |   // array
 64 |   for (int i = 0; i < kInsertNewChildNumber; i++) {
 65 |     LeafNodeType tmpLeaf;
 66 |     std::vector<int> prefetchIndex(perSize[i].size);
 67 |     int s = perSize[i].left;
 68 |     int e = perSize[i].left + perSize[i].size;
 69 |     for (int j = s; j < e; j++) {
 70 |       double predictLeafIdx = root.model.Predict(tmpDataset[j].first);
 71 |       int p = root.fetch_model.PrefetchPredict(predictLeafIdx);
 72 |       prefetchIndex[j - s] = p;
 73 |     }
 74 |     tmpLeaf.Init(tmpDataset, prefetchIndex, s, &data);
 75 |     if (isPrimary) {
 76 |       tmpLeaf.m_left = tmpLeft;
 77 |       tmpLeft += perSize[i].size;
 78 |     }
 79 |     node.nodeArray[currnode.childLeft + i].cfArray =
 80 |         *(reinterpret_cast<CFArrayType<KeyType, ValueType, Compare, Alloc> *>(
 81 |             &tmpLeaf));
 82 |   }
 83 |   if (idx == lastLeaf) {
 84 |     lastLeaf = currnode.childLeft + kInsertNewChildNumber - 1;
 85 |   }
 86 |   if (idx == firstLeaf) {
 87 |     firstLeaf = currnode.childLeft;
 88 |   }
 89 | 
 90 |   // if the original leaf node is the cf array leaf node, we need to update the
 91 |   // pointer to the siblings of the new leaf nodes
 92 |   if (!isPrimary) {
 93 |     if (previousIdx >= 0) {
 94 |       node.nodeArray[previousIdx].cfArray.nextLeaf = currnode.childLeft;
 95 |     }
 96 |     node.nodeArray[currnode.childLeft].cfArray.previousLeaf = previousIdx;
 97 |     node.nodeArray[currnode.childLeft].cfArray.nextLeaf =
 98 |         currnode.childLeft + 1;
 99 |     int end = currnode.childLeft + kInsertNewChildNumber - 1;
100 |     for (int i = currnode.childLeft + 1; i < end; i++) {
101 |       node.nodeArray[i].cfArray.previousLeaf = i - 1;
102 |       node.nodeArray[i].cfArray.nextLeaf = i + 1;
103 |     }
104 |     node.nodeArray[end].cfArray.previousLeaf = end - 1;
105 |     if (nextIdx != -1) {
106 |       node.nodeArray[end].cfArray.nextLeaf = nextIdx;
107 |       node.nodeArray[nextIdx].cfArray.previousLeaf = end;
108 |     }
109 |   }
110 | }
111 | 
112 | #endif  // FUNC_SPLIT_FUNCTION_H_
113 | 


--------------------------------------------------------------------------------
/src/include/func/insert_function.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file insert_function.h
  3 |  * @author Jiaoyi
  4 |  * @brief insert a record
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef FUNC_INSERT_FUNCTION_H_
 12 | #define FUNC_INSERT_FUNCTION_H_
 13 | 
 14 | #include <float.h>
 15 | 
 16 | #include <algorithm>
 17 | #include <map>
 18 | #include <utility>
 19 | #include <vector>
 20 | 
 21 | #include "../carmi.h"
 22 | #include "../construct/minor_function.h"
 23 | #include "./split_function.h"
 24 | 
 25 | template <typename KeyType, typename ValueType, typename Compare,
 26 |           typename Alloc>
 27 | std::pair<BaseNode<KeyType, ValueType, Compare, Alloc> *, bool>
 28 | CARMI<KeyType, ValueType, Compare, Alloc>::Insert(const DataType &datapoint,
 29 |                                                   int *currblock,
 30 |                                                   int *currslot) {
 31 |   int idx = 0;  // idx in the node array
 32 |   int type = root.flagNumber;
 33 |   while (1) {
 34 |     switch (type) {
 35 |       case PLR_ROOT_NODE:
 36 |         // Case 0: this node is the plr root node
 37 |         // use the plr root node to find the index of the next node
 38 |         idx = root.PLRType<DataVectorType, KeyType>::model.Predict(
 39 |             datapoint.first);
 40 |         break;
 41 |       case LR_INNER_NODE:
 42 |         // Case 1: this node is the lr inner node
 43 |         // use the predict function of lr inner node to obtain the index of the
 44 |         // next node
 45 |         idx = node.nodeArray[idx].lr.Predict(datapoint.first);
 46 |         break;
 47 |       case PLR_INNER_NODE:
 48 |         // Case 2: this node is the plr inner node
 49 |         // use the predict function of plr inner node to obtain the index of the
 50 |         // next node
 51 |         idx = node.nodeArray[idx].plr.Predict(datapoint.first);
 52 |         break;
 53 |       case HIS_INNER_NODE:
 54 |         // Case 3: this node is the his inner node
 55 |         // use the predict function of his inner node to obtain the index of the
 56 |         // next node
 57 |         idx = node.nodeArray[idx].his.Predict(datapoint.first);
 58 |         break;
 59 |       case BS_INNER_NODE:
 60 |         // Case 4: this node is the bs inner node
 61 |         // use the predict function of bs inner node to obtain the index of the
 62 |         // next node
 63 |         idx = node.nodeArray[idx].bs.Predict(datapoint.first);
 64 |         break;
 65 |       case ARRAY_LEAF_NODE: {
 66 |         // Case 5: this node is the cache-friendly array leaf node
 67 |         // insert the data point in the cf leaf node
 68 |         bool isSuccess = node.nodeArray[idx].cfArray.Insert(
 69 |             datapoint, currblock, currslot, &data);
 70 |         if (isSuccess) {
 71 |           if (datapoint.first > lastKey) {
 72 |             lastLeaf = idx;
 73 |             lastKey = datapoint.first;
 74 |           }
 75 |           if (datapoint.first < firstKey) {
 76 |             firstLeaf = idx;
 77 |             firstKey = datapoint.first;
 78 |           }
 79 |           currsize++;
 80 |           return {&node.nodeArray[idx], true};
 81 |         } else {
 82 |           // if this leaf node cannot accomodate more data points, we need to
 83 |           // split it and replace it with a new inner node and several new leaf
 84 |           // nodes
 85 |           Split<CFArrayType<KeyType, ValueType, Compare, Alloc>>(idx);
 86 |           idx = node.nodeArray[idx].lr.Predict(datapoint.first);
 87 |         }
 88 |         break;
 89 |       }
 90 |       case EXTERNAL_ARRAY_LEAF_NODE: {
 91 |         // Case 6: this node is the external array leaf node
 92 |         // insert the key value of the data point in the external leaf node
 93 |         bool isSuccess =
 94 |             node.nodeArray[idx].externalArray.Insert(datapoint, &currsize);
 95 | 
 96 |         if (isSuccess) {
 97 |           *currslot = currsize - 1;
 98 |           return {&node.nodeArray[idx], true};
 99 |         } else {
100 |           // if this leaf node cannot accomodate more data points, we need to
101 |           // split it and replace it with a new inner node and several new leaf
102 |           // nodes
103 |           Split<ExternalArray<KeyType, ValueType, Compare>>(idx);
104 |           idx = node.nodeArray[idx].lr.Predict(datapoint.first);
105 |         }
106 |       }
107 |     }
108 | 
109 |     type = node.nodeArray[idx].lr.flagNumber >> 24;
110 |   }
111 | }
112 | 
113 | #endif  // FUNC_INSERT_FUNCTION_H_
114 | 


--------------------------------------------------------------------------------
/src/include/params.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file params.h
  3 |  * @author Jiaoyi
  4 |  * @brief parameters in carmi_params space
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef PARAMS_H_
 12 | #define PARAMS_H_
 13 | 
 14 | #define DEBUG
 15 | 
 16 | #ifdef __APPLE__
 17 | #include <TargetConditionals.h>
 18 | #if TARGET_OS_OSX == 1
 19 | #define CATCH_PLATFORM_MAC
 20 | #elif TARGET_OS_IPHONE == 1
 21 | #define CATCH_PLATFORM_IPHONE
 22 | #endif
 23 | 
 24 | #elif defined(linux) || defined(__linux) || defined(__linux__)
 25 | #define CATCH_PLATFORM_LINUX
 26 | 
 27 | #elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || \
 28 |     defined(_MSC_VER) || defined(__MINGW32__)
 29 | #define CATCH_PLATFORM_WINDOWS
 30 | #endif
 31 | 
 32 | /**
 33 |  * @brief These are parameters in the CARMI framework. The first three
 34 |  * parameters are the maximum capacity of the cf array leaf node and external
 35 |  * array leaf node we have provided in the source code, and the boundary value
 36 |  * for switching between the dynamic programming algorithm and the greedy node
 37 |  * selection algorithm. Users can change their values according to their actual
 38 |  * needs. As for the parameters of the time costs of different nodes, users can
 39 |  * use our profiler to obtain them on their machine.
 40 |  */
 41 | namespace carmi_params {
 42 | /**
 43 |  * @brief bytes, the size of a data block in cf array leaf nodes.
 44 |  * The value must be an integer multiple of the size of the cache line, the
 45 |  * reference values are: 64, 128, 256, 512, etc.
 46 |  */
 47 | static constexpr int kMaxLeafNodeSize = 256;
 48 | 
 49 | /**
 50 |  * @brief The maximum number of data points in an external leaf node.
 51 |  * This value is generally an integer multiple of 2. Since the external dataset
 52 |  * is not stored in our index structure, the value can be larger to reduce the
 53 |  * space cost. Reference values are 512, 1024, 2048, and so on.
 54 |  */
 55 | static constexpr int kMaxLeafNodeSizeExternal = 1024;
 56 | 
 57 | /**
 58 |  * @brief The maximum number of data points which can use the DP algorithm to
 59 |  * construct an inner node. If the size of the sub-dataset exceeds this
 60 |  * parameter, greedy node selection algorithm is used to construct the inner
 61 |  * node.
 62 |  * This value needs to be no less than the first two parameters.
 63 |  */
 64 | static constexpr int kAlgorithmThreshold = 60000;
 65 | 
 66 | /**
 67 |  * @brief The latency of a memory access
 68 |  */
 69 | static constexpr double kMemoryAccessTime = 80.09;
 70 | 
 71 | /**
 72 |  * @brief The time cost of the lr root node including the latency of
 73 |  * accessing the cache (8.29 ns) and the CPU time (3.25 ns)
 74 |  */
 75 | static constexpr double kLRRootTime = 11.54;
 76 | 
 77 | /**
 78 |  * @brief The time cost of the plr root node including the latency of
 79 |  * accessing the cache (11.24 ns) and the CPU time (18.38 ns)
 80 |  */
 81 | static constexpr double kPLRRootTime = 29.62;
 82 | 
 83 | /**
 84 |  * @brief The time cost of the lr inner node including the latency of
 85 |  * memory access and the CPU time
 86 |  */
 87 | static constexpr double kLRInnerTime = kMemoryAccessTime + 5.23;
 88 | 
 89 | /**
 90 |  * @brief The time cost of the plr inner node including the latency of
 91 |  * memory access and the CPU time
 92 |  */
 93 | static constexpr double kPLRInnerTime = kMemoryAccessTime + 22.8;
 94 | 
 95 | /**
 96 |  * @brief The time cost of the his inner node including the latency of
 97 |  * memory access and the CPU time
 98 |  */
 99 | static constexpr double kHisInnerTime = kMemoryAccessTime + 18.44;
100 | 
101 | /**
102 |  * @brief The time cost of the bs inner node including the latency of
103 |  * memory access and the CPU time
104 |  */
105 | static constexpr double kBSInnerTime = kMemoryAccessTime + 26.38;
106 | 
107 | /**
108 |  * @brief The time cost of moving a data point
109 |  */
110 | static constexpr double kCostMoveTime = 6.25;
111 | 
112 | /**
113 |  * @brief The basic time cost of a leaf node including the latency of accessing
114 |  * the leaf node in the memory and the time cost of searching in the leaf node
115 |  * (25.4 ns).
116 |  */
117 | static constexpr double kLeafBaseTime = kMemoryAccessTime + 25.4;
118 | 
119 | /**
120 |  * @brief The average time cost of a binary search
121 |  */
122 | static constexpr double kCostBSTime = 10.9438;
123 | }  // namespace carmi_params
124 | 
125 | #endif  // PARAMS_H_
126 | 


--------------------------------------------------------------------------------
/src/include/construct/dp_inner.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file dp_inner.h
  3 |  * @author Jiaoyi
  4 |  * @brief use dynamic programming algorithm to construct inner nodes
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef CONSTRUCT_DP_INNER_H_
 12 | #define CONSTRUCT_DP_INNER_H_
 13 | #include <float.h>
 14 | 
 15 | #include <algorithm>
 16 | #include <vector>
 17 | 
 18 | #include "../carmi.h"
 19 | 
 20 | template <typename KeyType, typename ValueType, typename Compare,
 21 |           typename Alloc>
 22 | template <typename InnerNodeType>
 23 | void CARMI<KeyType, ValueType, Compare, Alloc>::UpdateDPOptSetting(
 24 |     const DataRange &dataRange, int c, double frequency_weight,
 25 |     NodeCost *optimalCost, InnerNodeType *optimal_node_struct) {
 26 |   double space_cost = kBaseNodeSpace * static_cast<double>(c);
 27 |   double time_cost =
 28 |       InnerNodeType::kTimeCost * static_cast<double>(frequency_weight);
 29 |   double RootCost = time_cost + lambda * space_cost;
 30 |   // Case 1: the cost of the root node has been larger than the optimal cost,
 31 |   // return directly
 32 |   if (RootCost > optimalCost->cost) {
 33 |     return;
 34 |   }
 35 | 
 36 |   // Case 2: construct an inner node and divide the dataset into c sub-datasets
 37 |   SubDataset subDataset(c);
 38 |   auto currnode = InnerDivideAll<InnerNodeType>(dataRange, c, &subDataset);
 39 | 
 40 |   for (int i = 0; i < c; i++) {
 41 |     // calculate the cost of each child node
 42 |     DataRange range(subDataset.subInit[i], subDataset.subFind[i],
 43 |                     subDataset.subInsert[i]);
 44 |     // Case 2.1: if this inner node fails to divide dataset evenly, return
 45 |     // directly
 46 |     if (range.initRange.size + range.initRange.size ==
 47 |         dataRange.initRange.size + dataRange.insertRange.size) {
 48 |       return;
 49 |     }
 50 | 
 51 |     NodeCost res = DP(range);
 52 | 
 53 |     space_cost += res.space;
 54 |     time_cost += res.time;
 55 |     RootCost += lambda * res.space + res.time;
 56 |   }
 57 |   // if the current cost is smaller than the optimal cost, update the optimal
 58 |   // cost and node setting
 59 |   if (RootCost <= optimalCost->cost) {
 60 |     *optimalCost = {time_cost, space_cost, RootCost};
 61 |     *optimal_node_struct = currnode;
 62 |   }
 63 | }
 64 | 
 65 | template <typename KeyType, typename ValueType, typename Compare,
 66 |           typename Alloc>
 67 | NodeCost CARMI<KeyType, ValueType, Compare, Alloc>::DPInner(
 68 |     const DataRange &dataRange) {
 69 |   // the optimal cost of this sub-dataset
 70 |   NodeCost optimalCost{DBL_MAX, DBL_MAX, DBL_MAX};
 71 |   // the optimal node of this sub-dataset
 72 |   BaseNode<KeyType, ValueType, Compare, Alloc> optimal_node_struct = emptyNode;
 73 |   // calculate the weight of the frequency of this sub-dataset (findQuery and
 74 |   // insertQury)
 75 |   double frequency_weight = CalculateFrequencyWeight(dataRange);
 76 |   int tmpEnd = std::min(0x00FFFFFF, dataRange.initRange.size / 16);
 77 |   tmpEnd = std::max(tmpEnd, kMinChildNumber);
 78 |   for (int c = kMinChildNumber; c <= tmpEnd; c *= 2) {
 79 |     // Case 1: construct a LR inner node, if it is better than the current
 80 |     // optimal setting, then use it to update the optimal setting
 81 |     UpdateDPOptSetting<LRModel<KeyType, ValueType>>(
 82 |         dataRange, c, frequency_weight, &optimalCost,
 83 |         &(optimal_node_struct.lr));
 84 |     // Case 2: construct a P. LR inner node, if it is better than the current
 85 |     // optimal setting, then use it to update the optimal setting
 86 |     UpdateDPOptSetting<PLRModel<KeyType, ValueType>>(
 87 |         dataRange, c, frequency_weight, &optimalCost,
 88 |         &(optimal_node_struct.plr));
 89 |     // Case 3: construct a His inner node, if it is better than the current
 90 |     // optimal setting, then use it to update the optimal setting
 91 |     if (c <= kHisMaxChildNumber)
 92 |       UpdateDPOptSetting<HisModel<KeyType, ValueType>>(
 93 |           dataRange, c, frequency_weight, &optimalCost,
 94 |           &(optimal_node_struct.his));
 95 |     // Case 4: construct a BS inner node, if it is better than the current
 96 |     // optimal setting, then use it to update the optimal setting
 97 |     if (c <= kBSMaxChildNumber)
 98 |       UpdateDPOptSetting<BSModel<KeyType, ValueType>>(
 99 |           dataRange, c, frequency_weight, &optimalCost,
100 |           &(optimal_node_struct.bs));
101 |   }
102 |   // store the optimal setting of this sub-dataset
103 |   structMap.insert({dataRange.initRange, optimal_node_struct});
104 |   // store the minimum cost of this sub-dataset
105 |   COST.insert({dataRange.initRange, optimalCost});
106 |   return optimalCost;
107 | }
108 | 
109 | #endif  // CONSTRUCT_DP_INNER_H_
110 | 


--------------------------------------------------------------------------------
/src/example/example.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file example.cpp
  3 |  * @author Jiaoyi
  4 |  * @brief The examples of CARMI
  5 |  * @version 3.0
  6 |  * @date 2021-04-07
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #include <algorithm>
 12 | #include <ctime>
 13 | #include <fstream>
 14 | #include <iostream>
 15 | #include <random>
 16 | 
 17 | #include "../include/carmi_external_map.h"
 18 | #include "../include/carmi_map.h"
 19 | const float kWriteHeavy = 0.5;  // write-heavy workload
 20 | 
 21 | void TestCarmi() {
 22 |   // generate datasets
 23 |   std::vector<std::pair<double, double>> initDataset(10, {1, 1});
 24 |   for (int i = 0; i < 10; i++) {
 25 |     initDataset[i].first = i * 2;
 26 |   }
 27 | 
 28 |   CARMIMap<double, double> carmi(initDataset.begin(), initDataset.end());
 29 | 
 30 |   // find the value of the given key
 31 |   auto it = carmi.find(initDataset[0].first);
 32 |   std::cout << "1.  FIND is successful, the value of the given key is: "
 33 |             << it.data() << std::endl;
 34 |   std::cout << "    Current and all subsequent key-value pairs:";
 35 |   for (; it != carmi.end(); ++it) {
 36 |     std::cout << "{" << it.key() << ", " << it.data() << "}  ";
 37 |   }
 38 |   std::cout << std::endl;
 39 | 
 40 |   // insert a data point
 41 |   std::pair<double, double> data = {5, 500};
 42 |   auto res = carmi.insert(data);
 43 |   std::cout << "2.  INSERT is successful!" << std::endl;
 44 | 
 45 |   it = carmi.find(data.first);
 46 |   std::cout
 47 |       << "    FIND after INSERT is successful, the value of the given key is: "
 48 |       << it.data() << std::endl;
 49 |   std::cout << "      Current and all subsequent key-value pairs:";
 50 |   for (; it != carmi.end(); ++it) {
 51 |     std::cout << "{" << it.key() << ", " << it.data() << "}  ";
 52 |   }
 53 |   std::cout << std::endl;
 54 | 
 55 |   // delete the record of the given key
 56 |   int cnt = carmi.erase(initDataset[0].first);
 57 |   if (cnt > 0)
 58 |     std::cout << "4.  DELETE is successful!" << std::endl;
 59 |   else
 60 |     std::cout << "  DELETE failed!" << std::endl;
 61 |   it = carmi.find(initDataset[0].first);
 62 |   if (it == carmi.end() || it.key() != initDataset[0].first) {
 63 |     std::cout << "    FIND after DELETE failed." << std::endl;
 64 |   }
 65 | }
 66 | 
 67 | template <typename KeyType, typename ValueType>
 68 | class ExternalDataType {
 69 |  public:
 70 |   typedef ValueType ValueType_;
 71 |   ExternalDataType() {
 72 |     k = 0;
 73 |     v = 0;
 74 |   }
 75 |   explicit ExternalDataType(KeyType key, ValueType_ value) {
 76 |     k = key;
 77 |     v = value;
 78 |   }
 79 |   const KeyType &key() const { return k; }
 80 |   const ValueType_ &data() const { return v; }
 81 | 
 82 |   bool operator<(const ExternalDataType &a) const {
 83 |     if (k == a.k) {
 84 |       return v < a.v;
 85 |     }
 86 |     return k < a.k;
 87 |   }
 88 | 
 89 |   KeyType k;
 90 |   ValueType_ v;
 91 | };
 92 | 
 93 | void TestExternalCarmi() {
 94 |   // generate datasets
 95 |   int initRatio = kWriteHeavy;
 96 |   int size = 10;
 97 |   std::vector<std::pair<double, double>> initDataset(size, {1, 1});
 98 |   for (int i = 0; i < size; i++) {
 99 |     initDataset[i].first = i * 2;
100 |   }
101 | 
102 |   const int record_size = sizeof(double) * 2;
103 |   int extLen = initDataset.size() * 2 + 10;
104 |   double *externalDataset = new double[extLen];
105 |   for (int i = 0, j = 0; i < initDataset.size(); i++) {
106 |     *(externalDataset + j) = initDataset[i].first;
107 |     *(externalDataset + j + 1) = initDataset[i].second;
108 |     j += 2;  // due to <double, double>
109 |   }
110 |   double maxKey = initDataset[initDataset.size() - 1].first;
111 |   std::vector<double> futureinsertKey(1, maxKey + 1);
112 | 
113 |   CARMIExternalMap<double, ExternalDataType<double, double>> carmi(
114 |       externalDataset, futureinsertKey, initDataset.size(), record_size);
115 | 
116 |   // find the value of the given key
117 |   auto it = carmi.find(initDataset[4].first);
118 |   std::cout << "1.  FIND is successful, the given key is: " << it.key()
119 |             << ",\tthe value is: " << it.data() << std::endl;
120 | 
121 |   // insert data into the external array
122 |   *(externalDataset + size * 2) = futureinsertKey[0];
123 |   *(externalDataset + size * 2 + 1) = 100;
124 | 
125 |   // insert a data point
126 |   carmi.insert(futureinsertKey[0]);  // insert key into carmi
127 |   std::cout << "2.  INSERT is successful!" << std::endl;
128 |   it = carmi.find(futureinsertKey[0]);
129 |   std::cout << "      FIND is successful, the given key is: " << it.key()
130 |             << ",\tthe value is: " << it.data() << std::endl;
131 | }
132 | 
133 | int main() {
134 |   std::cout << "Test carmi:" << std::endl;
135 |   TestCarmi();
136 |   std::cout << "Test external carmi:" << std::endl;
137 |   TestExternalCarmi();
138 |   return 0;
139 | }


--------------------------------------------------------------------------------
/src/include/construct/dp_leaf.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file dp_leaf.h
  3 |  * @author Jiaoyi
  4 |  * @brief use dynamic programming algorithm to construct a leaf node
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef CONSTRUCT_DP_LEAF_H_
 12 | #define CONSTRUCT_DP_LEAF_H_
 13 | 
 14 | #include <float.h>
 15 | 
 16 | #include <algorithm>
 17 | #include <map>
 18 | #include <vector>
 19 | 
 20 | #include "../carmi.h"
 21 | #include "../params.h"
 22 | #include "./structures.h"
 23 | 
 24 | template <typename KeyType, typename ValueType, typename Compare,
 25 |           typename Alloc>
 26 | NodeCost CARMI<KeyType, ValueType, Compare, Alloc>::DPLeaf(
 27 |     const DataRange &dataRange) {
 28 |   NodeCost nodeCost{DBL_MAX, DBL_MAX, DBL_MAX};
 29 |   BaseNode<KeyType, ValueType, Compare, Alloc> optimal_node_struct;
 30 | 
 31 |   nodeCost.time = 0.0;
 32 |   if (isPrimary) {
 33 |     // construct an external array leaf node as the current node
 34 |     nodeCost.space = 0.0;
 35 | 
 36 |     ExternalArray<KeyType, ValueType, Compare> tmp;
 37 |     tmp.Train(initDataset, dataRange.initRange.left, dataRange.initRange.size);
 38 |     int findEnd = dataRange.findRange.left + dataRange.findRange.size;
 39 |     // calculate the time cost of this external array leaf node
 40 |     for (int i = dataRange.findRange.left; i < findEnd; i++) {
 41 |       int p = tmp.Predict(findQuery[i].first) + dataRange.findRange.left;
 42 |       int d = abs(i - p);
 43 |       nodeCost.time +=
 44 |           (carmi_params::kLeafBaseTime * findQuery[i].second) / querySize;
 45 |       // Case 1: if the data point is within the error range, perform binary
 46 |       // search over the range of [p - error / 2, p + error / 2]
 47 |       if (d <= tmp.error)
 48 |         nodeCost.time += log2(tmp.error + 1) * findQuery[i].second *
 49 |                          carmi_params::kCostBSTime / querySize;
 50 |       // Case 2: the data point is not in the error range, perform binary search
 51 |       // over the entire sub-dataset
 52 |       else
 53 |         nodeCost.time += log2(dataRange.initRange.size) * findQuery[i].second *
 54 |                          carmi_params::kCostBSTime / querySize;
 55 |     }
 56 |     optimal_node_struct.externalArray = tmp;
 57 | 
 58 |   } else {
 59 |     // choose a cf array node as the leaf node
 60 |     int totalDataNum = dataRange.initRange.size + dataRange.insertRange.size;
 61 |     // calculate the number of needed data blocks
 62 |     int blockNum =
 63 |         CFArrayType<KeyType, ValueType, Compare, Alloc>::CalNeededBlockNum(
 64 |             totalDataNum);
 65 |     int avgSlotNum =
 66 |         std::max(1.0, ceil(static_cast<double>(totalDataNum) / blockNum));
 67 |     avgSlotNum = std::min(
 68 |         avgSlotNum,
 69 |         CFArrayType<KeyType, ValueType, Compare, Alloc>::kMaxBlockCapacity);
 70 | 
 71 |     nodeCost.space =
 72 |         blockNum * carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0;
 73 |     // calculate the time cost of find operations
 74 |     int end = dataRange.findRange.left + dataRange.findRange.size;
 75 |     for (int i = dataRange.findRange.left; i < end; i++) {
 76 |       nodeCost.time += static_cast<double>(findQuery[i].second) / querySize *
 77 |                        (carmi_params::kLeafBaseTime +
 78 |                         log2(avgSlotNum) * carmi_params::kCostBSTime);
 79 |     }
 80 |     // calculate the time cost of insert operations
 81 |     end = dataRange.insertRange.left + dataRange.insertRange.size;
 82 |     for (int i = dataRange.insertRange.left; i < end; i++) {
 83 |       nodeCost.time += 1.0 / static_cast<double>(querySize) *
 84 |                        (carmi_params::kLeafBaseTime +
 85 |                         log2(avgSlotNum) * carmi_params::kCostBSTime +
 86 |                         (1 + avgSlotNum) / 2.0 * carmi_params::kCostMoveTime);
 87 |     }
 88 | 
 89 |     optimal_node_struct.cfArray =
 90 |         CFArrayType<KeyType, ValueType, Compare, Alloc>();
 91 |   }
 92 |   nodeCost.cost = nodeCost.time + nodeCost.space * lambda;
 93 | 
 94 |   // if dp algorithm also constructs an inner node on this sub-dataset, we need
 95 |   // to check which one is the better setting
 96 |   auto it = COST.find(dataRange.initRange);
 97 |   if (it != COST.end()) {
 98 |     if (it->second.cost < nodeCost.cost) {
 99 |       // Case 1: the inner node is the better one, return the cost of it
100 |       // directly.
101 |       return nodeCost;
102 |     } else {
103 |       // Case 2: the leaf node is the better one, erase the cost and the setting
104 |       // of the inner node
105 |       COST.erase(dataRange.initRange);
106 |       structMap.erase(dataRange.initRange);
107 |     }
108 |   }
109 |   // store the optimal cost and setting
110 |   COST.insert({dataRange.initRange, nodeCost});
111 |   structMap.insert({dataRange.initRange, optimal_node_struct});
112 |   return nodeCost;
113 | }
114 | 
115 | #endif  // CONSTRUCT_DP_LEAF_H_
116 | 


--------------------------------------------------------------------------------
/src/unitTest/carmiTest/carmi_map_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file carmi_map_test.cpp
  3 |  * @author Jiaoyi
  4 |  * @brief
  5 |  * @version 0.1
  6 |  * @date 2021-11-14
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | 
 12 | #include "../../include/carmi_map.h"
 13 | 
 14 | #include "gtest/gtest.h"
 15 | 
 16 | const int kTestMaxValue = 1000000;
 17 | const int kInitSize = 10000;
 18 | const int kInsertSize = 100;
 19 | const float kRate = 0.1;
 20 | unsigned int seed = time(NULL);
 21 | 
 22 | typedef double KeyType;
 23 | typedef double ValueType;
 24 | typedef std::pair<KeyType, ValueType> DataType;
 25 | typedef std::vector<DataType> DataVecType;
 26 | typedef CARMIMap<KeyType, ValueType> CarmiType;
 27 | 
 28 | DataVecType initDataset(kInitSize);
 29 | DataVecType insertDataset(kInsertSize);
 30 | DataVecType testInsertQuery(kInsertSize);
 31 | CarmiType carmi;
 32 | std::default_random_engine engine(time(0));
 33 | 
 34 | TEST(TestCarmimapConstructor, CARMIMapConstructor) {
 35 |   std::uniform_real_distribution<KeyType> dis(0, kTestMaxValue);
 36 |   for (int i = 0; i < kInitSize; i++) {
 37 |     KeyType tmpKey = dis(engine);
 38 |     initDataset[i] = {tmpKey, tmpKey * 10};
 39 |   }
 40 |   std::sort(initDataset.begin(), initDataset.end());
 41 |   for (int i = 0; i < kInsertSize; i++) {
 42 |     KeyType tmpKey = dis(engine);
 43 |     insertDataset[i] = {tmpKey, tmpKey * 10};
 44 |   }
 45 |   std::sort(insertDataset.begin(), insertDataset.end());
 46 |   for (int i = 0; i < kInsertSize; i++) {
 47 |     KeyType tmpKey = dis(engine);
 48 |     testInsertQuery[i] = {tmpKey, tmpKey * 10};
 49 |   }
 50 |   ASSERT_TRUE(carmi.empty());
 51 | 
 52 |   CarmiType c(initDataset.begin(), initDataset.end(), insertDataset.begin(),
 53 |               insertDataset.end(), kRate);
 54 |   carmi.swap(c);
 55 | 
 56 |   ASSERT_EQ(carmi.size(), kInitSize);
 57 |   ASSERT_FALSE(carmi.empty());
 58 | 
 59 |   auto it = carmi.begin();
 60 |   for (int i = 0; i < kInitSize; i++) {
 61 |     EXPECT_EQ(it.key(), initDataset[i].first) << " i:" << i << std::endl;
 62 |     EXPECT_EQ(it.data(), initDataset[i].second);
 63 |     it++;
 64 |   }
 65 | }
 66 | 
 67 | TEST(TestCarmimapFind, CARMIMapFind) {
 68 |   for (int i = 0; i < kInitSize; i++) {
 69 |     auto it = carmi.find(initDataset[i].first);
 70 |     EXPECT_EQ(it.key(), initDataset[i].first);
 71 |     EXPECT_EQ(it.data(), initDataset[i].second);
 72 |   }
 73 | }
 74 | 
 75 | TEST(TestCarmimapLowerbound, CARMIMapLowerbound) {
 76 |   for (int i = 0; i < kInitSize; i++) {
 77 |     auto it = carmi.lower_bound(initDataset[i].first);
 78 |     EXPECT_EQ(it.key(), initDataset[i].first);
 79 |   }
 80 |   for (int i = 0; i < kInsertSize; i++) {
 81 |     if (testInsertQuery[i].first < initDataset[kInitSize - 1].first) {
 82 |       auto it = carmi.lower_bound(testInsertQuery[i].first);
 83 |       auto vector_res = std::lower_bound(initDataset.begin(), initDataset.end(),
 84 |                                          testInsertQuery[i]) -
 85 |                         initDataset.begin();
 86 |       EXPECT_EQ(it.key(), initDataset[vector_res].first);
 87 |     }
 88 |   }
 89 | }
 90 | 
 91 | TEST(TestCarmimapUpperbound, CARMIMapUpperbound) {
 92 |   for (int i = 0; i < kInitSize - 1; i++) {
 93 |     auto it = carmi.upper_bound(initDataset[i].first);
 94 |     EXPECT_GT(it.key(), initDataset[i].first);
 95 |   }
 96 | }
 97 | 
 98 | TEST(TestCarmimapEqualRange, CARMIMapEqualRange) {
 99 |   for (int i = 0; i < kInitSize; i++) {
100 |     auto res = carmi.equal_range(initDataset[i].first);
101 |     for (auto it = res.first; it != res.second; it++) {
102 |       EXPECT_EQ(it.key(), initDataset[i].first);
103 |     }
104 |   }
105 | }
106 | 
107 | TEST(TestCarmimapCount, CARMIMapCount) {
108 |   for (int i = 0; i < kInitSize; i++) {
109 |     auto res = carmi.count(initDataset[i].first);
110 |     int cnt = 0;
111 |     auto vector_res =
112 |         std::count(initDataset.begin(), initDataset.end(), initDataset[i]);
113 |     EXPECT_EQ(res, vector_res);
114 |   }
115 | }
116 | 
117 | TEST(TestCarmimapInsert, CARMIMapInsert) {
118 |   for (int i = 0; i < kInsertSize; i++) {
119 |     auto it = carmi.insert(testInsertQuery[i]);
120 |     EXPECT_TRUE(it.second);
121 |     EXPECT_EQ(it.first.key(), testInsertQuery[i].first);
122 |     EXPECT_EQ(it.first.data(), testInsertQuery[i].second);
123 |     for (int j = 0; j < i; j++) {
124 |       auto res = carmi.find(testInsertQuery[j].first);
125 |       EXPECT_EQ(res.key(), testInsertQuery[j].first);
126 |     }
127 |     for (int j = 0; j < kInitSize; j++) {
128 |       auto res = carmi.find(initDataset[j].first);
129 |       EXPECT_EQ(res.key(), initDataset[j].first);
130 |       EXPECT_EQ(res.data(), initDataset[j].second);
131 |     }
132 |   }
133 | }
134 | 
135 | TEST(TestCarmimapErase, CARMIMapErase) {
136 |   for (int i = 0; i < kInsertSize; i++) {
137 |     carmi.erase(testInsertQuery[i].first);
138 |     auto it = carmi.find(testInsertQuery[i].first);
139 |     EXPECT_EQ(it, carmi.end());
140 |   }
141 | }
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # README
  2 | # A simple implementation of CARMI
  3 | 
  4 | This is a simple implementation of our paper: **CARMI: A Cache-Aware Learned Index with a Cost-based Construction Algorithm**.
  5 | 
  6 | ## Reproducing the experiment
  7 | 
  8 | If you want to reproduce the experiment in our paper, do the following
  9 | 
 10 | ```
 11 | cd ./src
 12 | cmake .
 13 | make
 14 | ./CARMI
 15 | ```
 16 | 
 17 | ## Using CARMI
 18 | 
 19 | If you want to use CARMI as an index, then you only need to include the header file respectively:
 20 | 
 21 | Use in-memory index:
 22 | ```
 23 | #include "./include/carmi_common.h"
 24 | ```
 25 | 
 26 | Use external index:
 27 | ```
 28 | #include "./include/carmi_external.h"
 29 | ```
 30 | 
 31 | ## Instructions
 32 | 
 33 | **Method of constructing an index:**
 34 | 
 35 | 1. **Automatic construction(CARMI)**: prepare the initial dataset, training datasets (historical access and insertion queries), and then create a CARMI object, and the hybrid algorithm will automatically build the index.
 36 | 
 37 | **Main functions:**
 38 | 
 39 | 1. **find**: find the corresponding record of the given key, return the iterator
 40 | 
 41 | ```
 42 | iterator find(const KeyType &key);
 43 | ```
 44 | 
 45 | 2. **lower_bound**: return an iterator pointing to the first element in the container whose key is not less than key.
 46 | 
 47 | ```
 48 | iterator lower_bound(const KeyType &key);
 49 | ```
 50 | 
 51 | 3. **upper_bound**: return an iterator pointing to the first element in the container whose key is larger than key.
 52 | 
 53 | ```
 54 | iterator upper_bound(const KeyType &key);
 55 | ```
 56 | 
 57 | 4. **insert**: insert a data point into the index.
 58 | 
 59 | ```
 60 | std::pair<iterator, bool> insert(const DataType &datapoint);
 61 | ```
 62 | 
 63 | 5. **erase**: delete the record of the given key and return the number of elements erased.
 64 | 
 65 | ```
 66 | size_t erase(const KeyType &key);
 67 | ```
 68 | 
 69 | 6. **swap**: swap two carmi tree objects.
 70 | 
 71 | ```
 72 | void swap(CARMIMap &other);
 73 | ```
 74 | 
 75 | 7. **size**: return the number of data points in the carmi tree.
 76 | 
 77 | ```
 78 | size_t size();
 79 | ```
 80 | 
 81 | 8. **CalculateSpace**: return the space of the carmi tree in bytes.
 82 | 
 83 | ```
 84 | long long CalculateSpace();
 85 | ```
 86 | 
 87 | Only a few commonly used functions are briefly introduced here. In fact, we provide all interfaces similar to std::map in the C++11 version, and you can use CARMIMap like std::map. CARMIExternalMap is designed to store the data points externally. It also implements the std::map interfaces, but the template parameters are slightly different. You can check the examples we provide to use.
 88 | 
 89 | ## File structure of CARMI
 90 | 
 91 | In this project, we include the CARMI header files, the source code of the experimental part and the baseline. The description of each file in CARMI's header file is as follows:
 92 | 
 93 | - **include**
 94 |   - base_node.h  *( the union structure of all nodes )*
 95 |   - carmi.h  *( the implementation class of CARMI )*
 96 |   - carmi_map.h  *( the CARMI map class for common use )*
 97 |   - carmi_external_map.h  *( the CARMI map class for the dataset stored in the external position )*
 98 |   - **construct**  *( files used to construct the index )*
 99 |     - construction.h *( the main function of our algorithm )*
100 |     - construct_root.h *( use the optimal root node to construct child nodes )*
101 |     - dp.h *( the main dynamic programming algorithm )*
102 |     - dp_inner.h *( use DP to construct inner node )*
103 |     - dp_leaf.h *( use DP to construct leaf node )*
104 |     - greedy.h *( the greedy node selection algorithm )*
105 |     - minor_function.h *( minor functions )*
106 |     - structures.h *( the structures of CARMI )*
107 |     - store_node.h *( use the optimal setting to construct a new node )*
108 |   - **memoryLayout**  *( manage the two main arrays )*
109 |     - data_array.h
110 |     - node_array.h
111 |     - empty_block.h
112 |   - **nodes**   *( all nodes we have implemented )*
113 |     - **rootNode**
114 |       - root_nodes.h  *( the classes of root nodes )*
115 |       - **trainModel** *( models used to train the root nodes )*
116 |         - linear_regression.h
117 |         - piecewiseLR.h
118 |         - prefetch_plr.h
119 |     - **innerNode**
120 |       - lr_model.h
121 |       - plr_model.h
122 |       - his_model.h
123 |       - bs_model.h
124 |       - candidate_plr.h *( for piecewiseLR )*
125 |     - **leafNode**
126 |       - cfarray_type.h
127 |       - external_array_type.h
128 |   - **func**  *( public functions )*
129 |     - find_function.h
130 |     - insert_function.h
131 |     - delete_function.h
132 |     - update_function.h
133 |     - split_function.h
134 |     - calculate_space.h
135 |     - get_node_info.h
136 | 
137 | ## Dependencies
138 | 
139 | This code is based on C++17.
140 | 


--------------------------------------------------------------------------------
/src/include/construct/store_node.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file store_node.h
  3 |  * @author Jiaoyi
  4 |  * @brief store inner and leaf nodes
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef CONSTRUCT_STORE_NODE_H_
 12 | #define CONSTRUCT_STORE_NODE_H_
 13 | 
 14 | #include <float.h>
 15 | 
 16 | #include <algorithm>
 17 | #include <map>
 18 | #include <set>
 19 | #include <vector>
 20 | 
 21 | #include "../carmi.h"
 22 | #include "../memoryLayout/node_array.h"
 23 | #include "../nodes/innerNode/bs_model.h"
 24 | #include "../nodes/innerNode/his_model.h"
 25 | #include "../nodes/innerNode/lr_model.h"
 26 | #include "../nodes/innerNode/plr_model.h"
 27 | #include "../nodes/leafNode/cfarray_type.h"
 28 | #include "../nodes/leafNode/external_array_type.h"
 29 | #include "../params.h"
 30 | #include "./dp_inner.h"
 31 | 
 32 | template <typename KeyType, typename ValueType, typename Compare,
 33 |           typename Alloc>
 34 | template <typename InnerNodeType>
 35 | void CARMI<KeyType, ValueType, Compare, Alloc>::StoreInnerNode(
 36 |     const DataRange &range, InnerNodeType *currnode) {
 37 |   // get the number of child nodes
 38 |   int optimalChildNumber = currnode->flagNumber & 0x00FFFFFF;
 39 |   // divide the initDataset
 40 |   SubDataset subDataset(optimalChildNumber);
 41 |   NodePartition<InnerNodeType>(*currnode, range.initRange, initDataset,
 42 |                                &(subDataset.subInit));
 43 |   NodePartition<InnerNodeType>(*currnode, range.insertRange, insertQuery,
 44 |                                &(subDataset.subInsert));
 45 |   // allocate a block of empty memory for this node in the node array
 46 |   currnode->childLeft = node.AllocateNodeMemory(optimalChildNumber);
 47 | 
 48 |   for (int i = 0; i < optimalChildNumber; i++) {
 49 |     // store each child node
 50 |     DataRange subRange(subDataset.subInit[i], subDataset.subFind[i],
 51 |                        subDataset.subInsert[i]);
 52 |     StoreOptimalNode(subRange, currnode->childLeft + i);
 53 |   }
 54 | }
 55 | 
 56 | template <typename KeyType, typename ValueType, typename Compare,
 57 |           typename Alloc>
 58 | void CARMI<KeyType, ValueType, Compare, Alloc>::StoreOptimalNode(
 59 |     const DataRange &range, int storeIdx) {
 60 |   // find the optimal setting of this sub-dataset
 61 |   auto it = structMap.find(range.initRange);
 62 | 
 63 |   int type = it->second.cfArray.flagNumber >> 24;
 64 |   switch (type) {
 65 |     case LR_INNER_NODE: {
 66 |       // Case 1: the optimal node is the lr inner node, use the StoreInnerNode
 67 |       // function to store itself and its child nodes.
 68 |       StoreInnerNode<LRModel<KeyType, ValueType>>(range, &(it->second.lr));
 69 |       node.nodeArray[storeIdx].lr = it->second.lr;
 70 |       break;
 71 |     }
 72 |     case PLR_INNER_NODE: {
 73 |       // Case 2: the optimal node is the p. lr inner node, use the
 74 |       // StoreInnerNode function to store itself and its child nodes.
 75 |       StoreInnerNode<PLRModel<KeyType, ValueType>>(range, &(it->second.plr));
 76 |       node.nodeArray[storeIdx].plr = it->second.plr;
 77 |       break;
 78 |     }
 79 |     case HIS_INNER_NODE: {
 80 |       // Case 3: the optimal node is the his inner node, use the StoreInnerNode
 81 |       // function to store itself and its child nodes.
 82 |       StoreInnerNode<HisModel<KeyType, ValueType>>(range, &(it->second.his));
 83 |       node.nodeArray[storeIdx].his = it->second.his;
 84 |       break;
 85 |     }
 86 |     case BS_INNER_NODE: {
 87 |       // Case 4: the optimal node is the bs inner node, use the StoreInnerNode
 88 |       // function to store itself and its child nodes.
 89 |       StoreInnerNode<BSModel<KeyType, ValueType>>(range, &(it->second.bs));
 90 |       node.nodeArray[storeIdx].bs = it->second.bs;
 91 |       break;
 92 |     }
 93 |     case ARRAY_LEAF_NODE: {
 94 |       // Case 5: the optimal node is the cf array leaf node, and then we store
 95 |       // its information in the remainingNode for future processing due to the
 96 |       // prefetching mechanism
 97 |       scanLeaf.push_back(storeIdx);
 98 |       remainingNode.push_back(storeIdx);
 99 |       remainingRange.push_back(range);
100 |       break;
101 |     }
102 |     case EXTERNAL_ARRAY_LEAF_NODE: {
103 |       // Case 6: the optimal node is the external array leaf node, store it in
104 |       // the node array
105 |       ExternalArray<KeyType, ValueType, Compare> currnode =
106 |           it->second.externalArray;
107 |       int size = range.initRange.size;
108 |       if (size <= 0)
109 |         currnode.m_left = prefetchEnd;
110 |       else
111 |         currnode.m_left = range.initRange.left;
112 |       prefetchEnd += range.initRange.size;
113 |       node.nodeArray[storeIdx].externalArray = currnode;
114 |       break;
115 |     }
116 |   }
117 |   if (type >= ARRAY_LEAF_NODE && range.initRange.size > 0) {
118 |     lastLeaf = storeIdx;
119 |     if (firstLeaf == -1) {
120 |       firstLeaf = storeIdx;
121 |     }
122 |   }
123 | }
124 | 
125 | #endif  // CONSTRUCT_STORE_NODE_H_
126 | 


--------------------------------------------------------------------------------
/src/include/func/find_function.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file find_function.h
  3 |  * @author Jiaoyi
  4 |  * @brief find a record
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef FUNC_FIND_FUNCTION_H_
 12 | #define FUNC_FIND_FUNCTION_H_
 13 | 
 14 | #include <float.h>
 15 | 
 16 | #include <algorithm>
 17 | #include <cstring>
 18 | #include <utility>
 19 | #include <vector>
 20 | 
 21 | #if defined(CATCH_PLATFORM_WINDOWS)
 22 | #include <xmmintrin.h>
 23 | #endif
 24 | 
 25 | #include "../carmi.h"
 26 | 
 27 | template <typename KeyType, typename ValueType, typename Compare,
 28 |           typename Alloc>
 29 | BaseNode<KeyType, ValueType, Compare, Alloc> *
 30 | CARMI<KeyType, ValueType, Compare, Alloc>::Find(const KeyType &key,
 31 |                                                 int *currblock, int *currslot) {
 32 |   int idx = 0;
 33 |   int type = root.flagNumber;
 34 |   int fetch_start = 0;
 35 |   double fetch_leafIdx;
 36 |   while (1) {
 37 |     switch (type) {
 38 |       case PLR_ROOT_NODE:
 39 |         // Case 0: this node is the plr root node
 40 |         // use the plr root node to find the index of the next node and prefetch
 41 |         // the data block
 42 |         if (isPrimary == false) {
 43 |           fetch_leafIdx =
 44 |               root.PLRType<DataVectorType, KeyType>::model.Predict(key);
 45 |           idx = fetch_leafIdx;
 46 |           fetch_start = root.PLRType<DataVectorType, KeyType>::fetch_model
 47 |                             .PrefetchPredict(fetch_leafIdx);
 48 | #if defined(CATCH_PLATFORM_LINUX) || defined(CATCH_PLATFORM_MAC)
 49 |           // the instructions of prefetching in Ubuntu
 50 |           __builtin_prefetch(&data.dataArray[fetch_start], 0, 3);
 51 |           // __builtin_prefetch(&data.dataArray[fetch_start] + 64, 0, 3);
 52 |           // __builtin_prefetch(&data.dataArray[fetch_start] + 128, 0, 3);
 53 |           // __builtin_prefetch(&data.dataArray[fetch_start] + 192, 0, 3);
 54 | #elif defined(CATCH_PLATFORM_WINDOWS)
 55 |           // the instructions of prefetching in Windows
 56 |           _mm_prefetch(static_cast<char *>(
 57 |                            static_cast<void *>(&data.dataArray[fetch_start])),
 58 |                        _MM_HINT_T1);
 59 |           _mm_prefetch(static_cast<char *>(
 60 |                            static_cast<void *>(&data.dataArray[fetch_start])) +
 61 |                            64,
 62 |                        _MM_HINT_T1);
 63 |           _mm_prefetch(static_cast<char *>(
 64 |                            static_cast<void *>(&data.dataArray[fetch_start])) +
 65 |                            128,
 66 |                        _MM_HINT_T1);
 67 |           _mm_prefetch(static_cast<char *>(
 68 |                            static_cast<void *>(&data.dataArray[fetch_start])) +
 69 |                            192,
 70 |                        _MM_HINT_T1);
 71 | #endif
 72 |         } else {
 73 |           idx = root.PLRType<DataVectorType, KeyType>::model.Predict(key);
 74 |         }
 75 |         type = node.nodeArray[idx].lr.flagNumber >> 24;
 76 |         break;
 77 |       case LR_INNER_NODE:
 78 |         // Case 1: this node is the lr inner node
 79 |         // use the predict function of lr inner node to obtain the index of the
 80 |         // next node
 81 |         idx = node.nodeArray[idx].lr.Predict(key);
 82 |         type = node.nodeArray[idx].lr.flagNumber >> 24;
 83 |         break;
 84 |       case PLR_INNER_NODE:
 85 |         // Case 2: this node is the plr inner node
 86 |         // use the predict function of plr inner node to obtain the index of the
 87 |         // next node
 88 |         idx = node.nodeArray[idx].plr.Predict(key);
 89 |         type = node.nodeArray[idx].lr.flagNumber >> 24;
 90 |         break;
 91 |       case HIS_INNER_NODE:
 92 |         // Case 3: this node is the his inner node
 93 |         // use the predict function of his inner node to obtain the index of the
 94 |         // next node
 95 |         idx = node.nodeArray[idx].his.Predict(key);
 96 |         type = node.nodeArray[idx].lr.flagNumber >> 24;
 97 |         break;
 98 |       case BS_INNER_NODE:
 99 |         // Case 4: this node is the bs inner node
100 |         // use the predict function of bs inner node to obtain the index of the
101 |         // next node
102 |         idx = node.nodeArray[idx].bs.Predict(key);
103 |         type = node.nodeArray[idx].lr.flagNumber >> 24;
104 |         break;
105 |       case ARRAY_LEAF_NODE: {
106 |         // Case 5: this node is the cache-friendly array leaf node
107 |         // find the data point in the cf leaf node and return its position
108 |         *currslot = node.nodeArray[idx].cfArray.Find(data, key, currblock);
109 |         return &node.nodeArray[idx];
110 |       }
111 |       case EXTERNAL_ARRAY_LEAF_NODE: {
112 |         // Case 6: this node is the external array leaf node
113 |         // find the data point in the external leaf node and return its position
114 |         *currslot = node.nodeArray[idx].externalArray.Find(key, recordLength,
115 |                                                            external_data);
116 |         return &node.nodeArray[idx];
117 |       }
118 |     }
119 |   }
120 | }
121 | 
122 | #endif  // FUNC_FIND_FUNCTION_H_
123 | 


--------------------------------------------------------------------------------
/src/unitTest/carmiTest/carmi_external_map_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file carmi_external_map_test.cpp
  3 |  * @author Jiaoyi
  4 |  * @brief
  5 |  * @version 0.1
  6 |  * @date 2021-11-14
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | 
 12 | #include "../../include/carmi_external_map.h"
 13 | 
 14 | #include "gtest/gtest.h"
 15 | 
 16 | const int kTestMaxValue = 1000000;
 17 | const int kInitSize = 10000;
 18 | const int kInsertSize = 100;
 19 | const float kRate = 0.1;
 20 | unsigned int seed = time(NULL);
 21 | 
 22 | typedef double KeyType;
 23 | typedef double ValueType;
 24 | typedef std::pair<KeyType, ValueType> DataType;
 25 | typedef std::vector<DataType> DataVecType;
 26 | 
 27 | const int record_size = sizeof(KeyType) + sizeof(ValueType);
 28 | 
 29 | DataVecType initDataset(kInitSize);
 30 | DataVecType insertDataset(kInsertSize);
 31 | 
 32 | template <typename KeyType, typename ValueType>
 33 | class ExternalDataType {
 34 |  public:
 35 |   typedef ValueType ValueType_;
 36 |   ExternalDataType() {
 37 |     k = 0;
 38 |     v = 0;
 39 |   }
 40 |   explicit ExternalDataType(KeyType key, ValueType_ value) {
 41 |     k = key;
 42 |     v = value;
 43 |   }
 44 |   const KeyType &key() const { return k; }
 45 |   const ValueType_ &data() const { return v; }
 46 | 
 47 |   bool operator<(const ExternalDataType &a) const {
 48 |     if (k == a.k) {
 49 |       return v < a.v;
 50 |     }
 51 |     return k < a.k;
 52 |   }
 53 | 
 54 |   KeyType k;
 55 |   ValueType_ v;
 56 | };
 57 | 
 58 | typedef CARMIExternalMap<KeyType, ExternalDataType<KeyType, ValueType>>
 59 |     CarmiType;
 60 | CarmiType carmi;
 61 | KeyType *externalDataset;
 62 | std::default_random_engine engine(time(0));
 63 | 
 64 | TEST(TestCarmiExtmapConstructor, CARMIExtMapConstructor) {
 65 |   std::uniform_real_distribution<KeyType> dis(0, kTestMaxValue);
 66 |   for (int i = 0; i < kInitSize; i++) {
 67 |     KeyType tmpKey = dis(engine);
 68 |     initDataset[i] = {tmpKey, tmpKey * 10};
 69 |   }
 70 |   std::sort(initDataset.begin(), initDataset.end());
 71 |   KeyType lastKey = initDataset[kInitSize - 1].first;
 72 |   std::vector<KeyType> futureInsertKey(kInsertSize);
 73 |   for (int i = 0; i < kInsertSize; i++) {
 74 |     lastKey += 1;
 75 |     insertDataset[i] = {lastKey, lastKey * 10};
 76 |     futureInsertKey[i] = insertDataset[i].first;
 77 |   }
 78 |   std::sort(insertDataset.begin(), insertDataset.end());
 79 |   ASSERT_TRUE(carmi.empty());
 80 |   int extLen = initDataset.size() * 2 + kInsertSize * 2;
 81 |   externalDataset = new KeyType[extLen];
 82 |   for (int i = 0, j = 0; i < static_cast<int>(initDataset.size()); i++) {
 83 |     *(externalDataset + j) = initDataset[i].first;
 84 |     *(externalDataset + j + 1) = initDataset[i].second;
 85 |     j += 2;  // due to <double, double>
 86 |   }
 87 | 
 88 |   CarmiType c(externalDataset, futureInsertKey, initDataset.size(), record_size,
 89 |               kRate);
 90 | 
 91 |   carmi.swap(c);
 92 | 
 93 |   ASSERT_EQ(carmi.size(), kInitSize);
 94 |   ASSERT_FALSE(carmi.empty());
 95 | 
 96 |   auto it = carmi.begin();
 97 |   for (int i = 0; i < kInitSize; i++) {
 98 |     EXPECT_EQ(it.key(), initDataset[i].first) << " i:" << i << std::endl;
 99 |     EXPECT_EQ(it.data(), initDataset[i].second);
100 |     it++;
101 |   }
102 | }
103 | 
104 | TEST(TestCarmiExtmapFind, CARMIExtMapFind) {
105 |   for (int i = 0; i < kInitSize; i++) {
106 |     auto it = carmi.find(initDataset[i].first);
107 |     EXPECT_EQ(it.key(), initDataset[i].first);
108 |     EXPECT_EQ(it.data(), initDataset[i].second);
109 |   }
110 | }
111 | 
112 | TEST(TestCarmiExtmapLowerbound, CARMIExtMapLowerbound) {
113 |   for (int i = 0; i < kInitSize; i++) {
114 |     auto it = carmi.lower_bound(initDataset[i].first);
115 |     EXPECT_EQ(it.key(), initDataset[i].first);
116 |   }
117 |   for (int i = 0; i < kInsertSize; i++) {
118 |     if (insertDataset[i].first < initDataset[kInitSize - 1].first) {
119 |       auto it = carmi.lower_bound(insertDataset[i].first);
120 |       EXPECT_GE(it.key(), insertDataset[i].first);
121 |     }
122 |   }
123 | }
124 | 
125 | TEST(TestCarmiExtmapUpperbound, CARMIExtMapUpperbound) {
126 |   for (int i = 0; i < kInitSize - 1; i++) {
127 |     auto it = carmi.upper_bound(initDataset[i].first);
128 |     EXPECT_GT(it.key(), initDataset[i].first);
129 |   }
130 | }
131 | 
132 | TEST(TestCarmiExtmapEqualRange, CARMIExtMapEqualRange) {
133 |   for (int i = 0; i < kInitSize; i++) {
134 |     auto res = carmi.equal_range(initDataset[i].first);
135 |     for (auto it = res.first; it != res.second; it++) {
136 |       EXPECT_EQ(it.key(), initDataset[i].first);
137 |     }
138 |   }
139 | }
140 | 
141 | TEST(TestCarmiExtmapCount, CARMIExtMapCount) {
142 |   for (int i = 0; i < kInitSize; i++) {
143 |     auto res = carmi.count(initDataset[i].first);
144 |     auto vector_res =
145 |         std::count(initDataset.begin(), initDataset.end(), initDataset[i]);
146 |     EXPECT_EQ(res, vector_res);
147 |   }
148 | }
149 | 
150 | TEST(TestCarmiExtmapInsert, CARMIExtMapInsert) {
151 |   int cnt = 2 * kInitSize;
152 |   for (int i = 0; i < kInsertSize; i++, cnt += 2) {
153 |     auto it = carmi.insert(insertDataset[i].first);
154 |     *(externalDataset + cnt) = insertDataset[i].first;
155 |     *(externalDataset + cnt + 1) = insertDataset[i].second;
156 |     EXPECT_TRUE(it.second);
157 |     EXPECT_EQ(it.first.key(), insertDataset[i].first);
158 |     EXPECT_EQ(it.first.data(), insertDataset[i].second);
159 |     for (int j = 0; j < i; j++) {
160 |       auto res = carmi.find(insertDataset[j].first);
161 |       EXPECT_EQ(res.key(), insertDataset[j].first);
162 |     }
163 |     for (int j = 0; j < kInitSize; j++) {
164 |       auto res = carmi.find(initDataset[j].first);
165 |       EXPECT_EQ(res.key(), initDataset[j].first);
166 |       EXPECT_EQ(res.data(), initDataset[j].second);
167 |     }
168 |   }
169 | }
170 | 


--------------------------------------------------------------------------------
/src/include/nodes/innerNode/bs_model.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file bs_model.h
  3 |  * @author Jiaoyi
  4 |  * @brief binary search inner node
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef NODES_INNERNODE_BS_MODEL_H_
 12 | #define NODES_INNERNODE_BS_MODEL_H_
 13 | 
 14 | #include <algorithm>
 15 | #include <functional>
 16 | #include <memory>
 17 | #include <utility>
 18 | #include <vector>
 19 | 
 20 | #include "../../construct/structures.h"
 21 | 
 22 | /**
 23 |  * @brief binary search inner node
 24 |  *
 25 |  * This class is the binary search inner node. Due to the size limit of 64
 26 |  * bytes, we can only store the kMaxKeyNum key values. Thus, this type is not
 27 |  * suitable for nodes with a large number of child nodes. However, the bs node
 28 |  * can divide the dataset evenly, thus dealing with the uneven dataset.
 29 |  *
 30 |  * @tparam KeyType the type of the keyword
 31 |  * @tparam ValueType the type of the value
 32 |  */
 33 | template <typename KeyType, typename ValueType>
 34 | class BSModel {
 35 |  public:
 36 |   // *** Constructed Types and Constructor
 37 | 
 38 |   /**
 39 |    * @brief the pair of data points
 40 |    */
 41 |   typedef std::pair<KeyType, ValueType> DataType;
 42 | 
 43 |   /**
 44 |    * @brief the vector of data points, which is the type of dataset
 45 |    */
 46 |   typedef std::vector<DataType> DataVectorType;
 47 | 
 48 |   /**
 49 |    * @brief Construct a new BS Model object and use c to set its child number
 50 |    *
 51 |    * This model is a binary search model, which performs a binary search between
 52 |    * the index vector to find the index of the given key value, and the size of
 53 |    * the index must be less than 14 due to the limit of 64 bytes.
 54 |    *
 55 |    * @param[in] c the number of its child nodes
 56 |    */
 57 |   explicit BSModel(int c) {
 58 |     childLeft = 0;
 59 |     flagNumber =
 60 |         (BS_INNER_NODE << 24) + std::max(2, std::min(c, kMaxKeyNum + 1));
 61 |     for (int i = 0; i < kMaxKeyNum; i++) {
 62 |       keys[i] = 0;
 63 |     }
 64 |   }
 65 | 
 66 |  public:
 67 |   // *** Basic Functions of BS Inner Node Objects
 68 | 
 69 |   /**
 70 |    * @brief train the binary search model
 71 |    *
 72 |    * The training data points are stored in dataset[left, left + size].
 73 |    *
 74 |    * @param[in] left the starting index of data points
 75 |    * @param[in] size the size of data points
 76 |    * @param[in] dataset used to train the model
 77 |    */
 78 |   void Train(int left, int size, const DataVectorType &dataset);
 79 | 
 80 |   /**
 81 |    * @brief predict the next node which manages the data point corresponding to
 82 |    * the given key value
 83 |    *
 84 |    * @param[in] key the given key value
 85 |    * @return int: the predicted index of next node
 86 |    */
 87 |   int Predict(KeyType key) const;
 88 | 
 89 |  public:
 90 |   // *** Static Constant Options and Values of BS Inner Node Objects
 91 | 
 92 |   /**
 93 |    * @brief The time cost of the bs inner node.
 94 |    */
 95 |   static constexpr double kTimeCost = carmi_params::kBSInnerTime;
 96 | 
 97 |   /**
 98 |    * @brief The maximum number of stored keys.
 99 |    */
100 |   static constexpr int kMaxKeyNum = 56 / sizeof(KeyType);
101 | 
102 |  public:
103 |   //*** Public Data Members of BS Inner Node Objects
104 | 
105 |   /**
106 |    * @brief A combined integer, composed of the flag of bs inner node
107 |    * (BS_INNER_NODE, 1 byte) and the number of its child nodes (3 bytes). (This
108 |    * member is 4 bytes)
109 |    */
110 |   int flagNumber;
111 | 
112 |   /**
113 |    * @brief The index of its first child node in the node array. All the child
114 |    * nodes are stored in node[childLeft, childLeft + size]. Through this member
115 |    * and the right three bytes of flagNumber, all the child nodes can be
116 |    * accessed. (4 bytes)
117 |    */
118 |   int childLeft;
119 | 
120 |   /**
121 |    * @brief store at most kMaxKeyNum key values
122 |    * This bs model divides the key range into kMaxKeyNum + 1 intervals. To
123 |    * determine which branch to go through, perform a binary search among the
124 |    * kMaxKeyNum key values to locate the corresponding key value interval
125 |    * covering the input key. (56 bytes)
126 |    */
127 |   KeyType keys[kMaxKeyNum];
128 | };
129 | 
130 | template <typename KeyType, typename ValueType>
131 | inline void BSModel<KeyType, ValueType>::Train(int left, int size,
132 |                                                const DataVectorType &dataset) {
133 |   if (size == 0) return;
134 |   if (left < 0 || size < 0 || left + size > dataset.size()) {
135 |     throw std::out_of_range(
136 |         "BSModel::Train: the range of training dataset is invalid.");
137 |   }
138 | 
139 |   int childNumber = flagNumber & 0x00FFFFFF;
140 |   // calculate the value of the segment
141 |   float value = static_cast<float>(size) / childNumber;
142 |   int cnt = 1;
143 |   int start = left + value;
144 |   int end = left + size;
145 |   // store the minimum value of each segment
146 |   for (int i = start; i < end; i += value) {
147 |     if (cnt >= childNumber) {
148 |       break;
149 |     }
150 |     keys[cnt - 1] = dataset[i].first;
151 |     cnt++;
152 |   }
153 | }
154 | 
155 | template <typename KeyType, typename ValueType>
156 | inline int BSModel<KeyType, ValueType>::Predict(KeyType key) const {
157 |   int start_idx = 0;
158 |   // get the maximum index
159 |   int end_idx = (flagNumber & 0x00FFFFFF) - 2;
160 |   if (key > keys[end_idx]) {
161 |     return childLeft + end_idx + 1;
162 |   }
163 |   int mid;
164 |   // perform binary search between the index vector
165 |   while (start_idx < end_idx) {
166 |     mid = (start_idx + end_idx) >> 1;
167 |     if (keys[mid] < key)
168 |       start_idx = mid + 1;
169 |     else
170 |       end_idx = mid;
171 |   }
172 |   return start_idx + childLeft;
173 | }
174 | 
175 | #endif  // NODES_INNERNODE_BS_MODEL_H_
176 | 


--------------------------------------------------------------------------------
/src/include/nodes/innerNode/lr_model.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file lr_model.h
  3 |  * @author Jiaoyi
  4 |  * @brief linear regression inner node
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef NODES_INNERNODE_LR_MODEL_H_
 12 | #define NODES_INNERNODE_LR_MODEL_H_
 13 | 
 14 | #include <float.h>
 15 | 
 16 | #include <algorithm>
 17 | #include <functional>
 18 | #include <memory>
 19 | #include <utility>
 20 | #include <vector>
 21 | 
 22 | #include "../../construct/structures.h"
 23 | 
 24 | /**
 25 |  * @brief linear regression inner node
 26 |  *
 27 |  * This class is the LR inner node, which uses linear regression to train the
 28 |  * model and predict the index of the next node. The CPU time cost of this node
 29 |  * is less than the other nodes.
 30 |  *
 31 |  * @tparam KeyType the type of the keyword
 32 |  * @tparam ValueType the type of the value
 33 |  */
 34 | template <typename KeyType, typename ValueType>
 35 | class LRModel {
 36 |  public:
 37 |   // *** Constructed Types and Constructor
 38 | 
 39 |   /**
 40 |    * @brief the pair of data points
 41 |    */
 42 |   typedef std::pair<KeyType, ValueType> DataType;
 43 | 
 44 |   /**
 45 |    * @brief the vector of data points, which is the type of dataset
 46 |    */
 47 |   typedef std::vector<DataType> DataVectorType;
 48 | 
 49 |   /**
 50 |    * @brief Construct a new LRModel object and use c to set its child number
 51 |    *
 52 |    * @param[in] c the number of its child nodes
 53 |    */
 54 |   explicit LRModel(int c) {
 55 |     childLeft = 0;
 56 |     slope = 0;
 57 |     intercept = 0;
 58 |     minValue = 0;
 59 |     flagNumber = (LR_INNER_NODE << 24) + std::max(std::min(c, 0x00FFFFFF), 2);
 60 |   }
 61 | 
 62 |  public:
 63 |   // *** Basic Functions of LR Inner Node Objects
 64 | 
 65 |   /**
 66 |    * @brief train the linear regression model
 67 |    *
 68 |    * The training data points are stored in dataset[left, left + size].
 69 |    *
 70 |    * @param[in] left the starting index of data points
 71 |    * @param[in] size  the size of data points
 72 |    * @param[in] dataset used to train the model
 73 |    */
 74 |   void Train(int left, int size, const DataVectorType &dataset);
 75 | 
 76 |   /**
 77 |    * @brief predict the next node which manages the data point corresponding to
 78 |    * the given key value
 79 |    *
 80 |    * @param[in] key the given key value
 81 |    * @return int: the predicted index of next node
 82 |    */
 83 |   int Predict(KeyType key) const;
 84 | 
 85 |  public:
 86 |   // *** Static Constant Options and Values of LR Inner Node Objects
 87 | 
 88 |   /**
 89 |    * @brief The time cost of the lr inner node.
 90 |    */
 91 |   static constexpr double kTimeCost = carmi_params::kLRInnerTime;
 92 | 
 93 |   /**
 94 |    * @brief The bytes of placeholder.
 95 |    */
 96 |   static constexpr int kPlaceHolderLen = 48 - sizeof(KeyType);
 97 | 
 98 |  public:
 99 |   //*** Public Data Members of LR Inner Node Objects
100 | 
101 |   /**
102 |    * @brief A combined integer, composed of the flag of lr inner node
103 |    * (LR_INNER_NODE, 1 byte) and the number of its child nodes (3 bytes). (This
104 |    * member is 4 bytes)
105 |    */
106 |   int flagNumber;
107 | 
108 |   /**
109 |    * @brief The index of its first child node in the node array. All the child
110 |    * nodes are stored in node[childLeft, childLeft + size]. Through this member
111 |    * and the right three bytes of flagNumber, all the child nodes can be
112 |    * accessed. (4 bytes)
113 |    */
114 |   int childLeft;
115 | 
116 |   /**
117 |    * @brief The slope parameter of the linear regression model. (4 bytes)
118 |    */
119 |   float slope;
120 | 
121 |   /**
122 |    * @brief The intercept parameter of the linear regression model. (4 bytes)
123 |    */
124 |   float intercept;
125 | 
126 |   /**
127 |    * @brief The minimum value.
128 |    */
129 |   KeyType minValue;
130 | 
131 |   /**
132 |    * @brief Placeholder, used to make sure that the size of this node is 64
133 |    * bytes. (kPlaceHolderLen bytes)
134 |    */
135 |   char Placeholder[kPlaceHolderLen];
136 | };
137 | 
138 | template <typename KeyType, typename ValueType>
139 | inline void LRModel<KeyType, ValueType>::Train(int left, int size,
140 |                                                const DataVectorType &dataset) {
141 |   // Case 1: the dataset is empty, return directly
142 |   if (size == 0) return;
143 |   if (left < 0 || size < 0 || left + size > dataset.size()) {
144 |     throw std::out_of_range(
145 |         "LRModel::Train: the range of training dataset is invalid.");
146 |   }
147 | 
148 |   // Case 2: use the dataset to train the model
149 |   // extract data points from dataset[left, left + size] and use their processed
150 |   // relative index as y to train
151 |   int childNumber = flagNumber & 0x00FFFFFF;
152 |   minValue = dataset[left].first;
153 |   std::vector<std::pair<KeyType, double>> currdata(size);
154 |   for (int i = 0, j = left; i < size; i++, j++) {
155 |     currdata[i].first = dataset[j].first - minValue;
156 |     currdata[i].second = i * 1.0 / size * childNumber;
157 |   }
158 | 
159 |   // train the lr model
160 |   double t1 = 0, t2 = 0, t3 = 0, t4 = 0;
161 |   for (int i = 0; i < size; i++) {
162 |     t1 += static_cast<double>(currdata[i].first) *
163 |           static_cast<double>(currdata[i].first);
164 |     t2 += static_cast<double>(currdata[i].first);
165 |     t3 += static_cast<double>(currdata[i].first) * currdata[i].second;
166 |     t4 += currdata[i].second;
167 |   }
168 |   if (t1 * size - t2 * t2) {
169 |     slope = (t3 * size - t2 * t4) / (t1 * size - t2 * t2);
170 |     intercept = (t1 * t4 - t2 * t3) / (t1 * size - t2 * t2);
171 |   } else {
172 |     slope = 0;
173 |     intercept = 0;
174 |   }
175 | }
176 | 
177 | template <typename KeyType, typename ValueType>
178 | inline int LRModel<KeyType, ValueType>::Predict(KeyType key) const {
179 |   // use the lr model to predict the index of the next node
180 |   int p = slope * static_cast<double>(key - minValue) + intercept;
181 |   // get its child number
182 |   int bound = flagNumber & 0x00FFFFFF;
183 |   // check whether p exceeds the boundaries
184 |   if (p < 0)
185 |     p = 0;
186 |   else if (p >= bound)
187 |     p = bound - 1;
188 |   return p + childLeft;
189 | }
190 | #endif  // NODES_INNERNODE_LR_MODEL_H_
191 | 


--------------------------------------------------------------------------------
/src/include/construct/construct_root.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file construct_root.h
  3 |  * @author Jiaoyi
  4 |  * @brief functions for constructing the root
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef CONSTRUCT_CONSTRUCT_ROOT_H_
 12 | #define CONSTRUCT_CONSTRUCT_ROOT_H_
 13 | #include <algorithm>
 14 | #include <vector>
 15 | 
 16 | #include "../carmi.h"
 17 | #include "../nodes/rootNode/trainModel/linear_regression.h"
 18 | #include "./dp.h"
 19 | #include "./store_node.h"
 20 | #include "./structures.h"
 21 | 
 22 | template <typename KeyType, typename ValueType, typename Compare,
 23 |           typename Alloc>
 24 | template <typename RootNodeType>
 25 | void CARMI<KeyType, ValueType, Compare, Alloc>::UpdateRootOptSetting(
 26 |     int c, double *optimalCost, RootStruct *rootStruct) {
 27 |   // calculate the basic space cost of the c child nodes of the root node
 28 |   double space_cost = kBaseNodeSpace * static_cast<double>(c);
 29 |   // calculate the time cost of the root node
 30 |   double time_cost = RootNodeType::kTimeCost;
 31 | 
 32 |   // train this type of the root node
 33 |   RootNodeType tmpRoot(c, initDataset);
 34 |   IndexPair range{0, static_cast<int>(initDataset.size())};
 35 |   IndexPair insertRange{0, static_cast<int>(insertQuery.size())};
 36 |   // initialize the variables that store the range of each sub-dataset
 37 |   std::vector<IndexPair> perSize(c, emptyRange);
 38 |   std::vector<IndexPair> perInsertSize(c, emptyRange);
 39 |   // split initDataset into c sub-datasets
 40 |   NodePartition<typename RootNodeType::ModelType>(tmpRoot.model, range,
 41 |                                                   initDataset, &perSize);
 42 |   // split insertDataset into c sub-datasets
 43 |   NodePartition<typename RootNodeType::ModelType>(tmpRoot.model, insertRange,
 44 |                                                   insertQuery, &perInsertSize);
 45 | 
 46 |   int maxLeafCapacity = carmi_params::kMaxLeafNodeSizeExternal;
 47 |   if (!isPrimary) {
 48 |     maxLeafCapacity =
 49 |         CFArrayType<KeyType, ValueType, Compare, Alloc>::kMaxLeafCapacity;
 50 |   }
 51 |   for (int i = 0; i < c; i++) {
 52 |     if (perSize[i].size == static_cast<int>(initDataset.size())) {
 53 |       return;
 54 |     }
 55 |     int totalDataNum = perSize[i].size + perInsertSize[i].size;
 56 |     // if leaf nodes are cf array leaf nodes, add the space cost of data
 57 |     // blocks to the total space cost
 58 |     if (!isPrimary) {
 59 |       int tmpBlockNum =
 60 |           CFArrayType<KeyType, ValueType, Compare, Alloc>::CalNeededBlockNum(
 61 |               totalDataNum);
 62 |       space_cost +=
 63 |           tmpBlockNum * carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0;
 64 |     }
 65 |     // if the total number of data points exceeds the maximum capacity of the
 66 |     // leaf node, the current node needs at least kMinChildNumber inner nodes to
 67 |     // manage the data points together
 68 |     if (totalDataNum > maxLeafCapacity) {
 69 |       space_cost += kBaseNodeSpace * kMinChildNumber;
 70 |       time_cost += carmi_params::kMemoryAccessTime *
 71 |                    static_cast<double>(perSize[i].size) /
 72 |                    static_cast<double>(initDataset.size());
 73 |     }
 74 |   }
 75 | 
 76 |   // calculate the entropy of the root node
 77 |   double entropy = CalculateEntropy(perSize);
 78 |   double cost =
 79 |       (time_cost + lambda * static_cast<double>(space_cost)) / entropy;
 80 | 
 81 |   // if the current cost is smaller than the optimal cost, update the optimal
 82 |   // cost and root setting
 83 |   if (cost <= *optimalCost) {
 84 |     *optimalCost = cost;
 85 |     rootStruct->rootChildNum = c;
 86 |     rootStruct->rootType = tmpRoot.flagNumber;
 87 |   }
 88 | }
 89 | 
 90 | template <typename KeyType, typename ValueType, typename Compare,
 91 |           typename Alloc>
 92 | RootStruct CARMI<KeyType, ValueType, Compare, Alloc>::ChooseRoot() {
 93 |   double OptimalValue = DBL_MAX;
 94 |   RootStruct rootStruct(PLR_ROOT_NODE, kMinChildNumber);
 95 |   int minNum =
 96 |       std::max(kMinChildNumber, static_cast<int>(initDataset.size() / 1024));
 97 |   int maxNum =
 98 |       std::max(kMinChildNumber, static_cast<int>(initDataset.size() / 2));
 99 | 
100 |   // Calculate the cost of different settings and choose the optimal setting
101 |   for (int c = minNum; c <= maxNum; c *= 1.3) {
102 |     UpdateRootOptSetting<PLRType<DataVectorType, KeyType>>(
103 |         c * 1.001, &OptimalValue, &rootStruct);
104 |   }
105 |   // return the optimal root setting
106 |   return rootStruct;
107 | }
108 | 
109 | template <typename KeyType, typename ValueType, typename Compare,
110 |           typename Alloc>
111 | SubDataset CARMI<KeyType, ValueType, Compare, Alloc>::StoreRoot(
112 |     const RootStruct &rootStruct) {
113 |   SubDataset subDataset(rootStruct.rootChildNum);
114 |   // allocate a block of empty memory for these child nodes
115 |   node.AllocateNodeMemory(rootStruct.rootChildNum);
116 |   DataRange range({0, static_cast<int>(initDataset.size())},
117 |                   {0, static_cast<int>(findQuery.size())},
118 |                   {0, static_cast<int>(insertQuery.size())});
119 |   switch (rootStruct.rootType) {
120 |     case PLR_ROOT_NODE: {
121 |       // construct the root node and train the model
122 |       root = PLRType<DataVectorType, KeyType>(rootStruct.rootChildNum,
123 |                                               initDataset);
124 |       // split the dataset
125 |       NodePartition<typename PLRType<DataVectorType, KeyType>::ModelType>(
126 |           root.model, range.initRange, initDataset, &(subDataset.subInit));
127 |       subDataset.subFind = subDataset.subInit;
128 |       NodePartition<typename PLRType<DataVectorType, KeyType>::ModelType>(
129 |           root.model, range.insertRange, insertQuery, &(subDataset.subInsert));
130 |       break;
131 |     }
132 |   }
133 |   // roughly calculate the number of needed data blocks
134 |   int blockNum = 0;
135 |   for (int i = 0; i < rootStruct.rootChildNum; i++) {
136 |     if (subDataset.subInit[i].size + subDataset.subInsert[i].size <
137 |         CFArrayType<KeyType, ValueType, Compare, Alloc>::kMaxLeafCapacity)
138 |       blockNum +=
139 |           CFArrayType<KeyType, ValueType, Compare, Alloc>::CalNeededBlockNum(
140 |               subDataset.subInit[i].size + subDataset.subInsert[i].size);
141 |   }
142 | 
143 |   // update the block number of the prefetch prediction model
144 |   root.fetch_model.SetBlockNumber(blockNum);
145 |   // update the size of the data array
146 |   data.dataArray.resize(blockNum, LeafSlots<KeyType, ValueType>());
147 |   return subDataset;
148 | }
149 | #endif  // CONSTRUCT_CONSTRUCT_ROOT_H_
150 | 


--------------------------------------------------------------------------------
/src/experiment/core.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file core.cpp
  3 |  * @author Jiaoyi
  4 |  * @brief
  5 |  * @version 3.0
  6 |  * @date 2021-03-16
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #include <vector>
 12 | 
 13 | #include "../include/construct/construction.h"
 14 | #include "../include/func/calculate_space.h"
 15 | #include "../include/func/get_node_info.h"
 16 | #include "./experiment_params.h"
 17 | #include "./functions.h"
 18 | 
 19 | extern std::ofstream outRes;
 20 | 
 21 | /**
 22 |  * @brief the function of using CARMI
 23 |  *
 24 |  * @param[in] isZipfian whether to use zipfian access during the test
 25 |  * @param[in] initRatio the workload type
 26 |  * @param[in] rate the weight of space
 27 |  * @param[in] length the length of range scan
 28 |  * @param[in] initDataset
 29 |  * @param[in] testInsertQuery
 30 |  */
 31 | void CoreCARMI(bool isZipfian, double initRatio, double rate,
 32 |                const std::vector<int> &length, const DataVecType &initDataset,
 33 |                const DataVecType &insertDataset,
 34 |                const DataVecType &testInsertQuery) {
 35 | #ifdef DEBUG
 36 |   std::cout << std::endl;
 37 |   std::cout << "-------------------------------" << std::endl;
 38 |   std::cout << "kRate: " << rate << std::endl;
 39 |   std::cout << "Start construction!" << std::endl;
 40 |   time_t timep;
 41 |   time(&timep);
 42 |   char tmpTime[64];
 43 |   strftime(tmpTime, sizeof(tmpTime), "%Y-%m-%d %H:%M:%S", localtime(&timep));
 44 |   std::cout << "\nTEST time: " << tmpTime << std::endl;
 45 | #endif
 46 | 
 47 |   typedef CARMIMap<KeyType, ValueType> CarmiType;
 48 |   CarmiType carmi(initDataset.begin(), initDataset.end(), insertDataset.begin(),
 49 |                   insertDataset.end(), rate);
 50 | 
 51 | #ifdef DEBUG
 52 |   time(&timep);
 53 |   char tmpTime1[64];
 54 |   strftime(tmpTime1, sizeof(tmpTime1), "%Y-%m-%d %H:%M:%S", localtime(&timep));
 55 |   std::cout << "finish time: " << tmpTime1 << std::endl;
 56 | 
 57 |   std::cout << "\nprint the space:" << std::endl;
 58 |   auto space = carmi.CalculateSpace() / 1024.0 / 1024.0;
 59 |   outRes << space << ",";
 60 |   std::cout << space << " MB\n";
 61 | 
 62 | #endif
 63 | 
 64 |   if (initRatio == kWriteHeavy)
 65 |     WorkloadA<KeyType, ValueType>(isZipfian, initDataset, testInsertQuery,
 66 |                                   &carmi);  // write-heavy
 67 |   else if (initRatio == kReadHeavy)
 68 |     WorkloadB<KeyType, ValueType>(isZipfian, initDataset, testInsertQuery,
 69 |                                   &carmi);  // read-heavy
 70 |   else if (initRatio == kReadOnly)
 71 |     WorkloadC<KeyType, ValueType>(isZipfian, initDataset,
 72 |                                   &carmi);  // read-only
 73 |   else if (initRatio == kWritePartial)
 74 |     WorkloadD<KeyType, ValueType>(isZipfian, initDataset, testInsertQuery,
 75 |                                   &carmi);  // write-partial
 76 |   else if (initRatio == kRangeScan)
 77 |     WorkloadE<KeyType, ValueType>(isZipfian, initDataset, testInsertQuery,
 78 |                                   length,
 79 |                                   &carmi);  // range scan
 80 | }
 81 | 
 82 | template <typename KeyType, typename ValueType>
 83 | class ExternalDataType {
 84 |  public:
 85 |   typedef ValueType ValueType_;
 86 |   ExternalDataType() {
 87 |     k = 0;
 88 |     v = 0;
 89 |   }
 90 |   explicit ExternalDataType(KeyType key, ValueType_ value) {
 91 |     k = key;
 92 |     v = value;
 93 |   }
 94 |   const KeyType &key() const { return k; }
 95 |   const ValueType_ &data() const { return v; }
 96 | 
 97 |   bool operator<(const ExternalDataType &a) const {
 98 |     if (k == a.k) {
 99 |       return v < a.v;
100 |     }
101 |     return k < a.k;
102 |   }
103 | 
104 |   KeyType k;
105 |   ValueType_ v;
106 | };
107 | 
108 | /**
109 |  * @brief the function of using external CARMI
110 |  *
111 |  * @param[in] isZipfian whether to use zipfian access during the test
112 |  * @param[in] initRatio the workload type
113 |  * @param[in] rate the weight of space
114 |  * @param[in] length the length of range scan
115 |  * @param[in] initDataset
116 |  * @param[in] testInsertQuery
117 |  */
118 | void CoreExternalCARMI(bool isZipfian, double initRatio, double rate,
119 |                        const std::vector<int> &length,
120 |                        const DataVecType &initDataset,
121 |                        const DataVecType &testInsertQuery) {
122 |   DataVecType init = initDataset;
123 | 
124 | #ifdef DEBUG
125 |   std::cout << std::endl;
126 |   std::cout << "-------------------------------" << std::endl;
127 |   std::cout << "Start construction!" << std::endl;
128 |   time_t timep;
129 |   time(&timep);
130 |   char tmpTime[64];
131 |   strftime(tmpTime, sizeof(tmpTime), "%Y-%m-%d %H:%M:%S", localtime(&timep));
132 |   std::cout << "\nTEST time: " << tmpTime << std::endl;
133 | #endif
134 | 
135 |   KeyType *externalDataset;
136 |   const int record_size = sizeof(KeyType) + sizeof(ValueType);
137 |   typedef CARMIExternalMap<KeyType, ExternalDataType<KeyType, ValueType>>
138 |       CarmiType;
139 |   int extLen = initDataset.size() * 2 + kTestSize * 2;
140 |   externalDataset = new KeyType[extLen];
141 |   for (int i = 0, j = 0; i < static_cast<int>(initDataset.size()); i++) {
142 |     *(externalDataset + j) = initDataset[i].first;
143 |     *(externalDataset + j + 1) = initDataset[i].second;
144 |     j += 2;  // due to <double, double>
145 |   }
146 |   std::vector<KeyType> futureInsertKey(testInsertQuery.size(), 0);
147 |   for (int i = 0; i < static_cast<int>(testInsertQuery.size()); i++) {
148 |     futureInsertKey[i] = testInsertQuery[i].first;
149 |   }
150 |   // initDataset -> only includes the findQuery
151 |   CarmiType carmi(externalDataset, futureInsertKey, initDataset.size(),
152 |                   record_size, rate);
153 | 
154 | #ifdef DEBUG
155 |   time(&timep);
156 |   char tmpTime1[64];
157 |   strftime(tmpTime1, sizeof(tmpTime1), "%Y-%m-%d %H:%M:%S", localtime(&timep));
158 |   std::cout << "finish time: " << tmpTime1 << std::endl;
159 | #endif
160 | 
161 |   if (initRatio == kWriteHeavy)
162 |     WorkloadA<KeyType, ExternalDataType<KeyType, ValueType>>(
163 |         isZipfian, init, testInsertQuery,
164 |         &carmi);  // write-heavy
165 |   else if (initRatio == kReadHeavy)
166 |     WorkloadB<KeyType, ExternalDataType<KeyType, ValueType>>(
167 |         isZipfian, init, testInsertQuery,
168 |         &carmi);  // read-heavy
169 |   else if (initRatio == kReadOnly)
170 |     WorkloadC<KeyType, ExternalDataType<KeyType, ValueType>>(
171 |         isZipfian, init,
172 |         &carmi);  // read-only
173 |   else if (initRatio == kRangeScan)
174 |     WorkloadE<KeyType, ExternalDataType<KeyType, ValueType>>(
175 |         isZipfian, init, testInsertQuery, length,
176 |         &carmi);  // range scan
177 | }
178 | 


--------------------------------------------------------------------------------
/src/experiment/main_experiment.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file main_experiment.cpp
  3 |  * @author Jiaoyi
  4 |  * @brief
  5 |  * @version 3.0
  6 |  * @date 2021-03-16
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | 
 12 | #include <algorithm>
 13 | #include <vector>
 14 | 
 15 | #include "./experiment_params.h"
 16 | #include "./functions.h"
 17 | extern std::ofstream outRes;
 18 | 
 19 | /**
 20 |  * @brief test all datasets and workloads
 21 |  */
 22 | void mainExperiment() {
 23 |   // for range scan
 24 |   std::vector<int> length;
 25 | 
 26 |   // read-only
 27 |   mainSynthetic(kReadOnly, length);
 28 |   mainYCSB(kReadOnly, length);
 29 |   mainMap(kReadOnly, length);
 30 | 
 31 |   // write-heavy
 32 |   mainSynthetic(kWriteHeavy, length);
 33 |   mainYCSB(kWriteHeavy, length);
 34 |   mainMap(kWriteHeavy, length);
 35 | 
 36 |   // read-heavy
 37 |   mainSynthetic(kReadHeavy, length);
 38 |   mainYCSB(kReadHeavy, length);
 39 |   mainMap(kReadHeavy, length);
 40 | 
 41 |   // write-partial
 42 |   mainSynthetic(kWritePartial, length);
 43 |   mainYCSB(kWritePartial, length);
 44 |   mainMap(kWritePartial, length);
 45 | 
 46 |   // range scan
 47 |   std::default_random_engine e(time(0));
 48 |   std::uniform_int_distribution<int> dis(0, 100);
 49 |   for (int i = 0; i < kDatasetSize; i++) {
 50 |     length.push_back(std::min(dis(e), kDatasetSize) - i);
 51 |   }
 52 |   mainSynthetic(kRangeScan, length);
 53 |   mainYCSB(kRangeScan, length);
 54 |   mainMap(kRangeScan, length);
 55 | }
 56 | 
 57 | /**
 58 |  * @brief test the synthetic datasets
 59 |  *
 60 |  * @param[in] initRatio the workload type
 61 |  * @param[in] length the length of range scan
 62 |  */
 63 | void mainSynthetic(double initRatio, const std::vector<int> &length) {
 64 |   std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&"
 65 |                "&&&&&&&"
 66 |             << std::endl;
 67 |   std::cout << "initRatio is: " << initRatio << std::endl;
 68 |   outRes << "initRatio," << initRatio << std::endl;
 69 |   double init = initRatio;
 70 |   if (init == kRangeScan) {
 71 |     init = kReadHeavy;
 72 |   }
 73 |   LognormalDataset logData(init);
 74 |   UniformDataset uniData(init);
 75 |   NormalDataset norData(init);
 76 |   ExponentialDataset expData(init);
 77 | 
 78 |   DataVecType initData;
 79 |   DataVecType insertData;
 80 |   DataVecType testInsert;
 81 | 
 82 |   for (int r = 0; r < static_cast<int>(rate.size()); r++) {
 83 |     double kRate = rate[r];
 84 |     outRes << "kRate:" << kRate << std::endl;
 85 |     std::cout << "+++++++++++ uniform dataset ++++++++++++++++++++++++++"
 86 |               << std::endl;
 87 |     uniData.GenerateDataset(&initData, &insertData, &testInsert);
 88 |     CoreCARMI(false, initRatio, kRate, length, initData, insertData,
 89 |               testInsert);
 90 |     CoreCARMI(true, initRatio, kRate, length, initData, insertData, testInsert);
 91 | 
 92 |     // std::cout << "+++++++++++ exponential dataset +++++++++++++++++++"
 93 |     //           << std::endl;
 94 |     // expData.GenerateDataset(&initData, &insertData, &testInsert);
 95 |     // CoreCARMI(false, initRatio, kRate, length, initData, insertData,
 96 |     //           testInsert);
 97 |     // // CoreCARMI(true, initRatio, kRate, length, initData, insertData,
 98 |     // // testInsert);
 99 | 
100 |     std::cout << "+++++++++++ normal dataset ++++++++++++++++++++++++++"
101 |               << std::endl;
102 |     norData.GenerateDataset(&initData, &insertData, &testInsert);
103 |     CoreCARMI(false, initRatio, kRate, length, initData, insertData,
104 |               testInsert);
105 |     CoreCARMI(true, initRatio, kRate, length, initData, insertData, testInsert);
106 | 
107 |     std::cout << "+++++++++++ lognormal dataset ++++++++++++++++++++++++++"
108 |               << std::endl;
109 |     logData.GenerateDataset(&initData, &insertData, &testInsert);
110 |     CoreCARMI(false, initRatio, kRate, length, initData, insertData,
111 |               testInsert);
112 |     CoreCARMI(true, initRatio, kRate, length, initData, insertData, testInsert);
113 | 
114 |     outRes << std::endl;
115 |   }
116 | }
117 | 
118 | /**
119 |  * @brief test the map datasets
120 |  *
121 |  * @param[in] initRatio the workload type
122 |  * @param[in] length the length of range scan
123 |  */
124 | void mainMap(double initRatio, const std::vector<int> &length) {
125 |   std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&"
126 |                "&&&&&&&"
127 |             << std::endl;
128 |   std::cout << "initRatio is: " << initRatio << std::endl;
129 |   outRes << "initRatio," << initRatio << std::endl;
130 |   std::cout << "construct map" << std::endl;
131 |   outRes << "construct map" << std::endl;
132 |   double init = initRatio;
133 |   if (init == kRangeScan) {
134 |     init = kReadHeavy;
135 |   }
136 |   OsmcDataset osmcData(init);
137 | 
138 |   DataVecType initData;
139 |   DataVecType insertData;
140 |   DataVecType testInsert;
141 | 
142 |   for (int r = 0; r < static_cast<int>(rate.size()); r++) {
143 |     double kRate = rate[r];
144 |     outRes << "kRate:" << kRate << std::endl;
145 | 
146 |     std::cout << "+++++++++++ osmc dataset ++++++++++++++++++++++++++"
147 |               << std::endl;
148 |     osmcData.GenerateDataset(&initData, &insertData, &testInsert);
149 |     CoreCARMI(true, initRatio, kRate, length, initData, insertData, testInsert);
150 |     CoreCARMI(false, initRatio, kRate, length, initData, insertData,
151 |               testInsert);
152 | 
153 |     outRes << std::endl;
154 |   }
155 | }
156 | 
157 | /**
158 |  * @brief test the YCSB datasets
159 |  *
160 |  * @param[in] initRatio the workload type
161 |  * @param[in] length the length of range scan
162 |  */
163 | void mainYCSB(double initRatio, const std::vector<int> &length) {
164 |   kPrimaryIndex = true;
165 |   std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&"
166 |                "&&&&&&&"
167 |             << std::endl;
168 |   std::cout << "initRatio is: " << initRatio << std::endl;
169 |   outRes << "initRatio," << initRatio << std::endl;
170 |   std::cout << "construct ycsb" << std::endl;
171 |   outRes << "construct ycsb" << std::endl;
172 |   double init = initRatio;
173 |   if (init == kRangeScan) {
174 |     init = kReadHeavy;
175 |   }
176 |   YCSBDataset ycsbData(init);
177 | 
178 |   DataVecType initData;
179 |   DataVecType insertData;
180 |   DataVecType testInsert;
181 | 
182 |   for (int r = 0; r < static_cast<int>(rate.size()); r++) {
183 |     double kRate = rate[r];
184 |     outRes << "kRate:" << kRate << std::endl;
185 |     std::cout << "+++++++++++ ycsb dataset ++++++++++++++++++++++++++"
186 |               << std::endl;
187 |     ycsbData.GenerateDataset(&initData, &insertData, &testInsert);
188 |     CoreExternalCARMI(true, initRatio, kRate, length, initData, testInsert);
189 | 
190 |     outRes << std::endl;
191 |   }
192 |   kPrimaryIndex = false;
193 | }
194 | 


--------------------------------------------------------------------------------
/src/include/nodes/innerNode/candidate_plr.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file candidate_plr.h
  3 |  * @author Jiaoyi
  4 |  * @brief class for piecewise linear regression model
  5 |  * @version 3.0
  6 |  * @date 2021-03-16
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #include <math.h>
 12 | 
 13 | #include <map>
 14 | #include <utility>
 15 | #include <vector>
 16 | 
 17 | #include "../../construct/structures.h"
 18 | #include "../../params.h"
 19 | 
 20 | #ifndef NODES_INNERNODE_CANDIDATE_PLR_H_
 21 | #define NODES_INNERNODE_CANDIDATE_PLR_H_
 22 | 
 23 | /**
 24 |  * @brief Designed for the piecewise linear regression model. This structure
 25 |  * records all the contents that need to be stored in the training process of
 26 |  * the piecewise linear function, which is the item in the dp table.
 27 |  */
 28 | template <typename KeyType>
 29 | struct SegmentPoint {
 30 |   /**
 31 |    * @brief the current cost
 32 |    */
 33 |   float cost = -DBL_MAX;
 34 | 
 35 |   /**
 36 |    * @brief the key values
 37 |    */
 38 |   KeyType key[12] = {KeyType(), KeyType(), KeyType(), KeyType(),
 39 |                      KeyType(), KeyType(), KeyType(), KeyType(),
 40 |                      KeyType(), KeyType(), KeyType(), KeyType()};
 41 | 
 42 |   /**
 43 |    * @brief the corresponding indexes
 44 |    */
 45 |   int idx[12] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
 46 | 
 47 |   /**
 48 |    * @brief the number of blocks for the dp table in the prefetch prediction
 49 |    * model
 50 |    */
 51 |   int blockNum[12] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
 52 | };
 53 | 
 54 | /**
 55 |  * @brief designed for piecewise linear regression model.
 56 |  *
 57 |  * This class stores the cost between each candidate point, the parameters of
 58 |  * the line segment between two points, entropy, and so on in the process of
 59 |  * dynamic programming algorithm in P. LR model, which is used to assist and
 60 |  * accelerate the DP algorithm.
 61 |  *
 62 |  * @tparam DataVectorType the vector type of the dataset, each element is a
 63 |  * pair: {key, value}
 64 |  */
 65 | template <typename DataVectorType>
 66 | class CandidateCost {
 67 |  public:
 68 |   /**
 69 |    * @brief Construct a new empty Candidate Cost object
 70 |    */
 71 |   CandidateCost() {}
 72 | 
 73 |   /**
 74 |    * @brief store the slope and intercept of each segment
 75 |    *
 76 |    * @param[in] dataset the given dataset, each element is: {key value, y}
 77 |    * @param[in] index the indexes of candidates
 78 |    */
 79 |   void StoreTheta(const DataVectorType &dataset,
 80 |                   const std::vector<int> &index) {
 81 |     // store the value of each segment for least squares, used to speed up the
 82 |     // training process of the linear regression
 83 |     std::vector<long double> xx(index.size(), 0);
 84 |     std::vector<long double> x(index.size(), 0);
 85 |     std::vector<long double> px(index.size(), 0);
 86 |     std::vector<long double> p(index.size(), 0);
 87 |     xx[0] = 0.0;
 88 |     x[0] = 0.0;
 89 |     px[0] = 0.0;
 90 |     p[0] = 0.0;
 91 |     for (int i = 1; i < static_cast<int>(index.size()); i++) {
 92 |       for (int j = index[i - 1]; j < index[i]; j++) {
 93 |         xx[i] += static_cast<long double>(dataset[j].first) *
 94 |                  static_cast<long double>(dataset[j].first);
 95 |         x[i] += static_cast<long double>(dataset[j].first);
 96 |         px[i] += static_cast<long double>(dataset[j].first) *
 97 |                  static_cast<long double>(dataset[j].second);
 98 |         p[i] += static_cast<long double>(dataset[j].second);
 99 |       }
100 |       xx[i] += xx[i - 1];
101 |       x[i] += x[i - 1];
102 |       px[i] += px[i - 1];
103 |       p[i] += p[i - 1];
104 |     }
105 |     xx[index.size() - 1] +=
106 |         static_cast<long double>(dataset[index[index.size() - 1]].first) *
107 |         static_cast<long double>(dataset[index[index.size() - 1]].first);
108 |     x[index.size() - 1] +=
109 |         static_cast<long double>(dataset[index[index.size() - 1]].first);
110 |     px[index.size() - 1] +=
111 |         static_cast<long double>(dataset[index[index.size() - 1]].first) *
112 |         static_cast<long double>(dataset[index[index.size() - 1]].second);
113 |     p[index.size() - 1] +=
114 |         static_cast<long double>(dataset[index[index.size() - 1]].second);
115 | 
116 |     // store the parameters of each segment
117 |     for (int i = 0; i < index.size() - 1; i++) {
118 |       for (int j = i + 1; j < index.size(); j++) {
119 |         int tmpSize = index[j] - index[i];
120 | 
121 |         double theta1 = 0.0001, theta2 = 0.666;
122 |         long double t1 = 0, t2 = 0, t3 = 0, t4 = 0;
123 |         t1 = xx[j] - xx[i];
124 |         t2 = x[j] - x[i];
125 |         t3 = px[j] - px[i];
126 |         t4 = p[j] - p[i];
127 |         if (t1 * tmpSize - t2 * t2 == 0) {
128 |           if (dataset[index[j]].first - dataset[index[i]].first == 0) {
129 |             theta1 = 0;
130 |             theta2 = dataset[index[j]].second;
131 |           } else {
132 |             theta1 = (static_cast<long double>(dataset[index[j]].second) -
133 |                       static_cast<long double>(dataset[index[i]].second)) /
134 |                      (static_cast<long double>(dataset[index[j]].first) -
135 |                       static_cast<long double>(dataset[index[i]].first));
136 |             theta2 = static_cast<long double>(dataset[index[j]].second) -
137 |                      theta1 * static_cast<long double>(dataset[index[j]].first);
138 |           }
139 |         } else {
140 |           theta1 = (t3 * tmpSize - t2 * t4) / (t1 * tmpSize - t2 * t2);
141 |           theta2 = (t1 * t4 - t2 * t3) / (t1 * tmpSize - t2 * t2);
142 |         }
143 |         if (theta1 <= 0) {
144 |           theta1 = std::abs(theta1);
145 |         }
146 | 
147 |         theta.insert({{index[i], index[j]}, {theta1, theta2}});
148 |       }
149 |     }
150 |   }
151 | 
152 |   /**
153 |    * @brief calculate the entropy of each segment
154 |    *
155 |    * @param[in] leftIdx the left index of the sub-dataset
156 |    * @param[in] rightIdx the right-index of the sub-dataset
157 |    * @param[in] y1
158 |    * @param[in] y2
159 |    * @return double: entropy
160 |    */
161 |   double Entropy(int leftIdx, int rightIdx, double y1, double y2) {
162 |     auto tmp_theta = theta.find({leftIdx, rightIdx});
163 |     double a = tmp_theta->second.first;
164 |     double entropy = -DBL_MAX;
165 |     if (a > 0) {
166 |       entropy = log2(a) * (y2 - y1);
167 |     }
168 |     return entropy;
169 |   }
170 | 
171 |  public:
172 |   //*** Private Data Members of CandidatePLR Objects
173 |   /**
174 |    * @brief params for the corresponding segment, each element is {{the index of
175 |    * the left candidate points in the dataset, the index of the right candidate
176 |    * points in the dataset}, {the slope, the intercept}}
177 |    */
178 |   std::map<std::pair<int, int>, std::pair<double, double>> theta;
179 | };
180 | 
181 | #endif  // NODES_INNERNODE_CANDIDATE_PLR_H_
182 | 


--------------------------------------------------------------------------------
/src/include/construct/structures.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file structures.h
  3 |  * @author Jiaoyi
  4 |  * @brief structures for CARMI
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef CONSTRUCT_STRUCTURES_H_
 12 | #define CONSTRUCT_STRUCTURES_H_
 13 | 
 14 | #include <float.h>
 15 | 
 16 | #include <iostream>
 17 | #include <utility>
 18 | #include <vector>
 19 | 
 20 | #include "../params.h"
 21 | 
 22 | /**
 23 |  * @brief Root node settings: the type of the root node and the number of child
 24 |  * nodes
 25 |  */
 26 | struct RootStruct {
 27 |   /**
 28 |    * @brief the type identifier of the root node
 29 |    */
 30 |   int rootType;
 31 | 
 32 |   /**
 33 |    * @brief the number of its child nodes
 34 |    */
 35 |   int rootChildNum;
 36 | 
 37 |   /**
 38 |    * @brief Construct a new Root Struct object and set the values
 39 |    *
 40 |    * @param[in] type the type of the root node
 41 |    * @param[in] c the number of child nodes
 42 |    */
 43 |   RootStruct(int type, int c) {
 44 |     rootType = type;
 45 |     rootChildNum = c;
 46 |   }
 47 | };
 48 | 
 49 | /**
 50 |  * @brief three parts of the cost: time cost, space cost and the total cost.
 51 |  *
 52 |  * The total cost  = time cost + lambda * space cost.
 53 |  */
 54 | struct NodeCost {
 55 |   /**
 56 |    * @brief the time cost
 57 |    */
 58 |   double time;
 59 | 
 60 |   /**
 61 |    * @brief the space cost
 62 |    */
 63 |   double space;
 64 | 
 65 |   /**
 66 |    * @brief the total cost: time cost + lambda * space cost.
 67 |    */
 68 |   double cost;
 69 | };
 70 | 
 71 | /**
 72 |  * @brief the index range of data points: [left, left + size)
 73 |  */
 74 | struct IndexPair {
 75 |   /**
 76 |    * @brief the left index of data points in the dataset
 77 |    */
 78 |   int left;
 79 | 
 80 |   /**
 81 |    * @brief the size of data points
 82 |    */
 83 |   int size;
 84 | 
 85 |   bool operator<(const IndexPair& a) const {
 86 |     if (left == a.left)
 87 |       return size < a.size;
 88 |     else
 89 |       return left < a.left;
 90 |   }
 91 | };
 92 | 
 93 | /**
 94 |  * @brief the index ranges of sub-initDataset, sub-findQuery and
 95 |  * sub-insertQuery: {initDataset: {left, size}, findQuery: {left, size},
 96 |  * insertQuery: {left, size}}
 97 |  */
 98 | class DataRange {
 99 |  public:
100 |   /**
101 |    * @brief the index range of initDataset: {the left index of the sub-dataset
102 |    * in the initDataset, the size of the sub-dataset}
103 |    */
104 |   IndexPair initRange;
105 | 
106 |   /**
107 |    * @brief the index range of findQuery: {the left index of the sub-dataset
108 |    * in the findQuery, the size of the sub-dataset}
109 |    */
110 |   IndexPair findRange;
111 | 
112 |   /**
113 |    * @brief the index range of insertQuery: {the left index of the sub-dataset
114 |    * in the insertQuery, the size of the sub-dataset}
115 |    */
116 |   IndexPair insertRange;
117 | 
118 |   /**
119 |    * @brief Construct a new Data Range object
120 |    *
121 |    * @param[in] init the index range of sub-initDataset: {the left index of the
122 |    * sub-dataset in the initDataset, the size of the sub-dataset}
123 |    * @param[in] find the index range of sub-findQuery: {the left index of the
124 |    * sub-dataset in the findQuery, the size of the sub-dataset}
125 |    * @param[in] insert the index range of sub-insertQuery: {the left index of
126 |    * the sub-dataset in the insertQuery, the size of the sub-dataset}
127 |    */
128 |   DataRange(IndexPair init, IndexPair find, IndexPair insert)
129 |       : initRange(init), findRange(find), insertRange(insert) {}
130 | };
131 | 
132 | /**
133 |  * @brief the starting index and size of sub-dataset in each child node, each
134 |  * element is: {the vector of the sub-initDataset, the vector of the
135 |  * sub-findDataset, the vector of sub-insertDataset}. Each sub-dataset is
136 |  * represented by: {left, size}, which means the range of it in the dataset is
137 |  * [left, left + size).
138 |  */
139 | class SubDataset {
140 |  public:
141 |   /**
142 |    * @brief the IndexPair vector of sub-initDataset, each element is: {the left
143 |    * index of the sub-dataset in the initDataset, the size of the sub-dataset}
144 |    */
145 |   std::vector<IndexPair> subInit;
146 | 
147 |   /**
148 |    * @brief the IndexPair vector of sub-findDataset, each element is: {the left
149 |    * index of the sub-dataset in the findDataset, the size of the sub-dataset}
150 |    */
151 |   std::vector<IndexPair> subFind;
152 | 
153 |   /**
154 |    * @brief the IndexPair vector of sub-insertDataset, each element is: {the
155 |    * left index of the sub-dataset in the insertDataset, the size of the
156 |    * sub-dataset}
157 |    */
158 |   std::vector<IndexPair> subInsert;
159 | 
160 |   /**
161 |    * @brief Construct a new SubDataset object and the size of the vector is c
162 |    *
163 |    * @param[in] c the size of the vector
164 |    */
165 |   explicit SubDataset(int c)
166 |       : subInit(std::vector<IndexPair>(c, {-1, 0})),
167 |         subFind(std::vector<IndexPair>(c, {-1, 0})),
168 |         subInsert(std::vector<IndexPair>(c, {-1, 0})) {}
169 |   ~SubDataset() {}
170 | };
171 | 
172 | /**
173 |  * @brief enumerate type of all node types
174 |  */
175 | enum NodeType {
176 |   PLR_ROOT_NODE,
177 | 
178 |   LR_INNER_NODE,
179 |   PLR_INNER_NODE,
180 |   HIS_INNER_NODE,
181 |   BS_INNER_NODE,
182 | 
183 |   ARRAY_LEAF_NODE,
184 |   EXTERNAL_ARRAY_LEAF_NODE
185 | };
186 | 
187 | /**
188 |  * @brief the structure of a data block
189 |  *
190 |  * This structure is designed for the CF array leaf nodes, so as to make better
191 |  * use of the cache mechanism to speed up data access. The size of this class is
192 |  * fixed as kMaxLeafNodeSize.
193 |  *
194 |  * @tparam KeyType the type of the given key value
195 |  * @tparam ValueType the type of the value
196 |  */
197 | template <typename KeyType, typename ValueType>
198 | class LeafSlots {
199 |  public:
200 |   /**
201 |    * @brief the structure of a data block which actually stores the data points,
202 |    * its size is determined by the kMaxLeafNodeSize and the type of the data
203 |    * point. Each element in slots is: {key value, value}.
204 |    */
205 |   std::pair<KeyType, ValueType> slots[carmi_params::kMaxLeafNodeSize /
206 |                                       sizeof(std::pair<KeyType, ValueType>)];
207 | 
208 |   /**
209 |    * @brief Construct a new Leaf Slots object and set the default value of each
210 |    * element to the pair of {DBL_MAX, DBL_MAX}
211 |    */
212 |   LeafSlots() {
213 |     int len =
214 |         carmi_params::kMaxLeafNodeSize / sizeof(std::pair<KeyType, ValueType>);
215 |     for (int i = 0; i < len; i++) {
216 |       slots[i] = {DBL_MAX, DBL_MAX};
217 |     }
218 |   }
219 | 
220 |   LeafSlots& operator=(const LeafSlots& currnode) {
221 |     if (this != &currnode) {
222 |       int len = carmi_params::kMaxLeafNodeSize /
223 |                 sizeof(std::pair<KeyType, ValueType>);
224 |       for (int i = 0; i < len; i++) {
225 |         this->slots[i] = currnode.slots[i];
226 |       }
227 |     }
228 |     return *this;
229 |   }
230 | };
231 | #endif  // CONSTRUCT_STRUCTURES_H_
232 | 


--------------------------------------------------------------------------------
/src/include/construct/minor_function.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file minor_function.h
  3 |  * @author Jiaoyi
  4 |  * @brief the minor functions for constructing CARMI
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef CONSTRUCT_MINOR_FUNCTION_H_
 12 | #define CONSTRUCT_MINOR_FUNCTION_H_
 13 | 
 14 | #include <algorithm>
 15 | #include <utility>
 16 | #include <vector>
 17 | 
 18 | #include "../carmi.h"
 19 | 
 20 | template <typename KeyType, typename ValueType, typename Compare,
 21 |           typename Alloc>
 22 | double CARMI<KeyType, ValueType, Compare, Alloc>::CalculateFrequencyWeight(
 23 |     const DataRange &dataRange) {
 24 |   float frequency = 0.0;
 25 |   // count the frequency of findQuery
 26 |   int findEnd = dataRange.findRange.left + dataRange.findRange.size;
 27 |   for (int i = dataRange.findRange.left; i < findEnd; i++)
 28 |     frequency += findQuery[i].second;
 29 |   // count the frequency  of insertQuery
 30 |   frequency += dataRange.insertRange.size;
 31 |   // calculate the weighted frequency of this sub-dataset
 32 |   double frequency_weight = frequency / querySize;
 33 |   return frequency_weight;
 34 | }
 35 | 
 36 | template <typename KeyType, typename ValueType, typename Compare,
 37 |           typename Alloc>
 38 | double CARMI<KeyType, ValueType, Compare, Alloc>::CalculateEntropy(
 39 |     const std::vector<IndexPair> &perSize) const {
 40 |   // the sum of -size(i)*log(size(i))
 41 |   double slogs = 0.0;
 42 |   // the total size of the dataset
 43 |   int n = 0;
 44 |   for (int i = 0; i < perSize.size(); i++) {
 45 |     n += perSize[i].size;
 46 |     if (perSize[i].size != 0)
 47 |       slogs += static_cast<double>(perSize[i].size) * (-log2(perSize[i].size));
 48 |   }
 49 |   if (n == 0) {
 50 |     return -DBL_MAX;
 51 |   }
 52 | 
 53 |   double entropy = slogs / n + log2(n);
 54 |   return entropy;
 55 | }
 56 | 
 57 | template <typename KeyType, typename ValueType, typename Compare,
 58 |           typename Alloc>
 59 | std::vector<double> CARMI<KeyType, ValueType, Compare,
 60 |                           Alloc>::CalculateCFArrayCost(int size,
 61 |                                                        int totalPrefetchedNum) {
 62 |   std::vector<double> cost(
 63 |       CFArrayType<KeyType, ValueType, Compare, Alloc>::kMaxBlockNum, 0);
 64 |   for (int k = 0;
 65 |        k < CFArrayType<KeyType, ValueType, Compare, Alloc>::kMaxBlockNum; k++) {
 66 |     double space = kBaseNodeSpace;
 67 |     double time = carmi_params::kLeafBaseTime;
 68 |     if ((k + 1) * CFArrayType<KeyType, ValueType, Compare,
 69 |                               Alloc>::kMaxBlockCapacity >=
 70 |         size) {
 71 |       // Case 1: these data points can be prefetched, then the space cost is the
 72 |       // space cost of allocated data blocks, and the time cost does not
 73 |       // increase
 74 |       space += (k + 1) * carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0;
 75 |     } else {
 76 |       // Case 2: these data points cannot be prefetched, then the space cost is
 77 |       // the space cost of actually needed data blocks and the time cost should
 78 |       // include the latency of a memory access
 79 |       int neededBlock =
 80 |           CFArrayType<KeyType, ValueType, Compare, Alloc>::CalNeededBlockNum(
 81 |               size);
 82 |       space += static_cast<double>(neededBlock) *
 83 |                carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0;
 84 |       time += carmi_params::kMemoryAccessTime;
 85 |     }
 86 |     time *= static_cast<double>(size) / totalPrefetchedNum;
 87 |     cost[k] = time + lambda * space;
 88 |   }
 89 |   return cost;
 90 | }
 91 | 
 92 | template <typename KeyType, typename ValueType, typename Compare,
 93 |           typename Alloc>
 94 | template <typename InnerNodeType>
 95 | void CARMI<KeyType, ValueType, Compare, Alloc>::NodePartition(
 96 |     const InnerNodeType &currnode, const IndexPair &range,
 97 |     const DataVectorType &dataset, std::vector<IndexPair> *subData) const {
 98 |   int end = range.left + range.size;
 99 |   for (int i = range.left; i < end; i++) {
100 |     int p = currnode.Predict(dataset[i].first);
101 |     if (p < 0 || p >= (*subData).size()) {
102 |       throw std::out_of_range(
103 |           "CARMI::NodePartition: the output of the model is out of range.");
104 |     }
105 | 
106 |     // if this sub-dataset is newly divided, store its leaf index in the dataset
107 |     if ((*subData)[p].left == -1) {
108 |       (*subData)[p].left = i;
109 |     }
110 |     // count the size of this sub-dataset
111 |     (*subData)[p].size++;
112 |   }
113 | }
114 | 
115 | template <typename KeyType, typename ValueType, typename Compare,
116 |           typename Alloc>
117 | template <typename InnerNodeType>
118 | void CARMI<KeyType, ValueType, Compare, Alloc>::NodePartition(
119 |     const InnerNodeType &currnode, const IndexPair &range,
120 |     const KeyVectorType &dataset, std::vector<IndexPair> *subData) const {
121 |   int end = range.left + range.size;
122 |   for (int i = range.left; i < end; i++) {
123 |     int p = currnode.Predict(dataset[i]);
124 | 
125 |     // if this sub-dataset is newly divided, store its leaf index in the dataset
126 |     if ((*subData)[p].left == -1) {
127 |       (*subData)[p].left = i;
128 |     }
129 |     // count the size of this sub-dataset
130 |     (*subData)[p].size++;
131 |   }
132 | }
133 | 
134 | template <typename KeyType, typename ValueType, typename Compare,
135 |           typename Alloc>
136 | template <typename InnerNodeType>
137 | InnerNodeType CARMI<KeyType, ValueType, Compare, Alloc>::InnerDivideAll(
138 |     const DataRange &range, int c, SubDataset *subDataset) {
139 |   InnerNodeType currnode(c);
140 |   int s = range.initRange.left;
141 |   int e = range.initRange.size + s;
142 |   DataVectorType tmpDataset(initDataset.begin() + s, initDataset.begin() + e);
143 |   if (range.insertRange.size > 0) {
144 |     s = range.insertRange.left;
145 |     e = s + range.insertRange.size;
146 |     for (int j = s; j < e; j++) {
147 |       tmpDataset.push_back({insertQuery[j], static_cast<ValueType>(DBL_MAX)});
148 |     }
149 |     std::sort(tmpDataset.begin(), tmpDataset.end());
150 |   }
151 |   currnode.Train(0, tmpDataset.size(), tmpDataset);
152 |   // split initDataset into c sub-datasets
153 |   NodePartition<InnerNodeType>(currnode, range.initRange, initDataset,
154 |                                &(subDataset->subInit));
155 |   // split findQuery into c sub-datasets
156 |   subDataset->subFind = subDataset->subInit;
157 |   // split insertQuery into c sub-datasets
158 |   NodePartition<InnerNodeType>(currnode, range.insertRange, insertQuery,
159 |                                &(subDataset->subInsert));
160 |   return currnode;
161 | }
162 | 
163 | template <typename KeyType, typename ValueType, typename Compare,
164 |           typename Alloc>
165 | void CARMI<KeyType, ValueType, Compare, Alloc>::UpdateLeaf() {
166 |   if (isPrimary) return;
167 |   node.nodeArray[scanLeaf[0]].cfArray.nextLeaf = scanLeaf[1];
168 |   int end = scanLeaf.size() - 1;
169 |   node.nodeArray[scanLeaf[end]].cfArray.nextLeaf = -1;
170 |   node.nodeArray[scanLeaf[end]].cfArray.previousLeaf = scanLeaf[end - 1];
171 |   for (int i = 1; i < end; i++) {
172 |     node.nodeArray[scanLeaf[i]].cfArray.nextLeaf = scanLeaf[i + 1];
173 |     node.nodeArray[scanLeaf[i]].cfArray.previousLeaf = scanLeaf[i - 1];
174 |   }
175 | 
176 |   std::vector<int>().swap(scanLeaf);
177 | }
178 | 
179 | #endif  // CONSTRUCT_MINOR_FUNCTION_H_
180 | 


--------------------------------------------------------------------------------
/src/unitTest/leafNodeTest/cfarray_test.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file cfarray_test.cpp
  3 |  * @author Jiaoyi
  4 |  * @brief
  5 |  * @version 0.1
  6 |  * @date 2021-11-03
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | 
 12 | #include <random>
 13 | 
 14 | #include "../../include/nodes/leafNode/cfarray_type.h"
 15 | #include "gtest/gtest.h"
 16 | 
 17 | const int kTestMaxValue = 10000;
 18 | typedef double KeyType;
 19 | typedef double ValueType;
 20 | typedef CFArrayType<KeyType, ValueType> CFType;
 21 | std::default_random_engine engine(time(0));
 22 | std::uniform_real_distribution<KeyType> dis(0, kTestMaxValue);
 23 | 
 24 | TEST(TestCalNeededBlockNum, CalNeededBlockNum) {
 25 |   int CFSize = sizeof(CFType);
 26 |   ASSERT_EQ(64, CFSize);
 27 |   int maxBlockNum = CFType::kMaxBlockNum;
 28 |   for (int i = 0; i < CFType::kMaxLeafCapacity; i++) {
 29 |     int needBlockNum = CFType::CalNeededBlockNum(i);
 30 |     ASSERT_GE(needBlockNum, 0);
 31 |     ASSERT_LE(needBlockNum, maxBlockNum);
 32 |   }
 33 | }
 34 | 
 35 | TEST(TestSearchDataBlock, CheckSearchBlockRes) {
 36 |   for (int i = 0; i < CFType::kMaxBlockCapacity; i++) {
 37 |     std::vector<std::pair<KeyType, ValueType>> testTrainData(i);
 38 |     LeafSlots<KeyType, ValueType> currblock;
 39 |     CFType tmpCFNode;
 40 |     for (int j = 0; j < i; j++) {
 41 |       KeyType tmpKey = dis(engine);
 42 |       testTrainData[j] = {tmpKey, tmpKey * 10};
 43 |     }
 44 |     std::sort(testTrainData.begin(), testTrainData.end());
 45 |     for (int j = 0; j < i; j++) {
 46 |       currblock.slots[j] = testTrainData[j];
 47 |     }
 48 |     for (int j = 0; j < i; j++) {
 49 |       int res = tmpCFNode.SearchDataBlock(currblock, testTrainData[j].first, i);
 50 |       EXPECT_EQ(testTrainData[res].first, testTrainData[j].first)
 51 |           << "j:" << j << ",\tres:" << res << ",\ti:" << i << std::endl;
 52 |     }
 53 |   }
 54 | }
 55 | 
 56 | TEST(TestNormalStoreData, CheckStoreData) {
 57 |   int maxBlockNum = CFType::kMaxBlockNum;
 58 |   for (int i = 0; i < CFType::kMaxLeafCapacity; i++) {
 59 |     std::vector<std::pair<KeyType, ValueType>> testTrainData(i);
 60 |     DataArrayStructure<KeyType, ValueType> data(maxBlockNum, i);
 61 |     CFType tmpCFNode;
 62 |     for (int t = 0; t < CFType::kMaxPerSizeNum; t++) {
 63 |       ASSERT_EQ(static_cast<int>(tmpCFNode.perSize[t]), 0);
 64 |     }
 65 |     for (int j = 0; j < i; j++) {
 66 |       KeyType tmpKey = dis(engine);
 67 |       testTrainData[j] = {tmpKey, tmpKey * 10};
 68 |     }
 69 |     std::sort(testTrainData.begin(), testTrainData.end());
 70 |     int needBlockNum = CFType::CalNeededBlockNum(i);
 71 |     int tmpEnd = -1;
 72 |     auto isSuccess =
 73 |         tmpCFNode.StoreData(testTrainData, std::vector<int>(i), false,
 74 |                             needBlockNum, 0, &data, &tmpEnd);
 75 |     ASSERT_TRUE(isSuccess);
 76 |     for (int j = 0; j < CFType::kMaxBlockNum - 2; j++) {
 77 |       ASSERT_LE(tmpCFNode.slotkeys[j], tmpCFNode.slotkeys[j + 1]);
 78 |     }
 79 |     for (int t = 0; t < CFType::kMaxPerSizeNum; t++) {
 80 |       ASSERT_GE(static_cast<int>(tmpCFNode.perSize[t]), 0);
 81 |       ASSERT_LE(static_cast<int>(tmpCFNode.perSize[t]), 255);
 82 |     }
 83 |     for (int j = 0; j < i; j++) {
 84 |       for (int k = 0; k < CFType::kMaxBlockCapacity - 1; k++) {
 85 |         KeyType l = data.dataArray[j].slots[k].first;
 86 |         KeyType r = data.dataArray[j].slots[k + 1].first;
 87 |         ASSERT_LE(l, r) << "the size is:" << i;
 88 |       }
 89 |     }
 90 |   }
 91 | }
 92 | 
 93 | TEST(TestFind, CFArrayFindData) {
 94 |   int maxBlockNum = CFType::kMaxBlockNum;
 95 |   for (int i = 0; i < CFType::kMaxLeafCapacity; i++) {
 96 |     std::vector<std::pair<KeyType, ValueType>> testTrainData(i);
 97 |     DataArrayStructure<KeyType, ValueType> data(maxBlockNum, i);
 98 |     CFType tmpCFNode;
 99 |     for (int j = 0; j < i; j++) {
100 |       KeyType tmpKey = dis(engine);
101 |       testTrainData[j] = {tmpKey, tmpKey * 10};
102 |     }
103 |     std::sort(testTrainData.begin(), testTrainData.end());
104 |     int needBlockNum = CFType::CalNeededBlockNum(i);
105 |     int tmpEnd = -1;
106 |     auto isSuccess =
107 |         tmpCFNode.StoreData(testTrainData, std::vector<int>(i), false,
108 |                             needBlockNum, 0, &data, &tmpEnd);
109 |     for (int j = 0; j < i; j++) {
110 |       int currblock = 0;
111 |       int currslot = tmpCFNode.Find(data, testTrainData[j].first, &currblock);
112 |       KeyType res =
113 |           data.dataArray[tmpCFNode.m_left + currblock].slots[currslot].first;
114 |       ASSERT_EQ(res, testTrainData[j].first)
115 |           << "j:" << j << ",\tres:" << res << ",\ti:" << i;
116 |     }
117 |   }
118 | }
119 | 
120 | TEST(TestInsert, InsertData) {
121 |   int maxBlockNum = CFType::kMaxBlockNum;
122 |   for (int i = 0; i < CFType::kMaxLeafCapacity; i++) {
123 |     std::vector<std::pair<KeyType, ValueType>> testTrainData(i);
124 |     DataArrayStructure<KeyType, ValueType> data(maxBlockNum, i);
125 |     CFType tmpCFNode;
126 |     for (int j = 0; j < i; j++) {
127 |       KeyType tmpKey = dis(engine);
128 |       testTrainData[j] = {tmpKey, tmpKey * 10};
129 |     }
130 |     std::sort(testTrainData.begin(), testTrainData.end());
131 |     int needBlockNum = CFType::CalNeededBlockNum(i);
132 |     int tmpEnd = -1;
133 |     tmpCFNode.StoreData(testTrainData, std::vector<int>(i), false, needBlockNum,
134 |                         0, &data, &tmpEnd);
135 |     KeyType tmpKey = dis(engine);
136 |     std::pair<KeyType, ValueType> datapoint = {tmpKey, tmpKey};
137 |     int currblock = 0, currslot = 0;
138 |     auto isSuccess = tmpCFNode.Insert(datapoint, &currblock, &currslot, &data);
139 |     if (isSuccess) {
140 |       int m_left = tmpCFNode.m_left;
141 |       int blockNum = tmpCFNode.flagNumber & 0x00FFFFFF;
142 |       int nowDataNum = CFType::GetDataNum(data, m_left, m_left + blockNum);
143 | 
144 |       ASSERT_EQ(i + 1, nowDataNum);
145 |       for (int j = m_left; j < m_left + blockNum; j++) {
146 |         for (int k = 0; k < CFType::kMaxBlockCapacity - 1; k++) {
147 |           KeyType l = data.dataArray[j].slots[k].first;
148 |           KeyType r = data.dataArray[j].slots[k + 1].first;
149 |           ASSERT_LE(l, r);
150 |         }
151 |       }
152 |     }
153 |   }
154 | }
155 | 
156 | TEST(TestDelete, DeleteData) {
157 |   int maxBlockNum = CFType::kMaxBlockNum;
158 |   int size = 90;
159 |   std::vector<std::pair<KeyType, ValueType>> testTrainData(size);
160 |   DataArrayStructure<KeyType, ValueType> data(maxBlockNum, size);
161 |   CFType tmpCFNode;
162 |   for (int j = 0; j < size; j++) {
163 |     KeyType tmpKey = dis(engine);
164 |     testTrainData[j] = {tmpKey, tmpKey * 10};
165 |   }
166 |   std::sort(testTrainData.begin(), testTrainData.end());
167 |   int needBlockNum = CFType::CalNeededBlockNum(size);
168 |   int tmpEnd = -1;
169 |   tmpCFNode.StoreData(testTrainData, std::vector<int>(size), false,
170 |                       needBlockNum, 0, &data, &tmpEnd);
171 | 
172 |   for (int j = 0; j < size; j += 5) {
173 |     size_t cnt = 0;
174 |     auto isSuccess = tmpCFNode.Delete(testTrainData[j].first, &cnt, &data);
175 |     ASSERT_TRUE(isSuccess);
176 |     ASSERT_GT(cnt, 0);
177 |     int m_left = tmpCFNode.m_left;
178 |     int blockNum = tmpCFNode.flagNumber & 0x00FFFFFF;
179 |     int nowDataNum = CFType::GetDataNum(data, m_left, m_left + blockNum);
180 |     for (int j = m_left; j < m_left + blockNum; j++) {
181 |       for (int k = 0; k < CFType::kMaxBlockCapacity - 1; k++) {
182 |         KeyType l = data.dataArray[j].slots[k].first;
183 |         KeyType r = data.dataArray[j].slots[k + 1].first;
184 |         ASSERT_LE(l, r);
185 |       }
186 |     }
187 |   }
188 | }


--------------------------------------------------------------------------------
/src/include/construct/greedy.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file greedy.h
  3 |  * @author Jiaoyi
  4 |  * @brief use the greedy node selection algorithm to construct inner nodes
  5 |  * @version 3.0
  6 |  * @date 2021-03-11
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | #ifndef CONSTRUCT_GREEDY_H_
 12 | #define CONSTRUCT_GREEDY_H_
 13 | 
 14 | #include <float.h>
 15 | 
 16 | #include <algorithm>
 17 | #include <map>
 18 | #include <vector>
 19 | 
 20 | #include "../carmi.h"
 21 | #include "../params.h"
 22 | #include "./dp_inner.h"
 23 | #include "./minor_function.h"
 24 | #include "./structures.h"
 25 | 
 26 | template <typename KeyType, typename ValueType, typename Compare,
 27 |           typename Alloc>
 28 | template <typename InnerNodeType>
 29 | void CARMI<KeyType, ValueType, Compare, Alloc>::UpdateGreedyOptSetting(
 30 |     const DataRange &range, int c, double frequency_weight,
 31 |     NodeCost *optimalCost, InnerNodeType *optimal_node_struct) {
 32 |   // calculate the basic space cost of the c child nodes of the inner node
 33 |   double space_cost = kBaseNodeSpace * static_cast<double>(c);
 34 |   // calculate the time cost of the inner node
 35 |   double time_cost = InnerNodeType::kTimeCost;
 36 | 
 37 |   SubDataset subDataset(c);
 38 |   InnerNodeType currnode = InnerDivideAll<InnerNodeType>(range, c, &subDataset);
 39 |   int maxLeafCapacity = carmi_params::kMaxLeafNodeSizeExternal;
 40 |   if (!isPrimary) {
 41 |     maxLeafCapacity =
 42 |         CFArrayType<KeyType, ValueType, Compare, Alloc>::kMaxLeafCapacity;
 43 |   }
 44 |   for (int i = 0; i < c; i++) {
 45 |     int totalDataNum =
 46 |         subDataset.subInit[i].size + subDataset.subInsert[i].size;
 47 |     if (totalDataNum == range.initRange.size + range.insertRange.size) {
 48 |       return;
 49 |     }
 50 |     // if leaf nodes are cf array leaf nodes, add the space cost of data
 51 |     // blocks to the total space cost
 52 |     if (!isPrimary) {
 53 |       int tmpBlockNum =
 54 |           CFArrayType<KeyType, ValueType, Compare, Alloc>::CalNeededBlockNum(
 55 |               totalDataNum);
 56 |       space_cost += static_cast<double>(tmpBlockNum) *
 57 |                     carmi_params::kMaxLeafNodeSize / 1024.0 / 1024.0;
 58 |     }
 59 |     // if the total number of data points exceeds the maximum capacity of the
 60 |     // leaf node, the current node needs at least kMinChildNumber inner nodes to
 61 |     // manage the data points together
 62 |     if (totalDataNum > maxLeafCapacity) {
 63 |       space_cost += kBaseNodeSpace * kMinChildNumber;
 64 |       time_cost += carmi_params::kMemoryAccessTime *
 65 |                    static_cast<double>(subDataset.subInit[i].size) /
 66 |                    static_cast<double>(range.initRange.size);
 67 |     }
 68 |   }
 69 |   // calculate the entropy of the inner node
 70 |   double entropy = CalculateEntropy(subDataset.subInit);
 71 |   double cost = (time_cost + lambda * space_cost / frequency_weight) / entropy;
 72 | 
 73 |   // if the current cost is smaller than the optimal cost, update the optimal
 74 |   // cost and node setting
 75 |   if (cost <= optimalCost->cost) {
 76 |     *optimal_node_struct = currnode;
 77 |     *optimalCost = {time_cost, space_cost, cost};
 78 |   }
 79 | }
 80 | 
 81 | template <typename KeyType, typename ValueType, typename Compare,
 82 |           typename Alloc>
 83 | NodeCost CARMI<KeyType, ValueType, Compare, Alloc>::GreedyAlgorithm(
 84 |     const DataRange &dataRange) {
 85 |   // the optimal cost of this sub-dataset
 86 |   NodeCost optimalCost{DBL_MAX, DBL_MAX, DBL_MAX};
 87 |   // the optimal node of this sub-dataset
 88 |   BaseNode<KeyType, ValueType, Compare, Alloc> opt_struct;
 89 |   // calculate the weight of the frequency of this sub-dataset (findQuery and
 90 |   // insertQury)
 91 |   double frequency_weight = CalculateFrequencyWeight(dataRange);
 92 |   int tmpEnd = std::min(0x00FFFFFF, dataRange.initRange.size / 16);
 93 |   tmpEnd = std::max(tmpEnd, kMinChildNumber);
 94 |   for (int c = kMinChildNumber; c <= tmpEnd; c *= 2) {
 95 |     // Case 1: construct a LR inner node, if it is better than the current
 96 |     // optimal setting, then use it to update the optimal setting
 97 |     UpdateGreedyOptSetting<LRModel<KeyType, ValueType>>(
 98 |         dataRange, c, frequency_weight, &optimalCost, &(opt_struct.lr));
 99 |     // Case 2: construct a P. LR inner node, if it is better than the current
100 |     // optimal setting, then use it to update the optimal setting
101 |     UpdateGreedyOptSetting<PLRModel<KeyType, ValueType>>(
102 |         dataRange, c, frequency_weight, &optimalCost, &(opt_struct.plr));
103 |     // Case 3: construct a His inner node, if it is better than the current
104 |     // optimal setting, then use it to update the optimal setting
105 |     if (c <= kHisMaxChildNumber)
106 |       UpdateGreedyOptSetting<HisModel<KeyType, ValueType>>(
107 |           dataRange, c, frequency_weight, &optimalCost, &(opt_struct.his));
108 |     // Case 4: construct a BS inner node, if it is better than the current
109 |     // optimal setting, then use it to update the optimal setting
110 |     if (c <= kBSMaxChildNumber)
111 |       UpdateGreedyOptSetting<BSModel<KeyType, ValueType>>(
112 |           dataRange, c, frequency_weight, &optimalCost, &(opt_struct.bs));
113 |   }
114 | 
115 |   // use the optimal inner node to divide dataset into childNum sub-datasets
116 |   int childNum = opt_struct.lr.flagNumber & 0x00FFFFFF;
117 |   int type = opt_struct.lr.flagNumber >> 24;
118 |   SubDataset subDataset(childNum);
119 |   switch (type) {
120 |     case LR_INNER_NODE: {
121 |       InnerDivideAll<LRModel<KeyType, ValueType>>(dataRange, childNum,
122 |                                                   &subDataset);
123 |       break;
124 |     }
125 |     case PLR_INNER_NODE: {
126 |       InnerDivideAll<PLRModel<KeyType, ValueType>>(dataRange, childNum,
127 |                                                    &subDataset);
128 |       break;
129 |     }
130 |     case HIS_INNER_NODE: {
131 |       InnerDivideAll<HisModel<KeyType, ValueType>>(dataRange, childNum,
132 |                                                    &subDataset);
133 |       break;
134 |     }
135 |     case BS_INNER_NODE: {
136 |       InnerDivideAll<BSModel<KeyType, ValueType>>(dataRange, childNum,
137 |                                                   &subDataset);
138 |       break;
139 |     }
140 |   }
141 | 
142 |   // recursively calculate the cost of the child nodes
143 |   for (int i = 0; i < childNum; i++) {
144 |     NodeCost res = emptyCost;
145 |     DataRange range(subDataset.subInit[i], subDataset.subFind[i],
146 |                     subDataset.subInsert[i]);
147 |     // choose the suitable algorithm to construct the sub-tree according to the
148 |     // size of the sub-dataset
149 |     double minRatio = 0.95;
150 |     // record the maximum capacity of the leaf node
151 |     int maxStoredNum =
152 |         CFArrayType<KeyType, ValueType, Compare, Alloc>::kMaxLeafCapacity;
153 |     if (isPrimary) {
154 |       maxStoredNum = carmi_params::kMaxLeafNodeSizeExternal;
155 |     }
156 |     if (range.initRange.size + range.insertRange.size <=
157 |         minRatio * maxStoredNum) {
158 |       // Case 3: if the size is smaller than the threshold, directly construct a
159 |       // leaf node
160 |       res = DPLeaf(range);
161 |     } else if (subDataset.subInit[i].size + subDataset.subInsert[i].size >
162 |                carmi_params::kAlgorithmThreshold) {
163 |       res = GreedyAlgorithm(range);
164 |     } else {
165 |       res = DP(range);
166 |     }
167 |     optimalCost.cost += res.cost;
168 |     optimalCost.time += res.time;
169 |     optimalCost.space += res.space;
170 |   }
171 | 
172 |   // store the optimal setting of this sub-dataset
173 |   structMap.insert({dataRange.initRange, opt_struct});
174 |   // store the minimum cost of this sub-dataset
175 |   COST.insert({dataRange.initRange, optimalCost});
176 |   return optimalCost;
177 | }
178 | #endif  // CONSTRUCT_GREEDY_H_
179 | 


--------------------------------------------------------------------------------
/src/experiment/workload/workloads_external.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file workloads_external.h
  3 |  * @author Jiaoyi
  4 |  * @brief
  5 |  * @version 3.0
  6 |  * @date 2021-03-26
  7 |  *
  8 |  * @copyright Copyright (c) 2021
  9 |  *
 10 |  */
 11 | 
 12 | #ifndef EXPERIMENT_WORKLOAD_WORKLOADS_EXTERNAL_H_
 13 | #define EXPERIMENT_WORKLOAD_WORKLOADS_EXTERNAL_H_
 14 | 
 15 | #include <ctime>
 16 | #include <utility>
 17 | #include <vector>
 18 | 
 19 | #include "../../include/carmi_external_map.h"
 20 | #include "../functions.h"
 21 | #include "./public_functions.h"
 22 | #include "./zipfian.h"
 23 | 
 24 | extern std::ofstream outRes;
 25 | 
 26 | /**
 27 |  * @brief write heavy workload for external CARMI,
 28 |  * a mix of 50/50 reads and writes
 29 |  *
 30 |  * @tparam KeyType
 31 |  * @param[in] isZipfian whether to use zipfian access during the test
 32 |  * @param[in] findDataset
 33 |  * @param[in] insertDataset
 34 |  * @param[inout] carmi
 35 |  */
 36 | template <typename KeyType, typename ExternalType>
 37 | void WorkloadA(bool isZipfian, const DataVecType &findDataset,
 38 |                const DataVecType &insertDataset,
 39 |                CARMIExternalMap<KeyType, ExternalType> *carmi) {
 40 |   DataVecType findQuery;
 41 |   DataVecType insertQuery;
 42 |   std::vector<int> index;
 43 |   int end = kTestSize * kWriteHeavy;
 44 |   InitTestSet(findDataset, insertDataset, isZipfian, &findQuery, &insertQuery,
 45 |               &index);
 46 | 
 47 |   std::clock_t s, e;
 48 |   double tmp;
 49 |   auto resIte = carmi->end();
 50 |   KeyType res = 0;
 51 |   s = std::clock();
 52 |   if (isZipfian) {
 53 |     for (int i = 0; i < end; i++) {
 54 |       resIte = carmi->find(findQuery[index[i]].first);
 55 |       res += resIte.data();
 56 |       carmi->insert(insertQuery[i].first);
 57 |     }
 58 |   } else {
 59 |     for (int i = 0; i < end; i++) {
 60 |       resIte = carmi->find(findQuery[i].first);
 61 |       res += resIte.data();
 62 |       carmi->insert(insertQuery[i].first);
 63 |     }
 64 |   }
 65 |   e = std::clock();
 66 |   tmp = (e - s) / static_cast<double>(CLOCKS_PER_SEC);
 67 |   std::cout << "        res: " << res << std::endl;
 68 | 
 69 |   PrintAvgTime(tmp);
 70 | }
 71 | 
 72 | /**
 73 |  * @brief read heavy workload for external CARMI,
 74 |  * a mix of 95/5 reads and writes
 75 |  *
 76 |  * @tparam KeyType
 77 |  * @param[in] isZipfian whether to use zipfian access during the test
 78 |  * @param[in] findDataset
 79 |  * @param[in] insertDataset
 80 |  * @param[inout] carmi
 81 |  */
 82 | template <typename KeyType, typename ExternalType>
 83 | void WorkloadB(bool isZipfian, const DataVecType &findDataset,
 84 |                const DataVecType &insertDataset,
 85 |                CARMIExternalMap<KeyType, ExternalType> *carmi) {
 86 |   DataVecType findQuery;
 87 |   DataVecType insertQuery;
 88 |   std::vector<int> index;
 89 |   InitTestSet(findDataset, insertDataset, isZipfian, &findQuery, &insertQuery,
 90 |               &index);
 91 | 
 92 |   int end = round(kTestSize * (1 - kReadHeavy));
 93 |   int findCnt = 0;
 94 | 
 95 |   std::clock_t s, e;
 96 |   auto resIte = carmi->end();
 97 |   KeyType res = 0;
 98 |   double tmp;
 99 |   s = std::clock();
100 |   if (isZipfian) {
101 |     for (int i = 0; i < end; i++) {
102 |       for (int j = 0; j < 19; j++) {
103 |         resIte = carmi->find(findQuery[index[findCnt]].first);
104 |         res += resIte.data();
105 |         findCnt++;
106 |       }
107 |       carmi->insert(insertQuery[i].first);
108 |     }
109 |   } else {
110 |     for (int i = 0; i < end; i++) {
111 |       for (int j = 0; j < 19 && findCnt < static_cast<int>(findQuery.size());
112 |            j++) {
113 |         resIte = carmi->find(findQuery[findCnt++].first);
114 |         res += resIte.data();
115 |       }
116 |       carmi->insert(insertQuery[i].first);
117 |     }
118 |   }
119 |   e = std::clock();
120 |   tmp = (e - s) / static_cast<double>(CLOCKS_PER_SEC);
121 | 
122 |   std::cout << "        res: " << res << std::endl;
123 |   PrintAvgTime(tmp);
124 | }
125 | 
126 | /**
127 |  * @brief read only workload for external CARMI, 100% read
128 |  *
129 |  * @tparam KeyType
130 |  * @param[in] isZipfian whether to use zipfian access during the test
131 |  * @param[in] findDataset
132 |  * @param[inout] carmi
133 |  */
134 | template <typename KeyType, typename ExternalType>
135 | void WorkloadC(bool isZipfian, const DataVecType &findDataset,
136 |                CARMIExternalMap<KeyType, ExternalType> *carmi) {
137 |   DataVecType findQuery;
138 |   DataVecType insertQuery;
139 |   std::vector<int> index;
140 |   int end = kTestSize * kReadOnly;
141 |   InitTestSet(findDataset, DataVecType(), isZipfian, &findQuery, &insertQuery,
142 |               &index);
143 | 
144 |   std::clock_t s, e;
145 |   double tmp;
146 |   auto resIte = carmi->end();
147 |   KeyType res = 0;
148 |   s = std::clock();
149 |   if (isZipfian) {
150 |     for (int i = 0; i < end; i++) {
151 |       resIte = carmi->find(findQuery[index[i]].first);
152 |       res += resIte.data();
153 |     }
154 |   } else {
155 |     for (int i = 0; i < end; i++) {
156 |       resIte = carmi->find(findQuery[i].first);
157 |       res += resIte.data();
158 |     }
159 |   }
160 |   e = std::clock();
161 |   tmp = (e - s) / static_cast<double>(CLOCKS_PER_SEC);
162 | 
163 |   std::cout << "        res: " << res << std::endl;
164 | 
165 |   PrintAvgTime(tmp);
166 | }
167 | 
168 | /**
169 |  * @brief read mostly workload (range scan) for external CARMI,
170 |  * a mix of 95/5 reads and writes
171 |  *
172 |  * @tparam KeyType
173 |  * @param[in] isZipfian whether to use zipfian access during the test
174 |  * @param[in] findDataset
175 |  * @param[in] insertDataset
176 |  * @param[in] length
177 |  * @param[inout] carmi
178 |  */
179 | template <typename KeyType, typename ExternalType>
180 | void WorkloadE(bool isZipfian, const DataVecType &findDataset,
181 |                const DataVecType &insertDataset, const std::vector<int> &length,
182 |                CARMIExternalMap<KeyType, ExternalType> *carmi) {
183 |   DataVecType findQuery;
184 |   DataVecType insertQuery;
185 |   std::vector<int> index;
186 |   InitTestSet(findDataset, insertDataset, isZipfian, &findQuery, &insertQuery,
187 |               &index);
188 | 
189 |   int end = round(kTestSize * (1 - kReadHeavy));
190 |   int findCnt = 0;
191 | 
192 |   std::vector<std::pair<KeyType, std::vector<KeyType>>> ret(
193 |       100, {KeyType(), {KeyType()}});
194 |   std::clock_t s, e;
195 |   double tmp;
196 |   s = std::clock();
197 |   if (isZipfian) {
198 |     for (int i = 0; i < end; i++) {
199 |       for (int j = 0; j < 19 && findCnt < static_cast<int>(index.size()); j++) {
200 |         auto it = carmi->find(findQuery[index[findCnt]].first);
201 | 
202 |         for (int l = 0; l < length[index[findCnt]]; l++) {
203 |           // ret[l] = *it;
204 |           it++;
205 |         }
206 |         findCnt++;
207 |       }
208 |       carmi->insert(insertQuery[i].first);
209 |     }
210 |   } else {
211 |     for (int i = 0; i < end; i++) {
212 |       for (int j = 0; j < 19 && findCnt < static_cast<int>(findQuery.size());
213 |            j++) {
214 |         auto it = carmi->find(findQuery[findCnt].first);
215 |         for (int l = 0; l < length[findCnt]; l++) {
216 |           // ret[l] = *it;
217 |           it++;
218 |         }
219 |         findCnt++;
220 |       }
221 |       carmi->insert(insertQuery[i].first);
222 |     }
223 |   }
224 |   e = std::clock();
225 |   tmp = (e - s) / static_cast<double>(CLOCKS_PER_SEC);
226 | 
227 |   findCnt = 0;
228 |   s = std::clock();
229 |   if (isZipfian) {
230 |     for (int i = 0; i < end; i++) {
231 |       for (int j = 0; j < 19 && findCnt < static_cast<int>(index.size()); j++) {
232 |         for (int l = 0; l < length[index[findCnt]]; l++) {
233 |         }
234 |         findCnt++;
235 |       }
236 |     }
237 |   } else {
238 |     for (int i = 0; i < end; i++) {
239 |       for (int j = 0; j < 19 && findCnt < static_cast<int>(findQuery.size());
240 |            j++) {
241 |         for (int l = 0; l < length[findCnt]; l++) {
242 |         }
243 |         findCnt++;
244 |       }
245 |     }
246 |   }
247 |   e = std::clock();
248 |   double tmp0 = (e - s) / static_cast<double>(CLOCKS_PER_SEC);
249 |   tmp -= tmp0;
250 | 
251 |   PrintAvgTime(tmp);
252 | }
253 | #endif  // EXPERIMENT_WORKLOAD_WORKLOADS_EXTERNAL_H_
254 | 


--------------------------------------------------------------------------------