├── requirements.txt
├── .gitignore
├── scripts
    ├── run_all.sh
    ├── plot_all.sh
    ├── plot_paper.sh
    ├── run_rmi_segmentation.sh
    ├── run_rmi_guideline.sh
    ├── rmi_ref
    │   ├── prepare_rmi_ref.sh
    │   ├── fb_200M_uint64.json
    │   ├── books_200M_uint64.json
    │   ├── osm_cellids_200M_uint64.json
    │   └── wiki_ts_200M_uint64.json
    ├── run_rmi_errors.sh
    ├── run_index_comparison.sh
    ├── run_rmi_intervals.sh
    ├── run_rmi_lookup.sh
    ├── download_data.sh
    ├── run_rmi_build.sh
    ├── plot_rmi_guideline.py
    ├── plot_rmi_segmentation.py
    ├── plot_rmi_errors.py
    ├── plot_rmi_intervals.py
    ├── plot_rmi_lookup.py
    ├── plot_index_comparison.py
    └── plot_rmi_build.py
├── .gitmodules
├── example.cpp
├── CMakeLists.txt
├── experiments
    ├── CMakeLists.txt
    ├── rmi_segmentation.cpp
    ├── rmi_errors.cpp
    ├── rmi_intervals.cpp
    ├── rmi_build.cpp
    ├── rmi_lookup.cpp
    └── rmi_guideline.cpp
├── README.md
├── include
    └── rmi
    │   ├── util
    │       ├── fn.hpp
    │       └── search.hpp
    │   ├── models.hpp
    │   └── rmi.hpp
└── LICENSE


/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib==3.4.2
2 | numpy>=1.22
3 | pandas==1.2.4
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Compiled Object files
 5 | *.slo
 6 | *.lo
 7 | *.o
 8 | *.obj
 9 | 
10 | # Precompiled Headers
11 | *.gch
12 | *.pch
13 | 
14 | # Compiled Dynamic libraries
15 | *.so
16 | *.dylib
17 | *.dll
18 | 
19 | # Fortran module files
20 | *.mod
21 | *.smod
22 | 
23 | # Compiled Static libraries
24 | *.lai
25 | *.la
26 | *.a
27 | *.lib
28 | 
29 | # Executables
30 | *.exe
31 | *.out
32 | *.app
33 | 
34 | # Reference optimizer remnants
35 | *.json_results
36 | 
37 | # Plots
38 | *.pdf
39 | 
40 | # Directories
41 | build/
42 | data/
43 | doxy/
44 | 


--------------------------------------------------------------------------------
/scripts/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | echo "Running RMI Segmentation (Section 5.1)..."
 6 | source scripts/run_rmi_segmentation.sh
 7 | 
 8 | echo "Running RMI Errors (Section 5.2)..."
 9 | source scripts/run_rmi_errors.sh
10 | 
11 | echo "Running RMI Intervals (Section 5.3)..."
12 | source scripts/run_rmi_intervals.sh
13 | 
14 | echo "Running RMI Lookup (Section 6)..."
15 | source scripts/run_rmi_lookup.sh
16 | 
17 | echo "Running RMI Build (Section 7)..."
18 | source scripts/run_rmi_build.sh
19 | 
20 | echo "Running RMI Guideline (Section 8)..."
21 | source scripts/run_rmi_guideline.sh
22 | 
23 | echo "Running Index Comparison (Section 9)..."
24 | source scripts/run_index_comparison.sh
25 | 


--------------------------------------------------------------------------------
/scripts/plot_all.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | echo "Plotting RMI Segmentation (Section 5.1)..."
 6 | python3 scripts/plot_rmi_segmentation.py
 7 | 
 8 | echo "Plotting RMI Errors (Section 5.2)..."
 9 | python3 scripts/plot_rmi_errors.py
10 | 
11 | echo "Plotting RMI Intervals (Section 5.3)..."
12 | python3 scripts/plot_rmi_intervals.py
13 | 
14 | echo "Plotting RMI Lookup (Section 6)..."
15 | python3 scripts/plot_rmi_lookup.py
16 | 
17 | echo "Plotting RMI Build (Section 7)..."
18 | python3 scripts/plot_rmi_build.py
19 | 
20 | echo "Plotting RMI Guideline (Section 8)..."
21 | python3 scripts/plot_rmi_guideline.py
22 | 
23 | echo "Plotting Index Comparison (Section 9)..."
24 | python3 scripts/plot_index_comparison.py
25 | 


--------------------------------------------------------------------------------
/scripts/plot_paper.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | echo "Plotting RMI Segmentation (Section 5.1)..."
 6 | python3 scripts/plot_rmi_segmentation.py --paper
 7 | 
 8 | echo "Plotting RMI Errors (Section 5.2)..."
 9 | python3 scripts/plot_rmi_errors.py --paper
10 | 
11 | echo "Plotting RMI Intervals (Section 5.3)..."
12 | python3 scripts/plot_rmi_intervals.py --paper
13 | 
14 | echo "Plotting RMI Lookup (Section 6)..."
15 | python3 scripts/plot_rmi_lookup.py --paper
16 | 
17 | echo "Plotting RMI Build (Section 7)..."
18 | python3 scripts/plot_rmi_build.py --paper
19 | 
20 | echo "Plotting RMI Guideline (Section 8)..."
21 | python3 scripts/plot_rmi_guideline.py --paper
22 | 
23 | echo "Plotting Index Comparison (Section 9)..."
24 | python3 scripts/plot_index_comparison.py --paper
25 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "third_party/CHT"]
 2 | 	path = third_party/CHT
 3 | 	url = https://github.com/stoianmihail/CHT
 4 | [submodule "third_party/ALEX"]
 5 | 	path = third_party/ALEX
 6 | 	url = https://github.com/microsoft/ALEX
 7 | [submodule "third_party/RadixSpline"]
 8 | 	path = third_party/RadixSpline
 9 | 	url = https://github.com/learnedsystems/RadixSpline.git
10 | [submodule "third_party/PGM-index"]
11 | 	path = third_party/PGM-index
12 | 	url = https://github.com/gvinciguerra/PGM-index.git
13 | [submodule "third_party/tlx"]
14 | 	path = third_party/tlx
15 | 	url = https://github.com/tlx/tlx
16 | [submodule "third_party/argparse"]
17 | 	path = third_party/argparse
18 | 	url = https://github.com/p-ranav/argparse.git
19 | [submodule "third_party/RMI"]
20 | 	path = third_party/RMI
21 | 	url = https://github.com/learnedsystems/RMI
22 | 	ignore = dirty
23 | 


--------------------------------------------------------------------------------
/scripts/run_rmi_segmentation.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | EXPERIMENT="rmi segmentation"
 6 | 
 7 | DIR_DATA="data"
 8 | DIR_RESULTS="results"
 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_segmentation.csv"
10 | 
11 | BIN="build/bin/rmi_segmentation"
12 | 
13 | run() {
14 |     DATASET=$1
15 |     MODEL=$2
16 |     N_SEGMENTS=$3
17 |     DATA_FILE="${DIR_DATA}/${DATASET}"
18 |     ${BIN} ${DATA_FILE} ${MODEL} ${N_SEGMENTS} >> ${FILE_RESULTS}
19 | }
20 | 
21 | # Create results directory
22 | if [ ! -d "${DIR_RESULTS}" ];
23 | then
24 |     mkdir -p "${DIR_RESULTS}";
25 | fi
26 | 
27 | # Check data downloaded
28 | if [ ! -d "${DIR_DATA}" ];
29 | then
30 |     >&2 echo "Please download datasets first."
31 |     return 1
32 | fi
33 | 
34 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64"
35 | MODELS="linear_spline cubic_spline linear_regression radix"
36 | 
37 | # Run experiments
38 | echo "dataset,n_keys,model,n_segments,mean,stdev,median,min,max,n_empty" > ${FILE_RESULTS} # Write csv header
39 | for dataset in ${DATASETS};
40 | do
41 |     echo "Performing ${EXPERIMENT} on '${dataset}'..."
42 |     for model in ${MODELS};
43 |     do
44 |         for ((i=6; i<=25; i += 1));
45 |         do
46 |             n_segments=$((2**$i))
47 |             run ${dataset} ${model} ${n_segments}
48 |         done
49 |     done
50 | done
51 | 
52 | 


--------------------------------------------------------------------------------
/scripts/run_rmi_guideline.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | EXPERIMENT="rmi guideline"
 6 | 
 7 | DIR_DATA="data"
 8 | DIR_RESULTS="results"
 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_guideline.csv"
10 | 
11 | BIN="build/bin/rmi_guideline"
12 | 
13 | # Set number of repetitions and samples
14 | N_REPS="3"
15 | N_SAMPLES="20000000"
16 | PARAMS="--n_reps ${N_REPS} --n_samples ${N_SAMPLES}"
17 | 
18 | run() {
19 |     DATASET=$1
20 |     BUDGET=$2
21 |     DATA_FILE="${DIR_DATA}/${DATASET}"
22 |     ${BIN} ${DATA_FILE} ${BUDGET} ${PARAMS} >> ${FILE_RESULTS}
23 | }
24 | 
25 | # Create results directory
26 | if [ ! -d "${DIR_RESULTS}" ];
27 | then
28 |     mkdir -p "${DIR_RESULTS}";
29 | fi
30 | 
31 | # Check data downloaded
32 | if [ ! -d "${DIR_DATA}" ];
33 | then
34 |     >&2 echo "Please download datasets first."
35 |     return 1
36 | fi
37 | 
38 | DATASETS="books_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64"
39 | 
40 | # Run experiments
41 | echo "dataset,n_keys,layer1,layer2,n_models,bounds,search,size_in_bytes,rep,n_samples,budget_in_bytes,is_guideline,lookup_time,lookup_accu" > ${FILE_RESULTS} # Write csv header
42 | for dataset in ${DATASETS};
43 | do
44 |     echo "Performing ${EXPERIMENT} on '${dataset}'..."
45 |     for ((i=1; i<=20; i += 1));
46 |     do
47 |         budget=$((2**$i * 1024))
48 |         run ${dataset} ${budget}
49 |     done
50 | done
51 | 


--------------------------------------------------------------------------------
/example.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <random>
 3 | #include <vector>
 4 | 
 5 | #include "rmi/models.hpp"
 6 | #include "rmi/rmi.hpp"
 7 | 
 8 | 
 9 | int main()
10 | {
11 |     // Initialize random number generator.
12 |     using key_type = uint64_t;
13 |     std::mt19937 gen(42);
14 |     std::uniform_int_distribution<key_type> key_distrib(0, 1UL << 48);
15 |     auto rand = [&gen, &key_distrib] { return key_distrib(gen); };
16 | 
17 |     // Create 1M random keys.
18 |     std::size_t n_keys = 1e7;
19 |     std::vector<key_type> keys(n_keys);
20 |     std::generate(keys.begin(), keys.end(), rand);
21 |     std::sort(keys.begin(), keys.end());
22 | 
23 |     // Build a two-layer RMI.
24 |     using layer1_type = rmi::LinearSpline;
25 |     using layer2_type = rmi::LinearRegression;
26 |     std::size_t layer2_size = 2UL << 16;
27 |     rmi::RmiLAbs<key_type, layer1_type, layer2_type> rmi(keys, layer2_size);
28 | 
29 |     // Pick a key.
30 |     std::uniform_int_distribution<std::size_t> uniform_distrib(0, n_keys - 1);
31 |     key_type key = keys[uniform_distrib(gen)];
32 | 
33 |     // Perform a lookup.
34 |     auto range = rmi.search(key);
35 |     auto pos = std::lower_bound(keys.begin() + range.lo, keys.begin() + range.hi, key);
36 |     std::cout << "Key " << key << " is located at position "
37 |               << std::distance(keys.begin(), pos) << '.' << std::endl;
38 | 
39 |    return 0;
40 | }
41 | 


--------------------------------------------------------------------------------
/scripts/rmi_ref/prepare_rmi_ref.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | DIR_DATA="data"
 6 | RMI_PATH="third_party/RMI"
 7 | CONFIG_PATH="scripts/rmi_ref"
 8 | 
 9 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64"
10 | 
11 | gen_config_json() {
12 |     DATASET=$1
13 |     CWD=$(pwd)
14 |     DATA_FILE="${CWD}/${DIR_DATA}/${DATASET}"
15 |     CONFIG_FILE="${CWD}/${CONFIG_PATH}/${DATASET}.json"
16 |     MANIFEST_FILE="${CWD}/${RMI_PATH}/Cargo.toml"
17 | 
18 |     echo "Generating reference RMI config json for ${DATASET}..."
19 |     cargo run --manifest-path "${MANIFEST_FILE}" --release -- ${DATA_FILE} --optimize "${CONFIG_FILE}"
20 | }
21 | 
22 | train_rmi () {
23 |     DATASET=$1
24 |     CWD=$(pwd)
25 |     DATA_FILE="${CWD}/${DIR_DATA}/${DATASET}"
26 |     CONFIG_FILE="${CWD}/${CONFIG_PATH}/${DATASET}.json"
27 |     MANIFEST_FILE="${CWD}/${RMI_PATH}/Cargo.toml"
28 | 
29 |     # Create include dir
30 |     INCLUDE_PATH="${CWD}/${RMI_PATH}/include/rmi_ref"
31 |     mkdir -p "${INCLUDE_PATH}"
32 |     cd "${INCLUDE_PATH}"
33 | 
34 |     echo "Training reference RMIs on ${DATASET}..."
35 |     cargo run --manifest-path "${MANIFEST_FILE}" --release -- ${DATA_FILE} --param-grid "${CONFIG_FILE}" --disable-parallel-training
36 | 
37 |     cd ${CWD}
38 | }
39 | 
40 | for dataset in ${DATASETS};
41 | do
42 |     # Generate RMI configurations
43 |     # gen_config_json "$dataset" # configs are pre-generated
44 | 
45 |     # Train RMIs
46 |     train_rmi "$dataset"
47 | done
48 | 


--------------------------------------------------------------------------------
/scripts/run_rmi_errors.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | EXPERIMENT="rmi errors"
 6 | 
 7 | DIR_DATA="data"
 8 | DIR_RESULTS="results"
 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_errors.csv"
10 | 
11 | BIN="build/bin/rmi_errors"
12 | 
13 | run() {
14 |     DATASET=$1
15 |     LAYER1=$2
16 |     LAYER2=$3
17 |     N_MODELS=$4
18 |     DATA_FILE="${DIR_DATA}/${DATASET}"
19 |     ${BIN} ${DATA_FILE} ${LAYER1} ${LAYER2} ${N_MODELS} >> ${FILE_RESULTS}
20 | }
21 | 
22 | # Create results directory
23 | if [ ! -d "${DIR_RESULTS}" ];
24 | then
25 |     mkdir -p "${DIR_RESULTS}";
26 | fi
27 | 
28 | # Check data downloaded
29 | if [ ! -d "${DIR_DATA}" ];
30 | then
31 |     >&2 echo "Please download datasets first."
32 |     return 1
33 | fi
34 | 
35 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64"
36 | LAYER1_MODELS="linear_spline cubic_spline linear_regression radix"
37 | LAYER2_MODELS="linear_spline linear_regression"
38 | 
39 | # Run experiments
40 | echo "dataset,n_keys,layer1,layer2,n_models,mean_ae,median_ae,stdev_ae,min_ae,max_ae" > ${FILE_RESULTS} # Write csv header
41 | for dataset in ${DATASETS};
42 | do
43 |     echo "Performing ${EXPERIMENT} on '${dataset}'..."
44 |     for ((i=6; i<=25; i += 1));
45 |     do
46 |         n_models=$((2**$i))
47 |         for l1 in ${LAYER1_MODELS};
48 |         do
49 |             for l2 in ${LAYER2_MODELS};
50 |             do
51 |                 run ${dataset} ${l1} ${l2} ${n_models}
52 |             done
53 |         done
54 |     done
55 | done
56 | 
57 | 


--------------------------------------------------------------------------------
/scripts/rmi_ref/fb_200M_uint64.json:
--------------------------------------------------------------------------------
1 | {"configs":[{"layers":"robust_linear,linear","branching factor":16777216,"namespace":"fb_200M_uint64_0","size":402653200,"average log2 error":5.046965439910379,"binary":true},{"layers":"robust_linear,linear","branching factor":8388608,"namespace":"fb_200M_uint64_1","size":201326608,"average log2 error":5.63038432124456,"binary":true},{"layers":"robust_linear,linear","branching factor":4194304,"namespace":"fb_200M_uint64_2","size":100663312,"average log2 error":6.264421574769353,"binary":true},{"layers":"robust_linear,linear","branching factor":1048576,"namespace":"fb_200M_uint64_3","size":25165840,"average log2 error":7.61933264490036,"binary":true},{"layers":"robust_linear,linear","branching factor":524288,"namespace":"fb_200M_uint64_4","size":12582928,"average log2 error":8.309666117218308,"binary":true},{"layers":"robust_linear,linear","branching factor":262144,"namespace":"fb_200M_uint64_5","size":6291472,"average log2 error":8.993034097192816,"binary":true},{"layers":"robust_linear,linear","branching factor":131072,"namespace":"fb_200M_uint64_6","size":3145744,"average log2 error":9.664330303064656,"binary":true},{"layers":"robust_linear,linear","branching factor":32768,"namespace":"fb_200M_uint64_7","size":786448,"average log2 error":10.905758730947948,"binary":true},{"layers":"robust_linear,linear","branching factor":1024,"namespace":"fb_200M_uint64_8","size":24592,"average log2 error":13.674001584650402,"binary":true},{"layers":"robust_linear,linear","branching factor":128,"namespace":"fb_200M_uint64_9","size":3088,"average log2 error":15.312913003960264,"binary":true}]}


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.2)
 2 | 
 3 | project("analysis-rmi"
 4 |     LANGUAGES C CXX
 5 |     HOMEPAGE_URL https://github.com/BigDataAnalyticsGroup/analysis-rmi
 6 | )
 7 | 
 8 | # Set output directories
 9 | set(EXECUTABLE_OUTPUT_PATH "${PROJECT_BINARY_DIR}/bin")
10 | 
11 | # Set compilation flags
12 | SET(CMAKE_CXX_STANDARD 17)
13 | SET(CMAKE_COMPILE_FLAGS             "-W -Wall -pedantic -DLEVEL1_DCACHE_LINESIZE=${LEVEL1_DCACHE_LINESIZE} -DPAGESIZE=${PAGESIZE} -march=native -Wno-variadic-macros -Wno-gnu-zero-variadic-macro-arguments -Wno-gnu-label-as-value -Wno-vla-extension")
14 | SET(CMAKE_C_FLAGS                   "${CMAKE_C_FLAGS} ${CMAKE_COMPILE_FLAGS}")
15 | SET(CMAKE_CXX_FLAGS                 "-std=c++17 ${CMAKE_CXX_FLAGS} ${CMAKE_COMPILE_FLAGS}")
16 | SET(CMAKE_CXX_FLAGS_DEBUG           "-ggdb3 -fno-omit-frame-pointer -fno-optimize-sibling-calls -fsanitize=address,undefined -fsanitize-address-use-after-scope")
17 | SET(CMAKE_CXX_FLAGS_RELEASE         "-O2")
18 | SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO  "-O2 -ggdb3")
19 | 
20 | # Directories
21 | include_directories(include)
22 | 
23 | # Third party
24 | include_directories(third_party/ALEX/src)
25 | include_directories(third_party/argparse/include)
26 | include_directories(third_party/ART/include)
27 | include_directories(third_party/CHT/include)
28 | include_directories(third_party/PGM-index/include)
29 | include_directories(third_party/RadixSpline/include)
30 | include_directories(third_party/RMI/include)
31 | include_directories(third_party/tlx)
32 | 
33 | # Executables
34 | add_executable(example example.cpp)
35 | add_subdirectory(experiments)
36 | 


--------------------------------------------------------------------------------
/scripts/run_index_comparison.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | EXPERIMENT="index comparison"
 6 | 
 7 | DIR_DATA="data"
 8 | DIR_RESULTS="results"
 9 | FILE_RESULTS="${DIR_RESULTS}/index_comparison.csv"
10 | 
11 | BIN="build/bin/index_comparison"
12 | 
13 | # Set number of repetitions and samples
14 | N_REPS="3"
15 | N_SAMPLES="20000000"
16 | PARAMS="--n_reps ${N_REPS} --n_samples ${N_SAMPLES}"
17 | 
18 | # Set which indexes to run on datasets
19 | declare -A flags
20 | flags['books_200M_uint64']="--rmi --alex --pgm --rs --cht --art --tlx --ref --bin"
21 | flags['fb_200M_uint64']="--rmi --alex --pgm --rs --cht --art --tlx --ref --bin"
22 | flags['osm_cellids_200M_uint64']="--rmi --alex --pgm --rs --cht --art --tlx --ref --bin"
23 | flags['wiki_ts_200M_uint64']="--rmi --alex --pgm --rs --tlx --ref --bin" # ART and CHT do not support duplicates
24 | 
25 | run() {
26 |     DATASET=$1
27 |     DATA_FILE="${DIR_DATA}/${DATASET}"
28 |     ${BIN} ${PARAMS} ${flags[${DATASET}]} ${DATA_FILE} >> ${FILE_RESULTS}
29 | }
30 | 
31 | # Create results directory
32 | if [ ! -d "${DIR_RESULTS}" ];
33 | then
34 |     mkdir -p "${DIR_RESULTS}";
35 | fi
36 | 
37 | # Check data downloaded
38 | if [ ! -d "${DIR_DATA}" ];
39 | then
40 |     >&2 echo "Please download datasets first."
41 |     return 1
42 | fi
43 | 
44 | # Run experiments
45 | echo "dataset,n_keys,index,config,size_in_bytes,rep,n_samples,build_time,eval_time,lookup_time,eval_accu,lookup_accu" > ${FILE_RESULTS} # Write csv header
46 | for dataset in ${!flags[@]};
47 | do
48 |     echo "Performing ${EXPERIMENT} on '${dataset}'..."
49 |     run $dataset
50 | done
51 | 


--------------------------------------------------------------------------------
/scripts/rmi_ref/books_200M_uint64.json:
--------------------------------------------------------------------------------
1 | {"configs":[{"layers":"linear_spline,linear","branching factor":16777216,"namespace":"books_200M_uint64_0","size":402653200,"average log2 error":3.881372461290736,"binary":true},{"layers":"linear_spline,linear","branching factor":8388608,"namespace":"books_200M_uint64_1","size":201326608,"average log2 error":4.157408768582805,"binary":true},{"layers":"linear_spline,linear","branching factor":4194304,"namespace":"books_200M_uint64_2","size":100663312,"average log2 error":4.515015335099573,"binary":true},{"layers":"radix22,linear","branching factor":1048576,"namespace":"books_200M_uint64_3","size":41943040,"average log2 error":5.240585141688408,"binary":true},{"layers":"linear_spline,linear","branching factor":524288,"namespace":"books_200M_uint64_4","size":12582928,"average log2 error":5.779940878504297,"binary":true},{"layers":"linear_spline,linear","branching factor":262144,"namespace":"books_200M_uint64_5","size":6291472,"average log2 error":6.234719007601088,"binary":true},{"layers":"linear_spline,linear","branching factor":131072,"namespace":"books_200M_uint64_6","size":3145744,"average log2 error":6.698291356077185,"binary":true},{"layers":"linear_spline,linear","branching factor":32768,"namespace":"books_200M_uint64_7","size":786448,"average log2 error":7.656127767412821,"binary":true},{"layers":"linear_spline,linear","branching factor":1024,"namespace":"books_200M_uint64_8","size":24592,"average log2 error":10.182451513884163,"binary":true},{"layers":"linear_spline,linear","branching factor":128,"namespace":"books_200M_uint64_9","size":3088,"average log2 error":12.653101078683737,"binary":true}]}


--------------------------------------------------------------------------------
/scripts/rmi_ref/osm_cellids_200M_uint64.json:
--------------------------------------------------------------------------------
1 | {"configs":[{"layers":"cubic,linear","branching factor":16777216,"namespace":"osm_cellids_200M_uint64_0","size":402653216,"average log2 error":7.433883845329264,"binary":true},{"layers":"cubic,linear","branching factor":8388608,"namespace":"osm_cellids_200M_uint64_1","size":201326624,"average log2 error":8.206800487100525,"binary":true},{"layers":"cubic,linear","branching factor":4194304,"namespace":"osm_cellids_200M_uint64_2","size":100663328,"average log2 error":9.020228908629712,"binary":true},{"layers":"radix22,linear","branching factor":1048576,"namespace":"osm_cellids_200M_uint64_3","size":41943040,"average log2 error":9.651565886104518,"binary":true},{"layers":"cubic,linear","branching factor":524288,"namespace":"osm_cellids_200M_uint64_4","size":12582944,"average log2 error":11.587355989718806,"binary":true},{"layers":"radix18,linear","branching factor":32768,"namespace":"osm_cellids_200M_uint64_5","size":1835008,"average log2 error":13.362024384454787,"binary":true},{"layers":"linear,linear","branching factor":32768,"namespace":"osm_cellids_200M_uint64_6","size":786448,"average log2 error":15.128271341741858,"binary":true},{"layers":"robust_linear,cubic","branching factor":1024,"namespace":"osm_cellids_200M_uint64_7","size":40976,"average log2 error":19.590492696796756,"binary":true},{"layers":"robust_linear,linear","branching factor":512,"namespace":"osm_cellids_200M_uint64_8","size":12304,"average log2 error":20.541583565893747,"binary":true},{"layers":"robust_linear,linear","branching factor":128,"namespace":"osm_cellids_200M_uint64_9","size":3088,"average log2 error":22.416249075800633,"binary":true}]}


--------------------------------------------------------------------------------
/scripts/rmi_ref/wiki_ts_200M_uint64.json:
--------------------------------------------------------------------------------
1 | {"configs":[{"layers":"linear_spline,linear","branching factor":16777216,"namespace":"wiki_ts_200M_uint64_0","size":402653200,"average log2 error":4.283256474039042,"binary":true},{"layers":"linear_spline,linear","branching factor":8388608,"namespace":"wiki_ts_200M_uint64_1","size":201326608,"average log2 error":4.473650521507884,"binary":true},{"layers":"linear_spline,linear","branching factor":4194304,"namespace":"wiki_ts_200M_uint64_2","size":100663312,"average log2 error":4.714111817674564,"binary":true},{"layers":"linear_spline,linear","branching factor":1048576,"namespace":"wiki_ts_200M_uint64_3","size":25165840,"average log2 error":5.349370483548578,"binary":true},{"layers":"linear_spline,linear","branching factor":524288,"namespace":"wiki_ts_200M_uint64_4","size":12582928,"average log2 error":5.769176055947262,"binary":true},{"layers":"linear_spline,linear","branching factor":262144,"namespace":"wiki_ts_200M_uint64_5","size":6291472,"average log2 error":6.287461494618375,"binary":true},{"layers":"linear_spline,linear","branching factor":131072,"namespace":"wiki_ts_200M_uint64_6","size":3145744,"average log2 error":6.927357373267273,"binary":true},{"layers":"linear,linear","branching factor":32768,"namespace":"wiki_ts_200M_uint64_7","size":786448,"average log2 error":8.727133637122125,"binary":true},{"layers":"linear_spline,linear","branching factor":1024,"namespace":"wiki_ts_200M_uint64_8","size":24592,"average log2 error":14.509911274780844,"binary":true},{"layers":"linear_spline,linear","branching factor":128,"namespace":"wiki_ts_200M_uint64_9","size":3088,"average log2 error":16.279077272674099,"binary":true}]}


--------------------------------------------------------------------------------
/scripts/run_rmi_intervals.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | EXPERIMENT="rmi intervals"
 6 | 
 7 | DIR_DATA="data"
 8 | DIR_RESULTS="results"
 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_intervals.csv"
10 | 
11 | BIN="build/bin/rmi_intervals"
12 | 
13 | run() {
14 |     DATASET=$1
15 |     LAYER1=$2
16 |     LAYER2=$3
17 |     N_MODELS=$4
18 |     BOUND=$5
19 |     DATA_FILE="${DIR_DATA}/${DATASET}"
20 |     ${BIN} ${DATA_FILE} ${LAYER1} ${LAYER2} ${N_MODELS} ${BOUND} >> ${FILE_RESULTS}
21 | }
22 | 
23 | # Create results directory
24 | if [ ! -d "${DIR_RESULTS}" ];
25 | then
26 |     mkdir -p "${DIR_RESULTS}";
27 | fi
28 | 
29 | # Check data downloaded
30 | if [ ! -d "${DIR_DATA}" ];
31 | then
32 |     >&2 echo "Please download datasets first."
33 |     return 1
34 | fi
35 | 
36 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64"
37 | LAYERS1="linear_spline cubic_spline linear_regression radix"
38 | LAYERS2="linear_spline linear_regression"
39 | BOUNDS="gabs gind labs lind"
40 | 
41 | # Run experiments
42 | echo "dataset,n_keys,layer1,layer2,n_models,bounds,size_in_bytes,mean_interval,median_interval,stdev_interval,min_interval,max_interval" > ${FILE_RESULTS} # Write csv header
43 | for dataset in ${DATASETS};
44 | do
45 |     echo "Performing ${EXPERIMENT} on '${dataset}'..."
46 |     for ((i=6; i<=25; i += 1));
47 |     do
48 |         n_models=$((2**$i))
49 |         for l1 in ${LAYERS1};
50 |         do
51 |             for l2 in ${LAYERS2};
52 |             do
53 |                 for bound in ${BOUNDS};
54 |                 do
55 |                     run ${dataset} ${l1} ${l2} ${n_models} ${bound}
56 |                 done
57 |             done
58 |         done
59 |     done
60 | done
61 | 
62 | 


--------------------------------------------------------------------------------
/experiments/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.2)
 2 | 
 3 | add_executable(rmi_segmentation rmi_segmentation.cpp)
 4 | add_executable(rmi_errors rmi_errors.cpp)
 5 | add_executable(rmi_intervals rmi_intervals.cpp)
 6 | add_executable(rmi_lookup rmi_lookup.cpp)
 7 | add_executable(rmi_build rmi_build.cpp)
 8 | add_executable(rmi_guideline rmi_guideline.cpp)
 9 | 
10 | set(SOSD_PATH "${PROJECT_SOURCE_DIR}/third_party/RMI/include/rmi_ref")
11 | add_executable(index_comparison
12 |     index_comparison.cpp
13 |     ${SOSD_PATH}/books_200M_uint64_0.cpp
14 |     ${SOSD_PATH}/books_200M_uint64_1.cpp
15 |     ${SOSD_PATH}/books_200M_uint64_2.cpp
16 |     ${SOSD_PATH}/books_200M_uint64_3.cpp
17 |     ${SOSD_PATH}/books_200M_uint64_4.cpp
18 |     ${SOSD_PATH}/books_200M_uint64_5.cpp
19 |     ${SOSD_PATH}/books_200M_uint64_6.cpp
20 |     ${SOSD_PATH}/books_200M_uint64_7.cpp
21 |     ${SOSD_PATH}/books_200M_uint64_8.cpp
22 |     ${SOSD_PATH}/books_200M_uint64_9.cpp
23 |     ${SOSD_PATH}/fb_200M_uint64_0.cpp
24 |     ${SOSD_PATH}/fb_200M_uint64_1.cpp
25 |     ${SOSD_PATH}/fb_200M_uint64_2.cpp
26 |     ${SOSD_PATH}/fb_200M_uint64_3.cpp
27 |     ${SOSD_PATH}/fb_200M_uint64_4.cpp
28 |     ${SOSD_PATH}/fb_200M_uint64_5.cpp
29 |     ${SOSD_PATH}/fb_200M_uint64_6.cpp
30 |     ${SOSD_PATH}/fb_200M_uint64_7.cpp
31 |     ${SOSD_PATH}/fb_200M_uint64_8.cpp
32 |     ${SOSD_PATH}/fb_200M_uint64_9.cpp
33 |     ${SOSD_PATH}/osm_cellids_200M_uint64_0.cpp
34 |     ${SOSD_PATH}/osm_cellids_200M_uint64_1.cpp
35 |     ${SOSD_PATH}/osm_cellids_200M_uint64_2.cpp
36 |     ${SOSD_PATH}/osm_cellids_200M_uint64_3.cpp
37 |     ${SOSD_PATH}/osm_cellids_200M_uint64_4.cpp
38 |     ${SOSD_PATH}/osm_cellids_200M_uint64_5.cpp
39 |     ${SOSD_PATH}/osm_cellids_200M_uint64_6.cpp
40 |     ${SOSD_PATH}/osm_cellids_200M_uint64_7.cpp
41 |     ${SOSD_PATH}/osm_cellids_200M_uint64_8.cpp
42 |     ${SOSD_PATH}/osm_cellids_200M_uint64_9.cpp
43 |     ${SOSD_PATH}/wiki_ts_200M_uint64_0.cpp
44 |     ${SOSD_PATH}/wiki_ts_200M_uint64_1.cpp
45 |     ${SOSD_PATH}/wiki_ts_200M_uint64_2.cpp
46 |     ${SOSD_PATH}/wiki_ts_200M_uint64_3.cpp
47 |     ${SOSD_PATH}/wiki_ts_200M_uint64_4.cpp
48 |     ${SOSD_PATH}/wiki_ts_200M_uint64_5.cpp
49 |     ${SOSD_PATH}/wiki_ts_200M_uint64_6.cpp
50 |     ${SOSD_PATH}/wiki_ts_200M_uint64_7.cpp
51 |     ${SOSD_PATH}/wiki_ts_200M_uint64_8.cpp
52 |     ${SOSD_PATH}/wiki_ts_200M_uint64_9.cpp
53 | )
54 | 


--------------------------------------------------------------------------------
/scripts/run_rmi_lookup.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | EXPERIMENT="rmi lookup"
 6 | 
 7 | DIR_DATA="data"
 8 | DIR_RESULTS="results"
 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_lookup.csv"
10 | 
11 | BIN="build/bin/rmi_lookup"
12 | 
13 | # Set number of repetitions and samples
14 | N_REPS="3"
15 | N_SAMPLES="20000000"
16 | PARAMS="--n_reps ${N_REPS} --n_samples ${N_SAMPLES}"
17 | TIMEOUT="90s"
18 | 
19 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64"
20 | LAYER1="cubic_spline linear_spline linear_regression radix"
21 | LAYER2="linear_spline linear_regression"
22 | 
23 | run() {
24 |     DATASET=$1
25 |     L1=$2
26 |     L2=$3
27 |     N_MODELS=$4
28 |     BOUND=$5
29 |     SEARCH=$6
30 |     DATA_FILE="${DIR_DATA}/${DATASET}"
31 |     timeout ${TIMEOUT} ${BIN} ${DATA_FILE} ${L1} ${L2} ${N_MODELS} ${BOUND} ${SEARCH} ${PARAMS} >> ${FILE_RESULTS}
32 | }
33 | 
34 | # Create results directory
35 | if [ ! -d "${DIR_RESULTS}" ];
36 | then
37 |     mkdir -p "${DIR_RESULTS}";
38 | fi
39 | 
40 | # Check data downloaded
41 | if [ ! -d "${DIR_DATA}" ];
42 | then
43 |     >&2 echo "Please download datasets first."
44 |     return 1
45 | fi
46 | 
47 | # Write csv header
48 | echo "dataset,n_keys,layer1,layer2,n_models,bounds,search,size_in_bytes,rep,n_samples,lookup_time,lookup_accu" > ${FILE_RESULTS} # Write csv header
49 | 
50 | # Run model type experiment
51 | for dataset in ${DATASETS};
52 | do
53 |     echo "Performing ${EXPERIMENT} on '${dataset}'..."
54 |     for l1 in ${LAYER1};
55 |     do
56 |         for l2 in ${LAYER2};
57 |         do
58 |             for ((i=6; i<=25; i += 1));
59 |             do
60 |                 n_models=$((2**$i))
61 |                 run ${dataset} ${l1} ${l2} ${n_models} none model_biased_linear
62 |                 run ${dataset} ${l1} ${l2} ${n_models} none model_biased_exponential
63 | 
64 |                 run ${dataset} ${l1} ${l2} ${n_models} gabs binary
65 | 
66 |                 run ${dataset} ${l1} ${l2} ${n_models} gind model_biased_binary
67 |                 run ${dataset} ${l1} ${l2} ${n_models} gind binary
68 | 
69 |                 run ${dataset} ${l1} ${l2} ${n_models} labs binary
70 | 
71 |                 run ${dataset} ${l1} ${l2} ${n_models} lind model_biased_binary
72 |                 run ${dataset} ${l1} ${l2} ${n_models} lind binary
73 |             done
74 |         done
75 |     done
76 | done
77 | 


--------------------------------------------------------------------------------
/scripts/download_data.sh:
--------------------------------------------------------------------------------
 1 | #!bash
 2 | # set -x
 3 | trap "exit" SIGINT
 4 | 
 5 | DIR_DATA="data"
 6 | 
 7 | # Set download urls
 8 | declare -A urls
 9 | urls["books_200M_uint64"]="https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/A6HDNT"
10 | urls["fb_200M_uint64"]="https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/EATHF7"
11 | urls["osm_cellids_200M_uint64"]="https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/8FX9BV"
12 | urls["wiki_ts_200M_uint64"]="https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/SVN8PI"
13 | 
14 | # Set md5 for compressed files
15 | declare -A md5zst
16 | md5zst["books_200M_uint64"]="cd1f8bcb0dfd36f9ab08d160b887bf8a"
17 | md5zst["fb_200M_uint64"]="fec241e8b021b198b0849fbd5564c05f"
18 | md5zst["osm_cellids_200M_uint64"]="42575cb58f24bb7ea0a623d422d4c9a6"
19 | md5zst["wiki_ts_200M_uint64"]="6a2b17020959084ce2640177ee4afd5e"
20 | 
21 | # Set md5 for decompressed files
22 | declare -A md5bin
23 | md5bin["books_200M_uint64"]="aeedc7be338399ced89d0bb82287e024"
24 | md5bin["fb_200M_uint64"]="3b0f820caa0d62150e87ce94ec989978"
25 | md5bin["osm_cellids_200M_uint64"]="a7f6b8d2df09fcda5d9cfbc87d765979"
26 | md5bin["wiki_ts_200M_uint64"]="4f1402b1c476d67f77d2da4955432f7d"
27 | 
28 | check_md5() {
29 |     FILE=$1
30 |     MD5_EXPECTED=$2
31 |     echo "Checking '${FILE}'..."
32 |     MD5_ACTUAL=$(md5sum -b ${FILE} | cut -d ' ' -f 1)
33 |     [ ${MD5_EXPECTED} == ${MD5_ACTUAL} ]
34 | }
35 | 
36 | download() {
37 |     DATASET=$1
38 |     FILE="${DIR_DATA}/${DATASET}.zst"
39 |     URL=${urls[${DATASET}]}
40 |     echo "Downloading '${DATASET}'..."
41 |     wget -q --show-progress -O ${FILE} ${URL}
42 |     return $?
43 | }
44 | 
45 | decompress() {
46 |     FILE=$1
47 |     echo "Decompressing '${FILE}'..."
48 |     zstd -f -d ${FILE}
49 |     return $?
50 | }
51 | 
52 | # Create data directory
53 | if [ ! -d "${DIR_DATA}" ];
54 | then
55 |     mkdir -p "${DIR_DATA}";
56 | fi
57 | 
58 | # Download datasets
59 | for dataset in ${!urls[@]};
60 | do
61 |     FILE_BIN=${DIR_DATA}/${dataset}
62 |     if [ -f ${FILE_BIN} ];
63 |     then
64 |         echo "File '${FILE_BIN}' already exists."
65 |         check_md5 ${FILE_BIN} ${md5bin[${dataset}]} && continue
66 |     fi
67 | 
68 |     FILE_ZST=${DIR_DATA}/${dataset}.zst
69 |     if [ -f ${FILE_ZST} ];
70 |     then
71 |         echo "File '${FILE_ZST}' already exists."
72 |         check_md5 ${FILE_ZST} ${md5zst[${dataset}]} && decompress ${FILE_ZST} && check_md5 ${FILE_BIN} ${md5bin[${dataset}]} && continue
73 |     fi
74 | 
75 |     download ${dataset} && check_md5 ${FILE_ZST} ${md5zst[${dataset}]} && decompress ${FILE_ZST} && check_md5 ${FILE_BIN} ${md5bin[${dataset}]} && continue
76 |     echo "Download failed. Please try again."
77 | done
78 | 


--------------------------------------------------------------------------------
/scripts/run_rmi_build.sh:
--------------------------------------------------------------------------------
  1 | #!bash
  2 | # set -x
  3 | trap "exit" SIGINT
  4 | 
  5 | EXPERIMENT="rmi build"
  6 | 
  7 | DIR_DATA="data"
  8 | DIR_RESULTS="results"
  9 | FILE_RESULTS="${DIR_RESULTS}/rmi_build.csv"
 10 | 
 11 | BIN="build/bin/rmi_build"
 12 | 
 13 | # Set number of repetitions and samples
 14 | N_REPS="3"
 15 | PARAMS="--n_reps ${N_REPS}"
 16 | TIMEOUT="60s"
 17 | 
 18 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64"
 19 | LAYER1="cubic_spline linear_spline linear_regression radix"
 20 | LAYER2="linear_spline linear_regression"
 21 | BOUNDS="none gabs gind labs lind"
 22 | 
 23 | run() {
 24 |     DATASET=$1
 25 |     L1=$2
 26 |     L2=$3
 27 |     N_MODELS=$4
 28 |     BOUND=$5
 29 |     DATA_FILE="${DIR_DATA}/${DATASET}"
 30 |     timeout ${TIMEOUT} ${BIN} ${DATA_FILE} ${L1} ${L2} ${N_MODELS} ${BOUND} ${PARAMS} >> ${FILE_RESULTS}
 31 | }
 32 | 
 33 | # Create results directory
 34 | if [ ! -d "${DIR_RESULTS}" ];
 35 | then
 36 |     mkdir -p "${DIR_RESULTS}";
 37 | fi
 38 | 
 39 | # Check data downloaded
 40 | if [ ! -d "${DIR_DATA}" ];
 41 | then
 42 |     >&2 echo "Please download datasets first."
 43 |     return 1
 44 | fi
 45 | 
 46 | # Write csv header
 47 | echo "dataset,n_keys,rmi,layer1,layer2,n_models,bounds,size_in_bytes,rep,build_time,checksum" > ${FILE_RESULTS} # Write csv header
 48 | 
 49 | # Run layer1 and layer 2 model type experiment
 50 | for dataset in ${DATASETS};
 51 | do
 52 |     echo "Performing ${EXPERIMENT} (ours) on '${dataset}'..."
 53 |     for ((i=6; i<=25; i += 1));
 54 |     do
 55 |         n_models=$((2**$i))
 56 |         for l1 in ${LAYER1};
 57 |         do
 58 |             for l2 in ${LAYER2};
 59 |             do
 60 |                 for bound in ${BOUNDS};
 61 |                 do
 62 |                     run ${dataset} ${l1} ${l2} ${n_models} ${bound}
 63 |                 done
 64 |             done
 65 |         done
 66 |     done
 67 | done
 68 | 
 69 | 
 70 | # Prepare reference implementation experiment
 71 | CWD=$(pwd)
 72 | RMI_PATH="third_party/RMI"
 73 | TMP_PATH="${CWD}/${RMI_PATH}/tmp"
 74 | MANIFEST_FILE="${CWD}/${RMI_PATH}/Cargo.toml"
 75 | NAMESPACE="tmp"
 76 | RESULTS_FILE=${CWD}/${FILE_RESULTS}
 77 | mkdir -p ${TMP_PATH}
 78 | cd ${TMP_PATH}
 79 | 
 80 | declare -A l1models
 81 | l1models['linear_spline']="linear_spline"
 82 | l1models['cubic_spline']="cubic"
 83 | l1models['linear_regression']="linear"
 84 | l1models['radix']="radix"
 85 | 
 86 | declare -A l2models
 87 | l2models['linear_spline']="linear_spline"
 88 | l2models['linear_regression']="linear"
 89 | 
 90 | declare -A bounds
 91 | bounds['labs']=""
 92 | bounds['none']="--no-errors"
 93 | 
 94 | # Run reference implementation experiment
 95 | for dataset in ${DATASETS};
 96 | do
 97 |     DATA_FILE="${CWD}/${DIR_DATA}/${dataset}"
 98 |     echo "Performing ${EXPERIMENT} (ref) on '${dataset}'..."
 99 |     for ((i=6; i<=25; i += 1));
100 |     do
101 |         n_models=$((2**$i))
102 |         for l1 in ${!l1models[@]};
103 |         do
104 |             for l2 in ${!l2models[@]};
105 |             do
106 |                 for bound in ${!bounds[@]};
107 |                 do
108 |                     for ((rep=0; rep<${N_REPS}; rep += 1));
109 |                     do
110 |                         # Build RMI.
111 |                         cargo run --manifest-path ${MANIFEST_FILE} --release -- ${DATA_FILE} ${NAMESPACE} ${l1models[${l1}]},${l2models[${l2}]} ${n_models} ${bounds[${bound}]} > /dev/null
112 | 
113 |                         # Exract results.
114 |                         size=$(cat ${TMP_PATH}/tmp.h | grep SIZE | sed 's/.*=//' | tr -d -c 0-9)
115 |                         build_time=$(cat ${TMP_PATH}/tmp.h | grep BUILD | sed 's/.*=//' | tr -d -c 0-9)
116 | 
117 |                         # Append results to csv.
118 |                         echo "${dataset},200000000,ref,${l1},${l2},${n_models},${bound},${size},${rep},${build_time},0" >> ${RESULTS_FILE}
119 |                     done
120 |                 done
121 |             done
122 |         done
123 |     done
124 | done
125 | cd $CWD
126 | 


--------------------------------------------------------------------------------
/scripts/plot_rmi_guideline.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | import argparse
  3 | import itertools
  4 | import matplotlib.cm as cm
  5 | import matplotlib.pyplot as plt
  6 | import os
  7 | import pandas as pd
  8 | import warnings
  9 | 
 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc'))
 11 | 
 12 | # Ignore warnings
 13 | warnings.filterwarnings( "ignore")
 14 | 
 15 | # Argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true')
 18 | args = vars(parser.parse_args())
 19 | 
 20 | 
 21 | def plot_guideline(filename='rmi_guideline.pdf', width_fact=5, height_fact=4.2):
 22 |     n_cols = len(datasets)
 23 |     n_rows = 1
 24 | 
 25 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=False, sharex=True)
 26 |     fig.tight_layout()
 27 | 
 28 |     for col, dataset in enumerate(datasets):
 29 |         ax = axs[col]
 30 |         fast_lookups = list()
 31 |         fast_sizes = list()
 32 |         guide_lookups = list()
 33 |         guide_sizes = list()
 34 |         for budget in budgets:
 35 | 
 36 |             # Fastest configuration
 37 |             fast_confs = df[
 38 |                 (df['dataset']==dataset) &
 39 |                 (df['budget_in_bytes']==budget) &
 40 |                 (df['is_guideline']==False)
 41 |             ]
 42 |             fast_lookup = fast_confs['lookup_in_ns'].min()
 43 |             fast_conf = fast_confs[fast_confs['lookup_in_ns']==fast_lookup]
 44 |             fast_size = fast_conf['size_in_bytes'].iloc[0]
 45 | 
 46 |             fast_lookups.append(fast_lookup)
 47 |             fast_sizes.append(fast_size)
 48 | 
 49 |             # Guideline configuration
 50 |             guide_conf = df[
 51 |                 (df['dataset']==dataset) &
 52 |                 (df['budget_in_bytes']==budget) &
 53 |                 (df['is_guideline']==True)
 54 |             ]
 55 |             guide_lookup = guide_conf['lookup_in_ns'].iloc[0]
 56 |             guide_size = guide_conf['size_in_bytes'].iloc[0]
 57 | 
 58 |             guide_lookups.append(guide_lookup)
 59 |             guide_sizes.append(guide_size)
 60 | 
 61 |         # Plot lookup times
 62 |         ax.plot(fast_sizes, fast_lookups, marker='+', markersize=5, c=colors['fastest'], label='RMI (fastest)')
 63 |         ax.plot(guide_sizes, guide_lookups, c=colors['guideline'], linestyle='dotted', label='RMI (guideline)')
 64 | 
 65 |         # Title
 66 |         ax.set_title(f'{dataset}')
 67 | 
 68 |         # Labels
 69 |         if col==0:
 70 |             ax.set_ylabel('Lookup time [ns]')
 71 |         ax.set_xlabel('Index size [MiB]')
 72 | 
 73 |         # Visuals
 74 |         ax.set_xscale('log')
 75 |         if col==n_cols-1:
 76 |             ax.set_ylim(bottom=0)
 77 | 
 78 |         # Legend
 79 |         if col==0:
 80 |             fig.legend(ncol=2, bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
 81 | 
 82 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     path = 'results'
 87 | 
 88 |     # Read csv file
 89 |     file = os.path.join(path, 'rmi_guideline.csv')
 90 |     df = pd.read_csv(file, delimiter=',', header=0, comment='#')
 91 | 
 92 |     # Compute median of lookup times
 93 |     df = df.groupby(['dataset','layer1','layer2','n_models','bounds','search','is_guideline']).median().reset_index()
 94 | 
 95 |     # Replace datasets
 96 |     dataset_dict = {
 97 |         "books_200M_uint64": "books",
 98 |         "fb_200M_uint64": "fb",
 99 |         "osm_cellids_200M_uint64": "osmc",
100 |         "wiki_ts_200M_uint64": "wiki"
101 |     }
102 |     df.replace({**dataset_dict}, inplace=True)
103 | 
104 |     # Compute metrics
105 |     df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024)
106 |     df['lookup_in_ns'] = df['lookup_time'] / df['n_samples']
107 | 
108 |     # Define varibale lists
109 |     datasets = sorted(df['dataset'].unique())
110 |     budgets = sorted(df['budget_in_bytes'].unique())
111 | 
112 |     # Set colors
113 |     colors = {}
114 |     cmap = cm.get_cmap('tab10')
115 |     n_colors = 8
116 |     for i, x in enumerate(['fastest', 'guideline']):
117 |         colors[x] = cmap((i)/n_colors)
118 | 
119 |     if args['paper']:
120 |         # Plot guideline
121 |         filename = 'rmi_guideline.pdf'
122 |         print(f'Plotting guideline to \'{filename}\'...')
123 |         plot_guideline(filename, 2.7, 2)
124 |     else:
125 |         # Plot guideline
126 |         filename = 'rmi_guideline.pdf'
127 |         print(f'Plotting guideline to \'{filename}\'...')
128 |         plot_guideline(filename)
129 | 


--------------------------------------------------------------------------------
/scripts/plot_rmi_segmentation.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | import argparse
  3 | import matplotlib.cm as cm
  4 | import matplotlib.pyplot as plt
  5 | import matplotlib.ticker as mtick
  6 | import os
  7 | import pandas as pd
  8 | import warnings
  9 | 
 10 | plt.style.use(os.path.join('scripts','matplotlibrc'))
 11 | 
 12 | # Ignore warnings
 13 | warnings.filterwarnings( "ignore")
 14 | 
 15 | # Argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true')
 18 | args = vars(parser.parse_args())
 19 | 
 20 | 
 21 | def plot_frac_empty(filename='rmi_segmentation-frac_empty.pdf', width_fact=5, height_fact=4.2):
 22 |     n_cols = len(datasets)
 23 |     n_rows = 1
 24 | 
 25 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=True)
 26 |     fig.tight_layout()
 27 | 
 28 |     for col, dataset in enumerate(datasets):
 29 |         ax = axs[col]
 30 |         for model in models:
 31 |             data = df[
 32 |                     (df['dataset']==dataset) &
 33 |                     (df['model']==model)
 34 |             ]
 35 |             if not data.empty:
 36 |                 ax.plot(data['n_segments'], data['frac_empty'], label=model, c=colors[model])
 37 | 
 38 |         # Title
 39 |         ax.set_title(dataset)
 40 | 
 41 |         # Labels
 42 |         if col==0:
 43 |             ax.set_ylabel('Percentage of\nempty segments')
 44 |         ax.set_xlabel('# of segments')
 45 | 
 46 |         # Visuals
 47 |         ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
 48 |         ax.set_xscale('log', base=2)
 49 |         ax.set_xticks([2**12, 2**20])
 50 | 
 51 |         # Legend
 52 |         if col==0:
 53 |             fig.legend(ncol=len(models), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
 54 | 
 55 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
 56 | 
 57 | 
 58 | def plot_max_segment(filename='rmi_segmentation-max_segment.pdf', width_fact=5, height_fact=4.2):
 59 |     n_cols = len(datasets)
 60 |     n_rows = 1
 61 | 
 62 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=True)
 63 |     fig.tight_layout()
 64 | 
 65 |     for col, dataset in enumerate(datasets):
 66 |         ax = axs[col]
 67 |         for model in models:
 68 |             data = df[
 69 |                     (df['dataset']==dataset) &
 70 |                     (df['model']==model)
 71 |             ]
 72 |             if not data.empty:
 73 |                 ax.plot(data['n_segments'], data['max'], label=model, c=colors[model])
 74 | 
 75 |         # Title
 76 |         ax.set_title(dataset)
 77 | 
 78 |         # Labels
 79 |         if col==0:
 80 |             ax.set_ylabel('Size of largest\nsegment')
 81 |         ax.set_xlabel('# of segments')
 82 | 
 83 |         # Visuals
 84 |         ax.set_yscale('log')
 85 |         ax.set_xscale('log', base=2)
 86 |         ax.set_yticks([10**2, 10**4, 10**6, 10**8])
 87 |         ax.set_xticks([2**12, 2**20])
 88 | 
 89 |         # Legend
 90 |         if col==0:
 91 |             fig.legend(ncol=len(models), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
 92 | 
 93 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
 94 | 
 95 | if __name__ == "__main__":
 96 |     path = 'results'
 97 | 
 98 |     # Read csv file
 99 |     file = os.path.join(path, 'rmi_segmentation.csv')
100 |     df = pd.read_csv(file, delimiter=',', header=0, comment='#')
101 | 
102 |     # Replace datasets and model names
103 |     dataset_dict = {
104 |         "books_200M_uint64": "books",
105 |         "fb_200M_uint64": "fb",
106 |         "osm_cellids_200M_uint64": "osmc",
107 |         "wiki_ts_200M_uint64": "wiki"
108 |     }
109 |     model_dict = {
110 |         "linear_regression": "LR",
111 |         "linear_spline": "LS",
112 |         "cubic_spline": "CS",
113 |         "radix": "RX"
114 |     }
115 |     df.replace({**dataset_dict, **model_dict}, inplace=True)
116 | 
117 |     # Compute metrics
118 |     df['frac_empty'] = df['n_empty'] / df['n_segments']
119 | 
120 |     # Define variable lists
121 |     datasets = sorted(df['dataset'].unique())
122 |     models = sorted(df['model'].unique())
123 | 
124 |     # Set colors
125 |     cmap = cm.get_cmap('tab20b')
126 |     n_colors = 5
127 |     colors = {}
128 |     for i, model in enumerate(models):
129 |         colors[model] = cmap(i/n_colors+0.1)
130 | 
131 |     if args['paper']:
132 |         # Plot empty segments
133 |         filename = 'rmi_segmentation-frac_empty.pdf'
134 |         print(f'Plotting empty segments to \'{filename}\'...')
135 |         plot_frac_empty(filename, 2, 1.8)
136 | 
137 |         # Plot max segment
138 |         filename = 'rmi_segmentation-max_segment.pdf'
139 |         print(f'Plotting max segments to \'{filename}\'...')
140 |         plot_max_segment(filename, 2, 1.8)
141 |     else:
142 |         # Plot empty segments
143 |         filename = 'rmi_segmentation-frac_empty.pdf'
144 |         print(f'Plotting empty segments to \'{filename}\'...')
145 |         plot_frac_empty(filename)
146 | 
147 |         # Plot max segment
148 |         filename = 'rmi_segmentation-max_segment.pdf'
149 |         print(f'Plotting max segments to \'{filename}\'...')
150 |         plot_max_segment(filename)
151 | 


--------------------------------------------------------------------------------
/experiments/rmi_segmentation.cpp:
--------------------------------------------------------------------------------
  1 | #include "argparse/argparse.hpp"
  2 | 
  3 | #include "rmi/models.hpp"
  4 | #include "rmi/util/fn.hpp"
  5 | 
  6 | using key_type = uint64_t;
  7 | 
  8 | 
  9 | /**
 10 |  * Computes statistical properties of the segments created when segmenting the @p keys with @p Model and writes results
 11 |  * to `std::cout`.
 12 |  * @tparam Key key type
 13 |  * @tparam Model model type
 14 |  * @param keys on which the RMI is built
 15 |  * @param n_segments number of segments to be created
 16 |  * @param dataset_name name of the dataset
 17 |  * @param model model type used for segementing the keys
 18 |  */
 19 | template<typename Key, typename Model>
 20 | void experiment(const std::vector<key_type> &keys,
 21 |                 const std::size_t n_segments,
 22 |                 const std::string dataset_name,
 23 |                 const std::string model)
 24 | {
 25 |     using model_type = Model;
 26 | 
 27 |     // Build model.
 28 |     auto m = model_type(keys.begin(), keys.end(), 0, static_cast<double>(n_segments) / keys.size());
 29 | 
 30 |     // Initialize variables.
 31 |     std::vector<std::size_t> segments(n_segments, 0);
 32 | 
 33 |     // Segment keys.
 34 |     for (std::size_t i =0; i != keys.size(); ++i) {
 35 |         auto key = keys.at(i);
 36 |         std::size_t segment = std::clamp<double>(m.predict(key), 0, n_segments - 1);
 37 |         segments[segment]++;
 38 |     }
 39 | 
 40 |     // Compute properties.
 41 |     auto n_empty = std::count(segments.begin(), segments.end(), 0);
 42 | 
 43 |     // Report results.
 44 |                  // Dataset
 45 |     std::cout << dataset_name << ','
 46 |               << keys.size() << ','
 47 |                  // Model config
 48 |               << model << ','
 49 |               << n_segments << ','
 50 |                  // Absolute error
 51 |               << mean(segments) << ','
 52 |               << stdev(segments) << ','
 53 |               << median(segments) << ','
 54 |               << min(segments) << ','
 55 |               << max(segments) << ','
 56 |               << n_empty << std::endl;
 57 | }
 58 | 
 59 | /**
 60 |  * @brief experiment function pointer
 61 |  */
 62 | typedef void (*exp_fn_ptr)(const std::vector<key_type>&,
 63 |                            const std::size_t,
 64 |                            const std::string,
 65 |                            const std::string);
 66 | 
 67 | #define ENTRY(L, T) \
 68 |     { #L, &experiment<key_type, T> }
 69 | 
 70 | static std::map<std::string, exp_fn_ptr> exp_map {
 71 |     ENTRY(linear_regression, rmi::LinearRegression),
 72 |     ENTRY(linear_spline,     rmi::LinearSpline),
 73 |     ENTRY(cubic_spline,      rmi::CubicSpline),
 74 |     ENTRY(radix,             rmi::Radix<key_type>),
 75 | }; ///< Map that assigns an experiment function pointer to model types.
 76 | #undef ENTRY
 77 | 
 78 | 
 79 | /**
 80 |  * Performs segmentation using a model type and segment count provided via command line arguemnt and reports several
 81 |  * statistical properties of the resulting segments.
 82 |  * @param argc arguments counter
 83 |  * @param argv arguments vector
 84 |  */
 85 | int main(int argc, char *argv[])
 86 | {
 87 |     // Initialize argument parser.
 88 |     argparse::ArgumentParser program(argv[0], "0.1");
 89 | 
 90 |     // Define arguments.
 91 |     program.add_argument("filename")
 92 |         .help("path to binary file containing uin64_t keys");
 93 | 
 94 |     program.add_argument("model")
 95 |         .help("model type, either linear_regression, linear_spline, cubic_spline, or radix.");
 96 | 
 97 |     program.add_argument("n_segments")
 98 |         .help("number of segments, power of two is recommended.")
 99 |         .action([](const std::string &s) { return std::stoul(s); });
100 | 
101 |     program.add_argument("--header")
102 |         .help("output csv header")
103 |         .default_value(false)
104 |         .implicit_value(true);
105 | 
106 |     // Parse arguments.
107 |     try {
108 |         program.parse_args(argc, argv);
109 |     }
110 |     catch (const std::runtime_error &err) {
111 |         std::cout << err.what() << '\n' << program;
112 |         exit(EXIT_FAILURE);
113 |     }
114 | 
115 |     // Read arguments.
116 |     const auto filename = program.get<std::string>("filename");
117 |     const auto dataset_name = split(filename, '/').back();
118 |     const auto model = program.get<std::string>("model");
119 |     const auto n_segments = program.get<std::size_t>("n_segments");
120 | 
121 |     // Load keys.
122 |     auto keys = load_data<key_type>(filename);
123 | 
124 |     // Lookup experiment.
125 |     if (exp_map.find(model) == exp_map.end()) {
126 |         std::cerr << "Error: " << model << " is not a valid model type." << std::endl;
127 |         exit(EXIT_FAILURE);
128 |     }
129 |     exp_fn_ptr exp_fn = exp_map[model];
130 | 
131 |     // Output header.
132 |     if (program["--header"]  == true)
133 |         std::cout << "dataset,"
134 |                   << "n_keys,"
135 |                   << "model,"
136 |                   << "n_segments,"
137 |                   << "mean,"
138 |                   << "stdev,"
139 |                   << "median,"
140 |                   << "min,"
141 |                   << "max,"
142 |                   << "n_empty"
143 |                   << std::endl;
144 | 
145 |     // Run experiment.
146 |     (*exp_fn)(keys, n_segments, dataset_name, model);
147 | 
148 |     exit(EXIT_SUCCESS);
149 | }
150 | 


--------------------------------------------------------------------------------
/scripts/plot_rmi_errors.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | import argparse
  3 | import itertools
  4 | import matplotlib.cm as cm
  5 | import matplotlib.pyplot as plt
  6 | import os
  7 | import pandas as pd
  8 | import warnings
  9 | 
 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc'))
 11 | 
 12 | # Ignore warnings
 13 | warnings.filterwarnings( "ignore")
 14 | 
 15 | # Argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true')
 18 | args = vars(parser.parse_args())
 19 | 
 20 | 
 21 | def plot(x, y, xlabel, ylabel, filename):
 22 |     n_cols = len(datasets)
 23 |     n_rows = 1
 24 | 
 25 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True)
 26 |     fig.tight_layout()
 27 | 
 28 |     for col, dataset in enumerate(datasets):
 29 |         ax = axs[col]
 30 |         for l1 in l1_models:
 31 |             for l2 in l2_models:
 32 |                 data = df[
 33 |                     (df['dataset']==dataset) &
 34 |                     (df['layer1']==l1) &
 35 |                     (df['layer2']==l2)
 36 |                 ]
 37 |                 if not data.empty:
 38 |                     ax.plot(data[x], data[y], label=f'{l1}$\mapsto${l2}', color=colors[(l1,l2)])
 39 | 
 40 |         # Title
 41 |         ax.set_title(dataset)
 42 | 
 43 |         # Labels
 44 |         if col==0:
 45 |             ax.set_xlabel(xlabel)
 46 |         ax.set_ylabel(ylabel)
 47 | 
 48 |         # Visuals
 49 |         ax.set_xscale('log', base=2)
 50 |         ax.set_yscale('log')
 51 | 
 52 |         # Legend
 53 |         if col==0:
 54 |             fig.legend(ncol=len(l1_models)*len(l2_models), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
 55 | 
 56 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
 57 | 
 58 | 
 59 | def plot_paper(x, y, xlabel, ylabel, filename):
 60 |     l1_groups = [['CS','LR'],['LS','RX']]
 61 |     l2_models = ['LR','LS']
 62 | 
 63 |     n_cols = len(l1_groups)
 64 |     n_rows = len(datasets)
 65 | 
 66 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 2.7*n_rows), sharey='row', sharex=True)
 67 |     fig.tight_layout()
 68 | 
 69 |     for row, dataset in enumerate(datasets):
 70 |         for col, l1_models in enumerate(l1_groups):
 71 |             ax = axs[row,col]
 72 | 
 73 |             for l1 in l1_models:
 74 |                 for l2 in l2_models:
 75 |                     data = df[
 76 |                         (df['dataset']==dataset) &
 77 |                         (df['layer1']==l1) &
 78 |                         (df['layer2']==l2)
 79 |                     ]
 80 |                     if not data.empty:
 81 |                         ax.plot(data[x], data[y], label=f'{l1}$\mapsto${l2}', color=colors[(l1,l2)])
 82 | 
 83 |             # Title
 84 |             ax.set_title(dataset)
 85 | 
 86 |             # Labels
 87 |             if row==n_rows - 1:
 88 |                 ax.set_xlabel(xlabel)
 89 |             if col==0:
 90 |                 ax.set_ylabel(ylabel)
 91 | 
 92 |             # Visuals
 93 |             ax.set_xscale('log', base=2)
 94 |             ax.set_yscale('log')
 95 |             if dataset in ['fb','osmc']:
 96 |                 ax.set_ylim(bottom=0.7)
 97 | 
 98 |             # Legend
 99 |             if row==0 and col==n_cols - 1:
100 |                 fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center')
101 | 
102 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
103 | 
104 | if __name__ == "__main__":
105 |     path = 'results'
106 | 
107 |     # Read csv file
108 |     file = os.path.join(path, 'rmi_errors.csv')
109 |     df = pd.read_csv(file, delimiter=',', header=0, comment='#')
110 | 
111 |     # Replace datasets and model names
112 |     dataset_dict = {
113 |         "books_200M_uint64": "books",
114 |         "fb_200M_uint64": "fb",
115 |         "osm_cellids_200M_uint64": "osmc",
116 |         "wiki_ts_200M_uint64": "wiki"
117 |     }
118 |     model_dict = {
119 |         "linear_regression": "LR",
120 |         "linear_spline": "LS",
121 |         "cubic_spline": "CS",
122 |         "radix": "RX"
123 |     }
124 |     df.replace({**dataset_dict, **model_dict}, inplace=True)
125 | 
126 |     # Define variable lists
127 |     datasets = sorted(df['dataset'].unique())
128 |     l1_models = sorted(df['layer1'].unique())
129 |     l2_models = sorted(df['layer2'].unique())
130 | 
131 |     # Set colors
132 |     colors = {}
133 |     cmap = cm.get_cmap('tab10')
134 |     n_colors = 10
135 |     for i, (l1, l2) in enumerate(itertools.product(l1_models, l2_models)):
136 |         colors[(l1,l2)] = cmap(i/n_colors)
137 | 
138 |     if args['paper']:
139 |         # Plot median absolute error
140 |         filename = 'rmi_errors-median_absolute_error.pdf'
141 |         print(f'Plotting median absolute error to \'{filename}\'...')
142 |         plot_paper('n_models', 'median_ae', '# of segments', 'Median absolute error', filename)
143 |     else:
144 |         # Plot median absolute error
145 |         filename = 'rmi_errors-median_absolute_error.pdf'
146 |         print(f'Plotting median absolute error to \'{filename}\'...')
147 |         plot('n_models', 'median_ae', '# of segments', 'Median absolute error', filename)
148 | 
149 |         # Plot mean absolute error
150 |         filename = 'rmi_errors-mean_absolute_error.pdf'
151 |         print(f'Plotting mean absolute error to \'{filename}\'...')
152 |         plot('n_models', 'mean_ae', '# of segments', 'Mean absolute error', filename)
153 | 
154 |         # Plot max absolute error
155 |         filename = 'rmi_errors-max_absolute_error.pdf'
156 |         print(f'Plotting max absolute error to \'{filename}\'...')
157 |         plot('n_models', 'max_ae', '# of segments', 'Maximum absolute error', filename)
158 | 


--------------------------------------------------------------------------------
/scripts/plot_rmi_intervals.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | import argparse
  3 | import matplotlib.cm as cm
  4 | import matplotlib.pyplot as plt
  5 | import os
  6 | import pandas as pd
  7 | import warnings
  8 | 
  9 | plt.style.use(os.path.join('scripts', 'matplotlibrc'))
 10 | 
 11 | # Ignore warnings
 12 | warnings.filterwarnings( "ignore")
 13 | 
 14 | # Argparse
 15 | parser = argparse.ArgumentParser()
 16 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true')
 17 | args = vars(parser.parse_args())
 18 | 
 19 | 
 20 | def plot(x, y, xlabel, ylabel, filename):
 21 |     n_cols = len(configs)
 22 |     n_rows = len(datasets)
 23 | 
 24 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True)
 25 |     fig.tight_layout()
 26 | 
 27 |     for row, dataset in enumerate(datasets):
 28 |         for col, config in enumerate(configs):
 29 |             ax = axs[row,col]
 30 |             for bound in bounds:
 31 |                 data = df[
 32 |                     (df['dataset']==dataset) &
 33 |                     (df['config']==config) &
 34 |                     (df['bounds']==bound)
 35 |                 ]
 36 |                 if not data.empty:
 37 |                     ax.plot(data[x], data[y], label=bound, color=colors[bound])
 38 | 
 39 |             # Title
 40 |             ax.set_title(f'{dataset} ({config})')
 41 | 
 42 |             # Labels
 43 |             if row==n_rows - 1:
 44 |                 ax.set_xlabel(xlabel)
 45 |             if col==0:
 46 |                 ax.set_ylabel(ylabel)
 47 | 
 48 |             # Visuals
 49 |             ax.set_xscale('log')
 50 |             ax.set_yscale('log')
 51 | 
 52 |             # Legend
 53 |             if row==0 and col==0:
 54 |                 fig.legend(ncol=len(bounds), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
 55 | 
 56 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
 57 | 
 58 | 
 59 | def plot_paper(x, y, xlabel, ylabel, filename):
 60 |     configs = ['LS$\mapsto$LR', 'RX$\mapsto$LS']
 61 |     datasets = ['books', 'osmc', 'wiki']
 62 | 
 63 |     n_cols = len(configs)
 64 |     n_rows = len(datasets)
 65 | 
 66 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 2.7*n_rows), sharey='row', sharex=True)
 67 |     fig.tight_layout()
 68 | 
 69 |     for row, dataset in enumerate(datasets):
 70 |         for col, config in enumerate(configs):
 71 |             ax = axs[row,col]
 72 |             for bound in bounds:
 73 |                 data = df[
 74 |                     (df['dataset']==dataset) &
 75 |                     (df['config']==config) &
 76 |                     (df['bounds']==bound)
 77 |                 ]
 78 |                 if not data.empty:
 79 |                     ax.plot(data[x], data[y], label=bound, color=colors[bound])
 80 | 
 81 |             # Title
 82 |             ax.set_title(f'{dataset} ({config})')
 83 | 
 84 |             # Labels
 85 |             if row==n_rows - 1:
 86 |                 ax.set_xlabel(xlabel)
 87 |             if col==0:
 88 |                 ax.set_ylabel(ylabel)
 89 | 
 90 |             # Visuals
 91 |             ax.set_xscale('log')
 92 |             ax.set_yscale('log')
 93 |             if col==n_cols - 1:
 94 |                 ax.set_ylim(bottom=4, top=None)
 95 | 
 96 |             # Legend
 97 |             if row==0 and col==0:
 98 |                 fig.legend(ncol=len(bounds), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
 99 | 
100 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     path = 'results'
105 | 
106 |     # Read csv file
107 |     file = os.path.join(path, 'rmi_intervals.csv')
108 |     df = pd.read_csv(file, delimiter=',', header=0, comment='#')
109 | 
110 |     # Replace datasets, model names, and bounds
111 |     dataset_dict = {
112 |         "books_200M_uint64": "books",
113 |         "fb_200M_uint64": "fb",
114 |         "osm_cellids_200M_uint64": "osmc",
115 |         "wiki_ts_200M_uint64": "wiki"
116 |     }
117 |     model_dict = {
118 |         "cubic_spline": "CS",
119 |         "linear_spline": "LS",
120 |         "linear_regression": "LR",
121 |         "radix": "RX"
122 |     }
123 |     bounds_dict = {
124 |         "labs": "LAbs",
125 |         "lind": "LInd",
126 |         "gabs": "GAbs",
127 |         "gind": "GInd",
128 |         "none": "NB"
129 |     }
130 |     df.replace({**dataset_dict, **model_dict, **bounds_dict}, inplace=True)
131 | 
132 |     # Compute model combinations and metrics
133 |     df['config'] = df['layer1'] + '$\mapsto$' + df['layer2']
134 |     df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024)
135 | 
136 |     # Define varibale lists
137 |     datasets = sorted(df['dataset'].unique())
138 |     configs = sorted(df['config'].unique())
139 |     bounds = sorted(df['bounds'].unique())
140 | 
141 |     # Set colors
142 |     colors = {}
143 |     cmap = cm.get_cmap('Dark2')
144 |     n_colors = 8
145 |     for i, bound in enumerate(bounds):
146 |         colors[bound] = cmap(i/n_colors)
147 | 
148 |     if args['paper']:
149 |         #  Plot median interval size
150 |         filename = 'rmi_intervals-median_interval.pdf'
151 |         print(f'Plotting median interval to \'{filename}\'...')
152 |         plot_paper('size_in_MiB', 'median_interval', 'Index size [MiB]', 'Median search\ninterval size', filename)
153 |     else:
154 |         #  Plot mean interval size
155 |         filename = 'rmi_intervals-mean_interval.pdf'
156 |         print(f'Plotting mean interval to \'{filename}\'...')
157 |         plot('size_in_MiB', 'mean_interval', 'Index size [MiB]', 'Mean search\ninterval size', filename)
158 | 
159 |         #  Plot median interval size
160 |         filename = 'rmi_intervals-median_interval.pdf'
161 |         print(f'Plotting median interval to \'{filename}\'...')
162 |         plot('size_in_MiB', 'median_interval', 'Index size [MiB]', 'Median search\ninterval size', filename)
163 | 
164 |         #  Plot max interval size
165 |         filename = 'rmi_intervals-max_interval.pdf'
166 |         print(f'Plotting max interval to \'{filename}\'...')
167 |         plot('size_in_MiB', 'max_interval', 'Index size [MiB]', 'Max search\ninterval size', filename)
168 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A Critical Analysis of Recursive Model Indexes
  2 | Code for our [VLDB paper](https://www.vldb.org/pvldb/vol15/p1079-maltry.pdf)
  3 | and [arXiv report](https://arxiv.org/abs/2106.16166).
  4 | 
  5 | ## Build
  6 | First clone the repository including all submodules.
  7 | ```sh
  8 | git clone --recursive https://github.com/BigDataAnalyticsGroup/analysis-rmi.git
  9 | cd analysis-rmi
 10 | ```
 11 | Then download the datasets and generate the source files of the RMI reference
 12 | implementation.
 13 | ```sh
 14 | scripts/download_data.sh
 15 | scripts/rmi_ref/prepare_rmi_ref.sh
 16 | ```
 17 | Finally, the project can then be built as follows.
 18 | ```
 19 | mkdir build
 20 | cd build
 21 | cmake -DCMAKE_BUILD_TYPE=Release ..
 22 | make
 23 | bin/example
 24 | ```
 25 | 
 26 | ## Example
 27 | ```c++
 28 | // Initialize random number generator.
 29 | using key_type = uint64_t;
 30 | std::mt19937 gen(42);
 31 | std::uniform_int_distribution<key_type> key_distrib(0, 1UL << 48);
 32 | auto rand = [&gen, &key_distrib] { return key_distrib(gen); };
 33 | 
 34 | // Create 1M random keys.
 35 | std::size_t n_keys = 1e7;
 36 | std::vector<key_type> keys(n_keys);
 37 | std::generate(keys.begin(), keys.end(), rand);
 38 | std::sort(keys.begin(), keys.end());
 39 | 
 40 | // Build a two-layer RMI.
 41 | using layer1_type = rmi::LinearSpline;
 42 | using layer2_type = rmi::LinearRegression;
 43 | std::size_t layer2_size = 2UL << 16;
 44 | rmi::RmiLAbs<key_type, layer1_type, layer2_type> rmi(keys, layer2_size);
 45 | 
 46 | // Pick a key.
 47 | std::uniform_int_distribution<std::size_t> uniform_distrib(0, n_keys - 1);
 48 | key_type key = keys[uniform_distrib(gen)];
 49 | 
 50 | // Perform a lookup.
 51 | auto range = rmi.search(key);
 52 | auto pos = std::lower_bound(keys.begin() + range.lo, keys.begin() + range.hi, key);
 53 | std::cout << "Key " << key << " is located at position "
 54 |           << std::distance(keys.begin(), pos) << '.' << std::endl;
 55 | ```
 56 | 
 57 | ## Reproducing Experimental Results
 58 | We provide the following experiments from our paper.
 59 | * `rmi_segmentation`: Compute statistical properties on the segment sizes
 60 |   resulting from various root models (Section 5.1).
 61 | * `rmi_errors`: Compute statistical properties on the prediction errors of a
 62 |   wide range of RMI configurations (Section 5.2).
 63 | * `rmi_intervals`: Compute statistical properties on the error interval sizes
 64 |   of a wide range of RMI configurations (Section 5.3).
 65 | * `rmi_lookup`: Measure lookup times for a wide range of RMI configurations
 66 |   (Section 6).
 67 | * `rmi_build`: Measure build times for a wide range of RMI configurations and
 68 |   compare against the reference implementation (Section 7).
 69 | * `rmi_guideline`: Measure lookup times for a wide range of RMI configurations
 70 |   and compare against configurations resulting from our guideline (Section 8).
 71 | * `index_comparison`: Compare several indexes in terms of lookup time and build
 72 |   time (Section 9).
 73 | 
 74 | Below, we explain step by step how to reproduce our experimental results.
 75 | 
 76 | ### Preliminaries
 77 | The following tools are required to reproduce our results.
 78 | * C++ compiler supporting C++17.
 79 | * `bash>=4`: run shell scripts.
 80 | * `cmake>=3.2`: build configuration.
 81 | * `md5sum`: validate the datasets.
 82 | * `rust`: generate reference RMIs from
 83 |   [learnedsystems/RMI](https://github.com/learnedsystems/RMI).
 84 | * `timeout`: abort experiments of slow configurations.
 85 | * `wget`: download the datasets.
 86 | * `zstd`: decompress the datasets.
 87 | 
 88 | In the following, we assume that all scripts are run from the root directory of
 89 | this repository. If you want to plot the results, install the corresponding
 90 | Python requirements.
 91 | ```sh
 92 | pip install -r requirements.txt
 93 | ```
 94 | 
 95 | ### Running And Plotting a Single Experiment
 96 | We provide a script for running each experiment with the exact same
 97 | configuration used in the paper. To run experiment `<experiment>`, simply
 98 | execute the corresponding script `scripts/run_<experiment>.sh`, e.g., to
 99 | reproduce the experiment `index_comparison` proceed as follows.
100 | ```sh
101 | scripts/run_index_comparison.sh
102 | ```
103 | 
104 | Depending on the hardware, experiments involving measurements of lookup time
105 | might run several days. Results will be written to `results/<experiment>.csv`
106 | in csv format with an appropriate header.
107 | 
108 | Afterwards, the results can be plotted by running
109 | `scripts/plot_<experiment>.py`, e.g., to plot the results of the experiment
110 | `index_comparison` proceed as follows.
111 | ```sh
112 | scripts/plot_index_comparison.py
113 | ```
114 | Note that this will visualize _all_ results of the experiment. To reproduce the
115 | paper plots, execute the Python script with argument `--paper`.
116 | 
117 | The plots will be prefixed by the experiment name and placed in `results/`.
118 | 
119 | ### Running and Plotting All Experiments at Once
120 | To reproduce all experiments at once, run the script `scripts/run_all.sh`.
121 | Executing all experiments will take several days. Results will be written to
122 | `results/<experiment>.csv` in csv format with an appropriate header. Plots can
123 | be produced as described above.
124 | 
125 | Afterwards, all results can be visualized by executing the script
126 | `scripts/plot_all.sh`. To reproduce only the plots from the paper, execute the
127 | script `scripts/plot_paper.sh`. The resulting plots will be prefixed by the
128 | experiment name and place in `results/`.
129 | 
130 | ## Documentation
131 | Code documentation can be generated using `doxygen` by running the following command.
132 | ```sh
133 | doxygen Doxyfile
134 | ```
135 | The code documentation will be placed in `doxy/html/`.
136 | 
137 | ## Cite
138 | VLDB paper:
139 | ```
140 | @article{maltry2022critical,
141 |     title={A Critical Analysis of Recursive Model Indexes},
142 |     author={Marcel Maltry and Jens Dittrich}
143 |     journal={Proc. {VLDB} Endow.},
144 |     volume={15},
145 |     number={5},
146 |     pages={1079--1091},
147 |     year={2022}
148 | }
149 | ```
150 | 
151 | arXiv report:
152 | ```
153 | @misc{maltry2021criticalarxiv,
154 |     title={A Critical Analysis of Recursive Model Indexes},
155 |     author={Marcel Maltry and Jens Dittrich},
156 |     year={2021},
157 |     eprint={2106.16166},
158 |     archivePrefix={arXiv},
159 |     primaryClass={cs.DB}
160 | }
161 | ```
162 | 


--------------------------------------------------------------------------------
/include/rmi/util/fn.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | #include <fstream>
  5 | #include <iostream>
  6 | #include <limits>
  7 | #include <numeric>
  8 | #include <sstream>
  9 | #include <type_traits>
 10 | #include <vector>
 11 | 
 12 | 
 13 | /*======================================================================================================================
 14 |  * Bit Functions
 15 |  *====================================================================================================================*/
 16 | 
 17 | /**
 18 |  * Computes the amount of bits needed to represent unsigned value @p n.
 19 |  * @tparam Numeric the type of the value
 20 |  * @param n the value
 21 |  * @return the bit-width of the value
 22 |  */
 23 | template<typename Numeric>
 24 | uint8_t bit_width(Numeric n)
 25 | {
 26 |     static_assert(std::is_unsigned<Numeric>::value, "not defined for signed integral types");
 27 | 
 28 |     // Count leading zeros.
 29 |     int lz;
 30 |     if constexpr (std::is_same_v<Numeric, unsigned>) {
 31 |         lz = __builtin_clz(n);
 32 |     } else if constexpr (std::is_same_v<Numeric, unsigned long>) {
 33 |         lz = __builtin_clzl(n);
 34 |     } else if constexpr (std::is_same_v<Numeric, unsigned long long>) {
 35 |         lz = __builtin_clzll(n);
 36 |     } else {
 37 |         static_assert(sizeof(Numeric) > sizeof(unsigned long long), "unsupported width of integral type");
 38 |     }
 39 | 
 40 |     return sizeof(Numeric) * 8 - lz;
 41 | }
 42 | 
 43 | /**
 44 |  * Computes the length of the common prefix of two numeric values @p v1 and @p v2.
 45 |  * @tparam Numeric the type of the values
 46 |  * @param v1 the first value
 47 |  * @param v2 the second value
 48 |  * @return the length of the common prefix
 49 |  */
 50 | template<typename Numeric>
 51 | uint8_t common_prefix_width(Numeric v1, Numeric v2)
 52 | {
 53 |     Numeric Xor = v1 ^ v2; // bit-wise xor
 54 | 
 55 |     if constexpr (sizeof(Numeric) <= sizeof(unsigned)) {
 56 |         return __builtin_clz(Xor);
 57 |     } else if constexpr (sizeof(Numeric) <= sizeof(unsigned long)) {
 58 |         return __builtin_clzl(Xor);
 59 |     } else if constexpr (sizeof(Numeric) <= sizeof(unsigned long long)) {
 60 |         return __builtin_clzll(Xor);
 61 |     } else {
 62 |         static_assert(sizeof(Numeric) > sizeof(unsigned long long), "unsupported width of integral type");
 63 |     }
 64 | }
 65 | 
 66 | 
 67 | /*======================================================================================================================
 68 |  * String Functions
 69 |  *====================================================================================================================*/
 70 | 
 71 | /**
 72 |  * Splits @p str at each occurence of @p delimiter.
 73 |  * @param str the string to be split
 74 |  * @param delimiter the delimiter to split the string at
 75 |  * @return vector of substrings
 76 |  */
 77 | std::vector<std::string> split(const std::string &str, char delimiter)
 78 | {
 79 |     std::vector<std::string> tokens;
 80 |     std::string token;
 81 |     std::istringstream token_stream(str);
 82 |     while (std::getline(token_stream, token, delimiter)) {
 83 |         tokens.push_back(token);
 84 |     }
 85 |     return tokens;
 86 | }
 87 | 
 88 | 
 89 | /*======================================================================================================================
 90 |  * Arithmetic Functions
 91 |  *====================================================================================================================*/
 92 | 
 93 | /**
 94 |  * Computes the arithmetic mean of a vector @p v of numeric values.
 95 |  * @param v vector of numeric values
 96 |  * @return arithmetic mean
 97 |  */
 98 | template<typename Numeric>
 99 | double mean(std::vector<Numeric> &v)
100 | {
101 |     double sum = std::accumulate(v.begin(), v.end(), 0.0);
102 |     return sum / v.size();
103 | }
104 | 
105 | /**
106 |  * Computes the standard deviation of the mean of vector @p of numeric values.
107 |  * @param v vector of numeric values
108 |  * @return standard deviation
109 |  */
110 | template<typename Numeric>
111 | double stdev(std::vector<Numeric> &v) {
112 |     double mean = ::mean<Numeric>(v);
113 |     double sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), 0.0);
114 |     return std::sqrt(sq_sum / v.size() - mean * mean);
115 | }
116 | 
117 | /**
118 |  * Computes the median of vector @p v of numeric values.
119 |  * @param v vector of numeric values
120 |  * @return median
121 |  */
122 | template<typename Numeric>
123 | Numeric median(std::vector<Numeric> &v)
124 | {
125 |     std::size_t n = v.size() / 2;
126 |     std::nth_element(v.begin(), v.begin()+n, v.end());
127 |     return v.at(n);
128 | }
129 | 
130 | /**
131 |  * Computes the minimum of a vector @p v of numeric values.
132 |  * @param v vector of numeric values
133 |  * @return minimum
134 |  */
135 | template<typename Numeric>
136 | Numeric min(std::vector<Numeric> &v)
137 | {
138 |     return *std::min_element(v.begin(), v.end());
139 | }
140 | 
141 | /**
142 |  * Computes the maximum of a vector @p v of numeric values.
143 |  * @param v vector of numeric values
144 |  * @return maximum
145 |  */
146 | template<typename Numeric>
147 | Numeric max(std::vector<Numeric> &v)
148 | {
149 |     return *std::max_element(v.begin(), v.end());
150 | }
151 | 
152 | 
153 | /*======================================================================================================================
154 |  * Dataset Functions
155 |  *====================================================================================================================*/
156 | 
157 | /**
158 |  * Reads a dataset file @p filename in binary format and writes keys to vector.
159 |  * @tparam Key the type of the key
160 |  * @param filename name of the dataset file
161 |  * @return vector of keys
162 |  */
163 | template<typename Key>
164 | std::vector<Key> load_data(const std::string &filename) {
165 |     using key_type = Key;
166 | 
167 |     // Open file.
168 |     std::ifstream in(filename, std::ios::binary);
169 |     if (!in.is_open()) {
170 |         std::cerr << "Could not load " << filename << '.' << std::endl;
171 |         exit(EXIT_FAILURE);
172 |     }
173 | 
174 |     // Read number of keys.
175 |     uint64_t n_keys;
176 |     in.read(reinterpret_cast<char*>(&n_keys), sizeof(uint64_t));
177 | 
178 |     // Initialize vector.
179 |     std::vector<key_type> data;
180 |     data.resize(n_keys);
181 | 
182 |     // Read keys.
183 |     in.read(reinterpret_cast<char*>(data.data()), n_keys * sizeof(key_type));
184 |     in.close();
185 | 
186 |     return data;
187 | }
188 | 


--------------------------------------------------------------------------------
/experiments/rmi_errors.cpp:
--------------------------------------------------------------------------------
  1 | #include "argparse/argparse.hpp"
  2 | 
  3 | #include "rmi/models.hpp"
  4 | #include "rmi/rmi.hpp"
  5 | #include "rmi/util/fn.hpp"
  6 | 
  7 | using key_type = uint64_t;
  8 | 
  9 | 
 10 | /**
 11 |  * Computes several error metrics for a given @p Rmi on dataset @p keys and writes results to `std::cout`.
 12 |  * @tparam Key key type
 13 |  * @tparam Rmi RMI type
 14 |  * @param keys on which the RMI is built
 15 |  * @param n_models number of models in the second layer of the RMI
 16 |  * @param dataset_name name of the dataset
 17 |  * @param layer1 model type of the first layer
 18 |  * @param layer2 model type of the second layer
 19 |  */
 20 | template<typename Key, typename Rmi>
 21 | void experiment(const std::vector<key_type> &keys,
 22 |                 const std::size_t n_models,
 23 |                 const std::string dataset_name,
 24 |                 const std::string layer1,
 25 |                 const std::string layer2)
 26 | {
 27 |     using rmi_type = Rmi;
 28 | 
 29 |     // Build RMI.
 30 |     rmi_type rmi(keys, n_models);
 31 | 
 32 |     // Initialize variables.
 33 |     auto n_keys = keys.size();
 34 |     std::vector<int64_t> absolute_errors;
 35 |     absolute_errors.reserve(n_keys);
 36 | 
 37 |     // Perform predictions.
 38 |     auto prev_key = keys.at(0);
 39 |     int64_t prev_pos = 0;
 40 |     for (std::size_t i = 0; i != n_keys; ++i) {
 41 |         auto key = keys.at(i);
 42 |         auto pred = rmi.search(key);
 43 | 
 44 |         // Record error.
 45 |         int64_t pos = key == prev_key ? prev_pos : i;
 46 |         auto absolute_error = std::abs(pos - static_cast<int64_t>(pred.pos));
 47 |         absolute_errors.push_back(absolute_error);
 48 | 
 49 |         prev_key = key;
 50 |         prev_pos = pos;
 51 |     }
 52 | 
 53 |     // Report results.
 54 |                  // Dataset
 55 |     std::cout << dataset_name << ','
 56 |               << n_keys << ','
 57 |                  // RMI config
 58 |               << layer1 << ','
 59 |               << layer2 << ','
 60 |               << n_models << ','
 61 |                  // Absolute error
 62 |               << mean(absolute_errors) << ','
 63 |               << median(absolute_errors) << ','
 64 |               << stdev(absolute_errors) << ','
 65 |               << min(absolute_errors) << ','
 66 |               << max(absolute_errors) << std::endl;
 67 | }
 68 | 
 69 | 
 70 | /**
 71 |  * @brief experiment function pointer
 72 |  */
 73 | typedef void (*exp_fn_ptr)(const std::vector<key_type>&,
 74 |                            const std::size_t,
 75 |                            const std::string,
 76 |                            const std::string,
 77 |                            const std::string);
 78 | 
 79 | #define ENTRY(L1, L2, T1, T2) \
 80 |     { std::make_pair(#L1, #L2), &experiment<key_type, rmi::Rmi<key_type, T1, T2>> }
 81 | 
 82 | static std::map<std::pair<std::string, std::string>, exp_fn_ptr> exp_map {
 83 |     ENTRY(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression),
 84 |     ENTRY(linear_regression, linear_spline,     rmi::LinearRegression, rmi::LinearSpline),
 85 |     ENTRY(linear_spline,     linear_regression, rmi::LinearSpline,     rmi::LinearRegression),
 86 |     ENTRY(linear_spline,     linear_spline,     rmi::LinearSpline,     rmi::LinearSpline),
 87 |     ENTRY(cubic_spline,      linear_regression, rmi::CubicSpline,      rmi::LinearRegression),
 88 |     ENTRY(cubic_spline,      linear_spline,     rmi::CubicSpline,      rmi::LinearSpline),
 89 |     ENTRY(radix,             linear_regression, rmi::Radix<key_type>,  rmi::LinearRegression),
 90 |     ENTRY(radix,             linear_spline,     rmi::Radix<key_type>,  rmi::LinearSpline),
 91 | }; ///< Map that assigns an experiment function pointer to RMI configurations.
 92 | #undef ENTRY
 93 | 
 94 | 
 95 | /**
 96 |  * Triggers computation of several error metrics of an RMI configuration provided via command line arguments.
 97 |  * @param argc arguments counter
 98 |  * @param argv arguments vector
 99 |  */
100 | int main(int argc, char *argv[])
101 | {
102 |     // Initialize argument parser.
103 |     argparse::ArgumentParser program(argv[0], "0.1");
104 | 
105 |     // Define arguments.
106 |     program.add_argument("filename")
107 |         .help("path to binary file containing uin64_t keys");
108 | 
109 |     program.add_argument("layer1")
110 |         .help("layer1 model type, either linear_regression, linear_spline, cubic_spline, or radix.");
111 | 
112 |     program.add_argument("layer2")
113 |         .help("layer2 model type, either linear_regression, linear_spline, or cubic_spline.");
114 | 
115 |     program.add_argument("n_models")
116 |         .help("number of models on layer2, power of two is recommended.")
117 |         .action([](const std::string &s) { return std::stoul(s); });
118 | 
119 |     program.add_argument("--header")
120 |         .help("output csv header")
121 |         .default_value(false)
122 |         .implicit_value(true);
123 | 
124 |     // Parse arguments.
125 |     try {
126 |         program.parse_args(argc, argv);
127 |     }
128 |     catch (const std::runtime_error &err) {
129 |         std::cout << err.what() << '\n' << program;
130 |         exit(EXIT_FAILURE);
131 |     }
132 | 
133 |     // Read arguments.
134 |     const auto filename = program.get<std::string>("filename");
135 |     const auto dataset_name = split(filename, '/').back();
136 |     const auto layer1 = program.get<std::string>("layer1");
137 |     const auto layer2 = program.get<std::string>("layer2");
138 |     const auto n_models = program.get<std::size_t>("n_models");
139 | 
140 |     // Load keys.
141 |     auto keys = load_data<key_type>(filename);
142 | 
143 |     // Lookup experiment.
144 |     auto config = std::make_pair(layer1, layer2);
145 |     if (exp_map.find(config) == exp_map.end()) {
146 |         std::cerr << "Error: " << layer1 << ',' << layer2 << " is not a valid RMI configuration." << std::endl;
147 |         exit(EXIT_FAILURE);
148 |     }
149 |     exp_fn_ptr exp_fn = exp_map[config];
150 | 
151 |     // Output header.
152 |     if (program["--header"]  == true)
153 |         std::cout << "dataset,"
154 |                   << "n_keys,"
155 |                   << "layer1,"
156 |                   << "layer2,"
157 |                   << "n_models,"
158 |                   << "mean_ae,"
159 |                   << "median_ae,"
160 |                   << "stdev_ae"
161 |                   << "min_ae"
162 |                   << "max_ae"
163 |                   << std::endl;
164 | 
165 |     // Run experiment.
166 |     (*exp_fn)(keys, n_models, dataset_name, layer1, layer2);
167 | 
168 |     exit(EXIT_SUCCESS);
169 | }
170 | 


--------------------------------------------------------------------------------
/include/rmi/util/search.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | #include <vector>
  5 | 
  6 | 
  7 | /**
  8 |  * Functor for performing linear search.
  9 |  */
 10 | struct LinearSearch {
 11 |     /**
 12 |      * Performs linear search in the interval [first,last) to find the first element that is not less than @t value.
 13 |      * @tparam InputIt input iterator type
 14 |      * @tparam T type of searched value
 15 |      * @param first, last iterators defining the partially-ordered range to examine
 16 |      * @param pred iterator to the predicted position (ignored)
 17 |      * @param value value to compare the elements to
 18 |      * @return iterator to the first element that is not less than @p value
 19 |      */
 20 |     template<typename InputIt, typename T>
 21 |     InputIt operator()(InputIt first, InputIt last, InputIt /* pred */, const T &value) {
 22 |         InputIt runner = first;
 23 |         for (; runner != last; ++runner)
 24 |             if (*runner >= value) return runner;
 25 |         return last;
 26 |     }
 27 | };
 28 | 
 29 | 
 30 | /**
 31 |  * Functor for performing model-biased linear search.
 32 |  */
 33 | struct ModelBiasedLinearSearch {
 34 |     /**
 35 |      * Performs model-biased linear search either in the interval [first,pred) or [pred, last) to find the first element
 36 |      * that is not less than @t value.
 37 |      * @tparam InputIt input iterator type
 38 |      * @tparam T type of searched value
 39 |      * @param first, last iterators defining the partially-ordered range to examine
 40 |      * @param pred iterator to the predicted position
 41 |      * @param value value to compare the elements to
 42 |      * @return iterator to the first element that is not less than @p value
 43 |      */
 44 |     template<typename InputIt, typename T>
 45 |     InputIt operator()(InputIt first, InputIt last, InputIt pred, const T &value) {
 46 |         InputIt runner = pred;
 47 |         if (*runner < value) {
 48 |             for (; runner < last; ++runner) // search right side
 49 |                 if (*runner >= value) return runner;
 50 |             return last;
 51 |         } else {
 52 |             for (; runner >= first; --runner)// search left side
 53 |                 if (*runner < value) return ++runner;
 54 |             return first;
 55 |         }
 56 |     }
 57 | };
 58 | 
 59 | 
 60 | /**
 61 |  * Functor for performing binary search.
 62 |  */
 63 | struct BinarySearch {
 64 |     /**
 65 |      * Performs binary search in the interval [first,last) to find the first element that is not less than @t value.
 66 |      * @tparam InputIt input iterator type
 67 |      * @tparam T type of searched value
 68 |      * @param first, last iterators defining the partially-ordered range to examine
 69 |      * @param pred iterator to the predicted position (ignored)
 70 |      * @param value value to compare the elements to
 71 |      * @return iterator to the first element that is not less than @p value
 72 |      */
 73 |     template<typename InputIt, typename T>
 74 |     InputIt operator()(InputIt first, InputIt last, InputIt /* pred */, const T &value) {
 75 |         return std::lower_bound(first, last, value);
 76 |     }
 77 | };
 78 | 
 79 | 
 80 | /**
 81 |  * Functor for performing model-biased binary search.
 82 |  */
 83 | struct ModelBiasedBinarySearch {
 84 |     /**
 85 |      * Performs model-biased binary search either in the interval [first,pred) or [pred, last) to find the first element
 86 |      * that is not less than @t value.
 87 |      * @tparam InputIt input iterator type
 88 |      * @tparam T type of searched value
 89 |      * @param first, last iterators defining the partially-ordered range to examine
 90 |      * @param pred iterator to the predicted position
 91 |      * @param value value to compare the elements to
 92 |      * @return iterator to the first element that is not less than @p value
 93 |      */
 94 |     template<typename InputIt, typename T>
 95 |     InputIt operator()(InputIt first, InputIt last, InputIt pred, const T &value) {
 96 |         if (*pred < value) return std::lower_bound(pred, last, value); // search right side
 97 |         else return std::lower_bound(first, pred, value); // search left side
 98 |     }
 99 | };
100 | 
101 | 
102 | /**
103 |  * Functor for performing exponential search.
104 |  */
105 | struct ExponentialSearch {
106 |     /**
107 |      * Performs exponential search in the interval [first,last) to find the first element that is not less than @t
108 |      * value.
109 |      * @tparam InputIt input iterator type
110 |      * @tparam T type of searched value
111 |      * @param first, last iterators defining the partially-ordered range to examine
112 |      * @param pred iterator to the predicted position (ignored)
113 |      * @param value value to compare the elements to
114 |      * @return iterator to the first element that is not less than @p value
115 |      */
116 |     template<typename InputIt, typename T>
117 |     InputIt operator()(InputIt first, InputIt last, InputIt /* pred */, const T &value) {
118 |         if (*first >= value) return first;
119 |         std::size_t bound = 1;
120 |         InputIt prev = first;
121 |         InputIt curr = prev + bound;
122 |         while (curr < last and *curr < value) {
123 |             bound *= 2;
124 |             prev = curr;
125 |             curr += bound;
126 |         }
127 |         return std::lower_bound(prev, std::min(curr + 1, last), value);
128 |     }
129 | };
130 | 
131 | 
132 | /**
133 |  * Functor for performing model-biased exponential search.
134 |  */
135 | struct ModelBiasedExponentialSearch {
136 |     /**
137 |      * Performs model-biased exponential search either in the interval [first,pred) or [pred, last) to find the first
138 |      * element that is not less than @t value.
139 |      * @tparam InputIt input iterator type
140 |      * @tparam T type of searched value
141 |      * @param first, last iterators defining the partially-ordered range to examine
142 |      * @param pred iterator to the predicted position
143 |      * @param value value to compare the elements to
144 |      * @return iterator to the first element that is not less than @p value
145 |      */
146 |     template<typename InputIt, typename T>
147 |     InputIt operator()(InputIt first, InputIt last, InputIt pred, const T &value) {
148 |         if (*pred < value) { // search right side
149 |             std::size_t bound = 1;
150 |             InputIt prev = pred;
151 |             InputIt curr = prev + bound;
152 |             while (curr < last and *curr < value) {
153 |                 bound *= 2;
154 |                 prev = curr;
155 |                 curr += bound;
156 |             }
157 |             return std::lower_bound(prev, std::min(curr + 1, last), value);
158 |         } else { // search left side
159 |             std::size_t bound = 1;
160 |             InputIt prev = pred;
161 |             InputIt curr = prev - bound;
162 |             while (curr > first and *curr >= value) {
163 |                 bound *= 2;
164 |                 prev = curr;
165 |                 curr -= bound;
166 |             }
167 |             return std::lower_bound(std::max(first, curr), prev, value);
168 |         }
169 |     }
170 | };
171 | 


--------------------------------------------------------------------------------
/experiments/rmi_intervals.cpp:
--------------------------------------------------------------------------------
  1 | #include <tuple>
  2 | 
  3 | #include "argparse/argparse.hpp"
  4 | 
  5 | #include "rmi/models.hpp"
  6 | #include "rmi/rmi.hpp"
  7 | #include "rmi/util/fn.hpp"
  8 | 
  9 | using key_type = uint64_t;
 10 | 
 11 | 
 12 | /**
 13 |  * Computes several metrics on the error interval sizes for a given @p Rmi on dataset @p keys and writes results to
 14 |  * `std::cout`.
 15 |  * @tparam Key key type
 16 |  * @tparam Rmi RMI type
 17 |  * @param keys on which the RMI is built
 18 |  * @param n_models number of models in the second layer of the RMI
 19 |  * @param dataset_name name of the dataset
 20 |  * @param layer1 model type of the first layer
 21 |  * @param layer2 model type of the second layer
 22 |  * @param bound_type used by the RMI
 23 |  */
 24 | template<typename Key, typename Rmi>
 25 | void experiment(const std::vector<key_type> &keys,
 26 |                 const std::size_t n_models,
 27 |                 const std::string dataset_name,
 28 |                 const std::string layer1,
 29 |                 const std::string layer2,
 30 |                 const std::string bound_type)
 31 | {
 32 |     using rmi_type = Rmi;
 33 | 
 34 |     // Build RMI.
 35 |     rmi_type rmi(keys, n_models);
 36 | 
 37 |     // Initialize variables.
 38 |     auto n_keys = keys.size();
 39 |     std::vector<int64_t> interval_sizes;
 40 |     interval_sizes.reserve(n_keys);
 41 | 
 42 |     // Perform predictions.
 43 |     for (auto key : keys) {
 44 |         auto pred = rmi.search(key);
 45 | 
 46 |         // Record interval size.
 47 |         auto interval_size = pred.hi - pred.lo;
 48 |         interval_sizes.push_back(interval_size);
 49 |     }
 50 | 
 51 |     // Report results.
 52 |                  // Dataset
 53 |     std::cout << dataset_name << ','
 54 |               << n_keys << ','
 55 |                  // RMI config
 56 |               << layer1 << ','
 57 |               << layer2 << ','
 58 |               << n_models << ','
 59 |               << bound_type << ','
 60 |               << rmi.size_in_bytes() << ','
 61 |                  // Interval sizes
 62 |               << mean(interval_sizes) << ','
 63 |               << median(interval_sizes) << ','
 64 |               << stdev(interval_sizes) << ','
 65 |               << min(interval_sizes) << ','
 66 |               << max(interval_sizes) << std::endl;
 67 | }
 68 | 
 69 | 
 70 | /**
 71 |  * @brief experiment function pointer
 72 |  */
 73 | typedef void (*exp_fn_ptr)(const std::vector<key_type>&,
 74 |                            const std::size_t,
 75 |                            const std::string,
 76 |                            const std::string,
 77 |                            const std::string,
 78 |                            const std::string);
 79 | 
 80 | /**
 81 |  * RMI configuration that holds the string representation of model types of layer 1 and layer 2 and the error bound
 82 |  * type.
 83 |  */
 84 | struct Config {
 85 |     std::string layer1;
 86 |     std::string layer2;
 87 |     std::string bound_type;
 88 | };
 89 | 
 90 | /**
 91 |  * Comparator class for @p Config objects.
 92 |  */
 93 | struct ConfigCompare {
 94 |     bool operator() (const Config &lhs, const Config &rhs) const {
 95 |         if (lhs.layer1 != rhs.layer1) return lhs.layer1 < rhs.layer1;
 96 |         if (lhs.layer2 != rhs.layer2) return lhs.layer2 < rhs.layer2;
 97 |         return lhs.bound_type < rhs.bound_type;
 98 |     }
 99 | };
100 | 
101 | #define ENTRIES(L1, L2, T1, T2) \
102 |     { {#L1, #L2, "labs"}, &experiment<key_type, rmi::RmiLAbs<key_type, T1, T2>> }, \
103 |     { {#L1, #L2, "lind"}, &experiment<key_type, rmi::RmiLInd<key_type, T1, T2>> }, \
104 |     { {#L1, #L2, "gabs"}, &experiment<key_type, rmi::RmiGAbs<key_type, T1, T2>> }, \
105 |     { {#L1, #L2, "gind"}, &experiment<key_type, rmi::RmiGInd<key_type, T1, T2>> },
106 | 
107 | static std::map<Config, exp_fn_ptr, ConfigCompare> exp_map {
108 |     ENTRIES(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression)
109 |     ENTRIES(linear_regression, linear_spline,     rmi::LinearRegression, rmi::LinearSpline)
110 |     ENTRIES(linear_spline,     linear_regression, rmi::LinearSpline,     rmi::LinearRegression)
111 |     ENTRIES(linear_spline,     linear_spline,     rmi::LinearSpline,     rmi::LinearSpline)
112 |     ENTRIES(cubic_spline,      linear_regression, rmi::CubicSpline,      rmi::LinearRegression)
113 |     ENTRIES(cubic_spline,      linear_spline,     rmi::CubicSpline,      rmi::LinearSpline)
114 |     ENTRIES(radix,             linear_regression, rmi::Radix<key_type>,  rmi::LinearRegression)
115 |     ENTRIES(radix,             linear_spline,     rmi::Radix<key_type>,  rmi::LinearSpline)
116 | }; ///< Map that assigns an experiment function pointer to RMI configurations.
117 | #undef ENTRIES
118 | 
119 | 
120 | /**
121 |  * Triggers computation of several metrics on the error interval sizes of an RMI configuration provided via command line
122 |  * arguments.
123 |  * @param argc arguments counter
124 |  * @param argv arguments vector
125 |  */
126 | int main(int argc, char *argv[])
127 | {
128 |     // Initialize argument parser.
129 |     argparse::ArgumentParser program(argv[0], "0.1");
130 | 
131 |     // Define arguments.
132 |     program.add_argument("filename")
133 |         .help("path to binary file containing uin64_t keys");
134 | 
135 |     program.add_argument("layer1")
136 |         .help("layer1 model type, either linear_regression, linear_spline, cubic_spline, or radix.");
137 | 
138 |     program.add_argument("layer2")
139 |         .help("layer2 model type, either linear_regression, linear_spline, or cubic_spline.");
140 | 
141 |     program.add_argument("n_models")
142 |         .help("number of models on layer2, power of two is recommended.")
143 |         .action([](const std::string &s) { return std::stoul(s); });
144 | 
145 |     program.add_argument("bound_type")
146 |         .help("type of error bounds used, either labs, lind, gabs, or gind.");
147 | 
148 |     program.add_argument("--header")
149 |         .help("output csv header")
150 |         .default_value(false)
151 |         .implicit_value(true);
152 | 
153 |     // Parse arguments.
154 |     try {
155 |         program.parse_args(argc, argv);
156 |     }
157 |     catch (const std::runtime_error &err) {
158 |         std::cout << err.what() << '\n' << program;
159 |         exit(EXIT_FAILURE);
160 |     }
161 | 
162 |     // Read arguments.
163 |     const auto filename = program.get<std::string>("filename");
164 |     const auto dataset_name = split(filename, '/').back();
165 |     const auto layer1 = program.get<std::string>("layer1");
166 |     const auto layer2 = program.get<std::string>("layer2");
167 |     const auto n_models = program.get<std::size_t>("n_models");
168 |     const auto bound_type = program.get<std::string>("bound_type");
169 | 
170 |     // Load keys.
171 |     auto keys = load_data<key_type>(filename);
172 | 
173 |     // Lookup experiment.
174 |     Config config{layer1, layer2, bound_type};
175 |     if (exp_map.find(config) == exp_map.end()) {
176 |         std::cerr << "Error: " << layer1 << ',' << layer2 << ',' << bound_type <<  " is not a valid RMI configuration." << std::endl;
177 |         exit(EXIT_FAILURE);
178 |     }
179 |     exp_fn_ptr exp_fn = exp_map[config];
180 | 
181 |     // Output header.
182 |     if (program["--header"]  == true)
183 |         std::cout << "dataset,"
184 |                   << "n_keys,"
185 |                   << "layer1,"
186 |                   << "layer2,"
187 |                   << "n_models,"
188 |                   << "bounds,"
189 |                   << "size_in_bytes,"
190 |                   << "mean_interval,"
191 |                   << "median_interval,"
192 |                   << "stdev_interval,"
193 |                   << "min_interval,"
194 |                   << "max_interval"
195 |                   << std::endl;
196 | 
197 |     // Run experiment.
198 |     (*exp_fn)(keys, n_models, dataset_name, layer1, layer2, bound_type);
199 | 
200 |     exit(EXIT_SUCCESS);
201 | }
202 | 


--------------------------------------------------------------------------------
/experiments/rmi_build.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <chrono>
  3 | 
  4 | #include "argparse/argparse.hpp"
  5 | #include "rmi/models.hpp"
  6 | #include "rmi/rmi.hpp"
  7 | 
  8 | using key_type = uint64_t;
  9 | using namespace std::chrono;
 10 | 
 11 | std::size_t s_glob; ///< global size_t variable
 12 | 
 13 | 
 14 | /**
 15 |  * Measures the build time for a given @p Rmi on dataset @p keys and writes results to `std::cout`.
 16 |  * @tparam Key key type
 17 |  * @tparam Rmi RMI type
 18 |  * @param keys on which the RMI is built
 19 |  * @param n_models number of models in the second layer of the RMI
 20 |  * @param dataset_name name of the dataset
 21 |  * @param layer1 model type of the first layer
 22 |  * @param layer2 model type of the second layer
 23 |  * @param bounds_type used by the RMI
 24 |  */
 25 | template<typename Key, typename Rmi>
 26 | void experiment(const std::vector<key_type> &keys,
 27 |                 const std::size_t n_models,
 28 |                 const std::size_t n_reps,
 29 |                 const std::string dataset_name,
 30 |                 const std::string layer1,
 31 |                 const std::string layer2,
 32 |                 const std::string bound_type)
 33 | {
 34 |     using rmi_type = Rmi;
 35 | 
 36 |     // Perform n_reps runs.
 37 |     for (std::size_t rep = 0; rep != n_reps; ++rep) {
 38 | 
 39 |         // Build RMI.
 40 |         auto start = steady_clock::now();
 41 |         rmi_type rmi(keys, n_models);
 42 |         auto stop = steady_clock::now();
 43 |         auto build_time = duration_cast<nanoseconds>(stop - start).count();
 44 | 
 45 |         // Perform lookup to ensure that RMI is actually built.
 46 |         auto key = keys.at(0);
 47 |         auto range = rmi.search(key);
 48 |         auto pos = std::lower_bound(keys.begin() + range.lo, keys.begin() + range.hi, key);
 49 |         s_glob = std::distance(keys.begin(), pos);
 50 | 
 51 |         // Report results.
 52 |                   // Dataset
 53 |         std::cout << dataset_name << ','
 54 |                   << keys.size() << ','
 55 |                   // Index
 56 |                   << "ours" << ','
 57 |                   << layer1 << ','
 58 |                   << layer2 << ','
 59 |                   << n_models << ','
 60 |                   << bound_type << ','
 61 |                   << rmi.size_in_bytes() << ','
 62 |                   // Experiment
 63 |                   << rep << ','
 64 |                   // Results
 65 |                   << build_time << ','
 66 |                   // Checksums
 67 |                   << s_glob << std::endl;
 68 |     } // reps
 69 | }
 70 | 
 71 | 
 72 | /**
 73 |  * @brief experiment function pointer
 74 |  */
 75 | typedef void (*exp_fn_ptr)(const std::vector<key_type>&,
 76 |                            const std::size_t,
 77 |                            const std::size_t,
 78 |                            const std::string,
 79 |                            const std::string,
 80 |                            const std::string,
 81 |                            const std::string);
 82 | 
 83 | /**
 84 |  * RMI configuration that holds the string representation of model types of layer 1 and layer 2 and the error bound
 85 |  * type.
 86 |  */
 87 | struct Config {
 88 |     std::string layer1;
 89 |     std::string layer2;
 90 |     std::string bound_type;
 91 | };
 92 | 
 93 | /**
 94 |  * Comparator class for @p Config objects.
 95 |  */
 96 | struct ConfigCompare {
 97 |     bool operator() (const Config &lhs, const Config &rhs) const {
 98 |         if (lhs.layer1 != rhs.layer1) return lhs.layer1 < rhs.layer1;
 99 |         if (lhs.layer2 != rhs.layer2) return lhs.layer2 < rhs.layer2;
100 |         return lhs.bound_type < rhs.bound_type;
101 |     }
102 | };
103 | 
104 | #define ENTRIES(L1, L2, T1, T2) \
105 |     { {#L1, #L2, "labs"}, &experiment<key_type, rmi::RmiLAbs<key_type, T1, T2>> }, \
106 |     { {#L1, #L2, "lind"}, &experiment<key_type, rmi::RmiLInd<key_type, T1, T2>> }, \
107 |     { {#L1, #L2, "gabs"}, &experiment<key_type, rmi::RmiGAbs<key_type, T1, T2>> }, \
108 |     { {#L1, #L2, "gind"}, &experiment<key_type, rmi::RmiGInd<key_type, T1, T2>> }, \
109 |     { {#L1, #L2, "none"}, &experiment<key_type, rmi::Rmi<key_type, T1, T2>> },
110 | 
111 | static std::map<Config, exp_fn_ptr, ConfigCompare> exp_map {
112 |     ENTRIES(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression)
113 |     ENTRIES(linear_regression, linear_spline,     rmi::LinearRegression, rmi::LinearSpline)
114 |     ENTRIES(linear_spline,     linear_regression, rmi::LinearSpline,     rmi::LinearRegression)
115 |     ENTRIES(linear_spline,     linear_spline,     rmi::LinearSpline,     rmi::LinearSpline)
116 |     ENTRIES(cubic_spline,      linear_regression, rmi::CubicSpline,      rmi::LinearRegression)
117 |     ENTRIES(cubic_spline,      linear_spline,     rmi::CubicSpline,      rmi::LinearSpline)
118 |     ENTRIES(radix,             linear_regression, rmi::Radix<key_type>,  rmi::LinearRegression)
119 |     ENTRIES(radix,             linear_spline,     rmi::Radix<key_type>,  rmi::LinearSpline)
120 | }; ///< Map that assigns an experiment function pointer to RMI configurations.
121 | #undef ENTRIES
122 | 
123 | 
124 | /**
125 |  * Triggers measurement of build times for an RMI configuration provided via command line arguments.
126 |  * @param argc arguments counter
127 |  * @param argv arguments vector
128 |  */
129 | int main(int argc, char *argv[])
130 | {
131 |     // Initialize argument parser.
132 |     argparse::ArgumentParser program(argv[0], "0.1");
133 | 
134 |     // Define arguments.
135 |     program.add_argument("filename")
136 |         .help("path to binary file containing uin64_t keys");
137 | 
138 |     program.add_argument("layer1")
139 |         .help("layer1 model type, either linear_regression, linear_spline, cubic_spline, or radix.");
140 | 
141 |     program.add_argument("layer2")
142 |         .help("layer2 model type, either linear_regression, linear_spline, or cubic_spline.");
143 | 
144 |     program.add_argument("n_models")
145 |         .help("number of models on layer2, power of two is recommended.")
146 |         .action([](const std::string &s) { return std::stoul(s); });
147 | 
148 |     program.add_argument("bound_type")
149 |         .help("type of error bounds used, either none, labs, lind, gabs, or gind.");
150 | 
151 |    program.add_argument("-n", "--n_reps")
152 |         .help("number of experiment repetitions")
153 |         .default_value(std::size_t(3))
154 |         .action([](const std::string &s) { return std::stoul(s); });
155 | 
156 |     program.add_argument("--header")
157 |         .help("output csv header")
158 |         .default_value(false)
159 |         .implicit_value(true);
160 | 
161 |     // Parse arguments.
162 |     try {
163 |         program.parse_args(argc, argv);
164 |     }
165 |     catch (const std::runtime_error &err) {
166 |         std::cout << err.what() << '\n' << program;
167 |         exit(EXIT_FAILURE);
168 |     }
169 | 
170 |     // Read arguments.
171 |     const auto filename = program.get<std::string>("filename");
172 |     const auto dataset_name = split(filename, '/').back();
173 |     const auto layer1 = program.get<std::string>("layer1");
174 |     const auto layer2 = program.get<std::string>("layer2");
175 |     const auto n_models = program.get<std::size_t>("n_models");
176 |     const auto bound_type = program.get<std::string>("bound_type");
177 |     const auto n_reps = program.get<std::size_t>("-n");
178 | 
179 |     // Load keys.
180 |     auto keys = load_data<key_type>(filename);
181 | 
182 |     // Lookup experiment.
183 |     Config config{layer1, layer2, bound_type};
184 |     if (exp_map.find(config) == exp_map.end()) {
185 |         std::cerr << "Error: " << layer1 << ',' << layer2 << ',' << bound_type <<  " is not a valid RMI configuration." << std::endl;
186 |         exit(EXIT_FAILURE);
187 |     }
188 |     exp_fn_ptr exp_fn = exp_map[config];
189 | 
190 |     // Output header.
191 |     if (program["--header"]  == true)
192 |         std::cout << "dataset,"
193 |                   << "n_keys,"
194 |                   << "rmi,"
195 |                   << "layer1,"
196 |                   << "layer2,"
197 |                   << "n_models,"
198 |                   << "bounds,"
199 |                   << "size_in_bytes,"
200 |                   << "rep,"
201 |                   << "build_time,"
202 |                   << "checksum"
203 |                   << std::endl;
204 | 
205 |     // Run experiment.
206 |     (*exp_fn)(keys, n_models, n_reps, dataset_name, layer1, layer2, bound_type);
207 | 
208 |     exit(EXIT_SUCCESS);
209 | }
210 | 


--------------------------------------------------------------------------------
/scripts/plot_rmi_lookup.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | import argparse
  3 | import itertools
  4 | import matplotlib.cm as cm
  5 | import matplotlib.pyplot as plt
  6 | import os
  7 | import pandas as pd
  8 | import warnings
  9 | 
 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc'))
 11 | 
 12 | # Ignore warnings
 13 | warnings.filterwarnings( "ignore")
 14 | 
 15 | # Argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true')
 18 | args = vars(parser.parse_args())
 19 | 
 20 | 
 21 | def plot_full(filename='rmi_lookup-full.pdf'):
 22 |     n_rows = len(datasets)
 23 |     n_cols = len(l1models) * len(l2models)
 24 | 
 25 |     configs = itertools.product(l1models, l2models)
 26 | 
 27 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True)
 28 |     fig.tight_layout()
 29 | 
 30 |     for col, (l1, l2) in enumerate(configs):
 31 |         for row, dataset in enumerate(datasets):
 32 |             ax = axs[row,col]
 33 |             for bound in bounds:
 34 |                 for search in searches:
 35 |                     data = df[
 36 |                             (df['dataset']==dataset) &
 37 |                             (df['layer1']==l1) &
 38 |                             (df['layer2']==l2) &
 39 |                             (df['bounds']==bound) &
 40 |                             (df['search']==search)
 41 |                     ]
 42 |                     if not data.empty:
 43 |                         ax.plot(data['size_in_MiB'], data['lookup_in_ns'], label=f'{bound}+{search}', color=corr_colors[(bound,search)])
 44 | 
 45 |             # Title
 46 |             ax.set_title(f'{dataset} ({l1}$\mapsto${l2})')
 47 | 
 48 |             # Labels
 49 |             if row==n_rows-1:
 50 |                 ax.set_xlabel('Index size [MiB]')
 51 |             if col==0:
 52 |                 ax.set_ylabel('Lookup time [ns]')
 53 | 
 54 |             # Visuals
 55 |             ax.set_ylim(bottom=0)
 56 |             ax.set_xscale('log')
 57 | 
 58 |             # Legend
 59 |             if row==0 and col==0:
 60 |                 fig.legend(ncol=len(corr_configs), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
 61 | 
 62 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
 63 | 
 64 | 
 65 | def plot_models(filename='rmi_lookup-model_types.pdf', bounds='NB', search='MExp'):
 66 |     l1_groups = [['CS','LR'],['LS','RX']]
 67 |     l2_models = ['LR','LS']
 68 | 
 69 |     n_rows = len(datasets)
 70 |     n_cols = len(l1_groups)
 71 | 
 72 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 2.7*n_rows), sharey='row', sharex=True)
 73 |     fig.tight_layout()
 74 | 
 75 |     for row, dataset in enumerate(datasets):
 76 |         for col, l1_models in enumerate(l1_groups):
 77 |             ax = axs[row,col]
 78 |             for l1 in l1_models:
 79 |                 for l2 in l2_models:
 80 |                     data = df[
 81 |                         (df['dataset']==dataset) &
 82 |                         (df['layer1']==l1) &
 83 |                         (df['layer2']==l2) &
 84 |                         (df['bounds']==bounds) &
 85 |                         (df['search']==search)
 86 |                     ]
 87 |                     if not data.empty:
 88 |                         ax.plot(data['size_in_MiB'], data['lookup_in_ns'], label=f'{l1}$\mapsto${l2}', c=model_colors[(l1,l2)])
 89 | 
 90 |             # Title
 91 |             ax.set_title(dataset)
 92 | 
 93 |             # Labels
 94 |             if col==0:
 95 |                 ax.set_ylabel('Lookup time [ns]')
 96 |             if row==n_rows - 1:
 97 |                 ax.set_xlabel('Index size [MiB]')
 98 | 
 99 |             # Visuals
100 |             ax.set_xscale('log')
101 |             if col==n_cols - 1:
102 |                 if dataset=='books':
103 |                     ax.set_ylim(bottom=0, top=850)
104 |                 elif dataset=='fb':
105 |                     ax.set_ylim(bottom=0, top=1850)
106 |                 elif dataset=='osmc':
107 |                     ax.set_ylim(bottom=0, top=1500)
108 |                 elif dataset=='wiki':
109 |                     ax.set_ylim(bottom=0, top=1000)
110 | 
111 |             # Legend
112 |             if row==0 and col==n_cols - 1:
113 |                 fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center')
114 | 
115 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
116 | 
117 | 
118 | def plot_correction(filename='rmi_lookup-error_correction.pdf'):
119 |     models = [('LS', 'LR'),('RX','LS')]
120 |     datasets = ['books','osmc','wiki']
121 | 
122 |     n_rows = len(datasets)
123 |     n_cols = len(models)
124 | 
125 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 2.7*n_rows), sharey='row', sharex=True)
126 |     fig.tight_layout()
127 | 
128 |     for row, dataset in enumerate(datasets):
129 |         for col, model in enumerate(models):
130 |             ax = axs[row,col]
131 |             l1, l2 = model
132 |             for bound in bounds:
133 |                 for search in searches:
134 |                     data = df[
135 |                             (df['dataset']==dataset) &
136 |                             (df['layer1']==l1) &
137 |                             (df['layer2']==l2) &
138 |                             (df['bounds']==bound) &
139 |                             (df['search']==search)
140 |                     ]
141 |                     if not data.empty:
142 |                         ax.plot(data['size_in_MiB'], data['lookup_in_ns'], label=f'{bound}+{search}', color=corr_colors[(bound,search)])
143 | 
144 |             # Title
145 |             ax.set_title(f'{dataset} ({l1}$\mapsto${l2})')
146 | 
147 |             # Labels
148 |             if row==n_rows-1:
149 |                 ax.set_xlabel('Index size [MiB]')
150 |             if col==0:
151 |                 ax.set_ylabel('Lookup time [ns]')
152 | 
153 |             # Visuals
154 |             ax.set_xscale('log')
155 |             if col==n_cols - 1:
156 |                 if dataset=='books':
157 |                     ax.set_ylim(bottom=0, top=850)
158 |                 elif dataset=='osmc':
159 |                     ax.set_ylim(bottom=0, top=1400)
160 |                 elif dataset=='wiki':
161 |                     ax.set_ylim(bottom=0, top=1000)
162 | 
163 |             # Legend
164 |             if row==0 and col==0:
165 |                 fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
166 | 
167 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
168 | 
169 | 
170 | if __name__ == "__main__":
171 |     path = 'results'
172 | 
173 |     # Read csv file
174 |     file = os.path.join(path, 'rmi_lookup.csv')
175 |     df = pd.read_csv(file, delimiter=',', header=0, comment='#')
176 | 
177 |     # Compute median of lookup times
178 |     df = df.groupby(['dataset','layer1','layer2','n_models','bounds','search']).median().reset_index()
179 | 
180 |     # Replace datasets, model names, bounds, and searches
181 |     dataset_dict = {
182 |         "books_200M_uint64": "books",
183 |         "fb_200M_uint64": "fb",
184 |         "osm_cellids_200M_uint64": "osmc",
185 |         "wiki_ts_200M_uint64": "wiki"
186 |     }
187 |     model_dict = {
188 |         "cubic_spline": "CS",
189 |         "linear_spline": "LS",
190 |         "linear_regression": "LR",
191 |         "radix": "RX"
192 |     }
193 |     bounds_dict = {
194 |         "labs": "LAbs",
195 |         "lind": "LInd",
196 |         "gabs": "GAbs",
197 |         "gind": "GInd",
198 |         "none": "NB"
199 |     }
200 |     search_dict = {
201 |         "binary": "Bin",
202 |         "model_biased_binary": "MBin",
203 |         "model_biased_exponential": "MExp",
204 |         "model_biased_linear": "MLin"
205 |     }
206 |     df.replace({**dataset_dict, **model_dict, **bounds_dict, **search_dict}, inplace=True)
207 | 
208 |     # Compute metrics
209 |     df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024)
210 |     df['lookup_in_ns'] = df['lookup_time'] / df['n_samples']
211 | 
212 |     # Define variable lists
213 |     datasets = sorted(df['dataset'].unique())
214 |     bounds = sorted(df['bounds'].unique())
215 |     searches = sorted(df['search'].unique())
216 |     l1models = sorted(df['layer1'].unique())
217 |     l2models = sorted(df['layer2'].unique())
218 |     corr_configs = [
219 |         ('GAbs','Bin'),
220 |         ('GInd','Bin'),('GInd','MBin'),
221 |         ('LAbs','Bin'),
222 |         ('LInd','Bin'),('LInd','MBin'),
223 |         ('NB','MExp'),('NB','MLin'),
224 |     ]
225 | 
226 |     # Set colors
227 |     model_colors = {}
228 |     cmap = cm.get_cmap('tab10')
229 |     n_colors = 10
230 |     for i, (l1, l2) in enumerate(itertools.product(l1models, l2models)):
231 |         model_colors[(l1,l2)] = cmap(i/n_colors)
232 |     corr_colors = {}
233 |     cmap = cm.get_cmap('Dark2')
234 |     n_colors = len(corr_configs)
235 |     for i, (bound, search) in enumerate(corr_configs):
236 |         corr_colors[(bound,search)] = cmap(i/n_colors)
237 | 
238 |     if args['paper']:
239 |         # Plot model types
240 |         filename = 'rmi_lookup-model_types.pdf'
241 |         print(f'Plotting lookup time by model types to \'{filename}\'...')
242 |         plot_models(filename)
243 | 
244 |         # Plot error correction
245 |         filename = 'rmi_lookup-error_correction.pdf'
246 |         print(f'Plotting lookup time by error correction to \'{filename}\'...')
247 |         plot_correction(filename)
248 |     else:
249 |         # Plot full results
250 |         filename = 'rmi_lookup-full.pdf'
251 |         print(f'Plotting full lookup time results to \'{filename}\'...')
252 |         plot_full(filename)
253 | 


--------------------------------------------------------------------------------
/scripts/plot_index_comparison.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | import argparse
  3 | import matplotlib.cm as cm
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import os
  7 | import pandas as pd
  8 | import warnings
  9 | 
 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc'))
 11 | 
 12 | # Ignore warnings
 13 | warnings.filterwarnings( "ignore")
 14 | 
 15 | # Argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true')
 18 | args = vars(parser.parse_args())
 19 | 
 20 | 
 21 | def compute_pareto_frontier(source, cost, value):
 22 |     frontier = list()
 23 |     source = source.sort_values(cost)
 24 |     old_val = float('inf')
 25 |     for index, row in source.iterrows():
 26 |         curr_val = row[value]
 27 |         if curr_val < old_val:
 28 |             old_val = curr_val
 29 |             frontier.append(row)
 30 |     result = pd.DataFrame(frontier)
 31 |     return result
 32 | 
 33 | 
 34 | def plot_lookup(filename='index_comparison-lookup_time.pdf', width_fact=5, height_fact=4.2):
 35 |     n_rows = 2
 36 |     n_cols = 2
 37 | 
 38 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=True)
 39 |     fig.tight_layout()
 40 | 
 41 |     for i, dataset in enumerate(datasets):
 42 |         row = int(i / 2)
 43 |         col = int(i % 2)
 44 |         ax = axs[row,col]
 45 | 
 46 |         # Scatter indexes
 47 |         for index in index_dict.keys():
 48 |             data = df[
 49 |                 (df['dataset']==dataset) &
 50 |                 (df['index']==index)
 51 |             ]
 52 |             if not data.empty and index!='Binary search':
 53 |                 if index=='Compact Hist-Tree' or index=='RadixSpline':
 54 |                     data = compute_pareto_frontier(data, 'size_in_MiB', 'lookup_in_ns')
 55 |                     data = data.sort_values('size_in_MiB')
 56 |                     ax.plot(data['size_in_MiB'], data['lookup_in_ns'], color=colors[index], label=index_dict[index], alpha=0.9)
 57 |                 else:
 58 |                     data = data.sort_values('size_in_MiB')
 59 |                     ax.plot(data['size_in_MiB'], data['lookup_in_ns'], color=colors[index], label=index_dict[index], alpha=0.9)
 60 | 
 61 |         # Title
 62 |         ax.set_title(dataset)
 63 | 
 64 |         # Labels
 65 |         if row==n_rows - 1:
 66 |             ax.set_xlabel('Index size [MiB]')
 67 |         if col==0:
 68 |             ax.set_ylabel('Lookup time [ns]')
 69 | 
 70 |         # Visuals
 71 |         ax.set_xscale('log')
 72 |         ax.set_ylim(bottom=0, top=1250)
 73 | 
 74 |         # Legend
 75 |         if row==0 and col==0:
 76 |             fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center')
 77 | 
 78 |         # Binary search
 79 |         if True:
 80 |             data = df[
 81 |                 (df['dataset']==dataset) &
 82 |                 (df['index']=='Binary search')
 83 |             ].iloc[0]
 84 |             ax.axhline(y=data['lookup_in_ns'], marker='None', color='.2', dashes=(2, 1), label='Binary search')
 85 | 
 86 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
 87 | 
 88 | 
 89 | def plot_build(filename='index_comparison-build_time.pdf', width_fact=5, height_fact=4.2):
 90 |     n_cols = 2
 91 |     n_rows = 2
 92 | 
 93 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=True)
 94 |     fig.tight_layout()
 95 | 
 96 |     for i, dataset in enumerate(datasets):
 97 |         row = int(i / 2)
 98 |         col = int(i % 2)
 99 |         ax = axs[row,col]
100 | 
101 |         # Scatter indexes
102 |         for index in index_dict.keys():
103 |             data = df[
104 |                 (df['dataset']==dataset) &
105 |                 (df['index']==index)
106 |             ]
107 |             if not data.empty and index!='Binary search':
108 |                 if index=='Compact Hist-Tree' or index=='RadixSpline':
109 |                     data = compute_pareto_frontier(data, 'size_in_MiB', 'lookup_in_ns')
110 |                     data = data.sort_values('size_in_MiB')
111 |                     ax.plot(data['size_in_MiB'], data['build_in_s'], color=colors[index], label=index_dict[index], alpha=0.9)
112 |                 else:
113 |                     data = data.sort_values('size_in_MiB')
114 |                     ax.plot(data['size_in_MiB'], data['build_in_s'], color=colors[index], label=index_dict[index], alpha=0.9)
115 | 
116 |         # Title
117 |         ax.set_title(dataset)
118 | 
119 |         # Labels
120 |         if row==n_rows - 1:
121 |             ax.set_xlabel('Index size [MiB]')
122 |         if col==0:
123 |             ax.set_ylabel('Build time [s]')
124 | 
125 |         # Visuals
126 |         ax.set_xscale('log')
127 |         ax.set_ylim(bottom=-1, top=30)
128 | 
129 |         # Legend
130 |         if row==0 and col==0:
131 |             fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center')
132 | 
133 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
134 | 
135 | 
136 | def plot_lookup_shares(filename, width_fact=5, height_fact=4.2):
137 |     n_cols = len(datasets)
138 |     n_rows = 1
139 | 
140 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=False)
141 |     fig.tight_layout()
142 | 
143 |     for col, dataset in enumerate(datasets):
144 |         ax = axs[col]
145 | 
146 |         # Gather data of fastest configuration per index
147 |         labels = []
148 |         evals = []
149 |         searches = []
150 |         bar_colors = []
151 | 
152 |         # Binary search
153 |         row = df[
154 |             (df['dataset']==dataset) &
155 |             (df['index']=='Binary search')
156 |         ].iloc[0]
157 |         labels.append('Binary search')
158 |         evals.append(0)
159 |         searches.append(row['lookup_in_ns'])
160 |         bar_colors.append('.2')
161 | 
162 |         for index in index_dict.keys():
163 |             data = df[
164 |                 (df['dataset'] == dataset) &
165 |                 (df['index'] == index)
166 |             ]
167 |             labels.append(index_dict[index])
168 |             bar_colors.append(colors[index])
169 |             if not data.empty:
170 |                 data = data.sort_values(['lookup_in_ns']).reset_index()
171 |                 row = data.iloc[0] # fastest configuration
172 |                 evals.append(row['eval_in_ns'])
173 |                 searches.append(row['search_in_ns'])
174 |             else:
175 |                 evals.append(0)
176 |                 searches.append(0)
177 | 
178 |         # Plot results
179 |         ax.bar(labels, evals, color='0.6', edgecolor=bar_colors, linewidth=1, label='Evaluation')
180 |         ax.bar(labels, searches, color='0.9', edgecolor=bar_colors, linewidth=1, label='Search', bottom=evals)
181 | 
182 |         # Title
183 |         ax.set_title(dataset)
184 | 
185 |         # Labels
186 |         if col==0:
187 |             ax.set_ylabel('Lookup time [ns]')
188 | 
189 |         # Visuals
190 |         ax.grid(False, axis='x')
191 |         ax.set_xticklabels(labels=labels, rotation=90)
192 | 
193 |         # Legend
194 |         if col==0:
195 |             fig.legend(ncol=2, bbox_to_anchor=(0.5, 1), loc='lower center')
196 | 
197 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
198 | 
199 | 
200 | if __name__ == "__main__":
201 |     path = 'results'
202 | 
203 |     # Read csv file
204 |     file = os.path.join(path, 'index_comparison.csv')
205 |     df = pd.read_csv(file, delimiter=',', header=0, comment='#')
206 |     df = df.replace({np.nan: '-'})
207 | 
208 |     # Compute medians
209 |     df = df.groupby(['dataset', 'index', 'config']).median().reset_index()
210 | 
211 |     # Replace datasets
212 |     dataset_dict = {
213 |         "books_200M_uint64": "books",
214 |         "fb_200M_uint64": "fb",
215 |         "osm_cellids_200M_uint64": "osmc",
216 |         "wiki_ts_200M_uint64": "wiki"
217 |     }
218 |     df.replace({**dataset_dict}, inplace=True)
219 |     index_dict = {
220 |         'RMI-ours': 'RMI (ours)',
221 |         'RMI-ref': 'RMI (ref)',
222 |         'ALEX': 'ALEX',
223 |         'PGM-index': 'PGM-index',
224 |         'RadixSpline': 'RadixSpline',
225 |         'Compact Hist-Tree': 'Hist-Tree',
226 |         'B-tree': 'B-tree',
227 |         'ART': 'ART'
228 |     }
229 | 
230 |     # Compute metrics
231 |     df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024)
232 |     df['build_in_s'] = df['build_time'] / 1000000000
233 |     df['eval_in_ns'] = df['eval_time'] / df['n_samples']
234 |     df['lookup_in_ns'] = df['lookup_time'] / df['n_samples']
235 |     df['search_in_ns'] = df['lookup_in_ns'] - df['eval_in_ns']
236 | 
237 |     # Define variable lists
238 |     datasets = sorted(df['dataset'].unique())
239 |     indexes = sorted(df['index'].unique())
240 | 
241 |     # Set colors
242 |     cmap = cm.get_cmap('tab10')
243 |     n_colors = 10
244 |     colors = {}
245 |     for i, index in enumerate(index_dict.keys()):
246 |         colors[index] = cmap(i/n_colors)
247 | 
248 |     if args['paper']:
249 |         # Plot lookup times against index size
250 |         filename = 'index_comparison-lookup_time.pdf'
251 |         print(f'Plotting lookup time results to \'{filename}\'...')
252 |         plot_lookup(filename, 4, 2.7)
253 | 
254 |         # Plot build times against index size
255 |         filename = 'index_comparison-build_time.pdf'
256 |         print(f'Plotting build time results to \'{filename}\'...')
257 |         plot_build(filename, 4, 2.7)
258 | 
259 |         # Plot share of eval time and search time in overall lookup time
260 |         filename = 'index_comparison-lookup_shares.pdf'
261 |         print(f'Plotting lookup time shares to \'{filename}\'...')
262 |         plot_lookup_shares(filename, 2.1, 2)
263 |     else:
264 |         # Plot lookup times against index size
265 |         filename = 'index_comparison-lookup_time.pdf'
266 |         print(f'Plotting lookup time results to \'{filename}\'...')
267 |         plot_lookup(filename)
268 | 
269 |         # Plot build times against index size
270 |         filename = 'index_comparison-build_time.pdf'
271 |         print(f'Plotting build time results to \'{filename}\'...')
272 |         plot_build(filename)
273 | 
274 |         # Plot share of eval time and search time in overall lookup time
275 |         filename = 'index_comparison-lookup_shares.pdf'
276 |         print(f'Plotting lookup time shares to \'{filename}\'...')
277 |         plot_lookup_shares(filename)
278 | 


--------------------------------------------------------------------------------
/scripts/plot_rmi_build.py:
--------------------------------------------------------------------------------
  1 | #!python3
  2 | import argparse
  3 | import itertools
  4 | import matplotlib.cm as cm
  5 | import matplotlib.pyplot as plt
  6 | import os
  7 | import pandas as pd
  8 | import warnings
  9 | 
 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc'))
 11 | 
 12 | # Ignore warnings
 13 | warnings.filterwarnings( "ignore")
 14 | 
 15 | # Argparse
 16 | parser = argparse.ArgumentParser()
 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true')
 18 | args = vars(parser.parse_args())
 19 | 
 20 | 
 21 | def plot_ours_full(filename='rmi_build-ours_full.pdf'):
 22 |     rmi='ours'
 23 | 
 24 |     n_rows = len(datasets)
 25 |     n_cols = len(l1models) * len(l2models)
 26 | 
 27 |     configs = itertools.product(l1models, l2models)
 28 | 
 29 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True)
 30 |     fig.tight_layout()
 31 | 
 32 |     for col, (l1, l2) in enumerate(configs):
 33 |         for row, dataset in enumerate(datasets):
 34 |             ax = axs[row,col]
 35 |             for bound in bounds:
 36 |                 data = df[
 37 |                         (df['dataset']==dataset) &
 38 |                         (df['rmi']==rmi) &
 39 |                         (df['layer1']==l1) &
 40 |                         (df['layer2']==l2) &
 41 |                         (df['bounds']==bound)
 42 |                 ]
 43 |                 if not data.empty:
 44 |                     ax.plot(data['size_in_MiB'], data['build_in_s'], c=bound_colors[bound], marker=rmi_markers['ours'], label=bound)
 45 | 
 46 |             # Title
 47 |             ax.set_title(f'{dataset} ({l1}$\mapsto${l2})')
 48 | 
 49 |             # Labels
 50 |             if col==0:
 51 |                 ax.set_ylabel('Build time [s]')
 52 |             if row==n_rows - 1:
 53 |                 ax.set_xlabel('Index size [MiB]')
 54 | 
 55 |             # Visuals
 56 |             ax.set_ylim(bottom=0)
 57 |             ax.set_xscale('log')
 58 | 
 59 |             # Legend
 60 |             if row==0 and col==0:
 61 |                 fig.legend(ncol=len(bounds), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
 62 | 
 63 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
 64 | 
 65 | 
 66 | def plot_comp_full(filename='rmi_build-comp_full.pdf'):
 67 |     n_rows = len(datasets)
 68 |     n_cols = len(l1models) * len(l2models)
 69 | 
 70 |     bounds = ['NB','LAbs']
 71 |     configs = itertools.product(l1models, l2models)
 72 | 
 73 |     fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True)
 74 |     fig.tight_layout()
 75 | 
 76 |     for col, (l1, l2) in enumerate(configs):
 77 |         for row, dataset in enumerate(datasets):
 78 |             ax = axs[row,col]
 79 |             for rmi in rmis:
 80 |                 for bound in bounds:
 81 |                     data = df[
 82 |                             (df['dataset']==dataset) &
 83 |                             (df['rmi']==rmi) &
 84 |                             (df['layer1']==l1) &
 85 |                             (df['layer2']==l2) &
 86 |                             (df['bounds']==bound)
 87 |                     ]
 88 |                     if not data.empty:
 89 |                         ax.plot(data['size_in_MiB'], data['build_in_s'], c=bound_colors[bound], marker=rmi_markers[rmi], label=f'{bound} ({rmi})')
 90 | 
 91 |             # Title
 92 |             ax.set_title(f'{dataset} ({l1}$\mapsto${l2})')
 93 | 
 94 |             # Labels
 95 |             if col==0:
 96 |                 ax.set_ylabel('Build time [s]')
 97 |             if row==n_rows - 1:
 98 |                 ax.set_xlabel('Index size [MiB]')
 99 | 
100 |             # Visuals
101 |             ax.set_ylim(bottom=0)
102 |             ax.set_xscale('log')
103 | 
104 |             # Legend
105 |             if row==0 and col==0:
106 |                 fig.legend(ncol=len(bounds), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False)
107 | 
108 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
109 | 
110 | 
111 | def plot_models(dataset, models, bound, filename):
112 |     rmi = 'ours'
113 | 
114 |     n_cols = 1
115 |     n_rows = 1
116 | 
117 |     fig, ax = plt.subplots(n_rows, n_cols, figsize=(2.7*n_cols, 2.3*n_rows))
118 |     fig.tight_layout()
119 | 
120 |     for l1, l2 in models:
121 |         data = df[
122 |                 (df['dataset']==dataset) &
123 |                 (df['rmi']==rmi) &
124 |                 (df['layer1']==l1) &
125 |                 (df['layer2']==l2) &
126 |                 (df['bounds']==bound)
127 |         ]
128 |         if not data.empty:
129 |             ax.plot(data['size_in_MiB'], data['build_in_s'], c=model_colors[(l1,l2)], label=f'{l1}$\mapsto${l2}')
130 | 
131 |     # Title
132 |     ax.set_title(f'{dataset} ({bound})')
133 | 
134 |     # Labels
135 |     ax.set_ylabel('Build time [s]')
136 |     ax.set_xlabel('Index size [MiB]')
137 | 
138 |     # Visuals
139 |     ax.set_xscale('log')
140 |     ax.set_ylim(bottom=0, top=10)
141 | 
142 |     # Legend
143 |     fig.legend(ncol=2, bbox_to_anchor=(0.5, 1), loc='lower center')
144 | 
145 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
146 | 
147 | 
148 | def plot_bounds(dataset, model, filename):
149 |     rmi = 'ours'
150 |     l1, l2 = model
151 | 
152 |     n_cols = 1
153 |     n_rows = 1
154 | 
155 |     fig, ax = plt.subplots(n_rows, n_cols, figsize=(2.7*n_cols, 2.3*n_rows))
156 |     fig.tight_layout()
157 | 
158 |     for bound in bounds:
159 |         data = df[
160 |                 (df['dataset']==dataset) &
161 |                 (df['rmi']==rmi) &
162 |                 (df['layer1']==l1) &
163 |                 (df['layer2']==l2) &
164 |                 (df['bounds']==bound)
165 |         ]
166 |         if not data.empty:
167 |             ax.plot(data['size_in_MiB'], data['build_in_s'], c=bound_colors[bound], label=bound)
168 | 
169 |     # Title
170 |     ax.set_title(f'{dataset} ({l1}$\mapsto${l2})')
171 | 
172 |     # Labels
173 |     ax.set_ylabel('Build time [s]')
174 |     ax.set_xlabel('Index size [MiB]')
175 | 
176 |     # Visuals
177 |     ax.set_xscale('log')
178 |     ax.set_ylim(bottom=0, top=10)
179 | 
180 |     # Legend
181 |     fig.legend(ncol=3, bbox_to_anchor=(0.5, 1), loc='lower center')
182 | 
183 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
184 | 
185 | 
186 | def plot_comp(dataset, models, bound, filename):
187 |     n_cols = 1
188 |     n_rows = 1
189 | 
190 |     fig, ax = plt.subplots(n_rows, n_cols, figsize=(2.7*n_cols, 2.3*n_rows))
191 |     fig.tight_layout()
192 | 
193 |     for rmi in rmis:
194 |         for l1, l2 in models:
195 |             data = df[
196 |                     (df['dataset']==dataset) &
197 |                     (df['rmi']==rmi) &
198 |                     (df['layer1']==l1) &
199 |                     (df['layer2']==l2) &
200 |                     (df['bounds']==bound)
201 |             ]
202 |             if not data.empty:
203 |                 ax.plot(data['size_in_MiB'], data['build_in_s'], c=model_colors[(l1,l2)], marker=rmi_markers[rmi], label=f'{l1}$\mapsto${l2} ({rmi})')
204 | 
205 |     # Title
206 |     ax.set_title(f'{dataset} ({bound})')
207 | 
208 |     # Labels
209 |     ax.set_ylabel('Build time [s]')
210 |     ax.set_xlabel('Index size [MiB]')
211 | 
212 |     # Axes
213 |     ax.set_xscale('log')
214 |     ax.set_ylim(bottom=0)
215 | 
216 |     # Legend
217 |     fig.legend(ncol=2, bbox_to_anchor=(0.5, 1), loc='lower center')
218 | 
219 |     fig.savefig(os.path.join(path, filename), bbox_inches='tight')
220 | 
221 | 
222 | if __name__ == "__main__":
223 |     path = 'results'
224 | 
225 |     # Read csv file
226 |     file = os.path.join(path, 'rmi_build.csv')
227 |     df = pd.read_csv(file, delimiter=',', header=0, comment='#')
228 | 
229 |     # Compute median of lookup times
230 |     df = df.groupby(['dataset','rmi','layer1','layer2','n_models','bounds']).median().reset_index()
231 | 
232 |     # Replace datasets, model names, and bounds
233 |     dataset_dict = {
234 |         "books_200M_uint64": "books",
235 |         "fb_200M_uint64": "fb",
236 |         "osm_cellids_200M_uint64": "osmc",
237 |         "wiki_ts_200M_uint64": "wiki"
238 |     }
239 |     model_dict = {
240 |         "cubic_spline": "CS",
241 |         "linear_spline": "LS",
242 |         "linear_regression": "LR",
243 |         "radix": "RX"
244 |     }
245 |     bounds_dict = {
246 |         "labs": "LAbs",
247 |         "lind": "LInd",
248 |         "gabs": "GAbs",
249 |         "gind": "GInd",
250 |         "none": "NB"
251 |     }
252 |     df.replace({**dataset_dict, **model_dict, **bounds_dict}, inplace=True)
253 | 
254 |     # Compute metrics
255 |     df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024)
256 |     df['build_in_s'] = df['build_time'] / 1_000_000_000
257 | 
258 |     # Define variable lists
259 |     datasets = sorted(df['dataset'].unique())
260 |     rmis = sorted(df['rmi'].unique())
261 |     bounds = sorted(df['bounds'].unique())
262 |     l1models = sorted(df['layer1'].unique())
263 |     l2models = sorted(df['layer2'].unique())
264 | 
265 |     # Set colors and markers
266 |     model_colors = {}
267 |     cmap = cm.get_cmap('tab10')
268 |     n_colors = 10
269 |     for i, (l1, l2) in enumerate(itertools.product(l1models, l2models)):
270 |         model_colors[(l1,l2)] = cmap(i/n_colors)
271 |     bound_colors = {}
272 |     cmap = cm.get_cmap('Dark2')
273 |     n_colors = 8
274 |     for i, bound in enumerate(bounds):
275 |         bound_colors[bound] = cmap(i/n_colors)
276 |     rmi_markers = {'ours': '.', 'ref': 'x'}
277 | 
278 |     if args['paper']:
279 |         # Plot layer1
280 |         filename = 'rmi_build-layer1.pdf'
281 |         print(f'Plotting build times by layer 1 to \'{filename}\'...')
282 |         plot_models('books', [('CS','LR'),('LR','LR'),('LS','LR'),('RX','LR')], 'NB', filename)
283 | 
284 |         # Plot layer2
285 |         filename = 'rmi_build-layer2.pdf'
286 |         print(f'Plotting build times by layer 2 to \'{filename}\'...')
287 |         plot_models('books', [('LS','LS'),('LS','LR'),('RX','LS'),('RX','LR')], 'NB', filename)
288 | 
289 |         # Plot bounds
290 |         filename = 'rmi_build-bounds.pdf'
291 |         print(f'Plotting build times by error bounds to \'{filename}\'...')
292 |         plot_bounds('books', ('LS','LR'), filename)
293 | 
294 |         # Plot comparison NB
295 |         filename = 'rmi_build-comp_nb.pdf'
296 |         print(f'Plotting build time comparison to reference implementation (NB) to \'{filename}\'...')
297 |         plot_comp('books', [('LS','LR'),('RX','LS')], 'NB', filename)
298 | 
299 |         # Plot comparison LAbs
300 |         filename = 'rmi_build-comp_labs.pdf'
301 |         print(f'Plotting build time comparison to reference implementation (LAbs) to \'{filename}\'...')
302 |         plot_comp('books', [('LS','LR'),('RX','LS')], 'LAbs', filename)
303 |     else:
304 |         # Plot ours
305 |         filename = 'rmi_build-ours_full.pdf'
306 |         print(f'Plotting full build time results to \'{filename}\'...')
307 |         plot_ours_full(filename)
308 | 
309 |         # Plot reference
310 |         filename = 'rmi_build-comp_full.pdf'
311 |         print(f'Plotting full build time comparison to reference implementation to \'{filename}\'...')
312 |         plot_comp_full(filename)
313 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/experiments/rmi_lookup.cpp:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <random>
  3 | 
  4 | #include "argparse/argparse.hpp"
  5 | 
  6 | #include "rmi/models.hpp"
  7 | #include "rmi/rmi.hpp"
  8 | #include "rmi/util/fn.hpp"
  9 | #include "rmi/util/search.hpp"
 10 | 
 11 | using key_type = uint64_t;
 12 | using namespace std::chrono;
 13 | 
 14 | std::size_t s_glob; ///< global size_t variable
 15 | 
 16 | 
 17 | /**
 18 |  * Measures lookup times of @p samples on a given @p Rmi and writes results to `std::cout`.
 19 |  * @tparam Key key type
 20 |  * @tparam Rmi RMI type
 21 |  * @tparam Search search type
 22 |  * @param keys on which the RMI is built
 23 |  * @param n_models number of models in the second layer of the RMI
 24 |  * @param samples for which the lookup time is measured
 25 |  * @param n_reps number of repetitions
 26 |  * @param dataset_name name of the dataset
 27 |  * @param layer1 model type of the first layer
 28 |  * @param layer2 model type of the second layer
 29 |  * @param bound_type used by the RMI
 30 |  * @param search used by the RMI for correction prediction errors
 31 |  */
 32 | template<typename Key, typename Rmi, typename Search>
 33 | void experiment(const std::vector<key_type> &keys,
 34 |                 const std::size_t n_models,
 35 |                 const std::vector<key_type> &samples,
 36 |                 const std::size_t n_reps,
 37 |                 const std::string dataset_name,
 38 |                 const std::string layer1,
 39 |                 const std::string layer2,
 40 |                 const std::string bound_type,
 41 |                 const std::string search)
 42 | {
 43 |     using rmi_type = Rmi;
 44 |     auto search_fn = Search();
 45 | 
 46 |     // Build RMI.
 47 |     rmi_type rmi(keys, n_models);
 48 | 
 49 |     // Perform n_reps runs.
 50 |     for (std::size_t rep = 0; rep != n_reps; ++rep) {
 51 | 
 52 |         // Lookup time.
 53 |         std::size_t lookup_accu = 0;
 54 |         auto start = steady_clock::now();
 55 |         for (std::size_t i = 0; i != samples.size(); ++i) {
 56 |             auto key = samples.at(i);
 57 |             auto range = rmi.search(key);
 58 |             auto pos = search_fn(keys.begin() + range.lo, keys.begin() + range.hi, keys.begin() + range.pos, key);
 59 |             lookup_accu += std::distance(keys.begin(), pos);
 60 |         }
 61 |         auto stop = steady_clock::now();
 62 |         auto lookup_time = duration_cast<nanoseconds>(stop - start).count();
 63 |         s_glob = lookup_accu;
 64 | 
 65 |         // Report results.
 66 |                   // Dataset
 67 |         std::cout << dataset_name << ','
 68 |                   << keys.size() << ','
 69 |                   // Index
 70 |                   << layer1 << ','
 71 |                   << layer2 << ','
 72 |                   << n_models << ','
 73 |                   << bound_type << ','
 74 |                   << search << ','
 75 |                   << rmi.size_in_bytes() << ','
 76 |                   // Experiment
 77 |                   << rep << ','
 78 |                   << samples.size() << ','
 79 |                   // Results
 80 |                   << lookup_time << ','
 81 |                   // Checksums
 82 |                   << lookup_accu << std::endl;
 83 |     } // reps
 84 | }
 85 | 
 86 | 
 87 | /**
 88 |  * @brief experiment function pointer
 89 |  */
 90 | typedef void (*exp_fn_ptr)(const std::vector<key_type>&,
 91 |                            const std::size_t,
 92 |                            const std::vector<key_type>&,
 93 |                            const std::size_t,
 94 |                            const std::string,
 95 |                            const std::string,
 96 |                            const std::string,
 97 |                            const std::string,
 98 |                            const std::string);
 99 | 
100 | /**
101 |  * RMI configuration that holds the string representation of model types of layer 1 and layer 2, error bound type, and
102 |  * search algorithm.
103 |  */
104 | struct Config {
105 |     std::string layer1;
106 |     std::string layer2;
107 |     std::string bound_type;
108 |     std::string search;
109 | };
110 | 
111 | /**
112 |  * Comparator class for @p Config objects.
113 |  */
114 | struct ConfigCompare {
115 |     bool operator() (const Config &lhs, const Config &rhs) const {
116 |         if (lhs.layer1 != rhs.layer1) return lhs.layer1 < rhs.layer1;
117 |         if (lhs.layer2 != rhs.layer2) return lhs.layer2 < rhs.layer2;
118 |         if (lhs.bound_type != rhs.bound_type) return lhs.bound_type < rhs.bound_type;
119 |         return lhs.search < rhs.search;
120 |     }
121 | };
122 | 
123 | #define ENTRIES(L1, L2, LT1, LT2) \
124 |     { {#L1, #L2, "none", "binary"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, BinarySearch> }, \
125 |     { {#L1, #L2, "labs", "binary"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, BinarySearch> }, \
126 |     { {#L1, #L2, "lind", "binary"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, BinarySearch> }, \
127 |     { {#L1, #L2, "gabs", "binary"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, BinarySearch> }, \
128 |     { {#L1, #L2, "gind", "binary"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, BinarySearch> }, \
129 |     { {#L1, #L2, "none", "model_biased_binary"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
130 |     { {#L1, #L2, "labs", "model_biased_binary"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
131 |     { {#L1, #L2, "lind", "model_biased_binary"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
132 |     { {#L1, #L2, "gabs", "model_biased_binary"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
133 |     { {#L1, #L2, "gind", "model_biased_binary"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
134 |     { {#L1, #L2, "none", "linear"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, LinearSearch> }, \
135 |     { {#L1, #L2, "labs", "linear"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, LinearSearch> }, \
136 |     { {#L1, #L2, "lind", "linear"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, LinearSearch> }, \
137 |     { {#L1, #L2, "gabs", "linear"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, LinearSearch> }, \
138 |     { {#L1, #L2, "gind", "linear"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, LinearSearch> }, \
139 |     { {#L1, #L2, "none", "model_biased_linear"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
140 |     { {#L1, #L2, "labs", "model_biased_linear"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
141 |     { {#L1, #L2, "lind", "model_biased_linear"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
142 |     { {#L1, #L2, "gabs", "model_biased_linear"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
143 |     { {#L1, #L2, "gind", "model_biased_linear"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
144 |     { {#L1, #L2, "none", "exponential"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, ExponentialSearch> }, \
145 |     { {#L1, #L2, "labs", "exponential"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, ExponentialSearch> }, \
146 |     { {#L1, #L2, "lind", "exponential"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, ExponentialSearch> }, \
147 |     { {#L1, #L2, "gabs", "exponential"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, ExponentialSearch> }, \
148 |     { {#L1, #L2, "gind", "exponential"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, ExponentialSearch> }, \
149 |     { {#L1, #L2, "none", "model_biased_exponential"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
150 |     { {#L1, #L2, "labs", "model_biased_exponential"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
151 |     { {#L1, #L2, "lind", "model_biased_exponential"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
152 |     { {#L1, #L2, "gabs", "model_biased_exponential"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
153 |     { {#L1, #L2, "gind", "model_biased_exponential"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
154 | 
155 | static std::map<Config, exp_fn_ptr, ConfigCompare> exp_map {
156 |     ENTRIES(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression)
157 |     ENTRIES(linear_regression, linear_spline,     rmi::LinearRegression, rmi::LinearSpline)
158 |     ENTRIES(linear_spline,     linear_regression, rmi::LinearSpline,     rmi::LinearRegression)
159 |     ENTRIES(linear_spline,     linear_spline,     rmi::LinearSpline,     rmi::LinearSpline)
160 |     ENTRIES(cubic_spline,      linear_regression, rmi::CubicSpline,      rmi::LinearRegression)
161 |     ENTRIES(cubic_spline,      linear_spline,     rmi::CubicSpline,      rmi::LinearSpline)
162 |     ENTRIES(radix,             linear_regression, rmi::Radix<key_type>,  rmi::LinearRegression)
163 |     ENTRIES(radix,             linear_spline,     rmi::Radix<key_type>,  rmi::LinearSpline)
164 | }; ///< Map that assigns an experiment function pointer to RMI configurations.
165 | #undef ENTRIES
166 | 
167 | 
168 | /**
169 |  * Triggers measurement of lookup times for an RMI configuration provided via command line arguments.
170 |  * @param argc arguments counter
171 |  * @param argv arguments vector
172 |  */
173 | int main(int argc, char *argv[])
174 | {
175 |     // Initialize argument parser.
176 |     argparse::ArgumentParser program(argv[0], "0.1");
177 | 
178 |     // Define arguments.
179 |     program.add_argument("filename")
180 |         .help("path to binary file containing uin64_t keys");
181 | 
182 |     program.add_argument("layer1")
183 |         .help("layer1 model type, either linear_regression, linear_spline, cubic_spline, or radix.");
184 | 
185 |     program.add_argument("layer2")
186 |         .help("layer2 model type, either linear_regression, linear_spline, or cubic_spline.");
187 | 
188 |     program.add_argument("n_models")
189 |         .help("number of models on layer2, power of two is recommended.")
190 |         .action([](const std::string &s) { return std::stoul(s); });
191 | 
192 |     program.add_argument("bound_type")
193 |         .help("type of error bounds used, either none, labs, lind, gabs, or gind.");
194 | 
195 |     program.add_argument("search")
196 |         .help("search algorithm for error correction, either binary, model_biased_binary, exponential, model_biased_exponential, linear, or model_biased_linear.");
197 | 
198 |    program.add_argument("-n", "--n_reps")
199 |         .help("number of experiment repetitions")
200 |         .default_value(std::size_t(3))
201 |         .action([](const std::string &s) { return std::stoul(s); });
202 | 
203 |     program.add_argument("-s", "--n_samples")
204 |         .help("number of sampled lookup keys")
205 |         .default_value(std::size_t(1'000'000))
206 |         .action([](const std::string &s) { return std::stoul(s); });
207 | 
208 |     program.add_argument("--header")
209 |         .help("output csv header")
210 |         .default_value(false)
211 |         .implicit_value(true);
212 | 
213 |     // Parse arguments.
214 |     try {
215 |         program.parse_args(argc, argv);
216 |     }
217 |     catch (const std::runtime_error &err) {
218 |         std::cout << err.what() << '\n' << program;
219 |         exit(EXIT_FAILURE);
220 |     }
221 | 
222 |     // Read arguments.
223 |     const auto filename = program.get<std::string>("filename");
224 |     const auto dataset_name = split(filename, '/').back();
225 |     const auto layer1 = program.get<std::string>("layer1");
226 |     const auto layer2 = program.get<std::string>("layer2");
227 |     const auto n_models = program.get<std::size_t>("n_models");
228 |     const auto bound_type = program.get<std::string>("bound_type");
229 |     const auto search = program.get<std::string>("search");
230 |     const auto n_reps = program.get<std::size_t>("-n");
231 |     const auto n_samples = program.get<std::size_t>("-s");
232 | 
233 |     // Load keys.
234 |     auto keys = load_data<key_type>(filename);
235 | 
236 |     // Sample keys.
237 |     uint64_t seed = 42;
238 |     std::mt19937 gen(seed);
239 |     std::uniform_int_distribution<> distrib(0, keys.size() - 1);
240 |     std::vector<key_type> samples;
241 |     samples.reserve(n_samples);
242 |     for (std::size_t i = 0; i != n_samples; ++i)
243 |         samples.push_back(keys[distrib(gen)]);
244 | 
245 |     // Lookup experiment.
246 |     Config config{layer1, layer2, bound_type, search};
247 |     if (exp_map.find(config) == exp_map.end()) {
248 |         std::cerr << "Error: " << layer1 << ',' << layer2 << ',' << bound_type << ',' << search << " is not a valid RMI configuration." << std::endl;
249 |         exit(EXIT_FAILURE);
250 |     }
251 |     exp_fn_ptr exp_fn = exp_map[config];
252 | 
253 |     // Output header.
254 |     if (program["--header"]  == true)
255 |         std::cout << "dataset,"
256 |                   << "n_keys,"
257 |                   << "layer1,"
258 |                   << "layer2,"
259 |                   << "n_models,"
260 |                   << "bounds,"
261 |                   << "search,"
262 |                   << "size_in_bytes,"
263 |                   << "rep,"
264 |                   << "n_samples,"
265 |                   << "lookup_time,"
266 |                   << "lookup_accu,"
267 |                   << std::endl;
268 | 
269 |     // Run experiment.
270 |     (*exp_fn)(keys, n_models, samples, n_reps, dataset_name, layer1, layer2, bound_type, search);
271 | 
272 |     exit(EXIT_SUCCESS);
273 | }
274 | 


--------------------------------------------------------------------------------
/include/rmi/models.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cmath>
  4 | #include <x86intrin.h>
  5 | 
  6 | #include "rmi/util/fn.hpp"
  7 | 
  8 | namespace rmi {
  9 | 
 10 | /**
 11 |  * A model that fits a linear segment from the first first to the last data point.
 12 |  *
 13 |  * We assume that x-values are sorted in ascending order and y-values are handed implicitly where @p offset and @p
 14 |  * offset + distance(first, last) are the first and last y-value, respectively. The y-values can be scaled by
 15 |  * providing a @p compression_factor.
 16 |  */
 17 | class LinearSpline
 18 | {
 19 |     private:
 20 |     double slope_;     ///< The slope of the linear segment.
 21 |     double intercept_; ///< The y-intercept of the lienar segment.
 22 | 
 23 |     public:
 24 |     /**
 25 |      * Default contructor.
 26 |      */
 27 |     LinearSpline() = default;
 28 | 
 29 |     /**
 30 |      * Builds a linaer segment between the first and last data point.
 31 |      * @param first, last iterators to the first and last x-value the linear segment is fit on
 32 |      * @param offset first y-value the linear segment is fit on
 33 |      * @param compression_factor by which the y-values are scaled
 34 |      */
 35 |     template<typename RandomIt>
 36 |     LinearSpline(RandomIt first, RandomIt last, std::size_t offset = 0, double compression_factor = 1.f) {
 37 |         std::size_t n = std::distance(first, last);
 38 | 
 39 |         if (n == 0) {
 40 |             slope_ = 0.f;
 41 |             intercept_ = 0.f;
 42 |             return;
 43 |         }
 44 |         if (n == 1) {
 45 |             slope_ = 0.f;
 46 |             intercept_ = static_cast<double>(offset) * compression_factor;
 47 |             return;
 48 |         }
 49 | 
 50 |         double numerator = static_cast<double>(n); // (offset + n) - offset
 51 |         double denominator = static_cast<double>(*(last - 1) - *first);
 52 | 
 53 |         slope_ = denominator != 0.0 ? numerator/denominator * compression_factor : 0.0;
 54 |         intercept_ = offset * compression_factor - slope_ * *first;
 55 |     }
 56 | 
 57 |     /**
 58 |      * Returns the estimated y-value of @p x.
 59 |      * @param x to estimate a y-value for
 60 |      * @return the estimated y-value for @p x
 61 |      */
 62 |     template<typename X>
 63 |     double predict(const X x) const { return std::fma(slope_, static_cast<double>(x), intercept_); }
 64 | 
 65 |     /**
 66 |      * Returns the slope of the linear segment.
 67 |      * @return the slope of the linear segment
 68 |      */
 69 |     double slope() const { return slope_; }
 70 | 
 71 |     /**
 72 |      * Returns the y-intercept of the linear segment.
 73 |      * return the y-intercept of the linear segment
 74 |      */
 75 |     double intercept() const { return intercept_; }
 76 | 
 77 |     /**
 78 |      * Returns the size of the linear segment in bytes.
 79 |      * @return segment size in bytes.
 80 |      */
 81 |     std::size_t size_in_bytes() { return 2 * sizeof(double); }
 82 | 
 83 |     /**
 84 |      * Writes the mathematical representation of the linear segment to an output stream.
 85 |      * @param out output stream to write the linear segment to
 86 |      * @param m the linear segment
 87 |      * @returns the output stream
 88 |      */
 89 |     friend std::ostream & operator<<(std::ostream &out, const LinearSpline &m) {
 90 |         return out << m.slope() << " * x + " << m.intercept();
 91 |     }
 92 | };
 93 | 
 94 | 
 95 | /**
 96 |  * A linear regression model that fits a straight line to minimize the mean squared error.
 97 |  *
 98 |  * We assume that x-values are sorted in ascending order and y-values are handed implicitly where @p offset and @p
 99 |  * offset + distance(first, last) are the first and last y-value, respectively. The y-values can be scaled by
100 |  * providing a @p compression_factor.
101 |  */
102 | class LinearRegression
103 | {
104 |     private:
105 |     double slope_;     ///< The slope of the linear function.
106 |     double intercept_; ///< The y-intercept of the lienar function.
107 | 
108 |     public:
109 |     /*
110 |      * Default constructor.
111 |      */
112 |     LinearRegression() = default;
113 | 
114 |     /**
115 |      * Builds a linaer regression model between on the given data points.
116 |      * @param first, last iterators to the first and last x-value the linear regression is fit on
117 |      * @param offset first y-value the linear regression is fit on
118 |      * @param compression_factor by which the y-values are scaled
119 |      */
120 |     template<typename RandomIt>
121 |     LinearRegression(RandomIt first, RandomIt last, std::size_t offset = 0, double compression_factor = 1.f) {
122 |         std::size_t n = std::distance(first, last);
123 | 
124 |         if (n == 0) {
125 |             slope_ = 0.f;
126 |             intercept_ = 0.f;
127 |             return;
128 |         }
129 |         if (n == 1) {
130 |             slope_ = 0.f;
131 |             intercept_ = static_cast<double>(offset) * compression_factor;
132 |             return;
133 |         }
134 | 
135 |         double mean_x = 0.0;
136 |         double mean_y = 0.0;
137 |         double c = 0.0;
138 |         double m2 = 0.0;
139 | 
140 |         for (std::size_t i = 0; i != n; ++i) {
141 |             auto x = *(first + i);
142 |             std::size_t y = offset + i;
143 | 
144 |             double dx = x - mean_x;
145 |             mean_x += dx /  (i + 1);
146 |             mean_y += (y - mean_y) / (i + 1);
147 |             c += dx * (y - mean_y);
148 | 
149 |             double dx2 = x - mean_x;
150 |             m2 += dx * dx2;
151 |         }
152 | 
153 |         double cov = c / (n - 1);
154 |         double var = m2 / (n - 1);
155 | 
156 |         if (var == 0.f) {
157 |             slope_  = 0.f;
158 |             intercept_ = mean_y;
159 |             return;
160 |         }
161 | 
162 |         slope_ = cov / var * compression_factor;
163 |         intercept_ = mean_y * compression_factor - slope_ * mean_x;
164 |     }
165 | 
166 |     /**
167 |      * Returns the estimated y-value of @p x.
168 |      * @param x to estimate a y-value for
169 |      * @return the estimated y-value for @p x
170 |      */
171 |     template<typename X>
172 |     double predict(const X x) const { return std::fma(slope_, static_cast<double>(x), intercept_); }
173 | 
174 |     /**
175 |      * Returns the slope of the linear regression model.
176 |      * @return the slope of the linear regression model
177 |      */
178 |     double slope() const { return slope_; }
179 | 
180 |     /**
181 |      * Returns the y-intercept of the linear regression model.
182 |      * return the y-intercept of the linear regression model
183 |      */
184 |     double intercept() const { return intercept_; }
185 | 
186 |     /**
187 |      * Returns the size of the linear regression model in bytes.
188 |      * @return model size in bytes.
189 |      */
190 |     std::size_t size_in_bytes() { return 2 * sizeof(double); }
191 | 
192 |     /**
193 |      * Writes the mathematical representation of the linear regression model to an output stream.
194 |      * @param out output stream to write the linear regression model to
195 |      * @param m the linear regression model
196 |      * @returns the output stream
197 |      */
198 |     friend std::ostream & operator<<(std::ostream &out, const LinearRegression &m) {
199 |         return out << m.slope() << " * x + " << m.intercept();
200 |     }
201 | };
202 | 
203 | 
204 | /**
205 |  * A model that fits a cubic segment from the first first to the last data point.
206 |  *
207 |  * We assume that x-values are sorted in ascending order and y-values are handed implicitly where @p offset and @p
208 |  * offset + distance(first, last) are the first and last y-value, respectively. The y-values can be scaled by
209 |  * providing a @p compression_factor.
210 |  */
211 | class CubicSpline
212 | {
213 |     private:
214 |     double a_; ///< The cubic coefficient.
215 |     double b_; ///< The quadric coefficietn.
216 |     double c_; ///< The linear coefficient.
217 |     double d_; ///< The y-intercept.
218 | 
219 |     public:
220 |     /**
221 |      * Default constructor.
222 |      */
223 |     CubicSpline() = default;
224 | 
225 |     /**
226 |      * Builds a cubic segment between the first and last data point.
227 |      * @param first, last iterators to the first and last x-value the cubic segment is fit on
228 |      * @param offset first y-value the cubic segment is fit on
229 |      * @param compression_factor by which the y-values are scaled
230 |      */
231 |     template<typename RandomIt>
232 |     CubicSpline(RandomIt first, RandomIt last, std::size_t offset = 0, double compression_factor = 1.f) {
233 |         std::size_t n = std::distance(first, last);
234 | 
235 |         if (n == 0) {
236 |             a_ = 0.f;
237 |             b_ = 0.f;
238 |             c_ = 1.f;
239 |             d_ = 0.f;
240 |             return;
241 |         }
242 |         if (n == 1 or *first == *(last - 1)) {
243 |             a_ = 0.f;
244 |             b_ = 0.f;
245 |             c_ = 0.f;
246 |             d_ = static_cast<double>(offset) * compression_factor;
247 |             return;
248 |         }
249 | 
250 |         double xmin = static_cast<double>(*first);
251 |         double ymin = static_cast<double>(offset) * compression_factor;
252 |         double xmax = static_cast<double>(*(last - 1));
253 |         double ymax = static_cast<double>(offset + n - 1) * compression_factor;
254 | 
255 |         double x1 = 0.0;
256 |         double y1 = 0.0;
257 |         double x2 = 1.0;
258 |         double y2 = 1.0;
259 | 
260 |         double sxn, syn = 0.0;
261 |         for (std::size_t i = 0; i != n; ++i) {
262 |             double x = static_cast<double>(*(first + i));
263 |             double y = static_cast<double>(offset + i) * compression_factor;
264 |             sxn = (x - xmin) / (xmax - xmin);
265 |             if (sxn > 0.0) {
266 |                 syn = (y - ymin) / (ymax - ymin);
267 |                 break;
268 |             }
269 |         }
270 |         double m1 = (syn - y1) / (sxn - x1);
271 | 
272 |         double sxp, syp = 0.0;
273 |         for (std::size_t i = 0; i != n; ++i) {
274 |             double x = static_cast<double>(*(first + i));
275 |             double y = static_cast<double>(offset + i) * compression_factor;
276 |             sxp = (x - xmin) / (xmax - xmin);
277 |             if (sxp < 1.0) {
278 |                 syp = (y - ymin) / (ymax - ymin);
279 |                 break;
280 |             }
281 |         }
282 |         double m2 = (y2 - syp) / (x2 - sxp);
283 | 
284 |         if (std::pow(m1, 2.0) + std::pow(m2, 2.0) > 9.0) {
285 |             double tau = 3.0 / std::sqrt(std::pow(m1, 2.0) + std::pow(m2, 2.0));
286 |             m1 *= tau;
287 |             m2 *= tau;
288 |         }
289 | 
290 |         a_ = (m1 + m2 - 2.0)
291 |             / std::pow(xmax - xmin, 3.0);
292 | 
293 |         b_ = -(xmax * (2.0 * m1 + m2 - 3.0) + xmin * (m1 + 2.0 * m2 - 3.0))
294 |             / std::pow(xmax - xmin, 3.0);
295 | 
296 |         c_ = (m1 * std::pow(xmax, 2.0) + m2 * std::pow(xmin, 2.0) + xmax * xmin * (2.0 * m1 + 2.0 * m2 - 6.0))
297 |             / std::pow(xmax - xmin, 3.0);
298 | 
299 |         d_ = -xmin * (m1 * std::pow(xmax, 2.0) + xmax * xmin * (m2 - 3.0) + std::pow(xmin, 2.0))
300 |             / std::pow(xmax - xmin, 3.0);
301 | 
302 |         a_ *= ymax - ymin;
303 |         b_ *= ymax - ymin;
304 |         c_ *= ymax - ymin;
305 |         d_ *= ymax - ymin;
306 |         d_ += ymin;
307 | 
308 |         // Check if linear spline performs better.
309 |         // LinearSpline ls(first, last, offset, compression_factor);
310 | 
311 |         // double ls_error = 0.f;
312 |         // double cs_error = 0.f;
313 | 
314 |         // for (std::size_t i = 0; i != n; ++i) {
315 |         //     double y = (offset +i) * compression_factor;
316 |         //     auto key = *(first + i);
317 |         //     double ls_pred = ls.predict(key);
318 |         //     double cs_pred = predict(key);
319 |         //     ls_error += std::abs(ls_pred - y);
320 |         //     cs_error += std::abs(cs_pred - y);
321 |         // }
322 | 
323 |         // if (ls_error < cs_error) {
324 |         //     a_ = 0;
325 |         //     b_ = 0;
326 |         //     c_ = ls.slope();
327 |         //     d_ = ls.intercept();
328 |         // }
329 |     }
330 | 
331 |     /**
332 |      * Returns the estimated y-value of @p x.
333 |      * @param x to estimate a y-value for
334 |      * @return the estimated y-value for @p x
335 |      */
336 |     template<typename X>
337 |     double predict(const X x) const {
338 |         double x_ = static_cast<double>(x);
339 |         double v1 = std::fma(a_, x_, b_);
340 |         double v2 = std::fma(v1, x_, c_);
341 |         double v3 = std::fma(v2, x_, d_);
342 |         return v3;
343 |     }
344 | 
345 |     /** Returns the cubic coefficient.
346 |      * @return the cubic coefficient
347 |      */
348 |     double a() const { return a_; }
349 | 
350 |     /** Returns the quadric coefficient.
351 |      * @return the quadric coefficient
352 |      */
353 |     double b() const { return b_; }
354 | 
355 |     /** Returns the linear coefficient.
356 |      * @return the linear coefficient
357 |      */
358 |     double c() const { return c_; }
359 | 
360 |     /** Returns the y-intercept.
361 |      * @return the y-intercept
362 |      */
363 |     double d() const { return d_; }
364 | 
365 |     /**
366 |      * Returns the size of the cubic segment in bytes.
367 |      * @return segment size in bytes.
368 |      */
369 |     std::size_t size_in_bytes() { return 4 * sizeof(double); }
370 | 
371 |     /**
372 |      * Writes the mathematical representation of the cubic segment to an output stream.
373 |      * @param out output stream to write the cubic segment to
374 |      * @param m the cubic segment
375 |      * @returns the output stream
376 |      */
377 |     friend std::ostream & operator<<(std::ostream &out, const CubicSpline &m) {
378 |         return out << m.a() << " * x^3 + "
379 |                    << m.b() << " * x^2 + "
380 |                    << m.c() << " * x + d";
381 |     }
382 | };
383 | 
384 | 
385 | /**
386 |  * A radix model that projects a x-values to their most significant bits after eliminating the common prefix.
387 |  *
388 |  * We assume that x-values are sorted in ascending order and y-values are handed implicitly where @p offset and @p
389 |  * offset + distance(first, last) are the first and last y-value, respectively. The y-values can be scaled by
390 |  * providing a @p compression_factor.
391 |  *
392 |  * @tparam the type of x-values.
393 |  */
394 | template<typename X = uint64_t>
395 | class Radix
396 | {
397 |     using x_type = X;
398 | 
399 |     private:
400 |     x_type mask_; ///< The mask for parallel bits extract.
401 | 
402 |     public:
403 |     /*
404 |      * Default constructor.
405 |      */
406 |     Radix() = default;
407 | 
408 |     /**
409 |      * Builds a radix model on the given data points.
410 |      * @param first, last iterators to the first and last x-value the linear regression is fit on
411 |      * @param offset first y-value the linear regression is fit on
412 |      * @param compression_factor by which the y-values are scaled
413 |      */
414 |     template<typename RandomIt>
415 |     Radix(RandomIt first, RandomIt last, std::size_t offset = 0, double compression_factor = 1.f) {
416 |         std::size_t n = std::distance(first, last);
417 | 
418 |         if (n == 0) {
419 |             mask_ = 0;
420 |             return;
421 |         }
422 | 
423 |         auto prefix = common_prefix_width(*first, *(last - 1)); // compute common prefix length
424 | 
425 |         if (prefix == (sizeof(x_type) * 8)) {
426 |             mask_ = 42; // TODO: What should the mask be in this case?
427 |             return;
428 |         }
429 | 
430 |         // Determine radix width.
431 |         std::size_t max = static_cast<std::size_t>(offset + n - 1) * compression_factor;
432 |         bool is_mersenne = (max & (max + 1)) == 0; // check if max is 2^n-1
433 |         auto radix = is_mersenne ? bit_width<std::size_t>(max) : bit_width<std::size_t>(max) - 1;
434 | 
435 |         // Mask all bits but the radix
436 |         mask_ = (~(x_type)0 >> prefix) & (~(x_type)0 << ((sizeof(x_type) * 8) - radix - prefix)); //0xffff << prefix_
437 |     }
438 | 
439 |     /**
440 |      * Returns the estimated y-value of @p x.
441 |      * @param x to estimate a y-value for
442 |      * @return the estimated y-value for @p x
443 |      */
444 |     // double predict(const x_type x) const { return (x << prefix_) >> ((sizeof(x_type) * 8) - radix_); }
445 |     double predict(const x_type x) const {
446 |         if constexpr(sizeof(x_type) <= sizeof(unsigned)) {
447 |             return _pext_u32(x, mask_);
448 |         } else if constexpr(sizeof(x_type) <= sizeof(unsigned long long)) {
449 |             return _pext_u64(x, mask_);
450 |         } else {
451 |             static_assert(sizeof(x_type) > sizeof(unsigned long long), "unsupported width of integral type");
452 |         }
453 |     }
454 | 
455 |     /**
456 |      * Returns the mask used for parallel bits extraction.
457 |      * @return the mask
458 |      */
459 |     uint8_t mask() const { return mask_; }
460 | 
461 |     /**
462 |      * Returns the size of the radix model in bytes.
463 |      * @return radix model size in bytes.
464 |      */
465 |     std::size_t size_in_bytes() { return sizeof(mask_); }
466 | 
467 |     /**
468 |      * Writes a human readable representation of the radix model to an output stream.
469 |      * @param out output stream to write the radix model to
470 |      * @param m the radix model
471 |      * @returns the output stream
472 |      */
473 |     friend std::ostream & operator<<(std::ostream &out, const Radix &m) {
474 |         return out << "_pext(x, " << m.mask() << ")";
475 |     }
476 | };
477 | 
478 | } // namespace rmi
479 | 


--------------------------------------------------------------------------------
/include/rmi/rmi.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <algorithm>
  4 | #include <vector>
  5 | 
  6 | 
  7 | namespace rmi {
  8 | 
  9 | /**
 10 |  * Struct to hold the approximated position and error bounds returned by the index.
 11 |  */
 12 | struct Approx {
 13 |     std::size_t pos; ///< The estimated position of the key.
 14 |     std::size_t lo;  ///< The lower bound of the search range.
 15 |     std::size_t hi;  ///< The upper bound of the search range.
 16 | };
 17 | 
 18 | /**
 19 |  * This is a reimplementation of a two-layer recursive model index (RMI) supporting a variety of (monotonic) models.
 20 |  * RMIs were invented by Kraska et al. (https://dl.acm.org/doi/epdf/10.1145/3183713.3196909).
 21 |  *
 22 |  * Note that this is the base class which does not provide error bounds.
 23 |  *
 24 |  * @tparam Key the type of the keys to be indexed
 25 |  * @tparam Layer1 the type of the model used in layer1
 26 |  * @tparam Layer2 the type of the models used in layer2
 27 |  */
 28 | template<typename Key, typename Layer1, typename Layer2>
 29 | class Rmi
 30 | {
 31 |     using key_type = Key;
 32 |     using layer1_type = Layer1;
 33 |     using layer2_type = Layer2;
 34 | 
 35 |     protected:
 36 |     std::size_t n_keys_;      ///< The number of keys the index was built on.
 37 |     std::size_t layer2_size_; ///< The number of models in layer2.
 38 |     layer1_type l1_;          ///< The layer1 model.
 39 |     layer2_type *l2_;         ///< The array of layer2 models.
 40 | 
 41 |     public:
 42 |     /**
 43 |      * Default constructor.
 44 |      */
 45 |     Rmi() = default;
 46 | 
 47 |     /**
 48 |      * Builds the index with @p layer2_size models in layer2 on the sorted @p keys.
 49 |      * @param keys vector of sorted keys to be indexed
 50 |      * @param layer2_size the number of models in layer2
 51 |      */
 52 |     Rmi(const std::vector<key_type> &keys, const std::size_t layer2_size)
 53 |         : Rmi(keys.begin(), keys.end(), layer2_size) { }
 54 | 
 55 |     /**
 56 |      * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last).
 57 |      * @param first, last iterators that define the range of sorted keys to be indexed
 58 |      * @param layer2_size the number of models in layer2
 59 |      */
 60 |     template<typename RandomIt>
 61 |     Rmi(RandomIt first, RandomIt last, const std::size_t layer2_size)
 62 |         : n_keys_(std::distance(first, last))
 63 |         , layer2_size_(layer2_size)
 64 |     {
 65 |         // Train layer1.
 66 |         l1_ = layer1_type(first, last, 0, static_cast<double>(layer2_size) / n_keys_); // train with compression
 67 | 
 68 |         // Train layer2.
 69 |         l2_ = new layer2_type[layer2_size];
 70 |         std::size_t segment_start = 0;
 71 |         std::size_t segment_id = 0;
 72 |         // Assign each key to its segment.
 73 |         for (std::size_t i = 0; i != n_keys_; ++i) {
 74 |             auto pos = first + i;
 75 |             std::size_t pred_segment_id = get_segment_id(*pos);
 76 |             // If a key is assigned to a new segment, all models must be trained up to the new segment.
 77 |             if (pred_segment_id > segment_id) {
 78 |                 new (&l2_[segment_id]) layer2_type(first + segment_start, pos, segment_start);
 79 |                 for (std::size_t j = segment_id + 1; j < pred_segment_id; ++j) {
 80 |                     new (&l2_[j]) layer2_type(pos - 1, pos, i - 1); // train other models on last key in previous segment
 81 |                 }
 82 |                 segment_id = pred_segment_id;
 83 |                 segment_start = i;
 84 |             }
 85 |         }
 86 |         // Train remaining models.
 87 |         new (&l2_[segment_id]) layer2_type(first + segment_start, last, segment_start);
 88 |         for (std::size_t j = segment_id + 1; j < layer2_size; ++j) {
 89 |             new (&l2_[j]) layer2_type(last - 1, last, n_keys_ - 1); // train remaining models on last key
 90 |         }
 91 |     }
 92 | 
 93 |     /**
 94 |      * Destructor.
 95 |      */
 96 |     ~Rmi() { delete[] l2_; }
 97 | 
 98 |     /**
 99 |      * Returns the id of the segment @p key belongs to.
100 |      * @param key to get segment id for
101 |      * @return segment id of the given key
102 |      */
103 |     std::size_t get_segment_id(const key_type key) const {
104 |         return std::clamp<double>(l1_.predict(key), 0, layer2_size_ - 1);
105 |     }
106 | 
107 |     /**
108 |      * Returns a position estimate and search bounds for a given key.
109 |      * @param key to search for
110 |      * @return position estimate and search bounds
111 |      */
112 |     Approx search(const key_type key) const {
113 |         auto segment_id = get_segment_id(key);
114 |         std::size_t pred = std::clamp<double>(l2_[segment_id].predict(key), 0, n_keys_ - 1);
115 |         return {pred, 0, n_keys_};
116 |     }
117 | 
118 |     /**
119 |      * Returns the number of keys the index was built on.
120 |      * @return the number of keys the index was built on
121 |      */
122 |     std::size_t n_keys() const { return n_keys_; }
123 | 
124 |     /**
125 |      * Returns the number of models in layer2.
126 |      * @return the number of models in layer2
127 |      */
128 |     std::size_t layer2_size() const { return layer2_size_; }
129 | 
130 |     /**
131 |      * Returns the size of the index in bytes.
132 |      * @return index size in bytes
133 |      */
134 |     std::size_t size_in_bytes() {
135 |         return l1_.size_in_bytes() + layer2_size_ * l2_[0].size_in_bytes() + sizeof(n_keys_) + sizeof(layer2_size_);
136 |     }
137 | };
138 | 
139 | 
140 | /**
141 |  * Recursive model index with global absolute bounds.
142 |  */
143 | template<typename Key, typename Layer1, typename Layer2>
144 | class RmiGAbs : public Rmi<Key, Layer1, Layer2>
145 | {
146 |     using base_type = Rmi<Key, Layer1, Layer2>;
147 |     using key_type = Key;
148 |     using layer1_type = Layer1;
149 |     using layer2_type = Layer2;
150 | 
151 |     protected:
152 |     std::size_t error_; ///< The error bound of the layer2 models.
153 | 
154 |     public:
155 |     /**
156 |      * Default constructor.
157 |      */
158 |     RmiGAbs() = default;
159 | 
160 |     /**
161 |      * Builds the index with @p layer2_size models in layer2 on the sorted @p keys.
162 |      * @param keys vector of sorted keys to be indexed
163 |      * @param layer2_size the number of models in layer2
164 |      */
165 |     RmiGAbs(const std::vector<key_type> &keys, const std::size_t layer2_size)
166 |         : RmiGAbs(keys.begin(), keys.end(), layer2_size) { }
167 | 
168 |     /**
169 |      * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last).
170 |      * @param first, last iterators that define the range of sorted keys to be indexed
171 |      * @param layer2_size the number of models in layer2
172 |      */
173 |     template<typename RandomIt>
174 |     RmiGAbs(RandomIt first, RandomIt last, const std::size_t layer2_size) : base_type(first, last, layer2_size) {
175 |         // Compute global absolute errror bounds.
176 |         error_ = 0;
177 |         for (std::size_t i = 0; i != base_type::n_keys_; ++i) {
178 |             key_type key = *(first + i);
179 |             std::size_t segment_id = base_type::get_segment_id(key);
180 |             std::size_t pred = std::clamp<double>(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1);
181 |             if (pred > i) { // overestimation
182 |                 error_ = std::max(error_, pred - i);
183 |             } else { // underestimation
184 |                 error_ = std::max(error_, i - pred);
185 |             }
186 |         }
187 |     }
188 | 
189 |     /**
190 |      * Returns a position estimate and search bounds for a given key.
191 |      * @param key to search for
192 |      * @return position estimate and search bounds
193 |      */
194 |     Approx search(const key_type key) const {
195 |         auto segment_id = base_type::get_segment_id(key);
196 |         std::size_t pred = std::clamp<double>(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1);
197 |         std::size_t lo = pred > error_ ? pred - error_ : 0;
198 |         std::size_t hi = std::min(pred + error_ + 1, base_type::n_keys_);
199 |         return {pred, lo, hi};
200 |     }
201 | 
202 |     /**
203 |      * Returns the size of the index in bytes.
204 |      * @return index size in bytes
205 |      */
206 |     std::size_t size_in_bytes() { return base_type::size_in_bytes() + sizeof(error_); }
207 | };
208 | 
209 | 
210 | /**
211 |  * Recursive model index with global individual bounds.
212 |  */
213 | template<typename Key, typename Layer1, typename Layer2>
214 | class RmiGInd : public Rmi<Key, Layer1, Layer2>
215 | {
216 |     using base_type = Rmi<Key, Layer1, Layer2>;
217 |     using key_type = Key;
218 |     using layer1_type = Layer1;
219 |     using layer2_type = Layer2;
220 | 
221 |     protected:
222 |     std::size_t error_lo_; ///< The lower error bound of the layer2 models.
223 |     std::size_t error_hi_; ///< The upper error bound of the layer2 models.
224 | 
225 |     public:
226 |     /**
227 |      * Default constructor.
228 |      */
229 |     RmiGInd() = default;
230 | 
231 |     /**
232 |      * Builds the index with @p layer2_size models in layer2 on the sorted @p keys.
233 |      * @param keys vector of sorted keys to be indexed
234 |      * @param layer2_size the number of models in layer2
235 |      */
236 |     RmiGInd(const std::vector<key_type> &keys, const std::size_t layer2_size)
237 |         : RmiGInd(keys.begin(), keys.end(), layer2_size) { }
238 | 
239 |     /**
240 |      * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last).
241 |      * @param first, last iterators that define the range of sorted keys to be indexed
242 |      * @param layer2_size the number of models in layer2
243 |      */
244 |     template<typename RandomIt>
245 |     RmiGInd(RandomIt first, RandomIt last, const std::size_t layer2_size) : base_type(first, last, layer2_size) {
246 |         // Compute global absolute errror bounds.
247 |         error_lo_ = 0;
248 |         error_hi_ = 0;
249 |         for (std::size_t i = 0; i != base_type::n_keys_; ++i) {
250 |             key_type key = *(first + i);
251 |             std::size_t segment_id = base_type::get_segment_id(key);
252 |             std::size_t pred = std::clamp<double>(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1);
253 |             if (pred > i) { // overestimation
254 |                 error_lo_ = std::max(error_lo_, pred - i);
255 |             } else { // underestimation
256 |                 error_hi_ = std::max(error_hi_, i - pred);
257 |             }
258 |         }
259 |     }
260 | 
261 |     /**
262 |      * Returns a position estimate and search bounds for a given key.
263 |      * @param key to search for
264 |      * @return position estimate and search bounds
265 |      */
266 |     Approx search(const key_type key) const {
267 |         auto segment_id = base_type::get_segment_id(key);
268 |         std::size_t pred = std::clamp<double>(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1);
269 |         std::size_t lo = pred > error_lo_ ? pred - error_lo_ : 0;
270 |         std::size_t hi = std::min(pred + error_hi_ + 1, base_type::n_keys_);
271 |         return {pred, lo, hi};
272 |     }
273 | 
274 |     /**
275 |      * Returns the size of the index in bytes.
276 |      * @return index size in bytes
277 |      */
278 |     std::size_t size_in_bytes() { return base_type::size_in_bytes() + sizeof(error_lo_) + sizeof(error_hi_); }
279 | };
280 | 
281 | 
282 | /**
283 |  * Recursive model index with local absolute bounds.
284 |  */
285 | template<typename Key, typename Layer1, typename Layer2>
286 | class RmiLAbs : public Rmi<Key, Layer1, Layer2>
287 | {
288 |     using base_type = Rmi<Key, Layer1, Layer2>;
289 |     using key_type = Key;
290 |     using layer1_type = Layer1;
291 |     using layer2_type = Layer2;
292 | 
293 |     protected:
294 |     std::vector<std::size_t> errors_; ///< The error bounds of the layer2 models.
295 | 
296 |     public:
297 |     /**
298 |      * Default constructor.
299 |      */
300 |     RmiLAbs() = default;
301 | 
302 |     /**
303 |      * Builds the index with @p layer2_size models in layer2 on the sorted @p keys.
304 |      * @param keys vector of sorted keys to be indexed
305 |      * @param layer2_size the number of models in layer2
306 |      */
307 |     RmiLAbs(const std::vector<key_type> &keys, const std::size_t layer2_size)
308 |         : RmiLAbs(keys.begin(), keys.end(), layer2_size) { }
309 | 
310 |     /**
311 |      * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last).
312 |      * @param first, last iterators that define the range of sorted keys to be indexed
313 |      * @param layer2_size the number of models in layer2
314 |      */
315 |     template<typename RandomIt>
316 |     RmiLAbs(RandomIt first, RandomIt last, const std::size_t layer2_size) : base_type(first, last, layer2_size) {
317 |         // Compute local absolute errror bounds.
318 |         errors_ = std::vector<std::size_t>(layer2_size);
319 |         for (std::size_t i = 0; i != base_type::n_keys_; ++i) {
320 |             key_type key = *(first + i);
321 |             std::size_t segment_id = base_type::get_segment_id(key);
322 |             std::size_t pred = std::clamp<double>(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1);
323 |             if (pred > i) { // overestimation
324 |                 errors_[segment_id] = std::max(errors_[segment_id], pred - i);
325 |             } else { // underestimation
326 |                 errors_[segment_id] = std::max(errors_[segment_id], i - pred);
327 |             }
328 |         }
329 |     }
330 | 
331 |     /**
332 |      * Returns a position estimate and search bounds for a given key.
333 |      * @param key to search for
334 |      * @return position estimate and search bounds
335 |      */
336 |     Approx search(const key_type key) const {
337 |         auto segment_id = base_type::get_segment_id(key);
338 |         std::size_t pred = std::clamp<double>(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1);
339 |         std::size_t err = errors_[segment_id];
340 |         std::size_t lo = pred > err ? pred - err : 0;
341 |         std::size_t hi = std::min(pred + err + 1, base_type::n_keys_);
342 |         return {pred, lo, hi};
343 |     }
344 | 
345 |     /**
346 |      * Returns the size of the index in bytes.
347 |      * @return index size in bytes
348 |      */
349 |     std::size_t size_in_bytes() { return base_type::size_in_bytes() + errors_.size() * sizeof(errors_.front()); }
350 | };
351 | 
352 | 
353 | /**
354 |  * Recursive model index with local individual bounds.
355 |  */
356 | template<typename Key, typename Layer1, typename Layer2>
357 | class RmiLInd : public Rmi<Key, Layer1, Layer2>
358 | {
359 |     using base_type = Rmi<Key, Layer1, Layer2>;
360 |     using key_type = Key;
361 |     using layer1_type = Layer1;
362 |     using layer2_type = Layer2;
363 | 
364 |     protected:
365 |     /**
366 |      * Struct to store a lower and an upper error bound.
367 |      */
368 |     struct bounds {
369 |         std::size_t lo; ///< The lower error bound.
370 |         std::size_t hi; ///< The upper error bound.
371 | 
372 |         /**
373 |          * Default constructor.
374 |          */
375 |         bounds() : lo(0), hi(0) { }
376 |     };
377 | 
378 |     std::vector<bounds> errors_; ///< The error bounds of the layer2 models.
379 | 
380 |     public:
381 |     /**
382 |      * Default constructor.
383 |      */
384 |     RmiLInd() = default;
385 | 
386 |     /**
387 |      * Builds the index with @p layer2_size models in layer2 on the sorted @p keys.
388 |      * @param keys vector of sorted keys to be indexed
389 |      * @param layer2_size the number of models in layer2
390 |      */
391 |     RmiLInd(const std::vector<key_type> &keys, const std::size_t layer2_size)
392 |         : RmiLInd(keys.begin(), keys.end(), layer2_size) { }
393 | 
394 |     /**
395 |      * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last).
396 |      * @param first, last iterators that define the range of sorted keys to be indexed
397 |      * @param layer2_size the number of models in layer2
398 |      */
399 |     template<typename RandomIt>
400 |     RmiLInd(RandomIt first, RandomIt last, const std::size_t layer2_size) : base_type(first, last, layer2_size) {
401 |         // Compute local individual errror bounds.
402 |         errors_ = std::vector<bounds>(layer2_size);
403 |         for (std::size_t i = 0; i != base_type::n_keys_; ++i) {
404 |             key_type key = *(first + i);
405 |             std::size_t segment_id = base_type::get_segment_id(key);
406 |             std::size_t pred = std::clamp<double>(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1);
407 |             if (pred > i) { // overestimation
408 |                 std::size_t &lo = errors_[segment_id].lo;
409 |                 lo = std::max(lo, pred - i);
410 |             } else { // underestimation
411 |                 std::size_t &hi = errors_[segment_id].hi;
412 |                 hi = std::max(hi, i - pred);
413 |             }
414 |         }
415 |     }
416 | 
417 |     /**
418 |      * Returns a position estimate and search bounds for a given key.
419 |      * @param key to search for
420 |      * @return position estimate and search bounds
421 |      */
422 |     Approx search(const key_type key) const {
423 |         auto segment_id = base_type::get_segment_id(key);
424 |         std::size_t pred = std::clamp<double>(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1);
425 |         bounds err = errors_[segment_id];
426 |         std::size_t lo = pred > err.lo ? pred - err.lo : 0;
427 |         std::size_t hi = std::min(pred + err.hi + 1, base_type::n_keys_);
428 |         return {pred, lo, hi};
429 |     }
430 | 
431 |     /**
432 |      * Returns the size of the index in bytes.
433 |      * @return index size in bytes
434 |      */
435 |     std::size_t size_in_bytes() { return base_type::size_in_bytes() + errors_.size() * sizeof(errors_.front()); }
436 | };
437 | 
438 | } // namespace rmi
439 | 


--------------------------------------------------------------------------------
/experiments/rmi_guideline.cpp:
--------------------------------------------------------------------------------
  1 | #include <chrono>
  2 | #include <cmath>
  3 | #include <random>
  4 | 
  5 | #include "argparse/argparse.hpp"
  6 | 
  7 | #include "rmi/models.hpp"
  8 | #include "rmi/rmi.hpp"
  9 | #include "rmi/util/fn.hpp"
 10 | #include "rmi/util/search.hpp"
 11 | 
 12 | using key_type = uint64_t;
 13 | using namespace std::chrono;
 14 | 
 15 | std::size_t s_glob; ///< global size_t variable
 16 | 
 17 | 
 18 | /**
 19 |  * Measures lookup times of @p samples on a given @p Rmi and writes results to `std::cout`.
 20 |  * @tparam Key key type
 21 |  * @tparam Rmi RMI type
 22 |  * @tparam Search search type
 23 |  * @param keys on which the RMI is built
 24 |  * @param n_models number of models in the second layer of the RMI
 25 |  * @param samples for which the lookup time is measured
 26 |  * @param n_reps number of repetitions
 27 |  * @param dataset_name name of the dataset
 28 |  * @param layer1 model type of the first layer
 29 |  * @param layer2 model type of the second layer
 30 |  * @param bounds used by the RMI
 31 |  * @param search used by the RMI for correction prediction errors
 32 |  * @param budget the budget under which the configuration was chosen
 33 |  */
 34 | template<typename Key, typename Rmi, typename Search>
 35 | void experiment(const std::vector<key_type> &keys,
 36 |                 const std::size_t n_models,
 37 |                 const std::vector<key_type> &samples,
 38 |                 const std::size_t n_reps,
 39 |                 const std::string dataset_name,
 40 |                 const std::string layer1,
 41 |                 const std::string layer2,
 42 |                 const std::string bounds,
 43 |                 const std::string search,
 44 |                 const std::size_t budget,
 45 |                 const bool is_guideline)
 46 | {
 47 |     using rmi_type = Rmi;
 48 |     auto search_fn = Search();
 49 | 
 50 |     // Build RMI.
 51 |     rmi_type rmi(keys, n_models);
 52 | 
 53 |     // Skip configurations that are guaranteed to not be the fastest.
 54 |     if (search == "model_biased_linear") {
 55 |         auto n_keys = keys.size();
 56 |         std::vector<std::size_t> errors;
 57 |         errors.reserve(n_keys);
 58 | 
 59 |         for (std::size_t i = 0; i != n_keys; ++i) {
 60 |             auto key = keys.at(i);
 61 |             auto pred = rmi.search(key).pos;
 62 |             auto err = pred > i ? pred - i : i - pred;
 63 |             errors.push_back(err);
 64 |         }
 65 | 
 66 |         auto mean_ae = mean(errors);
 67 |         if (mean_ae > 10) return;
 68 |     }
 69 | 
 70 |     // Perform n_reps runs.
 71 |     for (std::size_t rep = 0; rep != n_reps; ++rep) {
 72 | 
 73 |         // Lookup time.
 74 |         std::size_t lookup_accu = 0;
 75 |         auto start = steady_clock::now();
 76 |         for (std::size_t i = 0; i != samples.size(); ++i) {
 77 |             auto key = samples.at(i);
 78 |             auto range = rmi.search(key);
 79 |             auto pos = search_fn(keys.begin() + range.lo, keys.begin() + range.hi, keys.begin() + range.pos, key);
 80 |             lookup_accu += std::distance(keys.begin(), pos);
 81 |         }
 82 |         auto stop = steady_clock::now();
 83 |         auto lookup_time = duration_cast<nanoseconds>(stop - start).count();
 84 |         s_glob = lookup_accu;
 85 | 
 86 |         // Report results.
 87 |                   // Dataset
 88 |         std::cout << dataset_name << ','
 89 |                   << keys.size() << ','
 90 |                   // Index
 91 |                   << layer1 << ','
 92 |                   << layer2 << ','
 93 |                   << n_models << ','
 94 |                   << bounds << ','
 95 |                   << search << ','
 96 |                   << rmi.size_in_bytes() << ','
 97 |                   // Experiment
 98 |                   << rep << ','
 99 |                   << samples.size() << ','
100 |                   << budget << ','
101 |                   << is_guideline << ','
102 |                   // Results
103 |                   << lookup_time << ','
104 |                   // Checksums
105 |                   << lookup_accu << std::endl;
106 |     } // reps
107 | }
108 | 
109 | 
110 | /**
111 |  * @brief experiment function pointer
112 |  */
113 | typedef void (*exp_fn_ptr)(const std::vector<key_type>&,
114 |                            const std::size_t,
115 |                            const std::vector<key_type>&,
116 |                            const std::size_t,
117 |                            const std::string,
118 |                            const std::string,
119 |                            const std::string,
120 |                            const std::string,
121 |                            const std::string,
122 |                            const std::size_t,
123 |                            const bool);
124 | 
125 | 
126 | /**
127 |  * RMI configuration that holds the string representation of model types of layer 1 and layer 2, error bound type, and
128 |  * search algorithm.
129 |  */
130 | struct Config {
131 |     std::string layer1;
132 |     std::string layer2;
133 |     std::string bounds;
134 |     std::string search;
135 | };
136 | 
137 | 
138 | /**
139 |  * Comparator class for @p Config objects.
140 |  */
141 | struct ConfigCompare {
142 |     bool operator() (const Config &lhs, const Config &rhs) const {
143 |         if (lhs.layer1 != rhs.layer1) return lhs.layer1 < rhs.layer1;
144 |         if (lhs.layer2 != rhs.layer2) return lhs.layer2 < rhs.layer2;
145 |         if (lhs.bounds != rhs.bounds) return lhs.bounds < rhs.bounds;
146 |         return lhs.search < rhs.search;
147 |     }
148 | };
149 | 
150 | 
151 | #define ENTRIES(L1, L2, LT1, LT2) \
152 |     { {#L1, #L2, "none", "binary"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, BinarySearch> }, \
153 |     { {#L1, #L2, "labs", "binary"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, BinarySearch> }, \
154 |     { {#L1, #L2, "lind", "binary"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, BinarySearch> }, \
155 |     { {#L1, #L2, "gabs", "binary"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, BinarySearch> }, \
156 |     { {#L1, #L2, "gind", "binary"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, BinarySearch> }, \
157 |     { {#L1, #L2, "none", "model_biased_binary"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
158 |     { {#L1, #L2, "labs", "model_biased_binary"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
159 |     { {#L1, #L2, "lind", "model_biased_binary"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
160 |     { {#L1, #L2, "gabs", "model_biased_binary"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
161 |     { {#L1, #L2, "gind", "model_biased_binary"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, ModelBiasedBinarySearch> }, \
162 |     { {#L1, #L2, "none", "linear"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, LinearSearch> }, \
163 |     { {#L1, #L2, "labs", "linear"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, LinearSearch> }, \
164 |     { {#L1, #L2, "lind", "linear"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, LinearSearch> }, \
165 |     { {#L1, #L2, "gabs", "linear"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, LinearSearch> }, \
166 |     { {#L1, #L2, "gind", "linear"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, LinearSearch> }, \
167 |     { {#L1, #L2, "none", "model_biased_linear"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
168 |     { {#L1, #L2, "labs", "model_biased_linear"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
169 |     { {#L1, #L2, "lind", "model_biased_linear"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
170 |     { {#L1, #L2, "gabs", "model_biased_linear"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
171 |     { {#L1, #L2, "gind", "model_biased_linear"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, ModelBiasedLinearSearch> }, \
172 |     { {#L1, #L2, "none", "exponential"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, ExponentialSearch> }, \
173 |     { {#L1, #L2, "labs", "exponential"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, ExponentialSearch> }, \
174 |     { {#L1, #L2, "lind", "exponential"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, ExponentialSearch> }, \
175 |     { {#L1, #L2, "gabs", "exponential"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, ExponentialSearch> }, \
176 |     { {#L1, #L2, "gind", "exponential"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, ExponentialSearch> }, \
177 |     { {#L1, #L2, "none", "model_biased_exponential"}, &experiment<key_type, rmi::Rmi<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
178 |     { {#L1, #L2, "labs", "model_biased_exponential"}, &experiment<key_type, rmi::RmiLAbs<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
179 |     { {#L1, #L2, "lind", "model_biased_exponential"}, &experiment<key_type, rmi::RmiLInd<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
180 |     { {#L1, #L2, "gabs", "model_biased_exponential"}, &experiment<key_type, rmi::RmiGAbs<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
181 |     { {#L1, #L2, "gind", "model_biased_exponential"}, &experiment<key_type, rmi::RmiGInd<key_type, LT1, LT2>, ModelBiasedExponentialSearch> }, \
182 | 
183 | static std::map<Config, exp_fn_ptr, ConfigCompare> exp_map {
184 |     ENTRIES(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression)
185 |     ENTRIES(linear_regression, linear_spline,     rmi::LinearRegression, rmi::LinearSpline)
186 |     ENTRIES(linear_spline,     linear_regression, rmi::LinearSpline,     rmi::LinearRegression)
187 |     ENTRIES(linear_spline,     linear_spline,     rmi::LinearSpline,     rmi::LinearSpline)
188 |     ENTRIES(cubic_spline,      linear_regression, rmi::CubicSpline,      rmi::LinearRegression)
189 |     ENTRIES(cubic_spline,      linear_spline,     rmi::CubicSpline,      rmi::LinearSpline)
190 |     ENTRIES(radix,             linear_regression, rmi::Radix<key_type>,  rmi::LinearRegression)
191 |     ENTRIES(radix,             linear_spline,     rmi::Radix<key_type>,  rmi::LinearSpline)
192 | }; ///< Map that assigns an experiment function pointer to RMI configurations.
193 | #undef ENTRIES
194 | 
195 | 
196 | /*
197 |  * Computes the recommended RMI configuration by following a simple guideline and evaluates its performance.
198 |  * @param keys on which the RMI is built
199 |  * @param samples for which the lookup time is measured
200 |  * @param n_reps number of repetitions
201 |  * @param dataset_name name of the dataset
202 |  * @param budget the budget under which the configuration is to be chosen
203 |  */
204 | void evaluate_guideline(const std::vector<key_type> &keys,
205 |                         const std::vector<key_type> &samples,
206 |                         const std::size_t n_reps,
207 |                         const std::string dataset_name,
208 |                         const std::size_t budget)
209 | {
210 |     // Dermine maximum number of layer 2 models for LS->LR NB+MExp.
211 |     auto n_models = (budget - 2 * sizeof(double) - 2 * sizeof(std::size_t)) / (2 * sizeof(double));
212 | 
213 |     // Train RMI.
214 |     rmi::Rmi<key_type, rmi::LinearSpline, rmi::LinearRegression> rmi(keys, n_models);
215 | 
216 |     // Evaluate RMI error.
217 |     auto n_keys = keys.size();
218 |     std::vector<double> log2_errors;
219 |     log2_errors.reserve(n_keys);
220 | 
221 |     for (std::size_t i = 0; i != n_keys; ++i) {
222 |         auto key = keys.at(i);
223 |         auto pred = rmi.search(key).pos;
224 |         auto err = pred > i ? pred - i : i - pred;
225 |         log2_errors.push_back(std::log2(err+1));
226 |     }
227 | 
228 |     auto mean_log2e = mean(log2_errors);
229 | 
230 |     // Pick and evaluate guideline config based on errors.
231 |     auto l1 = "linear_spline";
232 |     auto l2 = "linear_regression";
233 | 
234 |     auto threshold = 5.8; // This is hardware-dependent.
235 | 
236 |     if (mean_log2e < threshold) {
237 |         auto bounds = "none";
238 |         auto search = "model_biased_exponential";
239 | 
240 |         Config config {l1, l2, bounds, search};
241 |         exp_fn_ptr exp_fn = exp_map[config];
242 | 
243 |         (*exp_fn)(keys, n_models, samples, n_reps, dataset_name, l1, l2, bounds, search, budget, true);
244 |     } else {
245 |         auto bounds = "labs";
246 |         auto search = "binary";
247 |         n_models = (budget - 2 * sizeof(double) - 2 * sizeof(std::size_t)) / (2 * sizeof(double) + sizeof(std::size_t));
248 | 
249 |         Config config {l1, l2, bounds, search};
250 |         exp_fn_ptr exp_fn = exp_map[config];
251 | 
252 |         (*exp_fn)(keys, n_models, samples, n_reps, dataset_name, l1, l2, bounds, search, budget, true);
253 |     }
254 | }
255 | 
256 | 
257 | /**
258 |  * Tests RMI configurations for a given size budget and compares them against the performance chosen by the guideline in
259 |  * termns of lookup time.
260 |  * @param argc arguments counter
261 |  * @param argv arguments vector
262 |  */
263 | int main(int argc, char *argv[])
264 | {
265 |     // Initialize argument parser.
266 |     argparse::ArgumentParser program(argv[0], "0.1");
267 | 
268 |     // Define arguments.
269 |     program.add_argument("filename")
270 |         .help("path to binary file containing uin64_t keys");
271 | 
272 |     program.add_argument("budget")
273 |         .help("target size in bytes for the RMI configurations to test")
274 |         .action([](const std::string &s) { return std::stoul(s); });
275 | 
276 |    program.add_argument("-n", "--n_reps")
277 |         .help("number of experiment repetitions")
278 |         .default_value(std::size_t(3))
279 |         .action([](const std::string &s) { return std::stoul(s); });
280 | 
281 |     program.add_argument("-s", "--n_samples")
282 |         .help("number of sampled lookup keys")
283 |         .default_value(std::size_t(1'000'000))
284 |         .action([](const std::string &s) { return std::stoul(s); });
285 | 
286 |     program.add_argument("--header")
287 |         .help("output csv header")
288 |         .default_value(false)
289 |         .implicit_value(true);
290 | 
291 |     // Parse arguments.
292 |     try {
293 |         program.parse_args(argc, argv);
294 |     }
295 |     catch (const std::runtime_error &err) {
296 |         std::cout << err.what() << '\n' << program;
297 |         exit(EXIT_FAILURE);
298 |     }
299 | 
300 |     // Read arguments.
301 |     const auto filename = program.get<std::string>("filename");
302 |     const auto dataset_name = split(filename, '/').back();
303 |     const auto budget = program.get<std::size_t>("budget");
304 |     const auto n_reps = program.get<std::size_t>("-n");
305 |     const auto n_samples = program.get<std::size_t>("-s");
306 | 
307 |     // Load keys.
308 |     auto keys = load_data<key_type>(filename);
309 | 
310 |     // Sample keys.
311 |     uint64_t seed = 42;
312 |     std::mt19937 gen(seed);
313 |     std::uniform_int_distribution<> distrib(0, keys.size() - 1);
314 |     std::vector<key_type> samples;
315 |     samples.reserve(n_samples);
316 |     for (std::size_t i = 0; i != n_samples; ++i)
317 |         samples.push_back(keys[distrib(gen)]);
318 | 
319 |     // List configuration parameters.
320 |     std::vector<std::string> l1_models = {"linear_spline", "cubic_spline", "linear_regression", "radix"};
321 |     std::vector<std::string> l2_models = {"linear_regression"}; // We know that lr is always better than ls from previous experiments.
322 |     std::vector<std::pair<std::string, std::string>> err_corrs = {
323 |         std::make_pair("none", "model_biased_exponential"),
324 |         std::make_pair("none", "model_biased_linear"),
325 |         std::make_pair("labs", "binary"),
326 |         std::make_pair("lind", "model_biased_binary"),
327 |     };
328 | 
329 |     // List model and bound sizes.
330 |     std::map<std::string, std::size_t> model_size = {
331 |         { "linear_spline", 2 * sizeof(double) },
332 |         { "cubic_spline", 4 * sizeof(double) },
333 |         { "linear_regression", 2 * sizeof(double) },
334 |         { "radix", 1 * sizeof(key_type) },
335 |     };
336 |     std::map<std::string, std::size_t> bounds_size = {
337 |         { "none", 0 },
338 |         { "labs", sizeof(std::size_t) },
339 |         { "lind", 2 * sizeof(std::size_t) },
340 |     };
341 | 
342 |     // Output header.
343 |     if (program["--header"]  == true)
344 |         std::cout << "dataset,"
345 |                   << "n_keys,"
346 |                   << "layer1,"
347 |                   << "layer2,"
348 |                   << "n_models,"
349 |                   << "bounds,"
350 |                   << "search,"
351 |                   << "size_in_bytes,"
352 |                   << "rep,"
353 |                   << "n_samples,"
354 |                   << "budget_in_bytes,"
355 |                   << "is_guideline,"
356 |                   << "lookup_time,"
357 |                   << "lookup_accu"
358 |                   << std::endl;
359 | 
360 |     // Enumerate and evaluate configurations.
361 |     for (auto l1 : l1_models) {
362 |         for (auto l2 : l2_models) {
363 |             for (auto corr : err_corrs) {
364 |                 auto bounds = corr.first;
365 |                 auto search = corr.second;
366 | 
367 |                 // Dermine maximum number of layer 2 models.
368 |                 auto n_models = (budget - model_size[l1] - 2 * sizeof(std::size_t)) / (model_size[l2] + bounds_size[bounds]);
369 | 
370 |                 // Build configuiration object.
371 |                 Config config {l1, l2, bounds, search};
372 | 
373 |                 // Lookup evaluation function.
374 |                 exp_fn_ptr exp_fn = exp_map[config];
375 | 
376 |                 // Call evaluatin function with keys and n_models.
377 |                 (*exp_fn)(keys, n_models, samples, n_reps, dataset_name, l1, l2, bounds, search, budget, false);
378 |             }
379 |         }
380 |     }
381 | 
382 |     // Evaluate guideline configuration.
383 |     evaluate_guideline(keys, samples, n_reps, dataset_name, budget);
384 | 
385 |     exit(EXIT_SUCCESS);
386 | }
387 | 


--------------------------------------------------------------------------------