├── requirements.txt ├── .gitignore ├── scripts ├── run_all.sh ├── plot_all.sh ├── plot_paper.sh ├── run_rmi_segmentation.sh ├── run_rmi_guideline.sh ├── rmi_ref │ ├── prepare_rmi_ref.sh │ ├── fb_200M_uint64.json │ ├── books_200M_uint64.json │ ├── osm_cellids_200M_uint64.json │ └── wiki_ts_200M_uint64.json ├── run_rmi_errors.sh ├── run_index_comparison.sh ├── run_rmi_intervals.sh ├── run_rmi_lookup.sh ├── download_data.sh ├── run_rmi_build.sh ├── plot_rmi_guideline.py ├── plot_rmi_segmentation.py ├── plot_rmi_errors.py ├── plot_rmi_intervals.py ├── plot_rmi_lookup.py ├── plot_index_comparison.py └── plot_rmi_build.py ├── .gitmodules ├── example.cpp ├── CMakeLists.txt ├── experiments ├── CMakeLists.txt ├── rmi_segmentation.cpp ├── rmi_errors.cpp ├── rmi_intervals.cpp ├── rmi_build.cpp ├── rmi_lookup.cpp └── rmi_guideline.cpp ├── README.md ├── include └── rmi │ ├── util │ ├── fn.hpp │ └── search.hpp │ ├── models.hpp │ └── rmi.hpp └── LICENSE /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib==3.4.2 2 | numpy>=1.22 3 | pandas==1.2.4 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | 34 | # Reference optimizer remnants 35 | *.json_results 36 | 37 | # Plots 38 | *.pdf 39 | 40 | # Directories 41 | build/ 42 | data/ 43 | doxy/ 44 | -------------------------------------------------------------------------------- /scripts/run_all.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | echo "Running RMI Segmentation (Section 5.1)..." 6 | source scripts/run_rmi_segmentation.sh 7 | 8 | echo "Running RMI Errors (Section 5.2)..." 9 | source scripts/run_rmi_errors.sh 10 | 11 | echo "Running RMI Intervals (Section 5.3)..." 12 | source scripts/run_rmi_intervals.sh 13 | 14 | echo "Running RMI Lookup (Section 6)..." 15 | source scripts/run_rmi_lookup.sh 16 | 17 | echo "Running RMI Build (Section 7)..." 18 | source scripts/run_rmi_build.sh 19 | 20 | echo "Running RMI Guideline (Section 8)..." 21 | source scripts/run_rmi_guideline.sh 22 | 23 | echo "Running Index Comparison (Section 9)..." 24 | source scripts/run_index_comparison.sh 25 | -------------------------------------------------------------------------------- /scripts/plot_all.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | echo "Plotting RMI Segmentation (Section 5.1)..." 6 | python3 scripts/plot_rmi_segmentation.py 7 | 8 | echo "Plotting RMI Errors (Section 5.2)..." 9 | python3 scripts/plot_rmi_errors.py 10 | 11 | echo "Plotting RMI Intervals (Section 5.3)..." 12 | python3 scripts/plot_rmi_intervals.py 13 | 14 | echo "Plotting RMI Lookup (Section 6)..." 15 | python3 scripts/plot_rmi_lookup.py 16 | 17 | echo "Plotting RMI Build (Section 7)..." 18 | python3 scripts/plot_rmi_build.py 19 | 20 | echo "Plotting RMI Guideline (Section 8)..." 21 | python3 scripts/plot_rmi_guideline.py 22 | 23 | echo "Plotting Index Comparison (Section 9)..." 24 | python3 scripts/plot_index_comparison.py 25 | -------------------------------------------------------------------------------- /scripts/plot_paper.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | echo "Plotting RMI Segmentation (Section 5.1)..." 6 | python3 scripts/plot_rmi_segmentation.py --paper 7 | 8 | echo "Plotting RMI Errors (Section 5.2)..." 9 | python3 scripts/plot_rmi_errors.py --paper 10 | 11 | echo "Plotting RMI Intervals (Section 5.3)..." 12 | python3 scripts/plot_rmi_intervals.py --paper 13 | 14 | echo "Plotting RMI Lookup (Section 6)..." 15 | python3 scripts/plot_rmi_lookup.py --paper 16 | 17 | echo "Plotting RMI Build (Section 7)..." 18 | python3 scripts/plot_rmi_build.py --paper 19 | 20 | echo "Plotting RMI Guideline (Section 8)..." 21 | python3 scripts/plot_rmi_guideline.py --paper 22 | 23 | echo "Plotting Index Comparison (Section 9)..." 24 | python3 scripts/plot_index_comparison.py --paper 25 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/CHT"] 2 | path = third_party/CHT 3 | url = https://github.com/stoianmihail/CHT 4 | [submodule "third_party/ALEX"] 5 | path = third_party/ALEX 6 | url = https://github.com/microsoft/ALEX 7 | [submodule "third_party/RadixSpline"] 8 | path = third_party/RadixSpline 9 | url = https://github.com/learnedsystems/RadixSpline.git 10 | [submodule "third_party/PGM-index"] 11 | path = third_party/PGM-index 12 | url = https://github.com/gvinciguerra/PGM-index.git 13 | [submodule "third_party/tlx"] 14 | path = third_party/tlx 15 | url = https://github.com/tlx/tlx 16 | [submodule "third_party/argparse"] 17 | path = third_party/argparse 18 | url = https://github.com/p-ranav/argparse.git 19 | [submodule "third_party/RMI"] 20 | path = third_party/RMI 21 | url = https://github.com/learnedsystems/RMI 22 | ignore = dirty 23 | -------------------------------------------------------------------------------- /scripts/run_rmi_segmentation.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | EXPERIMENT="rmi segmentation" 6 | 7 | DIR_DATA="data" 8 | DIR_RESULTS="results" 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_segmentation.csv" 10 | 11 | BIN="build/bin/rmi_segmentation" 12 | 13 | run() { 14 | DATASET=$1 15 | MODEL=$2 16 | N_SEGMENTS=$3 17 | DATA_FILE="${DIR_DATA}/${DATASET}" 18 | ${BIN} ${DATA_FILE} ${MODEL} ${N_SEGMENTS} >> ${FILE_RESULTS} 19 | } 20 | 21 | # Create results directory 22 | if [ ! -d "${DIR_RESULTS}" ]; 23 | then 24 | mkdir -p "${DIR_RESULTS}"; 25 | fi 26 | 27 | # Check data downloaded 28 | if [ ! -d "${DIR_DATA}" ]; 29 | then 30 | >&2 echo "Please download datasets first." 31 | return 1 32 | fi 33 | 34 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64" 35 | MODELS="linear_spline cubic_spline linear_regression radix" 36 | 37 | # Run experiments 38 | echo "dataset,n_keys,model,n_segments,mean,stdev,median,min,max,n_empty" > ${FILE_RESULTS} # Write csv header 39 | for dataset in ${DATASETS}; 40 | do 41 | echo "Performing ${EXPERIMENT} on '${dataset}'..." 42 | for model in ${MODELS}; 43 | do 44 | for ((i=6; i<=25; i += 1)); 45 | do 46 | n_segments=$((2**$i)) 47 | run ${dataset} ${model} ${n_segments} 48 | done 49 | done 50 | done 51 | 52 | -------------------------------------------------------------------------------- /scripts/run_rmi_guideline.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | EXPERIMENT="rmi guideline" 6 | 7 | DIR_DATA="data" 8 | DIR_RESULTS="results" 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_guideline.csv" 10 | 11 | BIN="build/bin/rmi_guideline" 12 | 13 | # Set number of repetitions and samples 14 | N_REPS="3" 15 | N_SAMPLES="20000000" 16 | PARAMS="--n_reps ${N_REPS} --n_samples ${N_SAMPLES}" 17 | 18 | run() { 19 | DATASET=$1 20 | BUDGET=$2 21 | DATA_FILE="${DIR_DATA}/${DATASET}" 22 | ${BIN} ${DATA_FILE} ${BUDGET} ${PARAMS} >> ${FILE_RESULTS} 23 | } 24 | 25 | # Create results directory 26 | if [ ! -d "${DIR_RESULTS}" ]; 27 | then 28 | mkdir -p "${DIR_RESULTS}"; 29 | fi 30 | 31 | # Check data downloaded 32 | if [ ! -d "${DIR_DATA}" ]; 33 | then 34 | >&2 echo "Please download datasets first." 35 | return 1 36 | fi 37 | 38 | DATASETS="books_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64" 39 | 40 | # Run experiments 41 | echo "dataset,n_keys,layer1,layer2,n_models,bounds,search,size_in_bytes,rep,n_samples,budget_in_bytes,is_guideline,lookup_time,lookup_accu" > ${FILE_RESULTS} # Write csv header 42 | for dataset in ${DATASETS}; 43 | do 44 | echo "Performing ${EXPERIMENT} on '${dataset}'..." 45 | for ((i=1; i<=20; i += 1)); 46 | do 47 | budget=$((2**$i * 1024)) 48 | run ${dataset} ${budget} 49 | done 50 | done 51 | -------------------------------------------------------------------------------- /example.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "rmi/models.hpp" 6 | #include "rmi/rmi.hpp" 7 | 8 | 9 | int main() 10 | { 11 | // Initialize random number generator. 12 | using key_type = uint64_t; 13 | std::mt19937 gen(42); 14 | std::uniform_int_distribution key_distrib(0, 1UL << 48); 15 | auto rand = [&gen, &key_distrib] { return key_distrib(gen); }; 16 | 17 | // Create 1M random keys. 18 | std::size_t n_keys = 1e7; 19 | std::vector keys(n_keys); 20 | std::generate(keys.begin(), keys.end(), rand); 21 | std::sort(keys.begin(), keys.end()); 22 | 23 | // Build a two-layer RMI. 24 | using layer1_type = rmi::LinearSpline; 25 | using layer2_type = rmi::LinearRegression; 26 | std::size_t layer2_size = 2UL << 16; 27 | rmi::RmiLAbs rmi(keys, layer2_size); 28 | 29 | // Pick a key. 30 | std::uniform_int_distribution uniform_distrib(0, n_keys - 1); 31 | key_type key = keys[uniform_distrib(gen)]; 32 | 33 | // Perform a lookup. 34 | auto range = rmi.search(key); 35 | auto pos = std::lower_bound(keys.begin() + range.lo, keys.begin() + range.hi, key); 36 | std::cout << "Key " << key << " is located at position " 37 | << std::distance(keys.begin(), pos) << '.' << std::endl; 38 | 39 | return 0; 40 | } 41 | -------------------------------------------------------------------------------- /scripts/rmi_ref/prepare_rmi_ref.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | DIR_DATA="data" 6 | RMI_PATH="third_party/RMI" 7 | CONFIG_PATH="scripts/rmi_ref" 8 | 9 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64" 10 | 11 | gen_config_json() { 12 | DATASET=$1 13 | CWD=$(pwd) 14 | DATA_FILE="${CWD}/${DIR_DATA}/${DATASET}" 15 | CONFIG_FILE="${CWD}/${CONFIG_PATH}/${DATASET}.json" 16 | MANIFEST_FILE="${CWD}/${RMI_PATH}/Cargo.toml" 17 | 18 | echo "Generating reference RMI config json for ${DATASET}..." 19 | cargo run --manifest-path "${MANIFEST_FILE}" --release -- ${DATA_FILE} --optimize "${CONFIG_FILE}" 20 | } 21 | 22 | train_rmi () { 23 | DATASET=$1 24 | CWD=$(pwd) 25 | DATA_FILE="${CWD}/${DIR_DATA}/${DATASET}" 26 | CONFIG_FILE="${CWD}/${CONFIG_PATH}/${DATASET}.json" 27 | MANIFEST_FILE="${CWD}/${RMI_PATH}/Cargo.toml" 28 | 29 | # Create include dir 30 | INCLUDE_PATH="${CWD}/${RMI_PATH}/include/rmi_ref" 31 | mkdir -p "${INCLUDE_PATH}" 32 | cd "${INCLUDE_PATH}" 33 | 34 | echo "Training reference RMIs on ${DATASET}..." 35 | cargo run --manifest-path "${MANIFEST_FILE}" --release -- ${DATA_FILE} --param-grid "${CONFIG_FILE}" --disable-parallel-training 36 | 37 | cd ${CWD} 38 | } 39 | 40 | for dataset in ${DATASETS}; 41 | do 42 | # Generate RMI configurations 43 | # gen_config_json "$dataset" # configs are pre-generated 44 | 45 | # Train RMIs 46 | train_rmi "$dataset" 47 | done 48 | -------------------------------------------------------------------------------- /scripts/run_rmi_errors.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | EXPERIMENT="rmi errors" 6 | 7 | DIR_DATA="data" 8 | DIR_RESULTS="results" 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_errors.csv" 10 | 11 | BIN="build/bin/rmi_errors" 12 | 13 | run() { 14 | DATASET=$1 15 | LAYER1=$2 16 | LAYER2=$3 17 | N_MODELS=$4 18 | DATA_FILE="${DIR_DATA}/${DATASET}" 19 | ${BIN} ${DATA_FILE} ${LAYER1} ${LAYER2} ${N_MODELS} >> ${FILE_RESULTS} 20 | } 21 | 22 | # Create results directory 23 | if [ ! -d "${DIR_RESULTS}" ]; 24 | then 25 | mkdir -p "${DIR_RESULTS}"; 26 | fi 27 | 28 | # Check data downloaded 29 | if [ ! -d "${DIR_DATA}" ]; 30 | then 31 | >&2 echo "Please download datasets first." 32 | return 1 33 | fi 34 | 35 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64" 36 | LAYER1_MODELS="linear_spline cubic_spline linear_regression radix" 37 | LAYER2_MODELS="linear_spline linear_regression" 38 | 39 | # Run experiments 40 | echo "dataset,n_keys,layer1,layer2,n_models,mean_ae,median_ae,stdev_ae,min_ae,max_ae" > ${FILE_RESULTS} # Write csv header 41 | for dataset in ${DATASETS}; 42 | do 43 | echo "Performing ${EXPERIMENT} on '${dataset}'..." 44 | for ((i=6; i<=25; i += 1)); 45 | do 46 | n_models=$((2**$i)) 47 | for l1 in ${LAYER1_MODELS}; 48 | do 49 | for l2 in ${LAYER2_MODELS}; 50 | do 51 | run ${dataset} ${l1} ${l2} ${n_models} 52 | done 53 | done 54 | done 55 | done 56 | 57 | -------------------------------------------------------------------------------- /scripts/rmi_ref/fb_200M_uint64.json: -------------------------------------------------------------------------------- 1 | {"configs":[{"layers":"robust_linear,linear","branching factor":16777216,"namespace":"fb_200M_uint64_0","size":402653200,"average log2 error":5.046965439910379,"binary":true},{"layers":"robust_linear,linear","branching factor":8388608,"namespace":"fb_200M_uint64_1","size":201326608,"average log2 error":5.63038432124456,"binary":true},{"layers":"robust_linear,linear","branching factor":4194304,"namespace":"fb_200M_uint64_2","size":100663312,"average log2 error":6.264421574769353,"binary":true},{"layers":"robust_linear,linear","branching factor":1048576,"namespace":"fb_200M_uint64_3","size":25165840,"average log2 error":7.61933264490036,"binary":true},{"layers":"robust_linear,linear","branching factor":524288,"namespace":"fb_200M_uint64_4","size":12582928,"average log2 error":8.309666117218308,"binary":true},{"layers":"robust_linear,linear","branching factor":262144,"namespace":"fb_200M_uint64_5","size":6291472,"average log2 error":8.993034097192816,"binary":true},{"layers":"robust_linear,linear","branching factor":131072,"namespace":"fb_200M_uint64_6","size":3145744,"average log2 error":9.664330303064656,"binary":true},{"layers":"robust_linear,linear","branching factor":32768,"namespace":"fb_200M_uint64_7","size":786448,"average log2 error":10.905758730947948,"binary":true},{"layers":"robust_linear,linear","branching factor":1024,"namespace":"fb_200M_uint64_8","size":24592,"average log2 error":13.674001584650402,"binary":true},{"layers":"robust_linear,linear","branching factor":128,"namespace":"fb_200M_uint64_9","size":3088,"average log2 error":15.312913003960264,"binary":true}]} -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.2) 2 | 3 | project("analysis-rmi" 4 | LANGUAGES C CXX 5 | HOMEPAGE_URL https://github.com/BigDataAnalyticsGroup/analysis-rmi 6 | ) 7 | 8 | # Set output directories 9 | set(EXECUTABLE_OUTPUT_PATH "${PROJECT_BINARY_DIR}/bin") 10 | 11 | # Set compilation flags 12 | SET(CMAKE_CXX_STANDARD 17) 13 | SET(CMAKE_COMPILE_FLAGS "-W -Wall -pedantic -DLEVEL1_DCACHE_LINESIZE=${LEVEL1_DCACHE_LINESIZE} -DPAGESIZE=${PAGESIZE} -march=native -Wno-variadic-macros -Wno-gnu-zero-variadic-macro-arguments -Wno-gnu-label-as-value -Wno-vla-extension") 14 | SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_COMPILE_FLAGS}") 15 | SET(CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS} ${CMAKE_COMPILE_FLAGS}") 16 | SET(CMAKE_CXX_FLAGS_DEBUG "-ggdb3 -fno-omit-frame-pointer -fno-optimize-sibling-calls -fsanitize=address,undefined -fsanitize-address-use-after-scope") 17 | SET(CMAKE_CXX_FLAGS_RELEASE "-O2") 18 | SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -ggdb3") 19 | 20 | # Directories 21 | include_directories(include) 22 | 23 | # Third party 24 | include_directories(third_party/ALEX/src) 25 | include_directories(third_party/argparse/include) 26 | include_directories(third_party/ART/include) 27 | include_directories(third_party/CHT/include) 28 | include_directories(third_party/PGM-index/include) 29 | include_directories(third_party/RadixSpline/include) 30 | include_directories(third_party/RMI/include) 31 | include_directories(third_party/tlx) 32 | 33 | # Executables 34 | add_executable(example example.cpp) 35 | add_subdirectory(experiments) 36 | -------------------------------------------------------------------------------- /scripts/run_index_comparison.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | EXPERIMENT="index comparison" 6 | 7 | DIR_DATA="data" 8 | DIR_RESULTS="results" 9 | FILE_RESULTS="${DIR_RESULTS}/index_comparison.csv" 10 | 11 | BIN="build/bin/index_comparison" 12 | 13 | # Set number of repetitions and samples 14 | N_REPS="3" 15 | N_SAMPLES="20000000" 16 | PARAMS="--n_reps ${N_REPS} --n_samples ${N_SAMPLES}" 17 | 18 | # Set which indexes to run on datasets 19 | declare -A flags 20 | flags['books_200M_uint64']="--rmi --alex --pgm --rs --cht --art --tlx --ref --bin" 21 | flags['fb_200M_uint64']="--rmi --alex --pgm --rs --cht --art --tlx --ref --bin" 22 | flags['osm_cellids_200M_uint64']="--rmi --alex --pgm --rs --cht --art --tlx --ref --bin" 23 | flags['wiki_ts_200M_uint64']="--rmi --alex --pgm --rs --tlx --ref --bin" # ART and CHT do not support duplicates 24 | 25 | run() { 26 | DATASET=$1 27 | DATA_FILE="${DIR_DATA}/${DATASET}" 28 | ${BIN} ${PARAMS} ${flags[${DATASET}]} ${DATA_FILE} >> ${FILE_RESULTS} 29 | } 30 | 31 | # Create results directory 32 | if [ ! -d "${DIR_RESULTS}" ]; 33 | then 34 | mkdir -p "${DIR_RESULTS}"; 35 | fi 36 | 37 | # Check data downloaded 38 | if [ ! -d "${DIR_DATA}" ]; 39 | then 40 | >&2 echo "Please download datasets first." 41 | return 1 42 | fi 43 | 44 | # Run experiments 45 | echo "dataset,n_keys,index,config,size_in_bytes,rep,n_samples,build_time,eval_time,lookup_time,eval_accu,lookup_accu" > ${FILE_RESULTS} # Write csv header 46 | for dataset in ${!flags[@]}; 47 | do 48 | echo "Performing ${EXPERIMENT} on '${dataset}'..." 49 | run $dataset 50 | done 51 | -------------------------------------------------------------------------------- /scripts/rmi_ref/books_200M_uint64.json: -------------------------------------------------------------------------------- 1 | {"configs":[{"layers":"linear_spline,linear","branching factor":16777216,"namespace":"books_200M_uint64_0","size":402653200,"average log2 error":3.881372461290736,"binary":true},{"layers":"linear_spline,linear","branching factor":8388608,"namespace":"books_200M_uint64_1","size":201326608,"average log2 error":4.157408768582805,"binary":true},{"layers":"linear_spline,linear","branching factor":4194304,"namespace":"books_200M_uint64_2","size":100663312,"average log2 error":4.515015335099573,"binary":true},{"layers":"radix22,linear","branching factor":1048576,"namespace":"books_200M_uint64_3","size":41943040,"average log2 error":5.240585141688408,"binary":true},{"layers":"linear_spline,linear","branching factor":524288,"namespace":"books_200M_uint64_4","size":12582928,"average log2 error":5.779940878504297,"binary":true},{"layers":"linear_spline,linear","branching factor":262144,"namespace":"books_200M_uint64_5","size":6291472,"average log2 error":6.234719007601088,"binary":true},{"layers":"linear_spline,linear","branching factor":131072,"namespace":"books_200M_uint64_6","size":3145744,"average log2 error":6.698291356077185,"binary":true},{"layers":"linear_spline,linear","branching factor":32768,"namespace":"books_200M_uint64_7","size":786448,"average log2 error":7.656127767412821,"binary":true},{"layers":"linear_spline,linear","branching factor":1024,"namespace":"books_200M_uint64_8","size":24592,"average log2 error":10.182451513884163,"binary":true},{"layers":"linear_spline,linear","branching factor":128,"namespace":"books_200M_uint64_9","size":3088,"average log2 error":12.653101078683737,"binary":true}]} -------------------------------------------------------------------------------- /scripts/rmi_ref/osm_cellids_200M_uint64.json: -------------------------------------------------------------------------------- 1 | {"configs":[{"layers":"cubic,linear","branching factor":16777216,"namespace":"osm_cellids_200M_uint64_0","size":402653216,"average log2 error":7.433883845329264,"binary":true},{"layers":"cubic,linear","branching factor":8388608,"namespace":"osm_cellids_200M_uint64_1","size":201326624,"average log2 error":8.206800487100525,"binary":true},{"layers":"cubic,linear","branching factor":4194304,"namespace":"osm_cellids_200M_uint64_2","size":100663328,"average log2 error":9.020228908629712,"binary":true},{"layers":"radix22,linear","branching factor":1048576,"namespace":"osm_cellids_200M_uint64_3","size":41943040,"average log2 error":9.651565886104518,"binary":true},{"layers":"cubic,linear","branching factor":524288,"namespace":"osm_cellids_200M_uint64_4","size":12582944,"average log2 error":11.587355989718806,"binary":true},{"layers":"radix18,linear","branching factor":32768,"namespace":"osm_cellids_200M_uint64_5","size":1835008,"average log2 error":13.362024384454787,"binary":true},{"layers":"linear,linear","branching factor":32768,"namespace":"osm_cellids_200M_uint64_6","size":786448,"average log2 error":15.128271341741858,"binary":true},{"layers":"robust_linear,cubic","branching factor":1024,"namespace":"osm_cellids_200M_uint64_7","size":40976,"average log2 error":19.590492696796756,"binary":true},{"layers":"robust_linear,linear","branching factor":512,"namespace":"osm_cellids_200M_uint64_8","size":12304,"average log2 error":20.541583565893747,"binary":true},{"layers":"robust_linear,linear","branching factor":128,"namespace":"osm_cellids_200M_uint64_9","size":3088,"average log2 error":22.416249075800633,"binary":true}]} -------------------------------------------------------------------------------- /scripts/rmi_ref/wiki_ts_200M_uint64.json: -------------------------------------------------------------------------------- 1 | {"configs":[{"layers":"linear_spline,linear","branching factor":16777216,"namespace":"wiki_ts_200M_uint64_0","size":402653200,"average log2 error":4.283256474039042,"binary":true},{"layers":"linear_spline,linear","branching factor":8388608,"namespace":"wiki_ts_200M_uint64_1","size":201326608,"average log2 error":4.473650521507884,"binary":true},{"layers":"linear_spline,linear","branching factor":4194304,"namespace":"wiki_ts_200M_uint64_2","size":100663312,"average log2 error":4.714111817674564,"binary":true},{"layers":"linear_spline,linear","branching factor":1048576,"namespace":"wiki_ts_200M_uint64_3","size":25165840,"average log2 error":5.349370483548578,"binary":true},{"layers":"linear_spline,linear","branching factor":524288,"namespace":"wiki_ts_200M_uint64_4","size":12582928,"average log2 error":5.769176055947262,"binary":true},{"layers":"linear_spline,linear","branching factor":262144,"namespace":"wiki_ts_200M_uint64_5","size":6291472,"average log2 error":6.287461494618375,"binary":true},{"layers":"linear_spline,linear","branching factor":131072,"namespace":"wiki_ts_200M_uint64_6","size":3145744,"average log2 error":6.927357373267273,"binary":true},{"layers":"linear,linear","branching factor":32768,"namespace":"wiki_ts_200M_uint64_7","size":786448,"average log2 error":8.727133637122125,"binary":true},{"layers":"linear_spline,linear","branching factor":1024,"namespace":"wiki_ts_200M_uint64_8","size":24592,"average log2 error":14.509911274780844,"binary":true},{"layers":"linear_spline,linear","branching factor":128,"namespace":"wiki_ts_200M_uint64_9","size":3088,"average log2 error":16.279077272674099,"binary":true}]} -------------------------------------------------------------------------------- /scripts/run_rmi_intervals.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | EXPERIMENT="rmi intervals" 6 | 7 | DIR_DATA="data" 8 | DIR_RESULTS="results" 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_intervals.csv" 10 | 11 | BIN="build/bin/rmi_intervals" 12 | 13 | run() { 14 | DATASET=$1 15 | LAYER1=$2 16 | LAYER2=$3 17 | N_MODELS=$4 18 | BOUND=$5 19 | DATA_FILE="${DIR_DATA}/${DATASET}" 20 | ${BIN} ${DATA_FILE} ${LAYER1} ${LAYER2} ${N_MODELS} ${BOUND} >> ${FILE_RESULTS} 21 | } 22 | 23 | # Create results directory 24 | if [ ! -d "${DIR_RESULTS}" ]; 25 | then 26 | mkdir -p "${DIR_RESULTS}"; 27 | fi 28 | 29 | # Check data downloaded 30 | if [ ! -d "${DIR_DATA}" ]; 31 | then 32 | >&2 echo "Please download datasets first." 33 | return 1 34 | fi 35 | 36 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64" 37 | LAYERS1="linear_spline cubic_spline linear_regression radix" 38 | LAYERS2="linear_spline linear_regression" 39 | BOUNDS="gabs gind labs lind" 40 | 41 | # Run experiments 42 | echo "dataset,n_keys,layer1,layer2,n_models,bounds,size_in_bytes,mean_interval,median_interval,stdev_interval,min_interval,max_interval" > ${FILE_RESULTS} # Write csv header 43 | for dataset in ${DATASETS}; 44 | do 45 | echo "Performing ${EXPERIMENT} on '${dataset}'..." 46 | for ((i=6; i<=25; i += 1)); 47 | do 48 | n_models=$((2**$i)) 49 | for l1 in ${LAYERS1}; 50 | do 51 | for l2 in ${LAYERS2}; 52 | do 53 | for bound in ${BOUNDS}; 54 | do 55 | run ${dataset} ${l1} ${l2} ${n_models} ${bound} 56 | done 57 | done 58 | done 59 | done 60 | done 61 | 62 | -------------------------------------------------------------------------------- /experiments/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.2) 2 | 3 | add_executable(rmi_segmentation rmi_segmentation.cpp) 4 | add_executable(rmi_errors rmi_errors.cpp) 5 | add_executable(rmi_intervals rmi_intervals.cpp) 6 | add_executable(rmi_lookup rmi_lookup.cpp) 7 | add_executable(rmi_build rmi_build.cpp) 8 | add_executable(rmi_guideline rmi_guideline.cpp) 9 | 10 | set(SOSD_PATH "${PROJECT_SOURCE_DIR}/third_party/RMI/include/rmi_ref") 11 | add_executable(index_comparison 12 | index_comparison.cpp 13 | ${SOSD_PATH}/books_200M_uint64_0.cpp 14 | ${SOSD_PATH}/books_200M_uint64_1.cpp 15 | ${SOSD_PATH}/books_200M_uint64_2.cpp 16 | ${SOSD_PATH}/books_200M_uint64_3.cpp 17 | ${SOSD_PATH}/books_200M_uint64_4.cpp 18 | ${SOSD_PATH}/books_200M_uint64_5.cpp 19 | ${SOSD_PATH}/books_200M_uint64_6.cpp 20 | ${SOSD_PATH}/books_200M_uint64_7.cpp 21 | ${SOSD_PATH}/books_200M_uint64_8.cpp 22 | ${SOSD_PATH}/books_200M_uint64_9.cpp 23 | ${SOSD_PATH}/fb_200M_uint64_0.cpp 24 | ${SOSD_PATH}/fb_200M_uint64_1.cpp 25 | ${SOSD_PATH}/fb_200M_uint64_2.cpp 26 | ${SOSD_PATH}/fb_200M_uint64_3.cpp 27 | ${SOSD_PATH}/fb_200M_uint64_4.cpp 28 | ${SOSD_PATH}/fb_200M_uint64_5.cpp 29 | ${SOSD_PATH}/fb_200M_uint64_6.cpp 30 | ${SOSD_PATH}/fb_200M_uint64_7.cpp 31 | ${SOSD_PATH}/fb_200M_uint64_8.cpp 32 | ${SOSD_PATH}/fb_200M_uint64_9.cpp 33 | ${SOSD_PATH}/osm_cellids_200M_uint64_0.cpp 34 | ${SOSD_PATH}/osm_cellids_200M_uint64_1.cpp 35 | ${SOSD_PATH}/osm_cellids_200M_uint64_2.cpp 36 | ${SOSD_PATH}/osm_cellids_200M_uint64_3.cpp 37 | ${SOSD_PATH}/osm_cellids_200M_uint64_4.cpp 38 | ${SOSD_PATH}/osm_cellids_200M_uint64_5.cpp 39 | ${SOSD_PATH}/osm_cellids_200M_uint64_6.cpp 40 | ${SOSD_PATH}/osm_cellids_200M_uint64_7.cpp 41 | ${SOSD_PATH}/osm_cellids_200M_uint64_8.cpp 42 | ${SOSD_PATH}/osm_cellids_200M_uint64_9.cpp 43 | ${SOSD_PATH}/wiki_ts_200M_uint64_0.cpp 44 | ${SOSD_PATH}/wiki_ts_200M_uint64_1.cpp 45 | ${SOSD_PATH}/wiki_ts_200M_uint64_2.cpp 46 | ${SOSD_PATH}/wiki_ts_200M_uint64_3.cpp 47 | ${SOSD_PATH}/wiki_ts_200M_uint64_4.cpp 48 | ${SOSD_PATH}/wiki_ts_200M_uint64_5.cpp 49 | ${SOSD_PATH}/wiki_ts_200M_uint64_6.cpp 50 | ${SOSD_PATH}/wiki_ts_200M_uint64_7.cpp 51 | ${SOSD_PATH}/wiki_ts_200M_uint64_8.cpp 52 | ${SOSD_PATH}/wiki_ts_200M_uint64_9.cpp 53 | ) 54 | -------------------------------------------------------------------------------- /scripts/run_rmi_lookup.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | EXPERIMENT="rmi lookup" 6 | 7 | DIR_DATA="data" 8 | DIR_RESULTS="results" 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_lookup.csv" 10 | 11 | BIN="build/bin/rmi_lookup" 12 | 13 | # Set number of repetitions and samples 14 | N_REPS="3" 15 | N_SAMPLES="20000000" 16 | PARAMS="--n_reps ${N_REPS} --n_samples ${N_SAMPLES}" 17 | TIMEOUT="90s" 18 | 19 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64" 20 | LAYER1="cubic_spline linear_spline linear_regression radix" 21 | LAYER2="linear_spline linear_regression" 22 | 23 | run() { 24 | DATASET=$1 25 | L1=$2 26 | L2=$3 27 | N_MODELS=$4 28 | BOUND=$5 29 | SEARCH=$6 30 | DATA_FILE="${DIR_DATA}/${DATASET}" 31 | timeout ${TIMEOUT} ${BIN} ${DATA_FILE} ${L1} ${L2} ${N_MODELS} ${BOUND} ${SEARCH} ${PARAMS} >> ${FILE_RESULTS} 32 | } 33 | 34 | # Create results directory 35 | if [ ! -d "${DIR_RESULTS}" ]; 36 | then 37 | mkdir -p "${DIR_RESULTS}"; 38 | fi 39 | 40 | # Check data downloaded 41 | if [ ! -d "${DIR_DATA}" ]; 42 | then 43 | >&2 echo "Please download datasets first." 44 | return 1 45 | fi 46 | 47 | # Write csv header 48 | echo "dataset,n_keys,layer1,layer2,n_models,bounds,search,size_in_bytes,rep,n_samples,lookup_time,lookup_accu" > ${FILE_RESULTS} # Write csv header 49 | 50 | # Run model type experiment 51 | for dataset in ${DATASETS}; 52 | do 53 | echo "Performing ${EXPERIMENT} on '${dataset}'..." 54 | for l1 in ${LAYER1}; 55 | do 56 | for l2 in ${LAYER2}; 57 | do 58 | for ((i=6; i<=25; i += 1)); 59 | do 60 | n_models=$((2**$i)) 61 | run ${dataset} ${l1} ${l2} ${n_models} none model_biased_linear 62 | run ${dataset} ${l1} ${l2} ${n_models} none model_biased_exponential 63 | 64 | run ${dataset} ${l1} ${l2} ${n_models} gabs binary 65 | 66 | run ${dataset} ${l1} ${l2} ${n_models} gind model_biased_binary 67 | run ${dataset} ${l1} ${l2} ${n_models} gind binary 68 | 69 | run ${dataset} ${l1} ${l2} ${n_models} labs binary 70 | 71 | run ${dataset} ${l1} ${l2} ${n_models} lind model_biased_binary 72 | run ${dataset} ${l1} ${l2} ${n_models} lind binary 73 | done 74 | done 75 | done 76 | done 77 | -------------------------------------------------------------------------------- /scripts/download_data.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | DIR_DATA="data" 6 | 7 | # Set download urls 8 | declare -A urls 9 | urls["books_200M_uint64"]="https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/A6HDNT" 10 | urls["fb_200M_uint64"]="https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/EATHF7" 11 | urls["osm_cellids_200M_uint64"]="https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/8FX9BV" 12 | urls["wiki_ts_200M_uint64"]="https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/JGVF9A/SVN8PI" 13 | 14 | # Set md5 for compressed files 15 | declare -A md5zst 16 | md5zst["books_200M_uint64"]="cd1f8bcb0dfd36f9ab08d160b887bf8a" 17 | md5zst["fb_200M_uint64"]="fec241e8b021b198b0849fbd5564c05f" 18 | md5zst["osm_cellids_200M_uint64"]="42575cb58f24bb7ea0a623d422d4c9a6" 19 | md5zst["wiki_ts_200M_uint64"]="6a2b17020959084ce2640177ee4afd5e" 20 | 21 | # Set md5 for decompressed files 22 | declare -A md5bin 23 | md5bin["books_200M_uint64"]="aeedc7be338399ced89d0bb82287e024" 24 | md5bin["fb_200M_uint64"]="3b0f820caa0d62150e87ce94ec989978" 25 | md5bin["osm_cellids_200M_uint64"]="a7f6b8d2df09fcda5d9cfbc87d765979" 26 | md5bin["wiki_ts_200M_uint64"]="4f1402b1c476d67f77d2da4955432f7d" 27 | 28 | check_md5() { 29 | FILE=$1 30 | MD5_EXPECTED=$2 31 | echo "Checking '${FILE}'..." 32 | MD5_ACTUAL=$(md5sum -b ${FILE} | cut -d ' ' -f 1) 33 | [ ${MD5_EXPECTED} == ${MD5_ACTUAL} ] 34 | } 35 | 36 | download() { 37 | DATASET=$1 38 | FILE="${DIR_DATA}/${DATASET}.zst" 39 | URL=${urls[${DATASET}]} 40 | echo "Downloading '${DATASET}'..." 41 | wget -q --show-progress -O ${FILE} ${URL} 42 | return $? 43 | } 44 | 45 | decompress() { 46 | FILE=$1 47 | echo "Decompressing '${FILE}'..." 48 | zstd -f -d ${FILE} 49 | return $? 50 | } 51 | 52 | # Create data directory 53 | if [ ! -d "${DIR_DATA}" ]; 54 | then 55 | mkdir -p "${DIR_DATA}"; 56 | fi 57 | 58 | # Download datasets 59 | for dataset in ${!urls[@]}; 60 | do 61 | FILE_BIN=${DIR_DATA}/${dataset} 62 | if [ -f ${FILE_BIN} ]; 63 | then 64 | echo "File '${FILE_BIN}' already exists." 65 | check_md5 ${FILE_BIN} ${md5bin[${dataset}]} && continue 66 | fi 67 | 68 | FILE_ZST=${DIR_DATA}/${dataset}.zst 69 | if [ -f ${FILE_ZST} ]; 70 | then 71 | echo "File '${FILE_ZST}' already exists." 72 | check_md5 ${FILE_ZST} ${md5zst[${dataset}]} && decompress ${FILE_ZST} && check_md5 ${FILE_BIN} ${md5bin[${dataset}]} && continue 73 | fi 74 | 75 | download ${dataset} && check_md5 ${FILE_ZST} ${md5zst[${dataset}]} && decompress ${FILE_ZST} && check_md5 ${FILE_BIN} ${md5bin[${dataset}]} && continue 76 | echo "Download failed. Please try again." 77 | done 78 | -------------------------------------------------------------------------------- /scripts/run_rmi_build.sh: -------------------------------------------------------------------------------- 1 | #!bash 2 | # set -x 3 | trap "exit" SIGINT 4 | 5 | EXPERIMENT="rmi build" 6 | 7 | DIR_DATA="data" 8 | DIR_RESULTS="results" 9 | FILE_RESULTS="${DIR_RESULTS}/rmi_build.csv" 10 | 11 | BIN="build/bin/rmi_build" 12 | 13 | # Set number of repetitions and samples 14 | N_REPS="3" 15 | PARAMS="--n_reps ${N_REPS}" 16 | TIMEOUT="60s" 17 | 18 | DATASETS="books_200M_uint64 fb_200M_uint64 osm_cellids_200M_uint64 wiki_ts_200M_uint64" 19 | LAYER1="cubic_spline linear_spline linear_regression radix" 20 | LAYER2="linear_spline linear_regression" 21 | BOUNDS="none gabs gind labs lind" 22 | 23 | run() { 24 | DATASET=$1 25 | L1=$2 26 | L2=$3 27 | N_MODELS=$4 28 | BOUND=$5 29 | DATA_FILE="${DIR_DATA}/${DATASET}" 30 | timeout ${TIMEOUT} ${BIN} ${DATA_FILE} ${L1} ${L2} ${N_MODELS} ${BOUND} ${PARAMS} >> ${FILE_RESULTS} 31 | } 32 | 33 | # Create results directory 34 | if [ ! -d "${DIR_RESULTS}" ]; 35 | then 36 | mkdir -p "${DIR_RESULTS}"; 37 | fi 38 | 39 | # Check data downloaded 40 | if [ ! -d "${DIR_DATA}" ]; 41 | then 42 | >&2 echo "Please download datasets first." 43 | return 1 44 | fi 45 | 46 | # Write csv header 47 | echo "dataset,n_keys,rmi,layer1,layer2,n_models,bounds,size_in_bytes,rep,build_time,checksum" > ${FILE_RESULTS} # Write csv header 48 | 49 | # Run layer1 and layer 2 model type experiment 50 | for dataset in ${DATASETS}; 51 | do 52 | echo "Performing ${EXPERIMENT} (ours) on '${dataset}'..." 53 | for ((i=6; i<=25; i += 1)); 54 | do 55 | n_models=$((2**$i)) 56 | for l1 in ${LAYER1}; 57 | do 58 | for l2 in ${LAYER2}; 59 | do 60 | for bound in ${BOUNDS}; 61 | do 62 | run ${dataset} ${l1} ${l2} ${n_models} ${bound} 63 | done 64 | done 65 | done 66 | done 67 | done 68 | 69 | 70 | # Prepare reference implementation experiment 71 | CWD=$(pwd) 72 | RMI_PATH="third_party/RMI" 73 | TMP_PATH="${CWD}/${RMI_PATH}/tmp" 74 | MANIFEST_FILE="${CWD}/${RMI_PATH}/Cargo.toml" 75 | NAMESPACE="tmp" 76 | RESULTS_FILE=${CWD}/${FILE_RESULTS} 77 | mkdir -p ${TMP_PATH} 78 | cd ${TMP_PATH} 79 | 80 | declare -A l1models 81 | l1models['linear_spline']="linear_spline" 82 | l1models['cubic_spline']="cubic" 83 | l1models['linear_regression']="linear" 84 | l1models['radix']="radix" 85 | 86 | declare -A l2models 87 | l2models['linear_spline']="linear_spline" 88 | l2models['linear_regression']="linear" 89 | 90 | declare -A bounds 91 | bounds['labs']="" 92 | bounds['none']="--no-errors" 93 | 94 | # Run reference implementation experiment 95 | for dataset in ${DATASETS}; 96 | do 97 | DATA_FILE="${CWD}/${DIR_DATA}/${dataset}" 98 | echo "Performing ${EXPERIMENT} (ref) on '${dataset}'..." 99 | for ((i=6; i<=25; i += 1)); 100 | do 101 | n_models=$((2**$i)) 102 | for l1 in ${!l1models[@]}; 103 | do 104 | for l2 in ${!l2models[@]}; 105 | do 106 | for bound in ${!bounds[@]}; 107 | do 108 | for ((rep=0; rep<${N_REPS}; rep += 1)); 109 | do 110 | # Build RMI. 111 | cargo run --manifest-path ${MANIFEST_FILE} --release -- ${DATA_FILE} ${NAMESPACE} ${l1models[${l1}]},${l2models[${l2}]} ${n_models} ${bounds[${bound}]} > /dev/null 112 | 113 | # Exract results. 114 | size=$(cat ${TMP_PATH}/tmp.h | grep SIZE | sed 's/.*=//' | tr -d -c 0-9) 115 | build_time=$(cat ${TMP_PATH}/tmp.h | grep BUILD | sed 's/.*=//' | tr -d -c 0-9) 116 | 117 | # Append results to csv. 118 | echo "${dataset},200000000,ref,${l1},${l2},${n_models},${bound},${size},${rep},${build_time},0" >> ${RESULTS_FILE} 119 | done 120 | done 121 | done 122 | done 123 | done 124 | done 125 | cd $CWD 126 | -------------------------------------------------------------------------------- /scripts/plot_rmi_guideline.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | import argparse 3 | import itertools 4 | import matplotlib.cm as cm 5 | import matplotlib.pyplot as plt 6 | import os 7 | import pandas as pd 8 | import warnings 9 | 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc')) 11 | 12 | # Ignore warnings 13 | warnings.filterwarnings( "ignore") 14 | 15 | # Argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true') 18 | args = vars(parser.parse_args()) 19 | 20 | 21 | def plot_guideline(filename='rmi_guideline.pdf', width_fact=5, height_fact=4.2): 22 | n_cols = len(datasets) 23 | n_rows = 1 24 | 25 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=False, sharex=True) 26 | fig.tight_layout() 27 | 28 | for col, dataset in enumerate(datasets): 29 | ax = axs[col] 30 | fast_lookups = list() 31 | fast_sizes = list() 32 | guide_lookups = list() 33 | guide_sizes = list() 34 | for budget in budgets: 35 | 36 | # Fastest configuration 37 | fast_confs = df[ 38 | (df['dataset']==dataset) & 39 | (df['budget_in_bytes']==budget) & 40 | (df['is_guideline']==False) 41 | ] 42 | fast_lookup = fast_confs['lookup_in_ns'].min() 43 | fast_conf = fast_confs[fast_confs['lookup_in_ns']==fast_lookup] 44 | fast_size = fast_conf['size_in_bytes'].iloc[0] 45 | 46 | fast_lookups.append(fast_lookup) 47 | fast_sizes.append(fast_size) 48 | 49 | # Guideline configuration 50 | guide_conf = df[ 51 | (df['dataset']==dataset) & 52 | (df['budget_in_bytes']==budget) & 53 | (df['is_guideline']==True) 54 | ] 55 | guide_lookup = guide_conf['lookup_in_ns'].iloc[0] 56 | guide_size = guide_conf['size_in_bytes'].iloc[0] 57 | 58 | guide_lookups.append(guide_lookup) 59 | guide_sizes.append(guide_size) 60 | 61 | # Plot lookup times 62 | ax.plot(fast_sizes, fast_lookups, marker='+', markersize=5, c=colors['fastest'], label='RMI (fastest)') 63 | ax.plot(guide_sizes, guide_lookups, c=colors['guideline'], linestyle='dotted', label='RMI (guideline)') 64 | 65 | # Title 66 | ax.set_title(f'{dataset}') 67 | 68 | # Labels 69 | if col==0: 70 | ax.set_ylabel('Lookup time [ns]') 71 | ax.set_xlabel('Index size [MiB]') 72 | 73 | # Visuals 74 | ax.set_xscale('log') 75 | if col==n_cols-1: 76 | ax.set_ylim(bottom=0) 77 | 78 | # Legend 79 | if col==0: 80 | fig.legend(ncol=2, bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 81 | 82 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 83 | 84 | 85 | if __name__ == "__main__": 86 | path = 'results' 87 | 88 | # Read csv file 89 | file = os.path.join(path, 'rmi_guideline.csv') 90 | df = pd.read_csv(file, delimiter=',', header=0, comment='#') 91 | 92 | # Compute median of lookup times 93 | df = df.groupby(['dataset','layer1','layer2','n_models','bounds','search','is_guideline']).median().reset_index() 94 | 95 | # Replace datasets 96 | dataset_dict = { 97 | "books_200M_uint64": "books", 98 | "fb_200M_uint64": "fb", 99 | "osm_cellids_200M_uint64": "osmc", 100 | "wiki_ts_200M_uint64": "wiki" 101 | } 102 | df.replace({**dataset_dict}, inplace=True) 103 | 104 | # Compute metrics 105 | df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024) 106 | df['lookup_in_ns'] = df['lookup_time'] / df['n_samples'] 107 | 108 | # Define varibale lists 109 | datasets = sorted(df['dataset'].unique()) 110 | budgets = sorted(df['budget_in_bytes'].unique()) 111 | 112 | # Set colors 113 | colors = {} 114 | cmap = cm.get_cmap('tab10') 115 | n_colors = 8 116 | for i, x in enumerate(['fastest', 'guideline']): 117 | colors[x] = cmap((i)/n_colors) 118 | 119 | if args['paper']: 120 | # Plot guideline 121 | filename = 'rmi_guideline.pdf' 122 | print(f'Plotting guideline to \'{filename}\'...') 123 | plot_guideline(filename, 2.7, 2) 124 | else: 125 | # Plot guideline 126 | filename = 'rmi_guideline.pdf' 127 | print(f'Plotting guideline to \'{filename}\'...') 128 | plot_guideline(filename) 129 | -------------------------------------------------------------------------------- /scripts/plot_rmi_segmentation.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | import argparse 3 | import matplotlib.cm as cm 4 | import matplotlib.pyplot as plt 5 | import matplotlib.ticker as mtick 6 | import os 7 | import pandas as pd 8 | import warnings 9 | 10 | plt.style.use(os.path.join('scripts','matplotlibrc')) 11 | 12 | # Ignore warnings 13 | warnings.filterwarnings( "ignore") 14 | 15 | # Argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true') 18 | args = vars(parser.parse_args()) 19 | 20 | 21 | def plot_frac_empty(filename='rmi_segmentation-frac_empty.pdf', width_fact=5, height_fact=4.2): 22 | n_cols = len(datasets) 23 | n_rows = 1 24 | 25 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=True) 26 | fig.tight_layout() 27 | 28 | for col, dataset in enumerate(datasets): 29 | ax = axs[col] 30 | for model in models: 31 | data = df[ 32 | (df['dataset']==dataset) & 33 | (df['model']==model) 34 | ] 35 | if not data.empty: 36 | ax.plot(data['n_segments'], data['frac_empty'], label=model, c=colors[model]) 37 | 38 | # Title 39 | ax.set_title(dataset) 40 | 41 | # Labels 42 | if col==0: 43 | ax.set_ylabel('Percentage of\nempty segments') 44 | ax.set_xlabel('# of segments') 45 | 46 | # Visuals 47 | ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0)) 48 | ax.set_xscale('log', base=2) 49 | ax.set_xticks([2**12, 2**20]) 50 | 51 | # Legend 52 | if col==0: 53 | fig.legend(ncol=len(models), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 54 | 55 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 56 | 57 | 58 | def plot_max_segment(filename='rmi_segmentation-max_segment.pdf', width_fact=5, height_fact=4.2): 59 | n_cols = len(datasets) 60 | n_rows = 1 61 | 62 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=True) 63 | fig.tight_layout() 64 | 65 | for col, dataset in enumerate(datasets): 66 | ax = axs[col] 67 | for model in models: 68 | data = df[ 69 | (df['dataset']==dataset) & 70 | (df['model']==model) 71 | ] 72 | if not data.empty: 73 | ax.plot(data['n_segments'], data['max'], label=model, c=colors[model]) 74 | 75 | # Title 76 | ax.set_title(dataset) 77 | 78 | # Labels 79 | if col==0: 80 | ax.set_ylabel('Size of largest\nsegment') 81 | ax.set_xlabel('# of segments') 82 | 83 | # Visuals 84 | ax.set_yscale('log') 85 | ax.set_xscale('log', base=2) 86 | ax.set_yticks([10**2, 10**4, 10**6, 10**8]) 87 | ax.set_xticks([2**12, 2**20]) 88 | 89 | # Legend 90 | if col==0: 91 | fig.legend(ncol=len(models), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 92 | 93 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 94 | 95 | if __name__ == "__main__": 96 | path = 'results' 97 | 98 | # Read csv file 99 | file = os.path.join(path, 'rmi_segmentation.csv') 100 | df = pd.read_csv(file, delimiter=',', header=0, comment='#') 101 | 102 | # Replace datasets and model names 103 | dataset_dict = { 104 | "books_200M_uint64": "books", 105 | "fb_200M_uint64": "fb", 106 | "osm_cellids_200M_uint64": "osmc", 107 | "wiki_ts_200M_uint64": "wiki" 108 | } 109 | model_dict = { 110 | "linear_regression": "LR", 111 | "linear_spline": "LS", 112 | "cubic_spline": "CS", 113 | "radix": "RX" 114 | } 115 | df.replace({**dataset_dict, **model_dict}, inplace=True) 116 | 117 | # Compute metrics 118 | df['frac_empty'] = df['n_empty'] / df['n_segments'] 119 | 120 | # Define variable lists 121 | datasets = sorted(df['dataset'].unique()) 122 | models = sorted(df['model'].unique()) 123 | 124 | # Set colors 125 | cmap = cm.get_cmap('tab20b') 126 | n_colors = 5 127 | colors = {} 128 | for i, model in enumerate(models): 129 | colors[model] = cmap(i/n_colors+0.1) 130 | 131 | if args['paper']: 132 | # Plot empty segments 133 | filename = 'rmi_segmentation-frac_empty.pdf' 134 | print(f'Plotting empty segments to \'{filename}\'...') 135 | plot_frac_empty(filename, 2, 1.8) 136 | 137 | # Plot max segment 138 | filename = 'rmi_segmentation-max_segment.pdf' 139 | print(f'Plotting max segments to \'{filename}\'...') 140 | plot_max_segment(filename, 2, 1.8) 141 | else: 142 | # Plot empty segments 143 | filename = 'rmi_segmentation-frac_empty.pdf' 144 | print(f'Plotting empty segments to \'{filename}\'...') 145 | plot_frac_empty(filename) 146 | 147 | # Plot max segment 148 | filename = 'rmi_segmentation-max_segment.pdf' 149 | print(f'Plotting max segments to \'{filename}\'...') 150 | plot_max_segment(filename) 151 | -------------------------------------------------------------------------------- /experiments/rmi_segmentation.cpp: -------------------------------------------------------------------------------- 1 | #include "argparse/argparse.hpp" 2 | 3 | #include "rmi/models.hpp" 4 | #include "rmi/util/fn.hpp" 5 | 6 | using key_type = uint64_t; 7 | 8 | 9 | /** 10 | * Computes statistical properties of the segments created when segmenting the @p keys with @p Model and writes results 11 | * to `std::cout`. 12 | * @tparam Key key type 13 | * @tparam Model model type 14 | * @param keys on which the RMI is built 15 | * @param n_segments number of segments to be created 16 | * @param dataset_name name of the dataset 17 | * @param model model type used for segementing the keys 18 | */ 19 | template 20 | void experiment(const std::vector &keys, 21 | const std::size_t n_segments, 22 | const std::string dataset_name, 23 | const std::string model) 24 | { 25 | using model_type = Model; 26 | 27 | // Build model. 28 | auto m = model_type(keys.begin(), keys.end(), 0, static_cast(n_segments) / keys.size()); 29 | 30 | // Initialize variables. 31 | std::vector segments(n_segments, 0); 32 | 33 | // Segment keys. 34 | for (std::size_t i =0; i != keys.size(); ++i) { 35 | auto key = keys.at(i); 36 | std::size_t segment = std::clamp(m.predict(key), 0, n_segments - 1); 37 | segments[segment]++; 38 | } 39 | 40 | // Compute properties. 41 | auto n_empty = std::count(segments.begin(), segments.end(), 0); 42 | 43 | // Report results. 44 | // Dataset 45 | std::cout << dataset_name << ',' 46 | << keys.size() << ',' 47 | // Model config 48 | << model << ',' 49 | << n_segments << ',' 50 | // Absolute error 51 | << mean(segments) << ',' 52 | << stdev(segments) << ',' 53 | << median(segments) << ',' 54 | << min(segments) << ',' 55 | << max(segments) << ',' 56 | << n_empty << std::endl; 57 | } 58 | 59 | /** 60 | * @brief experiment function pointer 61 | */ 62 | typedef void (*exp_fn_ptr)(const std::vector&, 63 | const std::size_t, 64 | const std::string, 65 | const std::string); 66 | 67 | #define ENTRY(L, T) \ 68 | { #L, &experiment } 69 | 70 | static std::map exp_map { 71 | ENTRY(linear_regression, rmi::LinearRegression), 72 | ENTRY(linear_spline, rmi::LinearSpline), 73 | ENTRY(cubic_spline, rmi::CubicSpline), 74 | ENTRY(radix, rmi::Radix), 75 | }; ///< Map that assigns an experiment function pointer to model types. 76 | #undef ENTRY 77 | 78 | 79 | /** 80 | * Performs segmentation using a model type and segment count provided via command line arguemnt and reports several 81 | * statistical properties of the resulting segments. 82 | * @param argc arguments counter 83 | * @param argv arguments vector 84 | */ 85 | int main(int argc, char *argv[]) 86 | { 87 | // Initialize argument parser. 88 | argparse::ArgumentParser program(argv[0], "0.1"); 89 | 90 | // Define arguments. 91 | program.add_argument("filename") 92 | .help("path to binary file containing uin64_t keys"); 93 | 94 | program.add_argument("model") 95 | .help("model type, either linear_regression, linear_spline, cubic_spline, or radix."); 96 | 97 | program.add_argument("n_segments") 98 | .help("number of segments, power of two is recommended.") 99 | .action([](const std::string &s) { return std::stoul(s); }); 100 | 101 | program.add_argument("--header") 102 | .help("output csv header") 103 | .default_value(false) 104 | .implicit_value(true); 105 | 106 | // Parse arguments. 107 | try { 108 | program.parse_args(argc, argv); 109 | } 110 | catch (const std::runtime_error &err) { 111 | std::cout << err.what() << '\n' << program; 112 | exit(EXIT_FAILURE); 113 | } 114 | 115 | // Read arguments. 116 | const auto filename = program.get("filename"); 117 | const auto dataset_name = split(filename, '/').back(); 118 | const auto model = program.get("model"); 119 | const auto n_segments = program.get("n_segments"); 120 | 121 | // Load keys. 122 | auto keys = load_data(filename); 123 | 124 | // Lookup experiment. 125 | if (exp_map.find(model) == exp_map.end()) { 126 | std::cerr << "Error: " << model << " is not a valid model type." << std::endl; 127 | exit(EXIT_FAILURE); 128 | } 129 | exp_fn_ptr exp_fn = exp_map[model]; 130 | 131 | // Output header. 132 | if (program["--header"] == true) 133 | std::cout << "dataset," 134 | << "n_keys," 135 | << "model," 136 | << "n_segments," 137 | << "mean," 138 | << "stdev," 139 | << "median," 140 | << "min," 141 | << "max," 142 | << "n_empty" 143 | << std::endl; 144 | 145 | // Run experiment. 146 | (*exp_fn)(keys, n_segments, dataset_name, model); 147 | 148 | exit(EXIT_SUCCESS); 149 | } 150 | -------------------------------------------------------------------------------- /scripts/plot_rmi_errors.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | import argparse 3 | import itertools 4 | import matplotlib.cm as cm 5 | import matplotlib.pyplot as plt 6 | import os 7 | import pandas as pd 8 | import warnings 9 | 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc')) 11 | 12 | # Ignore warnings 13 | warnings.filterwarnings( "ignore") 14 | 15 | # Argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true') 18 | args = vars(parser.parse_args()) 19 | 20 | 21 | def plot(x, y, xlabel, ylabel, filename): 22 | n_cols = len(datasets) 23 | n_rows = 1 24 | 25 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True) 26 | fig.tight_layout() 27 | 28 | for col, dataset in enumerate(datasets): 29 | ax = axs[col] 30 | for l1 in l1_models: 31 | for l2 in l2_models: 32 | data = df[ 33 | (df['dataset']==dataset) & 34 | (df['layer1']==l1) & 35 | (df['layer2']==l2) 36 | ] 37 | if not data.empty: 38 | ax.plot(data[x], data[y], label=f'{l1}$\mapsto${l2}', color=colors[(l1,l2)]) 39 | 40 | # Title 41 | ax.set_title(dataset) 42 | 43 | # Labels 44 | if col==0: 45 | ax.set_xlabel(xlabel) 46 | ax.set_ylabel(ylabel) 47 | 48 | # Visuals 49 | ax.set_xscale('log', base=2) 50 | ax.set_yscale('log') 51 | 52 | # Legend 53 | if col==0: 54 | fig.legend(ncol=len(l1_models)*len(l2_models), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 55 | 56 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 57 | 58 | 59 | def plot_paper(x, y, xlabel, ylabel, filename): 60 | l1_groups = [['CS','LR'],['LS','RX']] 61 | l2_models = ['LR','LS'] 62 | 63 | n_cols = len(l1_groups) 64 | n_rows = len(datasets) 65 | 66 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 2.7*n_rows), sharey='row', sharex=True) 67 | fig.tight_layout() 68 | 69 | for row, dataset in enumerate(datasets): 70 | for col, l1_models in enumerate(l1_groups): 71 | ax = axs[row,col] 72 | 73 | for l1 in l1_models: 74 | for l2 in l2_models: 75 | data = df[ 76 | (df['dataset']==dataset) & 77 | (df['layer1']==l1) & 78 | (df['layer2']==l2) 79 | ] 80 | if not data.empty: 81 | ax.plot(data[x], data[y], label=f'{l1}$\mapsto${l2}', color=colors[(l1,l2)]) 82 | 83 | # Title 84 | ax.set_title(dataset) 85 | 86 | # Labels 87 | if row==n_rows - 1: 88 | ax.set_xlabel(xlabel) 89 | if col==0: 90 | ax.set_ylabel(ylabel) 91 | 92 | # Visuals 93 | ax.set_xscale('log', base=2) 94 | ax.set_yscale('log') 95 | if dataset in ['fb','osmc']: 96 | ax.set_ylim(bottom=0.7) 97 | 98 | # Legend 99 | if row==0 and col==n_cols - 1: 100 | fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center') 101 | 102 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 103 | 104 | if __name__ == "__main__": 105 | path = 'results' 106 | 107 | # Read csv file 108 | file = os.path.join(path, 'rmi_errors.csv') 109 | df = pd.read_csv(file, delimiter=',', header=0, comment='#') 110 | 111 | # Replace datasets and model names 112 | dataset_dict = { 113 | "books_200M_uint64": "books", 114 | "fb_200M_uint64": "fb", 115 | "osm_cellids_200M_uint64": "osmc", 116 | "wiki_ts_200M_uint64": "wiki" 117 | } 118 | model_dict = { 119 | "linear_regression": "LR", 120 | "linear_spline": "LS", 121 | "cubic_spline": "CS", 122 | "radix": "RX" 123 | } 124 | df.replace({**dataset_dict, **model_dict}, inplace=True) 125 | 126 | # Define variable lists 127 | datasets = sorted(df['dataset'].unique()) 128 | l1_models = sorted(df['layer1'].unique()) 129 | l2_models = sorted(df['layer2'].unique()) 130 | 131 | # Set colors 132 | colors = {} 133 | cmap = cm.get_cmap('tab10') 134 | n_colors = 10 135 | for i, (l1, l2) in enumerate(itertools.product(l1_models, l2_models)): 136 | colors[(l1,l2)] = cmap(i/n_colors) 137 | 138 | if args['paper']: 139 | # Plot median absolute error 140 | filename = 'rmi_errors-median_absolute_error.pdf' 141 | print(f'Plotting median absolute error to \'{filename}\'...') 142 | plot_paper('n_models', 'median_ae', '# of segments', 'Median absolute error', filename) 143 | else: 144 | # Plot median absolute error 145 | filename = 'rmi_errors-median_absolute_error.pdf' 146 | print(f'Plotting median absolute error to \'{filename}\'...') 147 | plot('n_models', 'median_ae', '# of segments', 'Median absolute error', filename) 148 | 149 | # Plot mean absolute error 150 | filename = 'rmi_errors-mean_absolute_error.pdf' 151 | print(f'Plotting mean absolute error to \'{filename}\'...') 152 | plot('n_models', 'mean_ae', '# of segments', 'Mean absolute error', filename) 153 | 154 | # Plot max absolute error 155 | filename = 'rmi_errors-max_absolute_error.pdf' 156 | print(f'Plotting max absolute error to \'{filename}\'...') 157 | plot('n_models', 'max_ae', '# of segments', 'Maximum absolute error', filename) 158 | -------------------------------------------------------------------------------- /scripts/plot_rmi_intervals.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | import argparse 3 | import matplotlib.cm as cm 4 | import matplotlib.pyplot as plt 5 | import os 6 | import pandas as pd 7 | import warnings 8 | 9 | plt.style.use(os.path.join('scripts', 'matplotlibrc')) 10 | 11 | # Ignore warnings 12 | warnings.filterwarnings( "ignore") 13 | 14 | # Argparse 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true') 17 | args = vars(parser.parse_args()) 18 | 19 | 20 | def plot(x, y, xlabel, ylabel, filename): 21 | n_cols = len(configs) 22 | n_rows = len(datasets) 23 | 24 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True) 25 | fig.tight_layout() 26 | 27 | for row, dataset in enumerate(datasets): 28 | for col, config in enumerate(configs): 29 | ax = axs[row,col] 30 | for bound in bounds: 31 | data = df[ 32 | (df['dataset']==dataset) & 33 | (df['config']==config) & 34 | (df['bounds']==bound) 35 | ] 36 | if not data.empty: 37 | ax.plot(data[x], data[y], label=bound, color=colors[bound]) 38 | 39 | # Title 40 | ax.set_title(f'{dataset} ({config})') 41 | 42 | # Labels 43 | if row==n_rows - 1: 44 | ax.set_xlabel(xlabel) 45 | if col==0: 46 | ax.set_ylabel(ylabel) 47 | 48 | # Visuals 49 | ax.set_xscale('log') 50 | ax.set_yscale('log') 51 | 52 | # Legend 53 | if row==0 and col==0: 54 | fig.legend(ncol=len(bounds), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 55 | 56 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 57 | 58 | 59 | def plot_paper(x, y, xlabel, ylabel, filename): 60 | configs = ['LS$\mapsto$LR', 'RX$\mapsto$LS'] 61 | datasets = ['books', 'osmc', 'wiki'] 62 | 63 | n_cols = len(configs) 64 | n_rows = len(datasets) 65 | 66 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 2.7*n_rows), sharey='row', sharex=True) 67 | fig.tight_layout() 68 | 69 | for row, dataset in enumerate(datasets): 70 | for col, config in enumerate(configs): 71 | ax = axs[row,col] 72 | for bound in bounds: 73 | data = df[ 74 | (df['dataset']==dataset) & 75 | (df['config']==config) & 76 | (df['bounds']==bound) 77 | ] 78 | if not data.empty: 79 | ax.plot(data[x], data[y], label=bound, color=colors[bound]) 80 | 81 | # Title 82 | ax.set_title(f'{dataset} ({config})') 83 | 84 | # Labels 85 | if row==n_rows - 1: 86 | ax.set_xlabel(xlabel) 87 | if col==0: 88 | ax.set_ylabel(ylabel) 89 | 90 | # Visuals 91 | ax.set_xscale('log') 92 | ax.set_yscale('log') 93 | if col==n_cols - 1: 94 | ax.set_ylim(bottom=4, top=None) 95 | 96 | # Legend 97 | if row==0 and col==0: 98 | fig.legend(ncol=len(bounds), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 99 | 100 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 101 | 102 | 103 | if __name__ == "__main__": 104 | path = 'results' 105 | 106 | # Read csv file 107 | file = os.path.join(path, 'rmi_intervals.csv') 108 | df = pd.read_csv(file, delimiter=',', header=0, comment='#') 109 | 110 | # Replace datasets, model names, and bounds 111 | dataset_dict = { 112 | "books_200M_uint64": "books", 113 | "fb_200M_uint64": "fb", 114 | "osm_cellids_200M_uint64": "osmc", 115 | "wiki_ts_200M_uint64": "wiki" 116 | } 117 | model_dict = { 118 | "cubic_spline": "CS", 119 | "linear_spline": "LS", 120 | "linear_regression": "LR", 121 | "radix": "RX" 122 | } 123 | bounds_dict = { 124 | "labs": "LAbs", 125 | "lind": "LInd", 126 | "gabs": "GAbs", 127 | "gind": "GInd", 128 | "none": "NB" 129 | } 130 | df.replace({**dataset_dict, **model_dict, **bounds_dict}, inplace=True) 131 | 132 | # Compute model combinations and metrics 133 | df['config'] = df['layer1'] + '$\mapsto$' + df['layer2'] 134 | df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024) 135 | 136 | # Define varibale lists 137 | datasets = sorted(df['dataset'].unique()) 138 | configs = sorted(df['config'].unique()) 139 | bounds = sorted(df['bounds'].unique()) 140 | 141 | # Set colors 142 | colors = {} 143 | cmap = cm.get_cmap('Dark2') 144 | n_colors = 8 145 | for i, bound in enumerate(bounds): 146 | colors[bound] = cmap(i/n_colors) 147 | 148 | if args['paper']: 149 | # Plot median interval size 150 | filename = 'rmi_intervals-median_interval.pdf' 151 | print(f'Plotting median interval to \'{filename}\'...') 152 | plot_paper('size_in_MiB', 'median_interval', 'Index size [MiB]', 'Median search\ninterval size', filename) 153 | else: 154 | # Plot mean interval size 155 | filename = 'rmi_intervals-mean_interval.pdf' 156 | print(f'Plotting mean interval to \'{filename}\'...') 157 | plot('size_in_MiB', 'mean_interval', 'Index size [MiB]', 'Mean search\ninterval size', filename) 158 | 159 | # Plot median interval size 160 | filename = 'rmi_intervals-median_interval.pdf' 161 | print(f'Plotting median interval to \'{filename}\'...') 162 | plot('size_in_MiB', 'median_interval', 'Index size [MiB]', 'Median search\ninterval size', filename) 163 | 164 | # Plot max interval size 165 | filename = 'rmi_intervals-max_interval.pdf' 166 | print(f'Plotting max interval to \'{filename}\'...') 167 | plot('size_in_MiB', 'max_interval', 'Index size [MiB]', 'Max search\ninterval size', filename) 168 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Critical Analysis of Recursive Model Indexes 2 | Code for our [VLDB paper](https://www.vldb.org/pvldb/vol15/p1079-maltry.pdf) 3 | and [arXiv report](https://arxiv.org/abs/2106.16166). 4 | 5 | ## Build 6 | First clone the repository including all submodules. 7 | ```sh 8 | git clone --recursive https://github.com/BigDataAnalyticsGroup/analysis-rmi.git 9 | cd analysis-rmi 10 | ``` 11 | Then download the datasets and generate the source files of the RMI reference 12 | implementation. 13 | ```sh 14 | scripts/download_data.sh 15 | scripts/rmi_ref/prepare_rmi_ref.sh 16 | ``` 17 | Finally, the project can then be built as follows. 18 | ``` 19 | mkdir build 20 | cd build 21 | cmake -DCMAKE_BUILD_TYPE=Release .. 22 | make 23 | bin/example 24 | ``` 25 | 26 | ## Example 27 | ```c++ 28 | // Initialize random number generator. 29 | using key_type = uint64_t; 30 | std::mt19937 gen(42); 31 | std::uniform_int_distribution key_distrib(0, 1UL << 48); 32 | auto rand = [&gen, &key_distrib] { return key_distrib(gen); }; 33 | 34 | // Create 1M random keys. 35 | std::size_t n_keys = 1e7; 36 | std::vector keys(n_keys); 37 | std::generate(keys.begin(), keys.end(), rand); 38 | std::sort(keys.begin(), keys.end()); 39 | 40 | // Build a two-layer RMI. 41 | using layer1_type = rmi::LinearSpline; 42 | using layer2_type = rmi::LinearRegression; 43 | std::size_t layer2_size = 2UL << 16; 44 | rmi::RmiLAbs rmi(keys, layer2_size); 45 | 46 | // Pick a key. 47 | std::uniform_int_distribution uniform_distrib(0, n_keys - 1); 48 | key_type key = keys[uniform_distrib(gen)]; 49 | 50 | // Perform a lookup. 51 | auto range = rmi.search(key); 52 | auto pos = std::lower_bound(keys.begin() + range.lo, keys.begin() + range.hi, key); 53 | std::cout << "Key " << key << " is located at position " 54 | << std::distance(keys.begin(), pos) << '.' << std::endl; 55 | ``` 56 | 57 | ## Reproducing Experimental Results 58 | We provide the following experiments from our paper. 59 | * `rmi_segmentation`: Compute statistical properties on the segment sizes 60 | resulting from various root models (Section 5.1). 61 | * `rmi_errors`: Compute statistical properties on the prediction errors of a 62 | wide range of RMI configurations (Section 5.2). 63 | * `rmi_intervals`: Compute statistical properties on the error interval sizes 64 | of a wide range of RMI configurations (Section 5.3). 65 | * `rmi_lookup`: Measure lookup times for a wide range of RMI configurations 66 | (Section 6). 67 | * `rmi_build`: Measure build times for a wide range of RMI configurations and 68 | compare against the reference implementation (Section 7). 69 | * `rmi_guideline`: Measure lookup times for a wide range of RMI configurations 70 | and compare against configurations resulting from our guideline (Section 8). 71 | * `index_comparison`: Compare several indexes in terms of lookup time and build 72 | time (Section 9). 73 | 74 | Below, we explain step by step how to reproduce our experimental results. 75 | 76 | ### Preliminaries 77 | The following tools are required to reproduce our results. 78 | * C++ compiler supporting C++17. 79 | * `bash>=4`: run shell scripts. 80 | * `cmake>=3.2`: build configuration. 81 | * `md5sum`: validate the datasets. 82 | * `rust`: generate reference RMIs from 83 | [learnedsystems/RMI](https://github.com/learnedsystems/RMI). 84 | * `timeout`: abort experiments of slow configurations. 85 | * `wget`: download the datasets. 86 | * `zstd`: decompress the datasets. 87 | 88 | In the following, we assume that all scripts are run from the root directory of 89 | this repository. If you want to plot the results, install the corresponding 90 | Python requirements. 91 | ```sh 92 | pip install -r requirements.txt 93 | ``` 94 | 95 | ### Running And Plotting a Single Experiment 96 | We provide a script for running each experiment with the exact same 97 | configuration used in the paper. To run experiment ``, simply 98 | execute the corresponding script `scripts/run_.sh`, e.g., to 99 | reproduce the experiment `index_comparison` proceed as follows. 100 | ```sh 101 | scripts/run_index_comparison.sh 102 | ``` 103 | 104 | Depending on the hardware, experiments involving measurements of lookup time 105 | might run several days. Results will be written to `results/.csv` 106 | in csv format with an appropriate header. 107 | 108 | Afterwards, the results can be plotted by running 109 | `scripts/plot_.py`, e.g., to plot the results of the experiment 110 | `index_comparison` proceed as follows. 111 | ```sh 112 | scripts/plot_index_comparison.py 113 | ``` 114 | Note that this will visualize _all_ results of the experiment. To reproduce the 115 | paper plots, execute the Python script with argument `--paper`. 116 | 117 | The plots will be prefixed by the experiment name and placed in `results/`. 118 | 119 | ### Running and Plotting All Experiments at Once 120 | To reproduce all experiments at once, run the script `scripts/run_all.sh`. 121 | Executing all experiments will take several days. Results will be written to 122 | `results/.csv` in csv format with an appropriate header. Plots can 123 | be produced as described above. 124 | 125 | Afterwards, all results can be visualized by executing the script 126 | `scripts/plot_all.sh`. To reproduce only the plots from the paper, execute the 127 | script `scripts/plot_paper.sh`. The resulting plots will be prefixed by the 128 | experiment name and place in `results/`. 129 | 130 | ## Documentation 131 | Code documentation can be generated using `doxygen` by running the following command. 132 | ```sh 133 | doxygen Doxyfile 134 | ``` 135 | The code documentation will be placed in `doxy/html/`. 136 | 137 | ## Cite 138 | VLDB paper: 139 | ``` 140 | @article{maltry2022critical, 141 | title={A Critical Analysis of Recursive Model Indexes}, 142 | author={Marcel Maltry and Jens Dittrich} 143 | journal={Proc. {VLDB} Endow.}, 144 | volume={15}, 145 | number={5}, 146 | pages={1079--1091}, 147 | year={2022} 148 | } 149 | ``` 150 | 151 | arXiv report: 152 | ``` 153 | @misc{maltry2021criticalarxiv, 154 | title={A Critical Analysis of Recursive Model Indexes}, 155 | author={Marcel Maltry and Jens Dittrich}, 156 | year={2021}, 157 | eprint={2106.16166}, 158 | archivePrefix={arXiv}, 159 | primaryClass={cs.DB} 160 | } 161 | ``` 162 | -------------------------------------------------------------------------------- /include/rmi/util/fn.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | 13 | /*====================================================================================================================== 14 | * Bit Functions 15 | *====================================================================================================================*/ 16 | 17 | /** 18 | * Computes the amount of bits needed to represent unsigned value @p n. 19 | * @tparam Numeric the type of the value 20 | * @param n the value 21 | * @return the bit-width of the value 22 | */ 23 | template 24 | uint8_t bit_width(Numeric n) 25 | { 26 | static_assert(std::is_unsigned::value, "not defined for signed integral types"); 27 | 28 | // Count leading zeros. 29 | int lz; 30 | if constexpr (std::is_same_v) { 31 | lz = __builtin_clz(n); 32 | } else if constexpr (std::is_same_v) { 33 | lz = __builtin_clzl(n); 34 | } else if constexpr (std::is_same_v) { 35 | lz = __builtin_clzll(n); 36 | } else { 37 | static_assert(sizeof(Numeric) > sizeof(unsigned long long), "unsupported width of integral type"); 38 | } 39 | 40 | return sizeof(Numeric) * 8 - lz; 41 | } 42 | 43 | /** 44 | * Computes the length of the common prefix of two numeric values @p v1 and @p v2. 45 | * @tparam Numeric the type of the values 46 | * @param v1 the first value 47 | * @param v2 the second value 48 | * @return the length of the common prefix 49 | */ 50 | template 51 | uint8_t common_prefix_width(Numeric v1, Numeric v2) 52 | { 53 | Numeric Xor = v1 ^ v2; // bit-wise xor 54 | 55 | if constexpr (sizeof(Numeric) <= sizeof(unsigned)) { 56 | return __builtin_clz(Xor); 57 | } else if constexpr (sizeof(Numeric) <= sizeof(unsigned long)) { 58 | return __builtin_clzl(Xor); 59 | } else if constexpr (sizeof(Numeric) <= sizeof(unsigned long long)) { 60 | return __builtin_clzll(Xor); 61 | } else { 62 | static_assert(sizeof(Numeric) > sizeof(unsigned long long), "unsupported width of integral type"); 63 | } 64 | } 65 | 66 | 67 | /*====================================================================================================================== 68 | * String Functions 69 | *====================================================================================================================*/ 70 | 71 | /** 72 | * Splits @p str at each occurence of @p delimiter. 73 | * @param str the string to be split 74 | * @param delimiter the delimiter to split the string at 75 | * @return vector of substrings 76 | */ 77 | std::vector split(const std::string &str, char delimiter) 78 | { 79 | std::vector tokens; 80 | std::string token; 81 | std::istringstream token_stream(str); 82 | while (std::getline(token_stream, token, delimiter)) { 83 | tokens.push_back(token); 84 | } 85 | return tokens; 86 | } 87 | 88 | 89 | /*====================================================================================================================== 90 | * Arithmetic Functions 91 | *====================================================================================================================*/ 92 | 93 | /** 94 | * Computes the arithmetic mean of a vector @p v of numeric values. 95 | * @param v vector of numeric values 96 | * @return arithmetic mean 97 | */ 98 | template 99 | double mean(std::vector &v) 100 | { 101 | double sum = std::accumulate(v.begin(), v.end(), 0.0); 102 | return sum / v.size(); 103 | } 104 | 105 | /** 106 | * Computes the standard deviation of the mean of vector @p of numeric values. 107 | * @param v vector of numeric values 108 | * @return standard deviation 109 | */ 110 | template 111 | double stdev(std::vector &v) { 112 | double mean = ::mean(v); 113 | double sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), 0.0); 114 | return std::sqrt(sq_sum / v.size() - mean * mean); 115 | } 116 | 117 | /** 118 | * Computes the median of vector @p v of numeric values. 119 | * @param v vector of numeric values 120 | * @return median 121 | */ 122 | template 123 | Numeric median(std::vector &v) 124 | { 125 | std::size_t n = v.size() / 2; 126 | std::nth_element(v.begin(), v.begin()+n, v.end()); 127 | return v.at(n); 128 | } 129 | 130 | /** 131 | * Computes the minimum of a vector @p v of numeric values. 132 | * @param v vector of numeric values 133 | * @return minimum 134 | */ 135 | template 136 | Numeric min(std::vector &v) 137 | { 138 | return *std::min_element(v.begin(), v.end()); 139 | } 140 | 141 | /** 142 | * Computes the maximum of a vector @p v of numeric values. 143 | * @param v vector of numeric values 144 | * @return maximum 145 | */ 146 | template 147 | Numeric max(std::vector &v) 148 | { 149 | return *std::max_element(v.begin(), v.end()); 150 | } 151 | 152 | 153 | /*====================================================================================================================== 154 | * Dataset Functions 155 | *====================================================================================================================*/ 156 | 157 | /** 158 | * Reads a dataset file @p filename in binary format and writes keys to vector. 159 | * @tparam Key the type of the key 160 | * @param filename name of the dataset file 161 | * @return vector of keys 162 | */ 163 | template 164 | std::vector load_data(const std::string &filename) { 165 | using key_type = Key; 166 | 167 | // Open file. 168 | std::ifstream in(filename, std::ios::binary); 169 | if (!in.is_open()) { 170 | std::cerr << "Could not load " << filename << '.' << std::endl; 171 | exit(EXIT_FAILURE); 172 | } 173 | 174 | // Read number of keys. 175 | uint64_t n_keys; 176 | in.read(reinterpret_cast(&n_keys), sizeof(uint64_t)); 177 | 178 | // Initialize vector. 179 | std::vector data; 180 | data.resize(n_keys); 181 | 182 | // Read keys. 183 | in.read(reinterpret_cast(data.data()), n_keys * sizeof(key_type)); 184 | in.close(); 185 | 186 | return data; 187 | } 188 | -------------------------------------------------------------------------------- /experiments/rmi_errors.cpp: -------------------------------------------------------------------------------- 1 | #include "argparse/argparse.hpp" 2 | 3 | #include "rmi/models.hpp" 4 | #include "rmi/rmi.hpp" 5 | #include "rmi/util/fn.hpp" 6 | 7 | using key_type = uint64_t; 8 | 9 | 10 | /** 11 | * Computes several error metrics for a given @p Rmi on dataset @p keys and writes results to `std::cout`. 12 | * @tparam Key key type 13 | * @tparam Rmi RMI type 14 | * @param keys on which the RMI is built 15 | * @param n_models number of models in the second layer of the RMI 16 | * @param dataset_name name of the dataset 17 | * @param layer1 model type of the first layer 18 | * @param layer2 model type of the second layer 19 | */ 20 | template 21 | void experiment(const std::vector &keys, 22 | const std::size_t n_models, 23 | const std::string dataset_name, 24 | const std::string layer1, 25 | const std::string layer2) 26 | { 27 | using rmi_type = Rmi; 28 | 29 | // Build RMI. 30 | rmi_type rmi(keys, n_models); 31 | 32 | // Initialize variables. 33 | auto n_keys = keys.size(); 34 | std::vector absolute_errors; 35 | absolute_errors.reserve(n_keys); 36 | 37 | // Perform predictions. 38 | auto prev_key = keys.at(0); 39 | int64_t prev_pos = 0; 40 | for (std::size_t i = 0; i != n_keys; ++i) { 41 | auto key = keys.at(i); 42 | auto pred = rmi.search(key); 43 | 44 | // Record error. 45 | int64_t pos = key == prev_key ? prev_pos : i; 46 | auto absolute_error = std::abs(pos - static_cast(pred.pos)); 47 | absolute_errors.push_back(absolute_error); 48 | 49 | prev_key = key; 50 | prev_pos = pos; 51 | } 52 | 53 | // Report results. 54 | // Dataset 55 | std::cout << dataset_name << ',' 56 | << n_keys << ',' 57 | // RMI config 58 | << layer1 << ',' 59 | << layer2 << ',' 60 | << n_models << ',' 61 | // Absolute error 62 | << mean(absolute_errors) << ',' 63 | << median(absolute_errors) << ',' 64 | << stdev(absolute_errors) << ',' 65 | << min(absolute_errors) << ',' 66 | << max(absolute_errors) << std::endl; 67 | } 68 | 69 | 70 | /** 71 | * @brief experiment function pointer 72 | */ 73 | typedef void (*exp_fn_ptr)(const std::vector&, 74 | const std::size_t, 75 | const std::string, 76 | const std::string, 77 | const std::string); 78 | 79 | #define ENTRY(L1, L2, T1, T2) \ 80 | { std::make_pair(#L1, #L2), &experiment> } 81 | 82 | static std::map, exp_fn_ptr> exp_map { 83 | ENTRY(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression), 84 | ENTRY(linear_regression, linear_spline, rmi::LinearRegression, rmi::LinearSpline), 85 | ENTRY(linear_spline, linear_regression, rmi::LinearSpline, rmi::LinearRegression), 86 | ENTRY(linear_spline, linear_spline, rmi::LinearSpline, rmi::LinearSpline), 87 | ENTRY(cubic_spline, linear_regression, rmi::CubicSpline, rmi::LinearRegression), 88 | ENTRY(cubic_spline, linear_spline, rmi::CubicSpline, rmi::LinearSpline), 89 | ENTRY(radix, linear_regression, rmi::Radix, rmi::LinearRegression), 90 | ENTRY(radix, linear_spline, rmi::Radix, rmi::LinearSpline), 91 | }; ///< Map that assigns an experiment function pointer to RMI configurations. 92 | #undef ENTRY 93 | 94 | 95 | /** 96 | * Triggers computation of several error metrics of an RMI configuration provided via command line arguments. 97 | * @param argc arguments counter 98 | * @param argv arguments vector 99 | */ 100 | int main(int argc, char *argv[]) 101 | { 102 | // Initialize argument parser. 103 | argparse::ArgumentParser program(argv[0], "0.1"); 104 | 105 | // Define arguments. 106 | program.add_argument("filename") 107 | .help("path to binary file containing uin64_t keys"); 108 | 109 | program.add_argument("layer1") 110 | .help("layer1 model type, either linear_regression, linear_spline, cubic_spline, or radix."); 111 | 112 | program.add_argument("layer2") 113 | .help("layer2 model type, either linear_regression, linear_spline, or cubic_spline."); 114 | 115 | program.add_argument("n_models") 116 | .help("number of models on layer2, power of two is recommended.") 117 | .action([](const std::string &s) { return std::stoul(s); }); 118 | 119 | program.add_argument("--header") 120 | .help("output csv header") 121 | .default_value(false) 122 | .implicit_value(true); 123 | 124 | // Parse arguments. 125 | try { 126 | program.parse_args(argc, argv); 127 | } 128 | catch (const std::runtime_error &err) { 129 | std::cout << err.what() << '\n' << program; 130 | exit(EXIT_FAILURE); 131 | } 132 | 133 | // Read arguments. 134 | const auto filename = program.get("filename"); 135 | const auto dataset_name = split(filename, '/').back(); 136 | const auto layer1 = program.get("layer1"); 137 | const auto layer2 = program.get("layer2"); 138 | const auto n_models = program.get("n_models"); 139 | 140 | // Load keys. 141 | auto keys = load_data(filename); 142 | 143 | // Lookup experiment. 144 | auto config = std::make_pair(layer1, layer2); 145 | if (exp_map.find(config) == exp_map.end()) { 146 | std::cerr << "Error: " << layer1 << ',' << layer2 << " is not a valid RMI configuration." << std::endl; 147 | exit(EXIT_FAILURE); 148 | } 149 | exp_fn_ptr exp_fn = exp_map[config]; 150 | 151 | // Output header. 152 | if (program["--header"] == true) 153 | std::cout << "dataset," 154 | << "n_keys," 155 | << "layer1," 156 | << "layer2," 157 | << "n_models," 158 | << "mean_ae," 159 | << "median_ae," 160 | << "stdev_ae" 161 | << "min_ae" 162 | << "max_ae" 163 | << std::endl; 164 | 165 | // Run experiment. 166 | (*exp_fn)(keys, n_models, dataset_name, layer1, layer2); 167 | 168 | exit(EXIT_SUCCESS); 169 | } 170 | -------------------------------------------------------------------------------- /include/rmi/util/search.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | 7 | /** 8 | * Functor for performing linear search. 9 | */ 10 | struct LinearSearch { 11 | /** 12 | * Performs linear search in the interval [first,last) to find the first element that is not less than @t value. 13 | * @tparam InputIt input iterator type 14 | * @tparam T type of searched value 15 | * @param first, last iterators defining the partially-ordered range to examine 16 | * @param pred iterator to the predicted position (ignored) 17 | * @param value value to compare the elements to 18 | * @return iterator to the first element that is not less than @p value 19 | */ 20 | template 21 | InputIt operator()(InputIt first, InputIt last, InputIt /* pred */, const T &value) { 22 | InputIt runner = first; 23 | for (; runner != last; ++runner) 24 | if (*runner >= value) return runner; 25 | return last; 26 | } 27 | }; 28 | 29 | 30 | /** 31 | * Functor for performing model-biased linear search. 32 | */ 33 | struct ModelBiasedLinearSearch { 34 | /** 35 | * Performs model-biased linear search either in the interval [first,pred) or [pred, last) to find the first element 36 | * that is not less than @t value. 37 | * @tparam InputIt input iterator type 38 | * @tparam T type of searched value 39 | * @param first, last iterators defining the partially-ordered range to examine 40 | * @param pred iterator to the predicted position 41 | * @param value value to compare the elements to 42 | * @return iterator to the first element that is not less than @p value 43 | */ 44 | template 45 | InputIt operator()(InputIt first, InputIt last, InputIt pred, const T &value) { 46 | InputIt runner = pred; 47 | if (*runner < value) { 48 | for (; runner < last; ++runner) // search right side 49 | if (*runner >= value) return runner; 50 | return last; 51 | } else { 52 | for (; runner >= first; --runner)// search left side 53 | if (*runner < value) return ++runner; 54 | return first; 55 | } 56 | } 57 | }; 58 | 59 | 60 | /** 61 | * Functor for performing binary search. 62 | */ 63 | struct BinarySearch { 64 | /** 65 | * Performs binary search in the interval [first,last) to find the first element that is not less than @t value. 66 | * @tparam InputIt input iterator type 67 | * @tparam T type of searched value 68 | * @param first, last iterators defining the partially-ordered range to examine 69 | * @param pred iterator to the predicted position (ignored) 70 | * @param value value to compare the elements to 71 | * @return iterator to the first element that is not less than @p value 72 | */ 73 | template 74 | InputIt operator()(InputIt first, InputIt last, InputIt /* pred */, const T &value) { 75 | return std::lower_bound(first, last, value); 76 | } 77 | }; 78 | 79 | 80 | /** 81 | * Functor for performing model-biased binary search. 82 | */ 83 | struct ModelBiasedBinarySearch { 84 | /** 85 | * Performs model-biased binary search either in the interval [first,pred) or [pred, last) to find the first element 86 | * that is not less than @t value. 87 | * @tparam InputIt input iterator type 88 | * @tparam T type of searched value 89 | * @param first, last iterators defining the partially-ordered range to examine 90 | * @param pred iterator to the predicted position 91 | * @param value value to compare the elements to 92 | * @return iterator to the first element that is not less than @p value 93 | */ 94 | template 95 | InputIt operator()(InputIt first, InputIt last, InputIt pred, const T &value) { 96 | if (*pred < value) return std::lower_bound(pred, last, value); // search right side 97 | else return std::lower_bound(first, pred, value); // search left side 98 | } 99 | }; 100 | 101 | 102 | /** 103 | * Functor for performing exponential search. 104 | */ 105 | struct ExponentialSearch { 106 | /** 107 | * Performs exponential search in the interval [first,last) to find the first element that is not less than @t 108 | * value. 109 | * @tparam InputIt input iterator type 110 | * @tparam T type of searched value 111 | * @param first, last iterators defining the partially-ordered range to examine 112 | * @param pred iterator to the predicted position (ignored) 113 | * @param value value to compare the elements to 114 | * @return iterator to the first element that is not less than @p value 115 | */ 116 | template 117 | InputIt operator()(InputIt first, InputIt last, InputIt /* pred */, const T &value) { 118 | if (*first >= value) return first; 119 | std::size_t bound = 1; 120 | InputIt prev = first; 121 | InputIt curr = prev + bound; 122 | while (curr < last and *curr < value) { 123 | bound *= 2; 124 | prev = curr; 125 | curr += bound; 126 | } 127 | return std::lower_bound(prev, std::min(curr + 1, last), value); 128 | } 129 | }; 130 | 131 | 132 | /** 133 | * Functor for performing model-biased exponential search. 134 | */ 135 | struct ModelBiasedExponentialSearch { 136 | /** 137 | * Performs model-biased exponential search either in the interval [first,pred) or [pred, last) to find the first 138 | * element that is not less than @t value. 139 | * @tparam InputIt input iterator type 140 | * @tparam T type of searched value 141 | * @param first, last iterators defining the partially-ordered range to examine 142 | * @param pred iterator to the predicted position 143 | * @param value value to compare the elements to 144 | * @return iterator to the first element that is not less than @p value 145 | */ 146 | template 147 | InputIt operator()(InputIt first, InputIt last, InputIt pred, const T &value) { 148 | if (*pred < value) { // search right side 149 | std::size_t bound = 1; 150 | InputIt prev = pred; 151 | InputIt curr = prev + bound; 152 | while (curr < last and *curr < value) { 153 | bound *= 2; 154 | prev = curr; 155 | curr += bound; 156 | } 157 | return std::lower_bound(prev, std::min(curr + 1, last), value); 158 | } else { // search left side 159 | std::size_t bound = 1; 160 | InputIt prev = pred; 161 | InputIt curr = prev - bound; 162 | while (curr > first and *curr >= value) { 163 | bound *= 2; 164 | prev = curr; 165 | curr -= bound; 166 | } 167 | return std::lower_bound(std::max(first, curr), prev, value); 168 | } 169 | } 170 | }; 171 | -------------------------------------------------------------------------------- /experiments/rmi_intervals.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "argparse/argparse.hpp" 4 | 5 | #include "rmi/models.hpp" 6 | #include "rmi/rmi.hpp" 7 | #include "rmi/util/fn.hpp" 8 | 9 | using key_type = uint64_t; 10 | 11 | 12 | /** 13 | * Computes several metrics on the error interval sizes for a given @p Rmi on dataset @p keys and writes results to 14 | * `std::cout`. 15 | * @tparam Key key type 16 | * @tparam Rmi RMI type 17 | * @param keys on which the RMI is built 18 | * @param n_models number of models in the second layer of the RMI 19 | * @param dataset_name name of the dataset 20 | * @param layer1 model type of the first layer 21 | * @param layer2 model type of the second layer 22 | * @param bound_type used by the RMI 23 | */ 24 | template 25 | void experiment(const std::vector &keys, 26 | const std::size_t n_models, 27 | const std::string dataset_name, 28 | const std::string layer1, 29 | const std::string layer2, 30 | const std::string bound_type) 31 | { 32 | using rmi_type = Rmi; 33 | 34 | // Build RMI. 35 | rmi_type rmi(keys, n_models); 36 | 37 | // Initialize variables. 38 | auto n_keys = keys.size(); 39 | std::vector interval_sizes; 40 | interval_sizes.reserve(n_keys); 41 | 42 | // Perform predictions. 43 | for (auto key : keys) { 44 | auto pred = rmi.search(key); 45 | 46 | // Record interval size. 47 | auto interval_size = pred.hi - pred.lo; 48 | interval_sizes.push_back(interval_size); 49 | } 50 | 51 | // Report results. 52 | // Dataset 53 | std::cout << dataset_name << ',' 54 | << n_keys << ',' 55 | // RMI config 56 | << layer1 << ',' 57 | << layer2 << ',' 58 | << n_models << ',' 59 | << bound_type << ',' 60 | << rmi.size_in_bytes() << ',' 61 | // Interval sizes 62 | << mean(interval_sizes) << ',' 63 | << median(interval_sizes) << ',' 64 | << stdev(interval_sizes) << ',' 65 | << min(interval_sizes) << ',' 66 | << max(interval_sizes) << std::endl; 67 | } 68 | 69 | 70 | /** 71 | * @brief experiment function pointer 72 | */ 73 | typedef void (*exp_fn_ptr)(const std::vector&, 74 | const std::size_t, 75 | const std::string, 76 | const std::string, 77 | const std::string, 78 | const std::string); 79 | 80 | /** 81 | * RMI configuration that holds the string representation of model types of layer 1 and layer 2 and the error bound 82 | * type. 83 | */ 84 | struct Config { 85 | std::string layer1; 86 | std::string layer2; 87 | std::string bound_type; 88 | }; 89 | 90 | /** 91 | * Comparator class for @p Config objects. 92 | */ 93 | struct ConfigCompare { 94 | bool operator() (const Config &lhs, const Config &rhs) const { 95 | if (lhs.layer1 != rhs.layer1) return lhs.layer1 < rhs.layer1; 96 | if (lhs.layer2 != rhs.layer2) return lhs.layer2 < rhs.layer2; 97 | return lhs.bound_type < rhs.bound_type; 98 | } 99 | }; 100 | 101 | #define ENTRIES(L1, L2, T1, T2) \ 102 | { {#L1, #L2, "labs"}, &experiment> }, \ 103 | { {#L1, #L2, "lind"}, &experiment> }, \ 104 | { {#L1, #L2, "gabs"}, &experiment> }, \ 105 | { {#L1, #L2, "gind"}, &experiment> }, 106 | 107 | static std::map exp_map { 108 | ENTRIES(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression) 109 | ENTRIES(linear_regression, linear_spline, rmi::LinearRegression, rmi::LinearSpline) 110 | ENTRIES(linear_spline, linear_regression, rmi::LinearSpline, rmi::LinearRegression) 111 | ENTRIES(linear_spline, linear_spline, rmi::LinearSpline, rmi::LinearSpline) 112 | ENTRIES(cubic_spline, linear_regression, rmi::CubicSpline, rmi::LinearRegression) 113 | ENTRIES(cubic_spline, linear_spline, rmi::CubicSpline, rmi::LinearSpline) 114 | ENTRIES(radix, linear_regression, rmi::Radix, rmi::LinearRegression) 115 | ENTRIES(radix, linear_spline, rmi::Radix, rmi::LinearSpline) 116 | }; ///< Map that assigns an experiment function pointer to RMI configurations. 117 | #undef ENTRIES 118 | 119 | 120 | /** 121 | * Triggers computation of several metrics on the error interval sizes of an RMI configuration provided via command line 122 | * arguments. 123 | * @param argc arguments counter 124 | * @param argv arguments vector 125 | */ 126 | int main(int argc, char *argv[]) 127 | { 128 | // Initialize argument parser. 129 | argparse::ArgumentParser program(argv[0], "0.1"); 130 | 131 | // Define arguments. 132 | program.add_argument("filename") 133 | .help("path to binary file containing uin64_t keys"); 134 | 135 | program.add_argument("layer1") 136 | .help("layer1 model type, either linear_regression, linear_spline, cubic_spline, or radix."); 137 | 138 | program.add_argument("layer2") 139 | .help("layer2 model type, either linear_regression, linear_spline, or cubic_spline."); 140 | 141 | program.add_argument("n_models") 142 | .help("number of models on layer2, power of two is recommended.") 143 | .action([](const std::string &s) { return std::stoul(s); }); 144 | 145 | program.add_argument("bound_type") 146 | .help("type of error bounds used, either labs, lind, gabs, or gind."); 147 | 148 | program.add_argument("--header") 149 | .help("output csv header") 150 | .default_value(false) 151 | .implicit_value(true); 152 | 153 | // Parse arguments. 154 | try { 155 | program.parse_args(argc, argv); 156 | } 157 | catch (const std::runtime_error &err) { 158 | std::cout << err.what() << '\n' << program; 159 | exit(EXIT_FAILURE); 160 | } 161 | 162 | // Read arguments. 163 | const auto filename = program.get("filename"); 164 | const auto dataset_name = split(filename, '/').back(); 165 | const auto layer1 = program.get("layer1"); 166 | const auto layer2 = program.get("layer2"); 167 | const auto n_models = program.get("n_models"); 168 | const auto bound_type = program.get("bound_type"); 169 | 170 | // Load keys. 171 | auto keys = load_data(filename); 172 | 173 | // Lookup experiment. 174 | Config config{layer1, layer2, bound_type}; 175 | if (exp_map.find(config) == exp_map.end()) { 176 | std::cerr << "Error: " << layer1 << ',' << layer2 << ',' << bound_type << " is not a valid RMI configuration." << std::endl; 177 | exit(EXIT_FAILURE); 178 | } 179 | exp_fn_ptr exp_fn = exp_map[config]; 180 | 181 | // Output header. 182 | if (program["--header"] == true) 183 | std::cout << "dataset," 184 | << "n_keys," 185 | << "layer1," 186 | << "layer2," 187 | << "n_models," 188 | << "bounds," 189 | << "size_in_bytes," 190 | << "mean_interval," 191 | << "median_interval," 192 | << "stdev_interval," 193 | << "min_interval," 194 | << "max_interval" 195 | << std::endl; 196 | 197 | // Run experiment. 198 | (*exp_fn)(keys, n_models, dataset_name, layer1, layer2, bound_type); 199 | 200 | exit(EXIT_SUCCESS); 201 | } 202 | -------------------------------------------------------------------------------- /experiments/rmi_build.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "argparse/argparse.hpp" 5 | #include "rmi/models.hpp" 6 | #include "rmi/rmi.hpp" 7 | 8 | using key_type = uint64_t; 9 | using namespace std::chrono; 10 | 11 | std::size_t s_glob; ///< global size_t variable 12 | 13 | 14 | /** 15 | * Measures the build time for a given @p Rmi on dataset @p keys and writes results to `std::cout`. 16 | * @tparam Key key type 17 | * @tparam Rmi RMI type 18 | * @param keys on which the RMI is built 19 | * @param n_models number of models in the second layer of the RMI 20 | * @param dataset_name name of the dataset 21 | * @param layer1 model type of the first layer 22 | * @param layer2 model type of the second layer 23 | * @param bounds_type used by the RMI 24 | */ 25 | template 26 | void experiment(const std::vector &keys, 27 | const std::size_t n_models, 28 | const std::size_t n_reps, 29 | const std::string dataset_name, 30 | const std::string layer1, 31 | const std::string layer2, 32 | const std::string bound_type) 33 | { 34 | using rmi_type = Rmi; 35 | 36 | // Perform n_reps runs. 37 | for (std::size_t rep = 0; rep != n_reps; ++rep) { 38 | 39 | // Build RMI. 40 | auto start = steady_clock::now(); 41 | rmi_type rmi(keys, n_models); 42 | auto stop = steady_clock::now(); 43 | auto build_time = duration_cast(stop - start).count(); 44 | 45 | // Perform lookup to ensure that RMI is actually built. 46 | auto key = keys.at(0); 47 | auto range = rmi.search(key); 48 | auto pos = std::lower_bound(keys.begin() + range.lo, keys.begin() + range.hi, key); 49 | s_glob = std::distance(keys.begin(), pos); 50 | 51 | // Report results. 52 | // Dataset 53 | std::cout << dataset_name << ',' 54 | << keys.size() << ',' 55 | // Index 56 | << "ours" << ',' 57 | << layer1 << ',' 58 | << layer2 << ',' 59 | << n_models << ',' 60 | << bound_type << ',' 61 | << rmi.size_in_bytes() << ',' 62 | // Experiment 63 | << rep << ',' 64 | // Results 65 | << build_time << ',' 66 | // Checksums 67 | << s_glob << std::endl; 68 | } // reps 69 | } 70 | 71 | 72 | /** 73 | * @brief experiment function pointer 74 | */ 75 | typedef void (*exp_fn_ptr)(const std::vector&, 76 | const std::size_t, 77 | const std::size_t, 78 | const std::string, 79 | const std::string, 80 | const std::string, 81 | const std::string); 82 | 83 | /** 84 | * RMI configuration that holds the string representation of model types of layer 1 and layer 2 and the error bound 85 | * type. 86 | */ 87 | struct Config { 88 | std::string layer1; 89 | std::string layer2; 90 | std::string bound_type; 91 | }; 92 | 93 | /** 94 | * Comparator class for @p Config objects. 95 | */ 96 | struct ConfigCompare { 97 | bool operator() (const Config &lhs, const Config &rhs) const { 98 | if (lhs.layer1 != rhs.layer1) return lhs.layer1 < rhs.layer1; 99 | if (lhs.layer2 != rhs.layer2) return lhs.layer2 < rhs.layer2; 100 | return lhs.bound_type < rhs.bound_type; 101 | } 102 | }; 103 | 104 | #define ENTRIES(L1, L2, T1, T2) \ 105 | { {#L1, #L2, "labs"}, &experiment> }, \ 106 | { {#L1, #L2, "lind"}, &experiment> }, \ 107 | { {#L1, #L2, "gabs"}, &experiment> }, \ 108 | { {#L1, #L2, "gind"}, &experiment> }, \ 109 | { {#L1, #L2, "none"}, &experiment> }, 110 | 111 | static std::map exp_map { 112 | ENTRIES(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression) 113 | ENTRIES(linear_regression, linear_spline, rmi::LinearRegression, rmi::LinearSpline) 114 | ENTRIES(linear_spline, linear_regression, rmi::LinearSpline, rmi::LinearRegression) 115 | ENTRIES(linear_spline, linear_spline, rmi::LinearSpline, rmi::LinearSpline) 116 | ENTRIES(cubic_spline, linear_regression, rmi::CubicSpline, rmi::LinearRegression) 117 | ENTRIES(cubic_spline, linear_spline, rmi::CubicSpline, rmi::LinearSpline) 118 | ENTRIES(radix, linear_regression, rmi::Radix, rmi::LinearRegression) 119 | ENTRIES(radix, linear_spline, rmi::Radix, rmi::LinearSpline) 120 | }; ///< Map that assigns an experiment function pointer to RMI configurations. 121 | #undef ENTRIES 122 | 123 | 124 | /** 125 | * Triggers measurement of build times for an RMI configuration provided via command line arguments. 126 | * @param argc arguments counter 127 | * @param argv arguments vector 128 | */ 129 | int main(int argc, char *argv[]) 130 | { 131 | // Initialize argument parser. 132 | argparse::ArgumentParser program(argv[0], "0.1"); 133 | 134 | // Define arguments. 135 | program.add_argument("filename") 136 | .help("path to binary file containing uin64_t keys"); 137 | 138 | program.add_argument("layer1") 139 | .help("layer1 model type, either linear_regression, linear_spline, cubic_spline, or radix."); 140 | 141 | program.add_argument("layer2") 142 | .help("layer2 model type, either linear_regression, linear_spline, or cubic_spline."); 143 | 144 | program.add_argument("n_models") 145 | .help("number of models on layer2, power of two is recommended.") 146 | .action([](const std::string &s) { return std::stoul(s); }); 147 | 148 | program.add_argument("bound_type") 149 | .help("type of error bounds used, either none, labs, lind, gabs, or gind."); 150 | 151 | program.add_argument("-n", "--n_reps") 152 | .help("number of experiment repetitions") 153 | .default_value(std::size_t(3)) 154 | .action([](const std::string &s) { return std::stoul(s); }); 155 | 156 | program.add_argument("--header") 157 | .help("output csv header") 158 | .default_value(false) 159 | .implicit_value(true); 160 | 161 | // Parse arguments. 162 | try { 163 | program.parse_args(argc, argv); 164 | } 165 | catch (const std::runtime_error &err) { 166 | std::cout << err.what() << '\n' << program; 167 | exit(EXIT_FAILURE); 168 | } 169 | 170 | // Read arguments. 171 | const auto filename = program.get("filename"); 172 | const auto dataset_name = split(filename, '/').back(); 173 | const auto layer1 = program.get("layer1"); 174 | const auto layer2 = program.get("layer2"); 175 | const auto n_models = program.get("n_models"); 176 | const auto bound_type = program.get("bound_type"); 177 | const auto n_reps = program.get("-n"); 178 | 179 | // Load keys. 180 | auto keys = load_data(filename); 181 | 182 | // Lookup experiment. 183 | Config config{layer1, layer2, bound_type}; 184 | if (exp_map.find(config) == exp_map.end()) { 185 | std::cerr << "Error: " << layer1 << ',' << layer2 << ',' << bound_type << " is not a valid RMI configuration." << std::endl; 186 | exit(EXIT_FAILURE); 187 | } 188 | exp_fn_ptr exp_fn = exp_map[config]; 189 | 190 | // Output header. 191 | if (program["--header"] == true) 192 | std::cout << "dataset," 193 | << "n_keys," 194 | << "rmi," 195 | << "layer1," 196 | << "layer2," 197 | << "n_models," 198 | << "bounds," 199 | << "size_in_bytes," 200 | << "rep," 201 | << "build_time," 202 | << "checksum" 203 | << std::endl; 204 | 205 | // Run experiment. 206 | (*exp_fn)(keys, n_models, n_reps, dataset_name, layer1, layer2, bound_type); 207 | 208 | exit(EXIT_SUCCESS); 209 | } 210 | -------------------------------------------------------------------------------- /scripts/plot_rmi_lookup.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | import argparse 3 | import itertools 4 | import matplotlib.cm as cm 5 | import matplotlib.pyplot as plt 6 | import os 7 | import pandas as pd 8 | import warnings 9 | 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc')) 11 | 12 | # Ignore warnings 13 | warnings.filterwarnings( "ignore") 14 | 15 | # Argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true') 18 | args = vars(parser.parse_args()) 19 | 20 | 21 | def plot_full(filename='rmi_lookup-full.pdf'): 22 | n_rows = len(datasets) 23 | n_cols = len(l1models) * len(l2models) 24 | 25 | configs = itertools.product(l1models, l2models) 26 | 27 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True) 28 | fig.tight_layout() 29 | 30 | for col, (l1, l2) in enumerate(configs): 31 | for row, dataset in enumerate(datasets): 32 | ax = axs[row,col] 33 | for bound in bounds: 34 | for search in searches: 35 | data = df[ 36 | (df['dataset']==dataset) & 37 | (df['layer1']==l1) & 38 | (df['layer2']==l2) & 39 | (df['bounds']==bound) & 40 | (df['search']==search) 41 | ] 42 | if not data.empty: 43 | ax.plot(data['size_in_MiB'], data['lookup_in_ns'], label=f'{bound}+{search}', color=corr_colors[(bound,search)]) 44 | 45 | # Title 46 | ax.set_title(f'{dataset} ({l1}$\mapsto${l2})') 47 | 48 | # Labels 49 | if row==n_rows-1: 50 | ax.set_xlabel('Index size [MiB]') 51 | if col==0: 52 | ax.set_ylabel('Lookup time [ns]') 53 | 54 | # Visuals 55 | ax.set_ylim(bottom=0) 56 | ax.set_xscale('log') 57 | 58 | # Legend 59 | if row==0 and col==0: 60 | fig.legend(ncol=len(corr_configs), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 61 | 62 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 63 | 64 | 65 | def plot_models(filename='rmi_lookup-model_types.pdf', bounds='NB', search='MExp'): 66 | l1_groups = [['CS','LR'],['LS','RX']] 67 | l2_models = ['LR','LS'] 68 | 69 | n_rows = len(datasets) 70 | n_cols = len(l1_groups) 71 | 72 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 2.7*n_rows), sharey='row', sharex=True) 73 | fig.tight_layout() 74 | 75 | for row, dataset in enumerate(datasets): 76 | for col, l1_models in enumerate(l1_groups): 77 | ax = axs[row,col] 78 | for l1 in l1_models: 79 | for l2 in l2_models: 80 | data = df[ 81 | (df['dataset']==dataset) & 82 | (df['layer1']==l1) & 83 | (df['layer2']==l2) & 84 | (df['bounds']==bounds) & 85 | (df['search']==search) 86 | ] 87 | if not data.empty: 88 | ax.plot(data['size_in_MiB'], data['lookup_in_ns'], label=f'{l1}$\mapsto${l2}', c=model_colors[(l1,l2)]) 89 | 90 | # Title 91 | ax.set_title(dataset) 92 | 93 | # Labels 94 | if col==0: 95 | ax.set_ylabel('Lookup time [ns]') 96 | if row==n_rows - 1: 97 | ax.set_xlabel('Index size [MiB]') 98 | 99 | # Visuals 100 | ax.set_xscale('log') 101 | if col==n_cols - 1: 102 | if dataset=='books': 103 | ax.set_ylim(bottom=0, top=850) 104 | elif dataset=='fb': 105 | ax.set_ylim(bottom=0, top=1850) 106 | elif dataset=='osmc': 107 | ax.set_ylim(bottom=0, top=1500) 108 | elif dataset=='wiki': 109 | ax.set_ylim(bottom=0, top=1000) 110 | 111 | # Legend 112 | if row==0 and col==n_cols - 1: 113 | fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center') 114 | 115 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 116 | 117 | 118 | def plot_correction(filename='rmi_lookup-error_correction.pdf'): 119 | models = [('LS', 'LR'),('RX','LS')] 120 | datasets = ['books','osmc','wiki'] 121 | 122 | n_rows = len(datasets) 123 | n_cols = len(models) 124 | 125 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(4*n_cols, 2.7*n_rows), sharey='row', sharex=True) 126 | fig.tight_layout() 127 | 128 | for row, dataset in enumerate(datasets): 129 | for col, model in enumerate(models): 130 | ax = axs[row,col] 131 | l1, l2 = model 132 | for bound in bounds: 133 | for search in searches: 134 | data = df[ 135 | (df['dataset']==dataset) & 136 | (df['layer1']==l1) & 137 | (df['layer2']==l2) & 138 | (df['bounds']==bound) & 139 | (df['search']==search) 140 | ] 141 | if not data.empty: 142 | ax.plot(data['size_in_MiB'], data['lookup_in_ns'], label=f'{bound}+{search}', color=corr_colors[(bound,search)]) 143 | 144 | # Title 145 | ax.set_title(f'{dataset} ({l1}$\mapsto${l2})') 146 | 147 | # Labels 148 | if row==n_rows-1: 149 | ax.set_xlabel('Index size [MiB]') 150 | if col==0: 151 | ax.set_ylabel('Lookup time [ns]') 152 | 153 | # Visuals 154 | ax.set_xscale('log') 155 | if col==n_cols - 1: 156 | if dataset=='books': 157 | ax.set_ylim(bottom=0, top=850) 158 | elif dataset=='osmc': 159 | ax.set_ylim(bottom=0, top=1400) 160 | elif dataset=='wiki': 161 | ax.set_ylim(bottom=0, top=1000) 162 | 163 | # Legend 164 | if row==0 and col==0: 165 | fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 166 | 167 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 168 | 169 | 170 | if __name__ == "__main__": 171 | path = 'results' 172 | 173 | # Read csv file 174 | file = os.path.join(path, 'rmi_lookup.csv') 175 | df = pd.read_csv(file, delimiter=',', header=0, comment='#') 176 | 177 | # Compute median of lookup times 178 | df = df.groupby(['dataset','layer1','layer2','n_models','bounds','search']).median().reset_index() 179 | 180 | # Replace datasets, model names, bounds, and searches 181 | dataset_dict = { 182 | "books_200M_uint64": "books", 183 | "fb_200M_uint64": "fb", 184 | "osm_cellids_200M_uint64": "osmc", 185 | "wiki_ts_200M_uint64": "wiki" 186 | } 187 | model_dict = { 188 | "cubic_spline": "CS", 189 | "linear_spline": "LS", 190 | "linear_regression": "LR", 191 | "radix": "RX" 192 | } 193 | bounds_dict = { 194 | "labs": "LAbs", 195 | "lind": "LInd", 196 | "gabs": "GAbs", 197 | "gind": "GInd", 198 | "none": "NB" 199 | } 200 | search_dict = { 201 | "binary": "Bin", 202 | "model_biased_binary": "MBin", 203 | "model_biased_exponential": "MExp", 204 | "model_biased_linear": "MLin" 205 | } 206 | df.replace({**dataset_dict, **model_dict, **bounds_dict, **search_dict}, inplace=True) 207 | 208 | # Compute metrics 209 | df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024) 210 | df['lookup_in_ns'] = df['lookup_time'] / df['n_samples'] 211 | 212 | # Define variable lists 213 | datasets = sorted(df['dataset'].unique()) 214 | bounds = sorted(df['bounds'].unique()) 215 | searches = sorted(df['search'].unique()) 216 | l1models = sorted(df['layer1'].unique()) 217 | l2models = sorted(df['layer2'].unique()) 218 | corr_configs = [ 219 | ('GAbs','Bin'), 220 | ('GInd','Bin'),('GInd','MBin'), 221 | ('LAbs','Bin'), 222 | ('LInd','Bin'),('LInd','MBin'), 223 | ('NB','MExp'),('NB','MLin'), 224 | ] 225 | 226 | # Set colors 227 | model_colors = {} 228 | cmap = cm.get_cmap('tab10') 229 | n_colors = 10 230 | for i, (l1, l2) in enumerate(itertools.product(l1models, l2models)): 231 | model_colors[(l1,l2)] = cmap(i/n_colors) 232 | corr_colors = {} 233 | cmap = cm.get_cmap('Dark2') 234 | n_colors = len(corr_configs) 235 | for i, (bound, search) in enumerate(corr_configs): 236 | corr_colors[(bound,search)] = cmap(i/n_colors) 237 | 238 | if args['paper']: 239 | # Plot model types 240 | filename = 'rmi_lookup-model_types.pdf' 241 | print(f'Plotting lookup time by model types to \'{filename}\'...') 242 | plot_models(filename) 243 | 244 | # Plot error correction 245 | filename = 'rmi_lookup-error_correction.pdf' 246 | print(f'Plotting lookup time by error correction to \'{filename}\'...') 247 | plot_correction(filename) 248 | else: 249 | # Plot full results 250 | filename = 'rmi_lookup-full.pdf' 251 | print(f'Plotting full lookup time results to \'{filename}\'...') 252 | plot_full(filename) 253 | -------------------------------------------------------------------------------- /scripts/plot_index_comparison.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | import argparse 3 | import matplotlib.cm as cm 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import os 7 | import pandas as pd 8 | import warnings 9 | 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc')) 11 | 12 | # Ignore warnings 13 | warnings.filterwarnings( "ignore") 14 | 15 | # Argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true') 18 | args = vars(parser.parse_args()) 19 | 20 | 21 | def compute_pareto_frontier(source, cost, value): 22 | frontier = list() 23 | source = source.sort_values(cost) 24 | old_val = float('inf') 25 | for index, row in source.iterrows(): 26 | curr_val = row[value] 27 | if curr_val < old_val: 28 | old_val = curr_val 29 | frontier.append(row) 30 | result = pd.DataFrame(frontier) 31 | return result 32 | 33 | 34 | def plot_lookup(filename='index_comparison-lookup_time.pdf', width_fact=5, height_fact=4.2): 35 | n_rows = 2 36 | n_cols = 2 37 | 38 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=True) 39 | fig.tight_layout() 40 | 41 | for i, dataset in enumerate(datasets): 42 | row = int(i / 2) 43 | col = int(i % 2) 44 | ax = axs[row,col] 45 | 46 | # Scatter indexes 47 | for index in index_dict.keys(): 48 | data = df[ 49 | (df['dataset']==dataset) & 50 | (df['index']==index) 51 | ] 52 | if not data.empty and index!='Binary search': 53 | if index=='Compact Hist-Tree' or index=='RadixSpline': 54 | data = compute_pareto_frontier(data, 'size_in_MiB', 'lookup_in_ns') 55 | data = data.sort_values('size_in_MiB') 56 | ax.plot(data['size_in_MiB'], data['lookup_in_ns'], color=colors[index], label=index_dict[index], alpha=0.9) 57 | else: 58 | data = data.sort_values('size_in_MiB') 59 | ax.plot(data['size_in_MiB'], data['lookup_in_ns'], color=colors[index], label=index_dict[index], alpha=0.9) 60 | 61 | # Title 62 | ax.set_title(dataset) 63 | 64 | # Labels 65 | if row==n_rows - 1: 66 | ax.set_xlabel('Index size [MiB]') 67 | if col==0: 68 | ax.set_ylabel('Lookup time [ns]') 69 | 70 | # Visuals 71 | ax.set_xscale('log') 72 | ax.set_ylim(bottom=0, top=1250) 73 | 74 | # Legend 75 | if row==0 and col==0: 76 | fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center') 77 | 78 | # Binary search 79 | if True: 80 | data = df[ 81 | (df['dataset']==dataset) & 82 | (df['index']=='Binary search') 83 | ].iloc[0] 84 | ax.axhline(y=data['lookup_in_ns'], marker='None', color='.2', dashes=(2, 1), label='Binary search') 85 | 86 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 87 | 88 | 89 | def plot_build(filename='index_comparison-build_time.pdf', width_fact=5, height_fact=4.2): 90 | n_cols = 2 91 | n_rows = 2 92 | 93 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=True) 94 | fig.tight_layout() 95 | 96 | for i, dataset in enumerate(datasets): 97 | row = int(i / 2) 98 | col = int(i % 2) 99 | ax = axs[row,col] 100 | 101 | # Scatter indexes 102 | for index in index_dict.keys(): 103 | data = df[ 104 | (df['dataset']==dataset) & 105 | (df['index']==index) 106 | ] 107 | if not data.empty and index!='Binary search': 108 | if index=='Compact Hist-Tree' or index=='RadixSpline': 109 | data = compute_pareto_frontier(data, 'size_in_MiB', 'lookup_in_ns') 110 | data = data.sort_values('size_in_MiB') 111 | ax.plot(data['size_in_MiB'], data['build_in_s'], color=colors[index], label=index_dict[index], alpha=0.9) 112 | else: 113 | data = data.sort_values('size_in_MiB') 114 | ax.plot(data['size_in_MiB'], data['build_in_s'], color=colors[index], label=index_dict[index], alpha=0.9) 115 | 116 | # Title 117 | ax.set_title(dataset) 118 | 119 | # Labels 120 | if row==n_rows - 1: 121 | ax.set_xlabel('Index size [MiB]') 122 | if col==0: 123 | ax.set_ylabel('Build time [s]') 124 | 125 | # Visuals 126 | ax.set_xscale('log') 127 | ax.set_ylim(bottom=-1, top=30) 128 | 129 | # Legend 130 | if row==0 and col==0: 131 | fig.legend(ncol=4, bbox_to_anchor=(0.5, 1), loc='lower center') 132 | 133 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 134 | 135 | 136 | def plot_lookup_shares(filename, width_fact=5, height_fact=4.2): 137 | n_cols = len(datasets) 138 | n_rows = 1 139 | 140 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(width_fact*n_cols, height_fact*n_rows), sharey=True, sharex=False) 141 | fig.tight_layout() 142 | 143 | for col, dataset in enumerate(datasets): 144 | ax = axs[col] 145 | 146 | # Gather data of fastest configuration per index 147 | labels = [] 148 | evals = [] 149 | searches = [] 150 | bar_colors = [] 151 | 152 | # Binary search 153 | row = df[ 154 | (df['dataset']==dataset) & 155 | (df['index']=='Binary search') 156 | ].iloc[0] 157 | labels.append('Binary search') 158 | evals.append(0) 159 | searches.append(row['lookup_in_ns']) 160 | bar_colors.append('.2') 161 | 162 | for index in index_dict.keys(): 163 | data = df[ 164 | (df['dataset'] == dataset) & 165 | (df['index'] == index) 166 | ] 167 | labels.append(index_dict[index]) 168 | bar_colors.append(colors[index]) 169 | if not data.empty: 170 | data = data.sort_values(['lookup_in_ns']).reset_index() 171 | row = data.iloc[0] # fastest configuration 172 | evals.append(row['eval_in_ns']) 173 | searches.append(row['search_in_ns']) 174 | else: 175 | evals.append(0) 176 | searches.append(0) 177 | 178 | # Plot results 179 | ax.bar(labels, evals, color='0.6', edgecolor=bar_colors, linewidth=1, label='Evaluation') 180 | ax.bar(labels, searches, color='0.9', edgecolor=bar_colors, linewidth=1, label='Search', bottom=evals) 181 | 182 | # Title 183 | ax.set_title(dataset) 184 | 185 | # Labels 186 | if col==0: 187 | ax.set_ylabel('Lookup time [ns]') 188 | 189 | # Visuals 190 | ax.grid(False, axis='x') 191 | ax.set_xticklabels(labels=labels, rotation=90) 192 | 193 | # Legend 194 | if col==0: 195 | fig.legend(ncol=2, bbox_to_anchor=(0.5, 1), loc='lower center') 196 | 197 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 198 | 199 | 200 | if __name__ == "__main__": 201 | path = 'results' 202 | 203 | # Read csv file 204 | file = os.path.join(path, 'index_comparison.csv') 205 | df = pd.read_csv(file, delimiter=',', header=0, comment='#') 206 | df = df.replace({np.nan: '-'}) 207 | 208 | # Compute medians 209 | df = df.groupby(['dataset', 'index', 'config']).median().reset_index() 210 | 211 | # Replace datasets 212 | dataset_dict = { 213 | "books_200M_uint64": "books", 214 | "fb_200M_uint64": "fb", 215 | "osm_cellids_200M_uint64": "osmc", 216 | "wiki_ts_200M_uint64": "wiki" 217 | } 218 | df.replace({**dataset_dict}, inplace=True) 219 | index_dict = { 220 | 'RMI-ours': 'RMI (ours)', 221 | 'RMI-ref': 'RMI (ref)', 222 | 'ALEX': 'ALEX', 223 | 'PGM-index': 'PGM-index', 224 | 'RadixSpline': 'RadixSpline', 225 | 'Compact Hist-Tree': 'Hist-Tree', 226 | 'B-tree': 'B-tree', 227 | 'ART': 'ART' 228 | } 229 | 230 | # Compute metrics 231 | df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024) 232 | df['build_in_s'] = df['build_time'] / 1000000000 233 | df['eval_in_ns'] = df['eval_time'] / df['n_samples'] 234 | df['lookup_in_ns'] = df['lookup_time'] / df['n_samples'] 235 | df['search_in_ns'] = df['lookup_in_ns'] - df['eval_in_ns'] 236 | 237 | # Define variable lists 238 | datasets = sorted(df['dataset'].unique()) 239 | indexes = sorted(df['index'].unique()) 240 | 241 | # Set colors 242 | cmap = cm.get_cmap('tab10') 243 | n_colors = 10 244 | colors = {} 245 | for i, index in enumerate(index_dict.keys()): 246 | colors[index] = cmap(i/n_colors) 247 | 248 | if args['paper']: 249 | # Plot lookup times against index size 250 | filename = 'index_comparison-lookup_time.pdf' 251 | print(f'Plotting lookup time results to \'{filename}\'...') 252 | plot_lookup(filename, 4, 2.7) 253 | 254 | # Plot build times against index size 255 | filename = 'index_comparison-build_time.pdf' 256 | print(f'Plotting build time results to \'{filename}\'...') 257 | plot_build(filename, 4, 2.7) 258 | 259 | # Plot share of eval time and search time in overall lookup time 260 | filename = 'index_comparison-lookup_shares.pdf' 261 | print(f'Plotting lookup time shares to \'{filename}\'...') 262 | plot_lookup_shares(filename, 2.1, 2) 263 | else: 264 | # Plot lookup times against index size 265 | filename = 'index_comparison-lookup_time.pdf' 266 | print(f'Plotting lookup time results to \'{filename}\'...') 267 | plot_lookup(filename) 268 | 269 | # Plot build times against index size 270 | filename = 'index_comparison-build_time.pdf' 271 | print(f'Plotting build time results to \'{filename}\'...') 272 | plot_build(filename) 273 | 274 | # Plot share of eval time and search time in overall lookup time 275 | filename = 'index_comparison-lookup_shares.pdf' 276 | print(f'Plotting lookup time shares to \'{filename}\'...') 277 | plot_lookup_shares(filename) 278 | -------------------------------------------------------------------------------- /scripts/plot_rmi_build.py: -------------------------------------------------------------------------------- 1 | #!python3 2 | import argparse 3 | import itertools 4 | import matplotlib.cm as cm 5 | import matplotlib.pyplot as plt 6 | import os 7 | import pandas as pd 8 | import warnings 9 | 10 | plt.style.use(os.path.join('scripts', 'matplotlibrc')) 11 | 12 | # Ignore warnings 13 | warnings.filterwarnings( "ignore") 14 | 15 | # Argparse 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('-p', '--paper', help='produce paper plots', action='store_true') 18 | args = vars(parser.parse_args()) 19 | 20 | 21 | def plot_ours_full(filename='rmi_build-ours_full.pdf'): 22 | rmi='ours' 23 | 24 | n_rows = len(datasets) 25 | n_cols = len(l1models) * len(l2models) 26 | 27 | configs = itertools.product(l1models, l2models) 28 | 29 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True) 30 | fig.tight_layout() 31 | 32 | for col, (l1, l2) in enumerate(configs): 33 | for row, dataset in enumerate(datasets): 34 | ax = axs[row,col] 35 | for bound in bounds: 36 | data = df[ 37 | (df['dataset']==dataset) & 38 | (df['rmi']==rmi) & 39 | (df['layer1']==l1) & 40 | (df['layer2']==l2) & 41 | (df['bounds']==bound) 42 | ] 43 | if not data.empty: 44 | ax.plot(data['size_in_MiB'], data['build_in_s'], c=bound_colors[bound], marker=rmi_markers['ours'], label=bound) 45 | 46 | # Title 47 | ax.set_title(f'{dataset} ({l1}$\mapsto${l2})') 48 | 49 | # Labels 50 | if col==0: 51 | ax.set_ylabel('Build time [s]') 52 | if row==n_rows - 1: 53 | ax.set_xlabel('Index size [MiB]') 54 | 55 | # Visuals 56 | ax.set_ylim(bottom=0) 57 | ax.set_xscale('log') 58 | 59 | # Legend 60 | if row==0 and col==0: 61 | fig.legend(ncol=len(bounds), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 62 | 63 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 64 | 65 | 66 | def plot_comp_full(filename='rmi_build-comp_full.pdf'): 67 | n_rows = len(datasets) 68 | n_cols = len(l1models) * len(l2models) 69 | 70 | bounds = ['NB','LAbs'] 71 | configs = itertools.product(l1models, l2models) 72 | 73 | fig, axs = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 4.2*n_rows), sharey=True, sharex=True) 74 | fig.tight_layout() 75 | 76 | for col, (l1, l2) in enumerate(configs): 77 | for row, dataset in enumerate(datasets): 78 | ax = axs[row,col] 79 | for rmi in rmis: 80 | for bound in bounds: 81 | data = df[ 82 | (df['dataset']==dataset) & 83 | (df['rmi']==rmi) & 84 | (df['layer1']==l1) & 85 | (df['layer2']==l2) & 86 | (df['bounds']==bound) 87 | ] 88 | if not data.empty: 89 | ax.plot(data['size_in_MiB'], data['build_in_s'], c=bound_colors[bound], marker=rmi_markers[rmi], label=f'{bound} ({rmi})') 90 | 91 | # Title 92 | ax.set_title(f'{dataset} ({l1}$\mapsto${l2})') 93 | 94 | # Labels 95 | if col==0: 96 | ax.set_ylabel('Build time [s]') 97 | if row==n_rows - 1: 98 | ax.set_xlabel('Index size [MiB]') 99 | 100 | # Visuals 101 | ax.set_ylim(bottom=0) 102 | ax.set_xscale('log') 103 | 104 | # Legend 105 | if row==0 and col==0: 106 | fig.legend(ncol=len(bounds), bbox_to_anchor=(0.5, 1), loc='lower center', frameon=False) 107 | 108 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 109 | 110 | 111 | def plot_models(dataset, models, bound, filename): 112 | rmi = 'ours' 113 | 114 | n_cols = 1 115 | n_rows = 1 116 | 117 | fig, ax = plt.subplots(n_rows, n_cols, figsize=(2.7*n_cols, 2.3*n_rows)) 118 | fig.tight_layout() 119 | 120 | for l1, l2 in models: 121 | data = df[ 122 | (df['dataset']==dataset) & 123 | (df['rmi']==rmi) & 124 | (df['layer1']==l1) & 125 | (df['layer2']==l2) & 126 | (df['bounds']==bound) 127 | ] 128 | if not data.empty: 129 | ax.plot(data['size_in_MiB'], data['build_in_s'], c=model_colors[(l1,l2)], label=f'{l1}$\mapsto${l2}') 130 | 131 | # Title 132 | ax.set_title(f'{dataset} ({bound})') 133 | 134 | # Labels 135 | ax.set_ylabel('Build time [s]') 136 | ax.set_xlabel('Index size [MiB]') 137 | 138 | # Visuals 139 | ax.set_xscale('log') 140 | ax.set_ylim(bottom=0, top=10) 141 | 142 | # Legend 143 | fig.legend(ncol=2, bbox_to_anchor=(0.5, 1), loc='lower center') 144 | 145 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 146 | 147 | 148 | def plot_bounds(dataset, model, filename): 149 | rmi = 'ours' 150 | l1, l2 = model 151 | 152 | n_cols = 1 153 | n_rows = 1 154 | 155 | fig, ax = plt.subplots(n_rows, n_cols, figsize=(2.7*n_cols, 2.3*n_rows)) 156 | fig.tight_layout() 157 | 158 | for bound in bounds: 159 | data = df[ 160 | (df['dataset']==dataset) & 161 | (df['rmi']==rmi) & 162 | (df['layer1']==l1) & 163 | (df['layer2']==l2) & 164 | (df['bounds']==bound) 165 | ] 166 | if not data.empty: 167 | ax.plot(data['size_in_MiB'], data['build_in_s'], c=bound_colors[bound], label=bound) 168 | 169 | # Title 170 | ax.set_title(f'{dataset} ({l1}$\mapsto${l2})') 171 | 172 | # Labels 173 | ax.set_ylabel('Build time [s]') 174 | ax.set_xlabel('Index size [MiB]') 175 | 176 | # Visuals 177 | ax.set_xscale('log') 178 | ax.set_ylim(bottom=0, top=10) 179 | 180 | # Legend 181 | fig.legend(ncol=3, bbox_to_anchor=(0.5, 1), loc='lower center') 182 | 183 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 184 | 185 | 186 | def plot_comp(dataset, models, bound, filename): 187 | n_cols = 1 188 | n_rows = 1 189 | 190 | fig, ax = plt.subplots(n_rows, n_cols, figsize=(2.7*n_cols, 2.3*n_rows)) 191 | fig.tight_layout() 192 | 193 | for rmi in rmis: 194 | for l1, l2 in models: 195 | data = df[ 196 | (df['dataset']==dataset) & 197 | (df['rmi']==rmi) & 198 | (df['layer1']==l1) & 199 | (df['layer2']==l2) & 200 | (df['bounds']==bound) 201 | ] 202 | if not data.empty: 203 | ax.plot(data['size_in_MiB'], data['build_in_s'], c=model_colors[(l1,l2)], marker=rmi_markers[rmi], label=f'{l1}$\mapsto${l2} ({rmi})') 204 | 205 | # Title 206 | ax.set_title(f'{dataset} ({bound})') 207 | 208 | # Labels 209 | ax.set_ylabel('Build time [s]') 210 | ax.set_xlabel('Index size [MiB]') 211 | 212 | # Axes 213 | ax.set_xscale('log') 214 | ax.set_ylim(bottom=0) 215 | 216 | # Legend 217 | fig.legend(ncol=2, bbox_to_anchor=(0.5, 1), loc='lower center') 218 | 219 | fig.savefig(os.path.join(path, filename), bbox_inches='tight') 220 | 221 | 222 | if __name__ == "__main__": 223 | path = 'results' 224 | 225 | # Read csv file 226 | file = os.path.join(path, 'rmi_build.csv') 227 | df = pd.read_csv(file, delimiter=',', header=0, comment='#') 228 | 229 | # Compute median of lookup times 230 | df = df.groupby(['dataset','rmi','layer1','layer2','n_models','bounds']).median().reset_index() 231 | 232 | # Replace datasets, model names, and bounds 233 | dataset_dict = { 234 | "books_200M_uint64": "books", 235 | "fb_200M_uint64": "fb", 236 | "osm_cellids_200M_uint64": "osmc", 237 | "wiki_ts_200M_uint64": "wiki" 238 | } 239 | model_dict = { 240 | "cubic_spline": "CS", 241 | "linear_spline": "LS", 242 | "linear_regression": "LR", 243 | "radix": "RX" 244 | } 245 | bounds_dict = { 246 | "labs": "LAbs", 247 | "lind": "LInd", 248 | "gabs": "GAbs", 249 | "gind": "GInd", 250 | "none": "NB" 251 | } 252 | df.replace({**dataset_dict, **model_dict, **bounds_dict}, inplace=True) 253 | 254 | # Compute metrics 255 | df['size_in_MiB'] = df['size_in_bytes'] / (1024 * 1024) 256 | df['build_in_s'] = df['build_time'] / 1_000_000_000 257 | 258 | # Define variable lists 259 | datasets = sorted(df['dataset'].unique()) 260 | rmis = sorted(df['rmi'].unique()) 261 | bounds = sorted(df['bounds'].unique()) 262 | l1models = sorted(df['layer1'].unique()) 263 | l2models = sorted(df['layer2'].unique()) 264 | 265 | # Set colors and markers 266 | model_colors = {} 267 | cmap = cm.get_cmap('tab10') 268 | n_colors = 10 269 | for i, (l1, l2) in enumerate(itertools.product(l1models, l2models)): 270 | model_colors[(l1,l2)] = cmap(i/n_colors) 271 | bound_colors = {} 272 | cmap = cm.get_cmap('Dark2') 273 | n_colors = 8 274 | for i, bound in enumerate(bounds): 275 | bound_colors[bound] = cmap(i/n_colors) 276 | rmi_markers = {'ours': '.', 'ref': 'x'} 277 | 278 | if args['paper']: 279 | # Plot layer1 280 | filename = 'rmi_build-layer1.pdf' 281 | print(f'Plotting build times by layer 1 to \'{filename}\'...') 282 | plot_models('books', [('CS','LR'),('LR','LR'),('LS','LR'),('RX','LR')], 'NB', filename) 283 | 284 | # Plot layer2 285 | filename = 'rmi_build-layer2.pdf' 286 | print(f'Plotting build times by layer 2 to \'{filename}\'...') 287 | plot_models('books', [('LS','LS'),('LS','LR'),('RX','LS'),('RX','LR')], 'NB', filename) 288 | 289 | # Plot bounds 290 | filename = 'rmi_build-bounds.pdf' 291 | print(f'Plotting build times by error bounds to \'{filename}\'...') 292 | plot_bounds('books', ('LS','LR'), filename) 293 | 294 | # Plot comparison NB 295 | filename = 'rmi_build-comp_nb.pdf' 296 | print(f'Plotting build time comparison to reference implementation (NB) to \'{filename}\'...') 297 | plot_comp('books', [('LS','LR'),('RX','LS')], 'NB', filename) 298 | 299 | # Plot comparison LAbs 300 | filename = 'rmi_build-comp_labs.pdf' 301 | print(f'Plotting build time comparison to reference implementation (LAbs) to \'{filename}\'...') 302 | plot_comp('books', [('LS','LR'),('RX','LS')], 'LAbs', filename) 303 | else: 304 | # Plot ours 305 | filename = 'rmi_build-ours_full.pdf' 306 | print(f'Plotting full build time results to \'{filename}\'...') 307 | plot_ours_full(filename) 308 | 309 | # Plot reference 310 | filename = 'rmi_build-comp_full.pdf' 311 | print(f'Plotting full build time comparison to reference implementation to \'{filename}\'...') 312 | plot_comp_full(filename) 313 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /experiments/rmi_lookup.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "argparse/argparse.hpp" 5 | 6 | #include "rmi/models.hpp" 7 | #include "rmi/rmi.hpp" 8 | #include "rmi/util/fn.hpp" 9 | #include "rmi/util/search.hpp" 10 | 11 | using key_type = uint64_t; 12 | using namespace std::chrono; 13 | 14 | std::size_t s_glob; ///< global size_t variable 15 | 16 | 17 | /** 18 | * Measures lookup times of @p samples on a given @p Rmi and writes results to `std::cout`. 19 | * @tparam Key key type 20 | * @tparam Rmi RMI type 21 | * @tparam Search search type 22 | * @param keys on which the RMI is built 23 | * @param n_models number of models in the second layer of the RMI 24 | * @param samples for which the lookup time is measured 25 | * @param n_reps number of repetitions 26 | * @param dataset_name name of the dataset 27 | * @param layer1 model type of the first layer 28 | * @param layer2 model type of the second layer 29 | * @param bound_type used by the RMI 30 | * @param search used by the RMI for correction prediction errors 31 | */ 32 | template 33 | void experiment(const std::vector &keys, 34 | const std::size_t n_models, 35 | const std::vector &samples, 36 | const std::size_t n_reps, 37 | const std::string dataset_name, 38 | const std::string layer1, 39 | const std::string layer2, 40 | const std::string bound_type, 41 | const std::string search) 42 | { 43 | using rmi_type = Rmi; 44 | auto search_fn = Search(); 45 | 46 | // Build RMI. 47 | rmi_type rmi(keys, n_models); 48 | 49 | // Perform n_reps runs. 50 | for (std::size_t rep = 0; rep != n_reps; ++rep) { 51 | 52 | // Lookup time. 53 | std::size_t lookup_accu = 0; 54 | auto start = steady_clock::now(); 55 | for (std::size_t i = 0; i != samples.size(); ++i) { 56 | auto key = samples.at(i); 57 | auto range = rmi.search(key); 58 | auto pos = search_fn(keys.begin() + range.lo, keys.begin() + range.hi, keys.begin() + range.pos, key); 59 | lookup_accu += std::distance(keys.begin(), pos); 60 | } 61 | auto stop = steady_clock::now(); 62 | auto lookup_time = duration_cast(stop - start).count(); 63 | s_glob = lookup_accu; 64 | 65 | // Report results. 66 | // Dataset 67 | std::cout << dataset_name << ',' 68 | << keys.size() << ',' 69 | // Index 70 | << layer1 << ',' 71 | << layer2 << ',' 72 | << n_models << ',' 73 | << bound_type << ',' 74 | << search << ',' 75 | << rmi.size_in_bytes() << ',' 76 | // Experiment 77 | << rep << ',' 78 | << samples.size() << ',' 79 | // Results 80 | << lookup_time << ',' 81 | // Checksums 82 | << lookup_accu << std::endl; 83 | } // reps 84 | } 85 | 86 | 87 | /** 88 | * @brief experiment function pointer 89 | */ 90 | typedef void (*exp_fn_ptr)(const std::vector&, 91 | const std::size_t, 92 | const std::vector&, 93 | const std::size_t, 94 | const std::string, 95 | const std::string, 96 | const std::string, 97 | const std::string, 98 | const std::string); 99 | 100 | /** 101 | * RMI configuration that holds the string representation of model types of layer 1 and layer 2, error bound type, and 102 | * search algorithm. 103 | */ 104 | struct Config { 105 | std::string layer1; 106 | std::string layer2; 107 | std::string bound_type; 108 | std::string search; 109 | }; 110 | 111 | /** 112 | * Comparator class for @p Config objects. 113 | */ 114 | struct ConfigCompare { 115 | bool operator() (const Config &lhs, const Config &rhs) const { 116 | if (lhs.layer1 != rhs.layer1) return lhs.layer1 < rhs.layer1; 117 | if (lhs.layer2 != rhs.layer2) return lhs.layer2 < rhs.layer2; 118 | if (lhs.bound_type != rhs.bound_type) return lhs.bound_type < rhs.bound_type; 119 | return lhs.search < rhs.search; 120 | } 121 | }; 122 | 123 | #define ENTRIES(L1, L2, LT1, LT2) \ 124 | { {#L1, #L2, "none", "binary"}, &experiment, BinarySearch> }, \ 125 | { {#L1, #L2, "labs", "binary"}, &experiment, BinarySearch> }, \ 126 | { {#L1, #L2, "lind", "binary"}, &experiment, BinarySearch> }, \ 127 | { {#L1, #L2, "gabs", "binary"}, &experiment, BinarySearch> }, \ 128 | { {#L1, #L2, "gind", "binary"}, &experiment, BinarySearch> }, \ 129 | { {#L1, #L2, "none", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 130 | { {#L1, #L2, "labs", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 131 | { {#L1, #L2, "lind", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 132 | { {#L1, #L2, "gabs", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 133 | { {#L1, #L2, "gind", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 134 | { {#L1, #L2, "none", "linear"}, &experiment, LinearSearch> }, \ 135 | { {#L1, #L2, "labs", "linear"}, &experiment, LinearSearch> }, \ 136 | { {#L1, #L2, "lind", "linear"}, &experiment, LinearSearch> }, \ 137 | { {#L1, #L2, "gabs", "linear"}, &experiment, LinearSearch> }, \ 138 | { {#L1, #L2, "gind", "linear"}, &experiment, LinearSearch> }, \ 139 | { {#L1, #L2, "none", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 140 | { {#L1, #L2, "labs", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 141 | { {#L1, #L2, "lind", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 142 | { {#L1, #L2, "gabs", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 143 | { {#L1, #L2, "gind", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 144 | { {#L1, #L2, "none", "exponential"}, &experiment, ExponentialSearch> }, \ 145 | { {#L1, #L2, "labs", "exponential"}, &experiment, ExponentialSearch> }, \ 146 | { {#L1, #L2, "lind", "exponential"}, &experiment, ExponentialSearch> }, \ 147 | { {#L1, #L2, "gabs", "exponential"}, &experiment, ExponentialSearch> }, \ 148 | { {#L1, #L2, "gind", "exponential"}, &experiment, ExponentialSearch> }, \ 149 | { {#L1, #L2, "none", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 150 | { {#L1, #L2, "labs", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 151 | { {#L1, #L2, "lind", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 152 | { {#L1, #L2, "gabs", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 153 | { {#L1, #L2, "gind", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 154 | 155 | static std::map exp_map { 156 | ENTRIES(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression) 157 | ENTRIES(linear_regression, linear_spline, rmi::LinearRegression, rmi::LinearSpline) 158 | ENTRIES(linear_spline, linear_regression, rmi::LinearSpline, rmi::LinearRegression) 159 | ENTRIES(linear_spline, linear_spline, rmi::LinearSpline, rmi::LinearSpline) 160 | ENTRIES(cubic_spline, linear_regression, rmi::CubicSpline, rmi::LinearRegression) 161 | ENTRIES(cubic_spline, linear_spline, rmi::CubicSpline, rmi::LinearSpline) 162 | ENTRIES(radix, linear_regression, rmi::Radix, rmi::LinearRegression) 163 | ENTRIES(radix, linear_spline, rmi::Radix, rmi::LinearSpline) 164 | }; ///< Map that assigns an experiment function pointer to RMI configurations. 165 | #undef ENTRIES 166 | 167 | 168 | /** 169 | * Triggers measurement of lookup times for an RMI configuration provided via command line arguments. 170 | * @param argc arguments counter 171 | * @param argv arguments vector 172 | */ 173 | int main(int argc, char *argv[]) 174 | { 175 | // Initialize argument parser. 176 | argparse::ArgumentParser program(argv[0], "0.1"); 177 | 178 | // Define arguments. 179 | program.add_argument("filename") 180 | .help("path to binary file containing uin64_t keys"); 181 | 182 | program.add_argument("layer1") 183 | .help("layer1 model type, either linear_regression, linear_spline, cubic_spline, or radix."); 184 | 185 | program.add_argument("layer2") 186 | .help("layer2 model type, either linear_regression, linear_spline, or cubic_spline."); 187 | 188 | program.add_argument("n_models") 189 | .help("number of models on layer2, power of two is recommended.") 190 | .action([](const std::string &s) { return std::stoul(s); }); 191 | 192 | program.add_argument("bound_type") 193 | .help("type of error bounds used, either none, labs, lind, gabs, or gind."); 194 | 195 | program.add_argument("search") 196 | .help("search algorithm for error correction, either binary, model_biased_binary, exponential, model_biased_exponential, linear, or model_biased_linear."); 197 | 198 | program.add_argument("-n", "--n_reps") 199 | .help("number of experiment repetitions") 200 | .default_value(std::size_t(3)) 201 | .action([](const std::string &s) { return std::stoul(s); }); 202 | 203 | program.add_argument("-s", "--n_samples") 204 | .help("number of sampled lookup keys") 205 | .default_value(std::size_t(1'000'000)) 206 | .action([](const std::string &s) { return std::stoul(s); }); 207 | 208 | program.add_argument("--header") 209 | .help("output csv header") 210 | .default_value(false) 211 | .implicit_value(true); 212 | 213 | // Parse arguments. 214 | try { 215 | program.parse_args(argc, argv); 216 | } 217 | catch (const std::runtime_error &err) { 218 | std::cout << err.what() << '\n' << program; 219 | exit(EXIT_FAILURE); 220 | } 221 | 222 | // Read arguments. 223 | const auto filename = program.get("filename"); 224 | const auto dataset_name = split(filename, '/').back(); 225 | const auto layer1 = program.get("layer1"); 226 | const auto layer2 = program.get("layer2"); 227 | const auto n_models = program.get("n_models"); 228 | const auto bound_type = program.get("bound_type"); 229 | const auto search = program.get("search"); 230 | const auto n_reps = program.get("-n"); 231 | const auto n_samples = program.get("-s"); 232 | 233 | // Load keys. 234 | auto keys = load_data(filename); 235 | 236 | // Sample keys. 237 | uint64_t seed = 42; 238 | std::mt19937 gen(seed); 239 | std::uniform_int_distribution<> distrib(0, keys.size() - 1); 240 | std::vector samples; 241 | samples.reserve(n_samples); 242 | for (std::size_t i = 0; i != n_samples; ++i) 243 | samples.push_back(keys[distrib(gen)]); 244 | 245 | // Lookup experiment. 246 | Config config{layer1, layer2, bound_type, search}; 247 | if (exp_map.find(config) == exp_map.end()) { 248 | std::cerr << "Error: " << layer1 << ',' << layer2 << ',' << bound_type << ',' << search << " is not a valid RMI configuration." << std::endl; 249 | exit(EXIT_FAILURE); 250 | } 251 | exp_fn_ptr exp_fn = exp_map[config]; 252 | 253 | // Output header. 254 | if (program["--header"] == true) 255 | std::cout << "dataset," 256 | << "n_keys," 257 | << "layer1," 258 | << "layer2," 259 | << "n_models," 260 | << "bounds," 261 | << "search," 262 | << "size_in_bytes," 263 | << "rep," 264 | << "n_samples," 265 | << "lookup_time," 266 | << "lookup_accu," 267 | << std::endl; 268 | 269 | // Run experiment. 270 | (*exp_fn)(keys, n_models, samples, n_reps, dataset_name, layer1, layer2, bound_type, search); 271 | 272 | exit(EXIT_SUCCESS); 273 | } 274 | -------------------------------------------------------------------------------- /include/rmi/models.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "rmi/util/fn.hpp" 7 | 8 | namespace rmi { 9 | 10 | /** 11 | * A model that fits a linear segment from the first first to the last data point. 12 | * 13 | * We assume that x-values are sorted in ascending order and y-values are handed implicitly where @p offset and @p 14 | * offset + distance(first, last) are the first and last y-value, respectively. The y-values can be scaled by 15 | * providing a @p compression_factor. 16 | */ 17 | class LinearSpline 18 | { 19 | private: 20 | double slope_; ///< The slope of the linear segment. 21 | double intercept_; ///< The y-intercept of the lienar segment. 22 | 23 | public: 24 | /** 25 | * Default contructor. 26 | */ 27 | LinearSpline() = default; 28 | 29 | /** 30 | * Builds a linaer segment between the first and last data point. 31 | * @param first, last iterators to the first and last x-value the linear segment is fit on 32 | * @param offset first y-value the linear segment is fit on 33 | * @param compression_factor by which the y-values are scaled 34 | */ 35 | template 36 | LinearSpline(RandomIt first, RandomIt last, std::size_t offset = 0, double compression_factor = 1.f) { 37 | std::size_t n = std::distance(first, last); 38 | 39 | if (n == 0) { 40 | slope_ = 0.f; 41 | intercept_ = 0.f; 42 | return; 43 | } 44 | if (n == 1) { 45 | slope_ = 0.f; 46 | intercept_ = static_cast(offset) * compression_factor; 47 | return; 48 | } 49 | 50 | double numerator = static_cast(n); // (offset + n) - offset 51 | double denominator = static_cast(*(last - 1) - *first); 52 | 53 | slope_ = denominator != 0.0 ? numerator/denominator * compression_factor : 0.0; 54 | intercept_ = offset * compression_factor - slope_ * *first; 55 | } 56 | 57 | /** 58 | * Returns the estimated y-value of @p x. 59 | * @param x to estimate a y-value for 60 | * @return the estimated y-value for @p x 61 | */ 62 | template 63 | double predict(const X x) const { return std::fma(slope_, static_cast(x), intercept_); } 64 | 65 | /** 66 | * Returns the slope of the linear segment. 67 | * @return the slope of the linear segment 68 | */ 69 | double slope() const { return slope_; } 70 | 71 | /** 72 | * Returns the y-intercept of the linear segment. 73 | * return the y-intercept of the linear segment 74 | */ 75 | double intercept() const { return intercept_; } 76 | 77 | /** 78 | * Returns the size of the linear segment in bytes. 79 | * @return segment size in bytes. 80 | */ 81 | std::size_t size_in_bytes() { return 2 * sizeof(double); } 82 | 83 | /** 84 | * Writes the mathematical representation of the linear segment to an output stream. 85 | * @param out output stream to write the linear segment to 86 | * @param m the linear segment 87 | * @returns the output stream 88 | */ 89 | friend std::ostream & operator<<(std::ostream &out, const LinearSpline &m) { 90 | return out << m.slope() << " * x + " << m.intercept(); 91 | } 92 | }; 93 | 94 | 95 | /** 96 | * A linear regression model that fits a straight line to minimize the mean squared error. 97 | * 98 | * We assume that x-values are sorted in ascending order and y-values are handed implicitly where @p offset and @p 99 | * offset + distance(first, last) are the first and last y-value, respectively. The y-values can be scaled by 100 | * providing a @p compression_factor. 101 | */ 102 | class LinearRegression 103 | { 104 | private: 105 | double slope_; ///< The slope of the linear function. 106 | double intercept_; ///< The y-intercept of the lienar function. 107 | 108 | public: 109 | /* 110 | * Default constructor. 111 | */ 112 | LinearRegression() = default; 113 | 114 | /** 115 | * Builds a linaer regression model between on the given data points. 116 | * @param first, last iterators to the first and last x-value the linear regression is fit on 117 | * @param offset first y-value the linear regression is fit on 118 | * @param compression_factor by which the y-values are scaled 119 | */ 120 | template 121 | LinearRegression(RandomIt first, RandomIt last, std::size_t offset = 0, double compression_factor = 1.f) { 122 | std::size_t n = std::distance(first, last); 123 | 124 | if (n == 0) { 125 | slope_ = 0.f; 126 | intercept_ = 0.f; 127 | return; 128 | } 129 | if (n == 1) { 130 | slope_ = 0.f; 131 | intercept_ = static_cast(offset) * compression_factor; 132 | return; 133 | } 134 | 135 | double mean_x = 0.0; 136 | double mean_y = 0.0; 137 | double c = 0.0; 138 | double m2 = 0.0; 139 | 140 | for (std::size_t i = 0; i != n; ++i) { 141 | auto x = *(first + i); 142 | std::size_t y = offset + i; 143 | 144 | double dx = x - mean_x; 145 | mean_x += dx / (i + 1); 146 | mean_y += (y - mean_y) / (i + 1); 147 | c += dx * (y - mean_y); 148 | 149 | double dx2 = x - mean_x; 150 | m2 += dx * dx2; 151 | } 152 | 153 | double cov = c / (n - 1); 154 | double var = m2 / (n - 1); 155 | 156 | if (var == 0.f) { 157 | slope_ = 0.f; 158 | intercept_ = mean_y; 159 | return; 160 | } 161 | 162 | slope_ = cov / var * compression_factor; 163 | intercept_ = mean_y * compression_factor - slope_ * mean_x; 164 | } 165 | 166 | /** 167 | * Returns the estimated y-value of @p x. 168 | * @param x to estimate a y-value for 169 | * @return the estimated y-value for @p x 170 | */ 171 | template 172 | double predict(const X x) const { return std::fma(slope_, static_cast(x), intercept_); } 173 | 174 | /** 175 | * Returns the slope of the linear regression model. 176 | * @return the slope of the linear regression model 177 | */ 178 | double slope() const { return slope_; } 179 | 180 | /** 181 | * Returns the y-intercept of the linear regression model. 182 | * return the y-intercept of the linear regression model 183 | */ 184 | double intercept() const { return intercept_; } 185 | 186 | /** 187 | * Returns the size of the linear regression model in bytes. 188 | * @return model size in bytes. 189 | */ 190 | std::size_t size_in_bytes() { return 2 * sizeof(double); } 191 | 192 | /** 193 | * Writes the mathematical representation of the linear regression model to an output stream. 194 | * @param out output stream to write the linear regression model to 195 | * @param m the linear regression model 196 | * @returns the output stream 197 | */ 198 | friend std::ostream & operator<<(std::ostream &out, const LinearRegression &m) { 199 | return out << m.slope() << " * x + " << m.intercept(); 200 | } 201 | }; 202 | 203 | 204 | /** 205 | * A model that fits a cubic segment from the first first to the last data point. 206 | * 207 | * We assume that x-values are sorted in ascending order and y-values are handed implicitly where @p offset and @p 208 | * offset + distance(first, last) are the first and last y-value, respectively. The y-values can be scaled by 209 | * providing a @p compression_factor. 210 | */ 211 | class CubicSpline 212 | { 213 | private: 214 | double a_; ///< The cubic coefficient. 215 | double b_; ///< The quadric coefficietn. 216 | double c_; ///< The linear coefficient. 217 | double d_; ///< The y-intercept. 218 | 219 | public: 220 | /** 221 | * Default constructor. 222 | */ 223 | CubicSpline() = default; 224 | 225 | /** 226 | * Builds a cubic segment between the first and last data point. 227 | * @param first, last iterators to the first and last x-value the cubic segment is fit on 228 | * @param offset first y-value the cubic segment is fit on 229 | * @param compression_factor by which the y-values are scaled 230 | */ 231 | template 232 | CubicSpline(RandomIt first, RandomIt last, std::size_t offset = 0, double compression_factor = 1.f) { 233 | std::size_t n = std::distance(first, last); 234 | 235 | if (n == 0) { 236 | a_ = 0.f; 237 | b_ = 0.f; 238 | c_ = 1.f; 239 | d_ = 0.f; 240 | return; 241 | } 242 | if (n == 1 or *first == *(last - 1)) { 243 | a_ = 0.f; 244 | b_ = 0.f; 245 | c_ = 0.f; 246 | d_ = static_cast(offset) * compression_factor; 247 | return; 248 | } 249 | 250 | double xmin = static_cast(*first); 251 | double ymin = static_cast(offset) * compression_factor; 252 | double xmax = static_cast(*(last - 1)); 253 | double ymax = static_cast(offset + n - 1) * compression_factor; 254 | 255 | double x1 = 0.0; 256 | double y1 = 0.0; 257 | double x2 = 1.0; 258 | double y2 = 1.0; 259 | 260 | double sxn, syn = 0.0; 261 | for (std::size_t i = 0; i != n; ++i) { 262 | double x = static_cast(*(first + i)); 263 | double y = static_cast(offset + i) * compression_factor; 264 | sxn = (x - xmin) / (xmax - xmin); 265 | if (sxn > 0.0) { 266 | syn = (y - ymin) / (ymax - ymin); 267 | break; 268 | } 269 | } 270 | double m1 = (syn - y1) / (sxn - x1); 271 | 272 | double sxp, syp = 0.0; 273 | for (std::size_t i = 0; i != n; ++i) { 274 | double x = static_cast(*(first + i)); 275 | double y = static_cast(offset + i) * compression_factor; 276 | sxp = (x - xmin) / (xmax - xmin); 277 | if (sxp < 1.0) { 278 | syp = (y - ymin) / (ymax - ymin); 279 | break; 280 | } 281 | } 282 | double m2 = (y2 - syp) / (x2 - sxp); 283 | 284 | if (std::pow(m1, 2.0) + std::pow(m2, 2.0) > 9.0) { 285 | double tau = 3.0 / std::sqrt(std::pow(m1, 2.0) + std::pow(m2, 2.0)); 286 | m1 *= tau; 287 | m2 *= tau; 288 | } 289 | 290 | a_ = (m1 + m2 - 2.0) 291 | / std::pow(xmax - xmin, 3.0); 292 | 293 | b_ = -(xmax * (2.0 * m1 + m2 - 3.0) + xmin * (m1 + 2.0 * m2 - 3.0)) 294 | / std::pow(xmax - xmin, 3.0); 295 | 296 | c_ = (m1 * std::pow(xmax, 2.0) + m2 * std::pow(xmin, 2.0) + xmax * xmin * (2.0 * m1 + 2.0 * m2 - 6.0)) 297 | / std::pow(xmax - xmin, 3.0); 298 | 299 | d_ = -xmin * (m1 * std::pow(xmax, 2.0) + xmax * xmin * (m2 - 3.0) + std::pow(xmin, 2.0)) 300 | / std::pow(xmax - xmin, 3.0); 301 | 302 | a_ *= ymax - ymin; 303 | b_ *= ymax - ymin; 304 | c_ *= ymax - ymin; 305 | d_ *= ymax - ymin; 306 | d_ += ymin; 307 | 308 | // Check if linear spline performs better. 309 | // LinearSpline ls(first, last, offset, compression_factor); 310 | 311 | // double ls_error = 0.f; 312 | // double cs_error = 0.f; 313 | 314 | // for (std::size_t i = 0; i != n; ++i) { 315 | // double y = (offset +i) * compression_factor; 316 | // auto key = *(first + i); 317 | // double ls_pred = ls.predict(key); 318 | // double cs_pred = predict(key); 319 | // ls_error += std::abs(ls_pred - y); 320 | // cs_error += std::abs(cs_pred - y); 321 | // } 322 | 323 | // if (ls_error < cs_error) { 324 | // a_ = 0; 325 | // b_ = 0; 326 | // c_ = ls.slope(); 327 | // d_ = ls.intercept(); 328 | // } 329 | } 330 | 331 | /** 332 | * Returns the estimated y-value of @p x. 333 | * @param x to estimate a y-value for 334 | * @return the estimated y-value for @p x 335 | */ 336 | template 337 | double predict(const X x) const { 338 | double x_ = static_cast(x); 339 | double v1 = std::fma(a_, x_, b_); 340 | double v2 = std::fma(v1, x_, c_); 341 | double v3 = std::fma(v2, x_, d_); 342 | return v3; 343 | } 344 | 345 | /** Returns the cubic coefficient. 346 | * @return the cubic coefficient 347 | */ 348 | double a() const { return a_; } 349 | 350 | /** Returns the quadric coefficient. 351 | * @return the quadric coefficient 352 | */ 353 | double b() const { return b_; } 354 | 355 | /** Returns the linear coefficient. 356 | * @return the linear coefficient 357 | */ 358 | double c() const { return c_; } 359 | 360 | /** Returns the y-intercept. 361 | * @return the y-intercept 362 | */ 363 | double d() const { return d_; } 364 | 365 | /** 366 | * Returns the size of the cubic segment in bytes. 367 | * @return segment size in bytes. 368 | */ 369 | std::size_t size_in_bytes() { return 4 * sizeof(double); } 370 | 371 | /** 372 | * Writes the mathematical representation of the cubic segment to an output stream. 373 | * @param out output stream to write the cubic segment to 374 | * @param m the cubic segment 375 | * @returns the output stream 376 | */ 377 | friend std::ostream & operator<<(std::ostream &out, const CubicSpline &m) { 378 | return out << m.a() << " * x^3 + " 379 | << m.b() << " * x^2 + " 380 | << m.c() << " * x + d"; 381 | } 382 | }; 383 | 384 | 385 | /** 386 | * A radix model that projects a x-values to their most significant bits after eliminating the common prefix. 387 | * 388 | * We assume that x-values are sorted in ascending order and y-values are handed implicitly where @p offset and @p 389 | * offset + distance(first, last) are the first and last y-value, respectively. The y-values can be scaled by 390 | * providing a @p compression_factor. 391 | * 392 | * @tparam the type of x-values. 393 | */ 394 | template 395 | class Radix 396 | { 397 | using x_type = X; 398 | 399 | private: 400 | x_type mask_; ///< The mask for parallel bits extract. 401 | 402 | public: 403 | /* 404 | * Default constructor. 405 | */ 406 | Radix() = default; 407 | 408 | /** 409 | * Builds a radix model on the given data points. 410 | * @param first, last iterators to the first and last x-value the linear regression is fit on 411 | * @param offset first y-value the linear regression is fit on 412 | * @param compression_factor by which the y-values are scaled 413 | */ 414 | template 415 | Radix(RandomIt first, RandomIt last, std::size_t offset = 0, double compression_factor = 1.f) { 416 | std::size_t n = std::distance(first, last); 417 | 418 | if (n == 0) { 419 | mask_ = 0; 420 | return; 421 | } 422 | 423 | auto prefix = common_prefix_width(*first, *(last - 1)); // compute common prefix length 424 | 425 | if (prefix == (sizeof(x_type) * 8)) { 426 | mask_ = 42; // TODO: What should the mask be in this case? 427 | return; 428 | } 429 | 430 | // Determine radix width. 431 | std::size_t max = static_cast(offset + n - 1) * compression_factor; 432 | bool is_mersenne = (max & (max + 1)) == 0; // check if max is 2^n-1 433 | auto radix = is_mersenne ? bit_width(max) : bit_width(max) - 1; 434 | 435 | // Mask all bits but the radix 436 | mask_ = (~(x_type)0 >> prefix) & (~(x_type)0 << ((sizeof(x_type) * 8) - radix - prefix)); //0xffff << prefix_ 437 | } 438 | 439 | /** 440 | * Returns the estimated y-value of @p x. 441 | * @param x to estimate a y-value for 442 | * @return the estimated y-value for @p x 443 | */ 444 | // double predict(const x_type x) const { return (x << prefix_) >> ((sizeof(x_type) * 8) - radix_); } 445 | double predict(const x_type x) const { 446 | if constexpr(sizeof(x_type) <= sizeof(unsigned)) { 447 | return _pext_u32(x, mask_); 448 | } else if constexpr(sizeof(x_type) <= sizeof(unsigned long long)) { 449 | return _pext_u64(x, mask_); 450 | } else { 451 | static_assert(sizeof(x_type) > sizeof(unsigned long long), "unsupported width of integral type"); 452 | } 453 | } 454 | 455 | /** 456 | * Returns the mask used for parallel bits extraction. 457 | * @return the mask 458 | */ 459 | uint8_t mask() const { return mask_; } 460 | 461 | /** 462 | * Returns the size of the radix model in bytes. 463 | * @return radix model size in bytes. 464 | */ 465 | std::size_t size_in_bytes() { return sizeof(mask_); } 466 | 467 | /** 468 | * Writes a human readable representation of the radix model to an output stream. 469 | * @param out output stream to write the radix model to 470 | * @param m the radix model 471 | * @returns the output stream 472 | */ 473 | friend std::ostream & operator<<(std::ostream &out, const Radix &m) { 474 | return out << "_pext(x, " << m.mask() << ")"; 475 | } 476 | }; 477 | 478 | } // namespace rmi 479 | -------------------------------------------------------------------------------- /include/rmi/rmi.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | 7 | namespace rmi { 8 | 9 | /** 10 | * Struct to hold the approximated position and error bounds returned by the index. 11 | */ 12 | struct Approx { 13 | std::size_t pos; ///< The estimated position of the key. 14 | std::size_t lo; ///< The lower bound of the search range. 15 | std::size_t hi; ///< The upper bound of the search range. 16 | }; 17 | 18 | /** 19 | * This is a reimplementation of a two-layer recursive model index (RMI) supporting a variety of (monotonic) models. 20 | * RMIs were invented by Kraska et al. (https://dl.acm.org/doi/epdf/10.1145/3183713.3196909). 21 | * 22 | * Note that this is the base class which does not provide error bounds. 23 | * 24 | * @tparam Key the type of the keys to be indexed 25 | * @tparam Layer1 the type of the model used in layer1 26 | * @tparam Layer2 the type of the models used in layer2 27 | */ 28 | template 29 | class Rmi 30 | { 31 | using key_type = Key; 32 | using layer1_type = Layer1; 33 | using layer2_type = Layer2; 34 | 35 | protected: 36 | std::size_t n_keys_; ///< The number of keys the index was built on. 37 | std::size_t layer2_size_; ///< The number of models in layer2. 38 | layer1_type l1_; ///< The layer1 model. 39 | layer2_type *l2_; ///< The array of layer2 models. 40 | 41 | public: 42 | /** 43 | * Default constructor. 44 | */ 45 | Rmi() = default; 46 | 47 | /** 48 | * Builds the index with @p layer2_size models in layer2 on the sorted @p keys. 49 | * @param keys vector of sorted keys to be indexed 50 | * @param layer2_size the number of models in layer2 51 | */ 52 | Rmi(const std::vector &keys, const std::size_t layer2_size) 53 | : Rmi(keys.begin(), keys.end(), layer2_size) { } 54 | 55 | /** 56 | * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last). 57 | * @param first, last iterators that define the range of sorted keys to be indexed 58 | * @param layer2_size the number of models in layer2 59 | */ 60 | template 61 | Rmi(RandomIt first, RandomIt last, const std::size_t layer2_size) 62 | : n_keys_(std::distance(first, last)) 63 | , layer2_size_(layer2_size) 64 | { 65 | // Train layer1. 66 | l1_ = layer1_type(first, last, 0, static_cast(layer2_size) / n_keys_); // train with compression 67 | 68 | // Train layer2. 69 | l2_ = new layer2_type[layer2_size]; 70 | std::size_t segment_start = 0; 71 | std::size_t segment_id = 0; 72 | // Assign each key to its segment. 73 | for (std::size_t i = 0; i != n_keys_; ++i) { 74 | auto pos = first + i; 75 | std::size_t pred_segment_id = get_segment_id(*pos); 76 | // If a key is assigned to a new segment, all models must be trained up to the new segment. 77 | if (pred_segment_id > segment_id) { 78 | new (&l2_[segment_id]) layer2_type(first + segment_start, pos, segment_start); 79 | for (std::size_t j = segment_id + 1; j < pred_segment_id; ++j) { 80 | new (&l2_[j]) layer2_type(pos - 1, pos, i - 1); // train other models on last key in previous segment 81 | } 82 | segment_id = pred_segment_id; 83 | segment_start = i; 84 | } 85 | } 86 | // Train remaining models. 87 | new (&l2_[segment_id]) layer2_type(first + segment_start, last, segment_start); 88 | for (std::size_t j = segment_id + 1; j < layer2_size; ++j) { 89 | new (&l2_[j]) layer2_type(last - 1, last, n_keys_ - 1); // train remaining models on last key 90 | } 91 | } 92 | 93 | /** 94 | * Destructor. 95 | */ 96 | ~Rmi() { delete[] l2_; } 97 | 98 | /** 99 | * Returns the id of the segment @p key belongs to. 100 | * @param key to get segment id for 101 | * @return segment id of the given key 102 | */ 103 | std::size_t get_segment_id(const key_type key) const { 104 | return std::clamp(l1_.predict(key), 0, layer2_size_ - 1); 105 | } 106 | 107 | /** 108 | * Returns a position estimate and search bounds for a given key. 109 | * @param key to search for 110 | * @return position estimate and search bounds 111 | */ 112 | Approx search(const key_type key) const { 113 | auto segment_id = get_segment_id(key); 114 | std::size_t pred = std::clamp(l2_[segment_id].predict(key), 0, n_keys_ - 1); 115 | return {pred, 0, n_keys_}; 116 | } 117 | 118 | /** 119 | * Returns the number of keys the index was built on. 120 | * @return the number of keys the index was built on 121 | */ 122 | std::size_t n_keys() const { return n_keys_; } 123 | 124 | /** 125 | * Returns the number of models in layer2. 126 | * @return the number of models in layer2 127 | */ 128 | std::size_t layer2_size() const { return layer2_size_; } 129 | 130 | /** 131 | * Returns the size of the index in bytes. 132 | * @return index size in bytes 133 | */ 134 | std::size_t size_in_bytes() { 135 | return l1_.size_in_bytes() + layer2_size_ * l2_[0].size_in_bytes() + sizeof(n_keys_) + sizeof(layer2_size_); 136 | } 137 | }; 138 | 139 | 140 | /** 141 | * Recursive model index with global absolute bounds. 142 | */ 143 | template 144 | class RmiGAbs : public Rmi 145 | { 146 | using base_type = Rmi; 147 | using key_type = Key; 148 | using layer1_type = Layer1; 149 | using layer2_type = Layer2; 150 | 151 | protected: 152 | std::size_t error_; ///< The error bound of the layer2 models. 153 | 154 | public: 155 | /** 156 | * Default constructor. 157 | */ 158 | RmiGAbs() = default; 159 | 160 | /** 161 | * Builds the index with @p layer2_size models in layer2 on the sorted @p keys. 162 | * @param keys vector of sorted keys to be indexed 163 | * @param layer2_size the number of models in layer2 164 | */ 165 | RmiGAbs(const std::vector &keys, const std::size_t layer2_size) 166 | : RmiGAbs(keys.begin(), keys.end(), layer2_size) { } 167 | 168 | /** 169 | * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last). 170 | * @param first, last iterators that define the range of sorted keys to be indexed 171 | * @param layer2_size the number of models in layer2 172 | */ 173 | template 174 | RmiGAbs(RandomIt first, RandomIt last, const std::size_t layer2_size) : base_type(first, last, layer2_size) { 175 | // Compute global absolute errror bounds. 176 | error_ = 0; 177 | for (std::size_t i = 0; i != base_type::n_keys_; ++i) { 178 | key_type key = *(first + i); 179 | std::size_t segment_id = base_type::get_segment_id(key); 180 | std::size_t pred = std::clamp(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1); 181 | if (pred > i) { // overestimation 182 | error_ = std::max(error_, pred - i); 183 | } else { // underestimation 184 | error_ = std::max(error_, i - pred); 185 | } 186 | } 187 | } 188 | 189 | /** 190 | * Returns a position estimate and search bounds for a given key. 191 | * @param key to search for 192 | * @return position estimate and search bounds 193 | */ 194 | Approx search(const key_type key) const { 195 | auto segment_id = base_type::get_segment_id(key); 196 | std::size_t pred = std::clamp(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1); 197 | std::size_t lo = pred > error_ ? pred - error_ : 0; 198 | std::size_t hi = std::min(pred + error_ + 1, base_type::n_keys_); 199 | return {pred, lo, hi}; 200 | } 201 | 202 | /** 203 | * Returns the size of the index in bytes. 204 | * @return index size in bytes 205 | */ 206 | std::size_t size_in_bytes() { return base_type::size_in_bytes() + sizeof(error_); } 207 | }; 208 | 209 | 210 | /** 211 | * Recursive model index with global individual bounds. 212 | */ 213 | template 214 | class RmiGInd : public Rmi 215 | { 216 | using base_type = Rmi; 217 | using key_type = Key; 218 | using layer1_type = Layer1; 219 | using layer2_type = Layer2; 220 | 221 | protected: 222 | std::size_t error_lo_; ///< The lower error bound of the layer2 models. 223 | std::size_t error_hi_; ///< The upper error bound of the layer2 models. 224 | 225 | public: 226 | /** 227 | * Default constructor. 228 | */ 229 | RmiGInd() = default; 230 | 231 | /** 232 | * Builds the index with @p layer2_size models in layer2 on the sorted @p keys. 233 | * @param keys vector of sorted keys to be indexed 234 | * @param layer2_size the number of models in layer2 235 | */ 236 | RmiGInd(const std::vector &keys, const std::size_t layer2_size) 237 | : RmiGInd(keys.begin(), keys.end(), layer2_size) { } 238 | 239 | /** 240 | * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last). 241 | * @param first, last iterators that define the range of sorted keys to be indexed 242 | * @param layer2_size the number of models in layer2 243 | */ 244 | template 245 | RmiGInd(RandomIt first, RandomIt last, const std::size_t layer2_size) : base_type(first, last, layer2_size) { 246 | // Compute global absolute errror bounds. 247 | error_lo_ = 0; 248 | error_hi_ = 0; 249 | for (std::size_t i = 0; i != base_type::n_keys_; ++i) { 250 | key_type key = *(first + i); 251 | std::size_t segment_id = base_type::get_segment_id(key); 252 | std::size_t pred = std::clamp(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1); 253 | if (pred > i) { // overestimation 254 | error_lo_ = std::max(error_lo_, pred - i); 255 | } else { // underestimation 256 | error_hi_ = std::max(error_hi_, i - pred); 257 | } 258 | } 259 | } 260 | 261 | /** 262 | * Returns a position estimate and search bounds for a given key. 263 | * @param key to search for 264 | * @return position estimate and search bounds 265 | */ 266 | Approx search(const key_type key) const { 267 | auto segment_id = base_type::get_segment_id(key); 268 | std::size_t pred = std::clamp(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1); 269 | std::size_t lo = pred > error_lo_ ? pred - error_lo_ : 0; 270 | std::size_t hi = std::min(pred + error_hi_ + 1, base_type::n_keys_); 271 | return {pred, lo, hi}; 272 | } 273 | 274 | /** 275 | * Returns the size of the index in bytes. 276 | * @return index size in bytes 277 | */ 278 | std::size_t size_in_bytes() { return base_type::size_in_bytes() + sizeof(error_lo_) + sizeof(error_hi_); } 279 | }; 280 | 281 | 282 | /** 283 | * Recursive model index with local absolute bounds. 284 | */ 285 | template 286 | class RmiLAbs : public Rmi 287 | { 288 | using base_type = Rmi; 289 | using key_type = Key; 290 | using layer1_type = Layer1; 291 | using layer2_type = Layer2; 292 | 293 | protected: 294 | std::vector errors_; ///< The error bounds of the layer2 models. 295 | 296 | public: 297 | /** 298 | * Default constructor. 299 | */ 300 | RmiLAbs() = default; 301 | 302 | /** 303 | * Builds the index with @p layer2_size models in layer2 on the sorted @p keys. 304 | * @param keys vector of sorted keys to be indexed 305 | * @param layer2_size the number of models in layer2 306 | */ 307 | RmiLAbs(const std::vector &keys, const std::size_t layer2_size) 308 | : RmiLAbs(keys.begin(), keys.end(), layer2_size) { } 309 | 310 | /** 311 | * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last). 312 | * @param first, last iterators that define the range of sorted keys to be indexed 313 | * @param layer2_size the number of models in layer2 314 | */ 315 | template 316 | RmiLAbs(RandomIt first, RandomIt last, const std::size_t layer2_size) : base_type(first, last, layer2_size) { 317 | // Compute local absolute errror bounds. 318 | errors_ = std::vector(layer2_size); 319 | for (std::size_t i = 0; i != base_type::n_keys_; ++i) { 320 | key_type key = *(first + i); 321 | std::size_t segment_id = base_type::get_segment_id(key); 322 | std::size_t pred = std::clamp(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1); 323 | if (pred > i) { // overestimation 324 | errors_[segment_id] = std::max(errors_[segment_id], pred - i); 325 | } else { // underestimation 326 | errors_[segment_id] = std::max(errors_[segment_id], i - pred); 327 | } 328 | } 329 | } 330 | 331 | /** 332 | * Returns a position estimate and search bounds for a given key. 333 | * @param key to search for 334 | * @return position estimate and search bounds 335 | */ 336 | Approx search(const key_type key) const { 337 | auto segment_id = base_type::get_segment_id(key); 338 | std::size_t pred = std::clamp(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1); 339 | std::size_t err = errors_[segment_id]; 340 | std::size_t lo = pred > err ? pred - err : 0; 341 | std::size_t hi = std::min(pred + err + 1, base_type::n_keys_); 342 | return {pred, lo, hi}; 343 | } 344 | 345 | /** 346 | * Returns the size of the index in bytes. 347 | * @return index size in bytes 348 | */ 349 | std::size_t size_in_bytes() { return base_type::size_in_bytes() + errors_.size() * sizeof(errors_.front()); } 350 | }; 351 | 352 | 353 | /** 354 | * Recursive model index with local individual bounds. 355 | */ 356 | template 357 | class RmiLInd : public Rmi 358 | { 359 | using base_type = Rmi; 360 | using key_type = Key; 361 | using layer1_type = Layer1; 362 | using layer2_type = Layer2; 363 | 364 | protected: 365 | /** 366 | * Struct to store a lower and an upper error bound. 367 | */ 368 | struct bounds { 369 | std::size_t lo; ///< The lower error bound. 370 | std::size_t hi; ///< The upper error bound. 371 | 372 | /** 373 | * Default constructor. 374 | */ 375 | bounds() : lo(0), hi(0) { } 376 | }; 377 | 378 | std::vector errors_; ///< The error bounds of the layer2 models. 379 | 380 | public: 381 | /** 382 | * Default constructor. 383 | */ 384 | RmiLInd() = default; 385 | 386 | /** 387 | * Builds the index with @p layer2_size models in layer2 on the sorted @p keys. 388 | * @param keys vector of sorted keys to be indexed 389 | * @param layer2_size the number of models in layer2 390 | */ 391 | RmiLInd(const std::vector &keys, const std::size_t layer2_size) 392 | : RmiLInd(keys.begin(), keys.end(), layer2_size) { } 393 | 394 | /** 395 | * Builds the index with @p layer2_size models in layer2 on the sorted keys in the range [first, last). 396 | * @param first, last iterators that define the range of sorted keys to be indexed 397 | * @param layer2_size the number of models in layer2 398 | */ 399 | template 400 | RmiLInd(RandomIt first, RandomIt last, const std::size_t layer2_size) : base_type(first, last, layer2_size) { 401 | // Compute local individual errror bounds. 402 | errors_ = std::vector(layer2_size); 403 | for (std::size_t i = 0; i != base_type::n_keys_; ++i) { 404 | key_type key = *(first + i); 405 | std::size_t segment_id = base_type::get_segment_id(key); 406 | std::size_t pred = std::clamp(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1); 407 | if (pred > i) { // overestimation 408 | std::size_t &lo = errors_[segment_id].lo; 409 | lo = std::max(lo, pred - i); 410 | } else { // underestimation 411 | std::size_t &hi = errors_[segment_id].hi; 412 | hi = std::max(hi, i - pred); 413 | } 414 | } 415 | } 416 | 417 | /** 418 | * Returns a position estimate and search bounds for a given key. 419 | * @param key to search for 420 | * @return position estimate and search bounds 421 | */ 422 | Approx search(const key_type key) const { 423 | auto segment_id = base_type::get_segment_id(key); 424 | std::size_t pred = std::clamp(base_type::l2_[segment_id].predict(key), 0, base_type::n_keys_ - 1); 425 | bounds err = errors_[segment_id]; 426 | std::size_t lo = pred > err.lo ? pred - err.lo : 0; 427 | std::size_t hi = std::min(pred + err.hi + 1, base_type::n_keys_); 428 | return {pred, lo, hi}; 429 | } 430 | 431 | /** 432 | * Returns the size of the index in bytes. 433 | * @return index size in bytes 434 | */ 435 | std::size_t size_in_bytes() { return base_type::size_in_bytes() + errors_.size() * sizeof(errors_.front()); } 436 | }; 437 | 438 | } // namespace rmi 439 | -------------------------------------------------------------------------------- /experiments/rmi_guideline.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "argparse/argparse.hpp" 6 | 7 | #include "rmi/models.hpp" 8 | #include "rmi/rmi.hpp" 9 | #include "rmi/util/fn.hpp" 10 | #include "rmi/util/search.hpp" 11 | 12 | using key_type = uint64_t; 13 | using namespace std::chrono; 14 | 15 | std::size_t s_glob; ///< global size_t variable 16 | 17 | 18 | /** 19 | * Measures lookup times of @p samples on a given @p Rmi and writes results to `std::cout`. 20 | * @tparam Key key type 21 | * @tparam Rmi RMI type 22 | * @tparam Search search type 23 | * @param keys on which the RMI is built 24 | * @param n_models number of models in the second layer of the RMI 25 | * @param samples for which the lookup time is measured 26 | * @param n_reps number of repetitions 27 | * @param dataset_name name of the dataset 28 | * @param layer1 model type of the first layer 29 | * @param layer2 model type of the second layer 30 | * @param bounds used by the RMI 31 | * @param search used by the RMI for correction prediction errors 32 | * @param budget the budget under which the configuration was chosen 33 | */ 34 | template 35 | void experiment(const std::vector &keys, 36 | const std::size_t n_models, 37 | const std::vector &samples, 38 | const std::size_t n_reps, 39 | const std::string dataset_name, 40 | const std::string layer1, 41 | const std::string layer2, 42 | const std::string bounds, 43 | const std::string search, 44 | const std::size_t budget, 45 | const bool is_guideline) 46 | { 47 | using rmi_type = Rmi; 48 | auto search_fn = Search(); 49 | 50 | // Build RMI. 51 | rmi_type rmi(keys, n_models); 52 | 53 | // Skip configurations that are guaranteed to not be the fastest. 54 | if (search == "model_biased_linear") { 55 | auto n_keys = keys.size(); 56 | std::vector errors; 57 | errors.reserve(n_keys); 58 | 59 | for (std::size_t i = 0; i != n_keys; ++i) { 60 | auto key = keys.at(i); 61 | auto pred = rmi.search(key).pos; 62 | auto err = pred > i ? pred - i : i - pred; 63 | errors.push_back(err); 64 | } 65 | 66 | auto mean_ae = mean(errors); 67 | if (mean_ae > 10) return; 68 | } 69 | 70 | // Perform n_reps runs. 71 | for (std::size_t rep = 0; rep != n_reps; ++rep) { 72 | 73 | // Lookup time. 74 | std::size_t lookup_accu = 0; 75 | auto start = steady_clock::now(); 76 | for (std::size_t i = 0; i != samples.size(); ++i) { 77 | auto key = samples.at(i); 78 | auto range = rmi.search(key); 79 | auto pos = search_fn(keys.begin() + range.lo, keys.begin() + range.hi, keys.begin() + range.pos, key); 80 | lookup_accu += std::distance(keys.begin(), pos); 81 | } 82 | auto stop = steady_clock::now(); 83 | auto lookup_time = duration_cast(stop - start).count(); 84 | s_glob = lookup_accu; 85 | 86 | // Report results. 87 | // Dataset 88 | std::cout << dataset_name << ',' 89 | << keys.size() << ',' 90 | // Index 91 | << layer1 << ',' 92 | << layer2 << ',' 93 | << n_models << ',' 94 | << bounds << ',' 95 | << search << ',' 96 | << rmi.size_in_bytes() << ',' 97 | // Experiment 98 | << rep << ',' 99 | << samples.size() << ',' 100 | << budget << ',' 101 | << is_guideline << ',' 102 | // Results 103 | << lookup_time << ',' 104 | // Checksums 105 | << lookup_accu << std::endl; 106 | } // reps 107 | } 108 | 109 | 110 | /** 111 | * @brief experiment function pointer 112 | */ 113 | typedef void (*exp_fn_ptr)(const std::vector&, 114 | const std::size_t, 115 | const std::vector&, 116 | const std::size_t, 117 | const std::string, 118 | const std::string, 119 | const std::string, 120 | const std::string, 121 | const std::string, 122 | const std::size_t, 123 | const bool); 124 | 125 | 126 | /** 127 | * RMI configuration that holds the string representation of model types of layer 1 and layer 2, error bound type, and 128 | * search algorithm. 129 | */ 130 | struct Config { 131 | std::string layer1; 132 | std::string layer2; 133 | std::string bounds; 134 | std::string search; 135 | }; 136 | 137 | 138 | /** 139 | * Comparator class for @p Config objects. 140 | */ 141 | struct ConfigCompare { 142 | bool operator() (const Config &lhs, const Config &rhs) const { 143 | if (lhs.layer1 != rhs.layer1) return lhs.layer1 < rhs.layer1; 144 | if (lhs.layer2 != rhs.layer2) return lhs.layer2 < rhs.layer2; 145 | if (lhs.bounds != rhs.bounds) return lhs.bounds < rhs.bounds; 146 | return lhs.search < rhs.search; 147 | } 148 | }; 149 | 150 | 151 | #define ENTRIES(L1, L2, LT1, LT2) \ 152 | { {#L1, #L2, "none", "binary"}, &experiment, BinarySearch> }, \ 153 | { {#L1, #L2, "labs", "binary"}, &experiment, BinarySearch> }, \ 154 | { {#L1, #L2, "lind", "binary"}, &experiment, BinarySearch> }, \ 155 | { {#L1, #L2, "gabs", "binary"}, &experiment, BinarySearch> }, \ 156 | { {#L1, #L2, "gind", "binary"}, &experiment, BinarySearch> }, \ 157 | { {#L1, #L2, "none", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 158 | { {#L1, #L2, "labs", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 159 | { {#L1, #L2, "lind", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 160 | { {#L1, #L2, "gabs", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 161 | { {#L1, #L2, "gind", "model_biased_binary"}, &experiment, ModelBiasedBinarySearch> }, \ 162 | { {#L1, #L2, "none", "linear"}, &experiment, LinearSearch> }, \ 163 | { {#L1, #L2, "labs", "linear"}, &experiment, LinearSearch> }, \ 164 | { {#L1, #L2, "lind", "linear"}, &experiment, LinearSearch> }, \ 165 | { {#L1, #L2, "gabs", "linear"}, &experiment, LinearSearch> }, \ 166 | { {#L1, #L2, "gind", "linear"}, &experiment, LinearSearch> }, \ 167 | { {#L1, #L2, "none", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 168 | { {#L1, #L2, "labs", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 169 | { {#L1, #L2, "lind", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 170 | { {#L1, #L2, "gabs", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 171 | { {#L1, #L2, "gind", "model_biased_linear"}, &experiment, ModelBiasedLinearSearch> }, \ 172 | { {#L1, #L2, "none", "exponential"}, &experiment, ExponentialSearch> }, \ 173 | { {#L1, #L2, "labs", "exponential"}, &experiment, ExponentialSearch> }, \ 174 | { {#L1, #L2, "lind", "exponential"}, &experiment, ExponentialSearch> }, \ 175 | { {#L1, #L2, "gabs", "exponential"}, &experiment, ExponentialSearch> }, \ 176 | { {#L1, #L2, "gind", "exponential"}, &experiment, ExponentialSearch> }, \ 177 | { {#L1, #L2, "none", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 178 | { {#L1, #L2, "labs", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 179 | { {#L1, #L2, "lind", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 180 | { {#L1, #L2, "gabs", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 181 | { {#L1, #L2, "gind", "model_biased_exponential"}, &experiment, ModelBiasedExponentialSearch> }, \ 182 | 183 | static std::map exp_map { 184 | ENTRIES(linear_regression, linear_regression, rmi::LinearRegression, rmi::LinearRegression) 185 | ENTRIES(linear_regression, linear_spline, rmi::LinearRegression, rmi::LinearSpline) 186 | ENTRIES(linear_spline, linear_regression, rmi::LinearSpline, rmi::LinearRegression) 187 | ENTRIES(linear_spline, linear_spline, rmi::LinearSpline, rmi::LinearSpline) 188 | ENTRIES(cubic_spline, linear_regression, rmi::CubicSpline, rmi::LinearRegression) 189 | ENTRIES(cubic_spline, linear_spline, rmi::CubicSpline, rmi::LinearSpline) 190 | ENTRIES(radix, linear_regression, rmi::Radix, rmi::LinearRegression) 191 | ENTRIES(radix, linear_spline, rmi::Radix, rmi::LinearSpline) 192 | }; ///< Map that assigns an experiment function pointer to RMI configurations. 193 | #undef ENTRIES 194 | 195 | 196 | /* 197 | * Computes the recommended RMI configuration by following a simple guideline and evaluates its performance. 198 | * @param keys on which the RMI is built 199 | * @param samples for which the lookup time is measured 200 | * @param n_reps number of repetitions 201 | * @param dataset_name name of the dataset 202 | * @param budget the budget under which the configuration is to be chosen 203 | */ 204 | void evaluate_guideline(const std::vector &keys, 205 | const std::vector &samples, 206 | const std::size_t n_reps, 207 | const std::string dataset_name, 208 | const std::size_t budget) 209 | { 210 | // Dermine maximum number of layer 2 models for LS->LR NB+MExp. 211 | auto n_models = (budget - 2 * sizeof(double) - 2 * sizeof(std::size_t)) / (2 * sizeof(double)); 212 | 213 | // Train RMI. 214 | rmi::Rmi rmi(keys, n_models); 215 | 216 | // Evaluate RMI error. 217 | auto n_keys = keys.size(); 218 | std::vector log2_errors; 219 | log2_errors.reserve(n_keys); 220 | 221 | for (std::size_t i = 0; i != n_keys; ++i) { 222 | auto key = keys.at(i); 223 | auto pred = rmi.search(key).pos; 224 | auto err = pred > i ? pred - i : i - pred; 225 | log2_errors.push_back(std::log2(err+1)); 226 | } 227 | 228 | auto mean_log2e = mean(log2_errors); 229 | 230 | // Pick and evaluate guideline config based on errors. 231 | auto l1 = "linear_spline"; 232 | auto l2 = "linear_regression"; 233 | 234 | auto threshold = 5.8; // This is hardware-dependent. 235 | 236 | if (mean_log2e < threshold) { 237 | auto bounds = "none"; 238 | auto search = "model_biased_exponential"; 239 | 240 | Config config {l1, l2, bounds, search}; 241 | exp_fn_ptr exp_fn = exp_map[config]; 242 | 243 | (*exp_fn)(keys, n_models, samples, n_reps, dataset_name, l1, l2, bounds, search, budget, true); 244 | } else { 245 | auto bounds = "labs"; 246 | auto search = "binary"; 247 | n_models = (budget - 2 * sizeof(double) - 2 * sizeof(std::size_t)) / (2 * sizeof(double) + sizeof(std::size_t)); 248 | 249 | Config config {l1, l2, bounds, search}; 250 | exp_fn_ptr exp_fn = exp_map[config]; 251 | 252 | (*exp_fn)(keys, n_models, samples, n_reps, dataset_name, l1, l2, bounds, search, budget, true); 253 | } 254 | } 255 | 256 | 257 | /** 258 | * Tests RMI configurations for a given size budget and compares them against the performance chosen by the guideline in 259 | * termns of lookup time. 260 | * @param argc arguments counter 261 | * @param argv arguments vector 262 | */ 263 | int main(int argc, char *argv[]) 264 | { 265 | // Initialize argument parser. 266 | argparse::ArgumentParser program(argv[0], "0.1"); 267 | 268 | // Define arguments. 269 | program.add_argument("filename") 270 | .help("path to binary file containing uin64_t keys"); 271 | 272 | program.add_argument("budget") 273 | .help("target size in bytes for the RMI configurations to test") 274 | .action([](const std::string &s) { return std::stoul(s); }); 275 | 276 | program.add_argument("-n", "--n_reps") 277 | .help("number of experiment repetitions") 278 | .default_value(std::size_t(3)) 279 | .action([](const std::string &s) { return std::stoul(s); }); 280 | 281 | program.add_argument("-s", "--n_samples") 282 | .help("number of sampled lookup keys") 283 | .default_value(std::size_t(1'000'000)) 284 | .action([](const std::string &s) { return std::stoul(s); }); 285 | 286 | program.add_argument("--header") 287 | .help("output csv header") 288 | .default_value(false) 289 | .implicit_value(true); 290 | 291 | // Parse arguments. 292 | try { 293 | program.parse_args(argc, argv); 294 | } 295 | catch (const std::runtime_error &err) { 296 | std::cout << err.what() << '\n' << program; 297 | exit(EXIT_FAILURE); 298 | } 299 | 300 | // Read arguments. 301 | const auto filename = program.get("filename"); 302 | const auto dataset_name = split(filename, '/').back(); 303 | const auto budget = program.get("budget"); 304 | const auto n_reps = program.get("-n"); 305 | const auto n_samples = program.get("-s"); 306 | 307 | // Load keys. 308 | auto keys = load_data(filename); 309 | 310 | // Sample keys. 311 | uint64_t seed = 42; 312 | std::mt19937 gen(seed); 313 | std::uniform_int_distribution<> distrib(0, keys.size() - 1); 314 | std::vector samples; 315 | samples.reserve(n_samples); 316 | for (std::size_t i = 0; i != n_samples; ++i) 317 | samples.push_back(keys[distrib(gen)]); 318 | 319 | // List configuration parameters. 320 | std::vector l1_models = {"linear_spline", "cubic_spline", "linear_regression", "radix"}; 321 | std::vector l2_models = {"linear_regression"}; // We know that lr is always better than ls from previous experiments. 322 | std::vector> err_corrs = { 323 | std::make_pair("none", "model_biased_exponential"), 324 | std::make_pair("none", "model_biased_linear"), 325 | std::make_pair("labs", "binary"), 326 | std::make_pair("lind", "model_biased_binary"), 327 | }; 328 | 329 | // List model and bound sizes. 330 | std::map model_size = { 331 | { "linear_spline", 2 * sizeof(double) }, 332 | { "cubic_spline", 4 * sizeof(double) }, 333 | { "linear_regression", 2 * sizeof(double) }, 334 | { "radix", 1 * sizeof(key_type) }, 335 | }; 336 | std::map bounds_size = { 337 | { "none", 0 }, 338 | { "labs", sizeof(std::size_t) }, 339 | { "lind", 2 * sizeof(std::size_t) }, 340 | }; 341 | 342 | // Output header. 343 | if (program["--header"] == true) 344 | std::cout << "dataset," 345 | << "n_keys," 346 | << "layer1," 347 | << "layer2," 348 | << "n_models," 349 | << "bounds," 350 | << "search," 351 | << "size_in_bytes," 352 | << "rep," 353 | << "n_samples," 354 | << "budget_in_bytes," 355 | << "is_guideline," 356 | << "lookup_time," 357 | << "lookup_accu" 358 | << std::endl; 359 | 360 | // Enumerate and evaluate configurations. 361 | for (auto l1 : l1_models) { 362 | for (auto l2 : l2_models) { 363 | for (auto corr : err_corrs) { 364 | auto bounds = corr.first; 365 | auto search = corr.second; 366 | 367 | // Dermine maximum number of layer 2 models. 368 | auto n_models = (budget - model_size[l1] - 2 * sizeof(std::size_t)) / (model_size[l2] + bounds_size[bounds]); 369 | 370 | // Build configuiration object. 371 | Config config {l1, l2, bounds, search}; 372 | 373 | // Lookup evaluation function. 374 | exp_fn_ptr exp_fn = exp_map[config]; 375 | 376 | // Call evaluatin function with keys and n_models. 377 | (*exp_fn)(keys, n_models, samples, n_reps, dataset_name, l1, l2, bounds, search, budget, false); 378 | } 379 | } 380 | } 381 | 382 | // Evaluate guideline configuration. 383 | evaluate_guideline(keys, samples, n_reps, dataset_name, budget); 384 | 385 | exit(EXIT_SUCCESS); 386 | } 387 | --------------------------------------------------------------------------------