├── .env ├── .gitignore ├── .pylintrc ├── Justfile ├── LICENSE ├── README.md ├── dynamic-exp ├── best_hp.csv ├── dynamic_exp.sh └── parse_log_exmaple.py ├── hyper-params.md ├── lecarb ├── __init__.py ├── __main__.py ├── constants.py ├── dataset │ ├── dataset.py │ ├── gen_dataset.py │ └── manipulate_dataset.py ├── dtypes.py ├── estimator │ ├── bayesnet.py │ ├── deepdb │ │ ├── README.md │ │ ├── aqp_spn │ │ │ ├── __init__.py │ │ │ ├── aqp_leaves.py │ │ │ ├── aqp_spn.py │ │ │ ├── code_generation │ │ │ │ ├── convert_conditions.py │ │ │ │ ├── generate_code.py │ │ │ │ └── templates │ │ │ │ │ ├── categorical_leave.cpp │ │ │ │ │ ├── identity_leave.cpp │ │ │ │ │ ├── master.cpp │ │ │ │ │ ├── method_master.cpp │ │ │ │ │ ├── product_node.cpp │ │ │ │ │ ├── registration_master.cpp │ │ │ │ │ └── sum_node.cpp │ │ │ ├── custom_spflow │ │ │ │ ├── __init__.py │ │ │ │ ├── custom_learning.py │ │ │ │ ├── custom_structure_learning.py │ │ │ │ ├── custom_transform_structure.py │ │ │ │ ├── custom_validity.py │ │ │ │ └── utils.py │ │ │ ├── expectations.py │ │ │ ├── group_by_combination.py │ │ │ ├── ranges.py │ │ │ └── util │ │ │ │ ├── Graphics.py │ │ │ │ ├── __init__.py │ │ │ │ └── bloom_filter.py │ │ ├── data_preparation │ │ │ ├── __init__.py │ │ │ ├── join_data_preparation.py │ │ │ └── prepare_single_tables.py │ │ ├── deepdb.py │ │ ├── ensemble_compilation │ │ │ ├── __init__.py │ │ │ ├── graph_representation.py │ │ │ ├── physical_db.py │ │ │ ├── probabilistic_query.py │ │ │ ├── spn_ensemble.py │ │ │ └── utils.py │ │ ├── ensemble_creation │ │ │ ├── __init__.py │ │ │ ├── naive.py │ │ │ ├── rdc_based.py │ │ │ └── utils.py │ │ └── evaluation │ │ │ ├── aqp_evaluation.py │ │ │ ├── cardinality_evaluation.py │ │ │ ├── confidence_interval_evaluation.py │ │ │ ├── spn_statistics.py │ │ │ └── utils.py │ ├── estimator.py │ ├── feedback_kde.py │ ├── lw │ │ ├── README.md │ │ ├── common.py │ │ ├── lw_nn.py │ │ ├── lw_tree.py │ │ └── model.py │ ├── mhist.py │ ├── mscn │ │ ├── README.md │ │ ├── model.py │ │ └── mscn.py │ ├── mysql.py │ ├── naru │ │ ├── README.md │ │ ├── made.py │ │ ├── naru.py │ │ └── transformer.py │ ├── postgres.py │ ├── sample.py │ └── utils.py └── workload │ ├── __init__.py │ ├── dump_quicksel.py │ ├── gen_label.py │ ├── gen_workload.py │ ├── generator.py │ ├── merge_workload.py │ └── workload.py └── pyproject.toml /.env: -------------------------------------------------------------------------------- 1 | DATA_ROOT=data 2 | OUTPUT_ROOT=output 3 | # DATABASE_URL=postgres://card:card@localhost:6666/card 4 | DATABASE_URL=postgres://card:card@localhost:6667/card 5 | KDE_DATABASE_URL=postgres://card:card@localhost:5432/card 6 | 7 | CPU_NUM_THREADS=16 8 | OMP_NUM_THREADS=16 9 | OPENBLAS_NUM_THREADS=16 10 | MKL_NUM_THREADS=16 11 | VECLIB_MAXIMUM_THREADS=16 12 | NUMEXPR_NUM_THREADS=16 13 | 14 | PSQL=/usr/bin/psql 15 | KDE_PSQL=/usr/local/pgsql/bin/psql 16 | KDE_POSTGRES=/usr/local/pgsql/bin/postgres 17 | KDE_PG_DATA=/home/ubuntu/feedback-kde/data 18 | 19 | # MYSQL 20 | MYSQL=mysql 21 | MYSQL_HOST=localhost 22 | MYSQL_DB=card 23 | MYSQL_USER=root 24 | MYSQL_PSWD=card 25 | MYSQL_PORT=10235 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | figures/ 3 | .ipynb_checkpoints/ 4 | cluster/ 5 | nohup.out 6 | __pycache__/ 7 | result/ 8 | log/ 9 | model/ 10 | fmodels/ 11 | tmp/ 12 | tmp.* 13 | *.bk 14 | .vscode 15 | .mypy_cache 16 | .venv 17 | lecarb.egg-info 18 | *.tar 19 | poetry.lock 20 | quicksel/ 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 SFU Database Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Are We Ready For Learned Cardinality Estimation? 2 | 3 | **Our paper can be found at [arxiv](https://arxiv.org/abs/2012.06743) and [vldb](http://www.vldb.org/pvldb/vol14/p1640-wang.pdf).** 4 | 5 | ## Development Environment Setup 6 | 7 | Setup: 8 | * Install Just 9 | * MacOS: `brew install just` 10 | * Linux: `curl --proto '=https' --tlsv1.2 -sSf https://just.systems/install.sh | bash -s -- --to /usr/local/bin` 11 | * Install Poetry: `pip install poetry` 12 | * Install Python dependencies: `just install-dependencies` 13 | 14 | We define all the commands used in this work in `Justfile`. Run `just -l` for a list of supported tasks. 15 | 16 | All the environmental configurations (e.g. data path, database configurations) are set in file `.env`. 17 | 18 | ## Dataset 19 | 20 | Download the real-world datasets and workloads from [here](https://www.dropbox.com/s/5bmvc1si5hysapf/data.tar.gz?dl=0). 21 | 22 | The path of the data is defined in `.env` as variable `DATA_ROOT`. We support dataset with different versions, typically a csv file is located at: `{DATA_ROOT}/{dataset name}/{version name}.csv`. 23 | 24 | We define the `Table` object, which contains both data, some commonly used statistics and functions for convenient usage. Please refer to `lecarb/dataset/dataset.py` for details. (Most of the methods in our repo take `Table` as the dataset input.) 25 | 26 | - Example: Given a csv file of census dataset (name: census13, version: original), generate the Table object 27 | ```bash 28 | # 1. convert csv file to pickle 29 | just csv2pkl data/census13/original.csv 30 | 31 | # 2. convert to Table object 32 | just pkl2table census13 original 33 | ``` 34 | 35 | - Example: Generate synthetic dataset with s=1.0, c=1.0, d=1000 (dataset name: dom1000, version: skew1.0_corr1.0) 36 | ```bash 37 | just data-gen 1.0 1.0 1000 38 | ``` 39 | If we want to update the dataset, please run the command in the following format: 40 | 41 | `just append-data-{update} {seed} {dataset} {version} {interval}`, 42 | 43 | where {update} can be chosen from `cor` and `skew`. {seed} is the random seed. {dataset} is the dataset name. {version} is the version of the data. {interval} is between 0 and 1. It decides the ratio of the data to be appended. 44 | 45 | 46 | 47 | - Example: Generate appended the dataset (name: census13, version: original) with correlated (update: cor) data: 48 | ``` 49 | just append-data-cor 123 census13 original 0.2 50 | ``` 51 | The appended data will be located at: `{DATA_ROOT}/{dataset name}/{version}+{version}_{update}_{interval}.pkl` 52 | 53 | ## Workload 54 | We adopt a unified workload generation framework to produce synthetic queries that we use in all the experiments. Specifically, in our framework each query is generated through three steps: 55 | 56 | 1. Choose a set of attributes to place predicates. 57 | 2. Select the query center for each predicate. 58 | 3. Determine the operator for each predicate (as well as widths for range predicates). 59 | 60 | We have different implementations of each step in `lecarb/workload/generator.py` (function names start with `asf_`, `csf_` and `wsf_` respectively), user can also add customized implementations to the code for more variations. 61 | 62 | - Example: generate workload used in static experiment for census dataset (workloads for real-world datasets used in the paper are already provided [here](https://www.dropbox.com/s/5bmvc1si5hysapf/data.tar.gz?dl=0)) 63 | ```bash 64 | # generate workload for small datasets (labels are generated in the same time) 65 | just wkld-gen-base census13 original base 66 | 67 | # for large datasets, start 10 processes to generate workload and then merge 68 | just wkld-gen-mth10 census13 original base 69 | just wkld-merge census13 original base 70 | rm data/census13/workload/base_[0-9]* 71 | ``` 72 | 73 | - Example: generate workload for synthetic dataset (name: dom1000, version: skew1.0_corr1.0) used in the paper, check [hyper-params.md](./hyper-params.md#preparation) for a whole prepartion procedure (data generation and workload & label generation) of the micro-benchmark. 74 | ```base 75 | # 1. generate workload (no labels generated) 76 | just wkld-vood dom1000 skew1.0_corr1.0 77 | 78 | # 2. generate labels 79 | just wkld-label dom1000 skew1.0_corr1.0 vood 80 | ``` 81 | 82 | ## Train & Test 83 | 84 | Training and test commands for all the estimators are defined in `Justfile`, for hyper-parameters used and examples please refer to [hyper-params.md](./hyper-params.md). 85 | 86 | Generated models are located at `{OUTPUT_ROOT}/model/{dataset name}/` and prediction results are at `{OUTPUT_ROOT}/result/{dataset name}/` in csv format. 87 | 88 | Run `just report-error {output file name} {dataset name}` to see different error metrics of the **static** experiment result. 89 | 90 | ## Run dynamic experiments: 91 | 92 | Dynamic experiment related code for reproducibility is in `dynamic-exp/` and commands can be found in `Justfile`. 93 | 94 | (1) To run all dynmaic experiments, run `bash dynamic-exp/dynamic_exp.sh`. It includes all commands for dynamic experiment. 95 | 96 | - Example: we want to run dynamic experiment for mscn on data 'census13'. We could run the following command: 97 | 98 | ``` 99 | just dynamic-mscn-census13 census13 original base cor 0.2 10000 123 100 | ``` 101 | 102 | 'original' is the old version of census13. 'base' is the training workload generation method. 'cor' is the correlation change we consider for data update. '0.2' is the appended size of data (i.e. 20% of the 'original data'). '10000' is the size of training workload. '123' is the random seed. 103 | 104 | (2) Run the following command to see different error metrics of the dynamic experiment errors. 105 | 106 | `just report-error-dynamic {dataset} {stale model result file} {update model result file} {T} {model update time}`, 107 | 108 | where {model update time} can be extracted through parsing the logging files. `dynamic-exp/parse_log_exmaple.py` provides some example scipts of extracting {model update time}. 109 | 110 | For convinience usage, we put the hyperparameters of deferent models in `dynamic-exp/best_hp.py`. It copies the best hyperparameters we tested from [hyper-params.md](./hyper-params.md). 111 | 112 | ## Code References: 113 | 114 | * Naru (including implementation of BayesNet): https://github.com/naru-project/naru 115 | * MSCN: https://github.com/andreaskipf/learnedcardinalities 116 | * DeepDB: https://github.com/DataManagementLab/deepdb-public 117 | * QuickSel: https://github.com/illinoisdata/quicksel 118 | * KDE-FB: https://github.com/martinkiefer/feedback-kde 119 | 120 | Our forked repos: 121 | * QuickSel: 122 | * Change: adding a new test class 123 | * Link: https://github.com/sfu-db/quicksel 124 | * KDE-FB: 125 | * Making the code support tables with <=15 columns (original code has the limitation <=10) 126 | * Link: https://github.com/sfu-db/feedback-kde 127 | -------------------------------------------------------------------------------- /dynamic-exp/best_hp.csv: -------------------------------------------------------------------------------- 1 | dataset_method,model 2 | census13_naru,"resmade_hid16,16,16,16_emb8_ep100_embedInembedOut_warm0" 3 | census13_mscn,mscn_hid8_sample500_ep100_bs256 4 | census13_chinn,chinn_hid64_64_64_bin200_ep500_bs128 5 | census13_chitree,chixgb_tr64_bin200 6 | census13_deepdb,spn_sample48842_rdc0.4_ms0.01 7 | forest10_naru,"resmade_hid64,64,64,64_emb8_ep100_embedInembedOut_warm4000" 8 | forest10_mscn,mscn_hid32_sample3000_ep100_bs256 9 | forest10_chinn,chinn_hid256_256_128_64_bin200_ep500_bs32 10 | forest10_chitree,chixgb_tr512_bin200 11 | forest10_deepdb,spn_sample581012_rdc0.4_ms0.005 12 | power7_naru,"resmade_hid128,128,128,128,128_emb16_ep100_embedInembedOut_warm4000" 13 | power7_mscn,mscn_hid64_sample5000_ep100_bs256 14 | power7_chinn,chinn_hid512_512_256_bin200_ep500_bs128 15 | power7_chitree,chixgb_tr256_bin200 16 | power7_deepdb,spn_sample2075259_rdc0.3_ms0.001 17 | dmv11_naru,"resmade_hid512,512,512,512_emb128_ep100_embedInembedOut_warm4000" 18 | dmv11_mscn,mscn_hid256_sample10000_ep100_bs256 19 | dmv11_chinn,chinn_hid2048_1024_512_256_bin200_ep500_bs32 20 | dmv11_chitree,chixgb_tr8192_bin200 21 | dmv11_deepdb,spn_sample1000000_rdc0.2_ms0.001 22 | -------------------------------------------------------------------------------- /dynamic-exp/dynamic_exp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #################### Dynamic exp 3 | ### Do not use 0 as random seed, becasue our Postgres sets random seed as 1/seed 4 | log_path='log' 5 | exp_num=1 6 | for (( i=1; i < 1+$exp_num; ++i )) 7 | do 8 | for dataset in 'census13' 'forest10' 'power7' 'dmv11' 9 | do 10 | for up in 'cor' #'skew' 11 | do 12 | ## MSCN 13 | just dynamic-mscn-${dataset} ${dataset} 'original' 'base' ${up} '0.2' '10000' "$i" >${log_path}/${dataset}/mscn_${up}-exp${i}.out 2>&1 14 | 15 | ## lw retrain 16 | just dynamic-lw-tree-${dataset}-retrain ${dataset} 'original' 'base' ${up} '0.2' '8000' "$i" >>${log_path}/${dataset}/lwtree_${up}-exp${i}.out 2>&1 17 | just dynamic-lw-nn-${dataset}-retrain ${dataset} 'original' 'base' ${up} '0.2' '16000' "$i" '500' >>${log_path}/${dataset}/lwnn_eq500_${up}-exp${i}.out 2>&1 18 | just dynamic-lw-nn-${dataset}-retrain ${dataset} 'original' 'base' ${up} '0.2' '16000' "$i" '100' >${log_path}/${dataset}/lwnn_eq100_${up}-exp${i}.out 2>&1 19 | 20 | ## Postgres 21 | just dynamic-postgres-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" >${log_path}/${dataset}/postgres_${up}-exp${i}.out 2>&1 22 | 23 | ## MySQL 24 | just dynamic-mysql-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" >${log_path}/${dataset}/mysql_${up}-exp${i}.out 2>&1 25 | 26 | ## Naru 27 | just dynamic-naru-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" '1' >>${log_path}/${dataset}/naru_eq1_${up}-exp${i}.out 2>&1 28 | just dynamic-naru-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" '7' >${log_path}/${dataset}/naru_eq7_${up}-exp${i}.out 2>&1 29 | just dynamic-naru-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" '15' >${log_path}/${dataset}/naru_eq15_${up}-exp${i}.out 2>&1 30 | ## QuickSel 31 | just dynamic-quicksel ${dataset} 'original' 'base' ${up} '0.2' "$i" >${log_path}/${dataset}/quicksel_${up}-exp${i}.out 2>&1 32 | 33 | ## DeepDB 34 | just dynamic-deepdb-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" >${log_path}/${dataset}/deepdb_${up}-exp${i}.out 2>&1 35 | done 36 | done 37 | done 38 | 39 | 40 | # epoch vs accuracy 41 | for (( i=1; i < 1+$exp_num; ++i )) 42 | do 43 | for dataset in 'census13' 'forest10' 'power7' 'dmv11' 44 | do 45 | for up in 'cor' 'skew' 46 | do 47 | ## lwNN 48 | for ep in '100' '200' '300' '400' '500' 49 | do 50 | just dynamic-lw-nn-${dataset}-retrain ${dataset} 'original' 'base' ${up} '0.2' '16000' "$i" $ep >${log_path}/${dataset}/lwnn_eq${ep}_${up}-exp${i}.out 2>&1 51 | done 52 | ## Naru 53 | for ep in '1' '5' '10' '15' '20' 54 | do 55 | just dynamic-naru-${dataset} ${dataset} 'original' 'base' ${up} '0.2' "$i" $ep >${log_path}/${dataset}/naru_eq${ep}_${up}-exp${i}.out 2>&1 56 | done 57 | done 58 | done 59 | done 60 | 61 | 62 | echo `date` "All Finished!" 63 | -------------------------------------------------------------------------------- /dynamic-exp/parse_log_exmaple.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from parse import * 4 | from datetime import datetime as dt 5 | 6 | # path hack 7 | sys.path.append(os.getcwd()) 8 | sys.path.append('..') 9 | 10 | TIME_FMT = '%Y-%m-%d %H:%M:%S,%f' 11 | 12 | def get_gen_query_time(logfile, training_size): 13 | '''return time_gen_train_query, time_gen_train_label''' 14 | t_tr_query = [[],[]] 15 | t_tr_label = [[],[]] 16 | # time_update_model = [[],[]] 17 | with open(logfile, 'r') as log_f: 18 | lines = log_f.readlines() 19 | for line in lines: 20 | line = line.strip() 21 | # parse time for training query update 22 | s_tr_query=parse("[{time} INFO] lecarb.workload.gen_workload: Start generate workload with {train_num:d} queries for train...", line) 23 | e_tr_query=parse("[{time} INFO] lecarb.workload.gen_workload: Start generate workload with {test_num:d} queries for valid...", line) 24 | if s_tr_query and s_tr_query['train_num'] == training_size: 25 | t_tr_query[0].append(dt.strptime(s_tr_query['time'], TIME_FMT)) 26 | if e_tr_query: 27 | t_tr_query[1].append(dt.strptime(e_tr_query['time'], TIME_FMT)) 28 | 29 | # parse time for training label update 30 | s_tr_label=parse("[{time} INFO] lecarb.workload.gen_label: Updating ground truth labels for the workload, with sample size {}...", line) 31 | e_tr_label=parse("[{time} INFO] lecarb.workload.gen_label: Dump labels to disk...", line) 32 | if s_tr_label: 33 | t_tr_label[0].append(dt.strptime(s_tr_label['time'], TIME_FMT)) 34 | if e_tr_label: 35 | t_tr_label[1].append(dt.strptime(e_tr_label['time'], TIME_FMT)) 36 | # print(t_tr_query, t_tr_label) 37 | 38 | time_gen_tr_query = 0 39 | time_gen_tr_label = 0 40 | if len(t_tr_query[0]) >= 1: 41 | time_gen_tr_query = (t_tr_query[1][0] - t_tr_query[0][0]).total_seconds() 42 | if len(t_tr_label[0]) >= 1: 43 | time_gen_tr_label = (t_tr_label[1][0] - t_tr_label[0][0]).total_seconds() 44 | return time_gen_tr_query, time_gen_tr_label 45 | 46 | def get_lw_nn_training_time(logfile): 47 | with open(logfile, 'r') as log_f: 48 | lines = log_f.readlines() 49 | for line in lines: 50 | line = line.strip() 51 | update_time=parse("[{} INFO] lecarb.estimator.lw.lw_nn: Training finished! Time spent since start: {train_time:f} mins", line) 52 | if update_time: 53 | return update_time['train_time'] 54 | return 0 55 | 56 | def get_postgres_time(logfile): 57 | with open(logfile, 'r') as logf: 58 | lines = logf.readlines() 59 | for line in lines: 60 | line = line.strip() 61 | # parse time for training query update 62 | update_time=parse("[{} INFO] lecarb.estimator.postgres: construct statistics finished, using {update_time:f} minutes, All statistics consumes {} MBs", line) 63 | if update_time: 64 | return update_time['update_time'] 65 | return 0 66 | 67 | def get_mysql_time(logfile): 68 | with open(logfile, 'r') as logf: 69 | lines = logf.readlines() 70 | for line in lines: 71 | line = line.strip() 72 | # parse time for training query update 73 | update_time=parse("[{} INFO] lecarb.estimator.mysql: construct statistics finished, using {update_time:f} minutes", line) 74 | if update_time: 75 | return update_time['update_time'] 76 | return 0 77 | -------------------------------------------------------------------------------- /hyper-params.md: -------------------------------------------------------------------------------- 1 | # Hyper-parameter Tuning 2 | 3 | This file contains the hyper-parameters we test and report in our paper (mapping to the parameters in Justfile commands). 4 | 5 | **NOTE:** Different hardwares may have different results, for example the max q-error on Census dataset of Naru (the same hyper-parameter and python & library version) on CPU machine is 66.0, P100 GPU machine (ComputeCanada Cedar) is 57.0 and K80 GPU machine (AWS p2.xlarge) is 58.0. The result we report for neural network methods are trained and tested on P100 GPU machine and others are on CPU. 6 | 7 | ## Static Environment 8 | 9 | ### Naru 10 | 11 | CMD: `train-naru` and `test-naru` 12 | 13 | Model Architectures: 14 | * Census 15 | * layers: 5, hc_hiddens: 16, embed_size: 8 16 | * layers: 4, hc_hiddens: 16, embed_size: 8 17 | * layers: 5, hc_hiddens: 32, embed_size: 4 18 | * layers: 4, hc_hiddens: 32, embed_size: 4 19 | * Forest 20 | * layers: 5, hc_hiddens: 32, embed_size: 8 21 | * layers: 4, hc_hiddens: 64, embed_size: 8 22 | * layers: 5, hc_hiddens: 64, embed_size: 4 23 | * layers: 4, hc_hiddens: 64, embed_size: 4 24 | * Power 25 | * layers: 5, hc_hiddens: 64, embed_size: 32 26 | * layers: 4, hc_hiddens: 64, embed_size: 32 27 | * layers: 5, hc_hiddens: 128, embed_size: 16 28 | * layers: 4, hc_hiddens: 128, embed_size: 16 29 | * DMV 30 | * layers: 5, hc_hiddens: 256, embed_size: 128 31 | * layers: 4, hc_hiddens: 512, embed_size: 128 32 | * layers: 5, hc_hiddens: 512, embed_size: 64 33 | * layers: 4, hc_hiddens: 512, embed_size: 64 34 | 35 | Others: 36 | * warmups: 0, 4000, 8000 37 | * epochs: 100 38 | * psample: 2000 39 | * we use natral order for all the dataset 40 | 41 | Selected Models: 42 | 43 | ```bash 44 | # census 45 | just train-naru census13 original 4 16 8 embed embed True 0 0 100 base 123 46 | just test-naru original-resmade_hid16,16,16,16_emb8_ep100_embedInembedOut_warm0-123 2000 census13 original base 123 47 | 48 | # forest 49 | just train-naru forest10 original 4 64 8 embed embed True 4000 0 100 base 123 50 | just test-naru original-resmade_hid64,64,64,64_emb8_ep100_embedInembedOut_warm4000-123 2000 forest10 original base 123 51 | 52 | # power 53 | just train-naru power7 original 5 128 16 embed embed True 4000 0 100 base 123 54 | just test-naru original-resmade_hid128,128,128,128,128_emb16_ep100_embedInembedOut_warm4000-123 2000 power7 original base 123 55 | 56 | # dmv 57 | just train-naru dmv11 original 4 512 128 embed embed True 4000 0 100 base 123 58 | just test-naru original-resmade_hid512,512,512,512_emb128_ep100_embedInembedOut_warm4000-123 2000 dmv11 original base 123 59 | ``` 60 | 61 | ### MSCN 62 | 63 | CMD: `train-mscn` and `test-mscn` 64 | 65 | Model Architectures: 66 | * Census 67 | * num_samples: 200, hid_units: 32 68 | * num_samples: 400, hid_units: 16 69 | * num_samples: 500, hid_units: 8 70 | * num_samples: 600, hid_units: 4 71 | * Forest 72 | * num_samples: 1000, hid_units: 64 73 | * num_samples: 3000, hid_units: 32 74 | * num_samples: 4000, hid_units: 16 75 | * num_samples: 5000, hid_units: 8 76 | * Power 77 | * num_samples: 1000, hid_units: 128 78 | * num_samples: 5000, hid_units: 64 79 | * num_samples: 9000, hid_units: 32 80 | * num_samples: 10000, hid_units: 16 81 | * DMV 82 | * num_samples: 1000, hid_units: 512 83 | * num_samples: 5000, hid_units: 512 84 | * num_samples: 8000, hid_units: 256 85 | * num_samples: 10000, hid_units: 256 86 | 87 | Others: 88 | * bs: 256, 512, 1024, 2048 89 | * epochs: 100 90 | 91 | Selected Models: 92 | 93 | ```bash 94 | # census 95 | just train-mscn census13 original base 500 8 100 256 100000 0 123 96 | just test-mscn original_base-mscn_hid8_sample500_ep100_bs256_100k-123 census13 original base 123 97 | 98 | # forest 99 | just train-mscn forest10 original base 3000 32 100 256 100000 0 123 100 | just test-mscn original_base-mscn_hid32_sample3000_ep100_bs256_100k-123 forest10 original base 123 101 | 102 | # power 103 | just train-mscn power7 original base 5000 64 100 256 100000 0 123 104 | just test-mscn original_base-mscn_hid64_sample5000_ep100_bs256_100k-123 power7 original base 123 105 | 106 | # dmv 107 | just train-mscn dmv11 original base 10000 256 100 256 100000 0 123 108 | just test-mscn original_base-mscn_hid256_sample10000_ep100_bs256_100k-123 dmv11 original base 123 109 | ``` 110 | 111 | ### LW-NN 112 | 113 | CMD: `train-lw-nn` and `test-lw-nn` 114 | 115 | Model Architectures: 116 | * Census 117 | * hid_units: 64_64_64_64 118 | * hid_units: 128_64_32_16 119 | * hid_units: 64_64_64 120 | * hid_units: 128_64_32 121 | * Forest 122 | * hid_units: 512_256 123 | * hid_units: 256_256_256 124 | * hid_units: 256_256_128_128 125 | * hid_units: 256_256_128_64 126 | * Power 127 | * hid_units: 512_512 128 | * hid_units: 512_256_128_64 129 | * hid_units: 256_256_256_256 130 | * hid_units: 512_512_256 131 | * DMV 132 | * hid_units: 2048_1024_512_256 133 | * hid_units: 1024_1024_1024_1024 134 | * hid_units: 2048_1024_1024 135 | * hid_units: 1024_1024_1024 136 | 137 | Others: 138 | * bs: 32, 128, 512 139 | * bins: 200 140 | * epochs: 500 141 | 142 | Selected Models: 143 | 144 | ```bash 145 | # census 146 | just train-lw-nn census13 original base 64_64_64 200 100000 128 0 123 147 | just test-lw-nn original_base-lwnn_hid64_64_64_bin200_ep500_bs128_100k-123 census13 original base True 123 148 | 149 | # forest 150 | just train-lw-nn forest10 original base 256_256_128_64 200 100000 32 0 123 151 | just test-lw-nn original_base-lwnn_hid256_256_128_64_bin200_ep500_bs32_100k-123 forest10 original base True 123 152 | 153 | # power 154 | just train-lw-nn power7 original base 512_512_256 200 100000 128 0 123 155 | just test-lw-nn original_base-lwnn_hid512_512_256_bin200_ep500_bs128_100k-123 power7 original base True 123 156 | 157 | # dmv 158 | just train-lw-nn dmv11 original base 2048_1024_512_256 200 100000 32 0 123 159 | just test-lw-nn original_base-lwnn_hid2048_1024_512_256_bin200_ep500_bs32_100k-123 dmv11 original base True 123 160 | ``` 161 | 162 | ### LW-XGB 163 | 164 | CMD: `train-lw-tree` and `test-lw-tree` 165 | 166 | trees: 167 | * Census: 16, 32, 64 168 | * Forest: 128, 256, 512 169 | * Power: 256, 512, 1024 170 | * DMV: 2048, 4096, 8192 171 | 172 | Selected Models: 173 | 174 | ```bash 175 | # census 176 | just train-lw-tree census13 original base 64 200 100000 0 123 177 | just test-lw-tree original_base-lwxgb_tr64_bin200_100k-123 census13 original base True 123 178 | 179 | # forest 180 | just train-lw-tree forest10 original base 512 200 100000 0 123 181 | just test-lw-tree original_base-lwxgb_tr512_bin200_100k-123 forest10 original base True 123 182 | 183 | # power 184 | just train-lw-tree power7 original base 256 200 100000 0 123 185 | just test-lw-tree original_base-lwxgb_tr256_bin200_100k-123 power7 original base True 123 186 | 187 | # dmv 188 | just train-lw-tree dmv11 original base 8192 200 100000 0 123 189 | just test-lw-tree original_base-lwxgb_tr8192_bin200_100k-123 dmv11 original base True 123 190 | ``` 191 | 192 | ### DeepDB 193 | 194 | CMD: `train-deepdb` and `test-deepdb` 195 | 196 | Grid Search: 197 | * rdc_threshold: 0.2, 0.3, 0.4 198 | * ratio_min_instance_slice: 0.001, 0.005, 0.01, 0.05 199 | * hdf_sample_size: 1M, 10M 200 | 201 | Selected Models: 202 | 203 | ```bash 204 | # census 205 | just train-deepdb census13 original 1000000 0.4 0.01 0 base 123 206 | just test-deepdb original-spn_sample48842_rdc0.4_ms0.01-123 census13 original base 123 207 | 208 | # forest 209 | just train-deepdb forest10 original 1000000 0.4 0.005 0 base 123 210 | just test-deepdb original-spn_sample581012_rdc0.4_ms0.005-123 forest10 original base 123 211 | 212 | # power 213 | just train-deepdb power7 original 10000000 0.3 0.001 0 base 123 214 | just test-deepdb original-spn_sample2075259_rdc0.3_ms0.001-123 power7 original base 123 215 | 216 | # dmv 217 | just train-deepdb dmv11 original 1000000 0.2 0.001 0 base 123 218 | just test-deepdb original-spn_sample1000000_rdc0.2_ms0.001-123 dmv11 original base 123 219 | ``` 220 | 221 | ## Micro-Bencmark 222 | 223 | ### Preparation 224 | 225 | #### Data Generation 226 | 227 | CMD: `data-gen` 228 | * skew: 0.0, 0.2, 0.4, ..., 1.8, 2.0 229 | * corr: 0.0, 0.1, 0.2, ..., 0.9, 1.0 230 | * dom: 10, 100, 1000, 10000 231 | 232 | #### Workload Generation 233 | 234 | CMD: `wkld-gen-vood` and `wkld-label` 235 | 236 | Example: generate dataset and workload for dataset versions with 1000 domain values 237 | 238 | ```bash 239 | # 1. generate versions 240 | for c in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0; do 241 | for s in 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0; do 242 | just data-gen $s $c 1000 243 | done 244 | done 245 | 246 | # 2. generate queryset (can use any version to generate this workload since we use independent center values and the domains are the same) 247 | wkld-gen-vood dom1000 skew0.0_corr0.0 248 | 249 | # 3. generate labels for each version 250 | for c in 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0; do 251 | for s in 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0; do 252 | just wkld-label dom1000 skew${s}_corr${c} vood 253 | done 254 | done 255 | ``` 256 | 257 | ### Model Parameters 258 | 259 | In this experiment, we train and test following models on every synthetic dataset (using `vood` workload) we generated. **Architecture used to report in paper** 260 | 261 | #### Naru 262 | 263 | CMD: `train-naru` and `test-naru` 264 | 265 | Model Architectures: 266 | * dom10 (domain size = 10) 267 | * layers: 5, hc_hiddens: 64, embed_size: 128 268 | * **layers: 4, hc_hiddens: 64, embed_size: 64** 269 | * layers: 5, hc_hiddens: 64, embed_size: 32 270 | * dom100 271 | * layers: 4, hc_hiddens: 32, embed_size: 128 272 | * layers: 5, hc_hiddens: 32, embed_size: 64 273 | * **layers: 5, hc_hiddens: 32, embed_size: 32** 274 | * dom1000 275 | * layers: 5, hc_hiddens: 16, embed_size: 16 276 | * layers: 5, hc_hiddens: 64, embed_size: 8 277 | * **layers: 4, hc_hiddens: 16, embed_size: 16** 278 | * dom10000 279 | * layers: 3, hc_hiddens: 64, embed_size: 2 280 | * **layers: 4, hc_hiddens: 32, embed_size: 2** 281 | * layers: 5, hc_hiddens: 32, embed_size: 2 282 | 283 | Others: 284 | * warmups: 0 285 | * epochs: 100 286 | 287 | #### MSCN 288 | 289 | CMD: `train-mscn` and `test-mscn` 290 | 291 | Model Architectures: 292 | * **num_samples: 1000, hid_units: 32** 293 | * num_samples: 3000, hid_units: 8 294 | * num_samples: 5000, hid_units: 4 295 | 296 | Others: 297 | * bs: 1024 298 | * epochs: 100 299 | * train_num: 100000 300 | 301 | #### LW-NN 302 | 303 | CMD: `train-lw-nn` and `test-lw-nn` 304 | 305 | Model Architectures: 306 | * hid_units: 256_128_64 307 | * hid_units: 128_128_128 308 | * **hid_units: 256_128_64_32** 309 | 310 | Others: 311 | * bs: 32 312 | * bins: 200 313 | * epochs: 500 314 | * train_num: 100000 315 | 316 | #### LW-Tree 317 | 318 | CMD: `train-lw-tree` and `test-lw-tree` 319 | 320 | * trees: 128 321 | * bins: 200 322 | * train_num: 100000 323 | 324 | #### DeepDB 325 | 326 | CMD: `train-deepdb` and `test-deepdb` 327 | 328 | * hdf_sample_size: 1000000 329 | * rdc_threshold: 0.3 330 | * ratio_min_instance_slice: 0.01 331 | -------------------------------------------------------------------------------- /lecarb/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging import getLogger 3 | 4 | logger = getLogger(__name__) 5 | logger.setLevel(logging.DEBUG) 6 | 7 | ch = logging.StreamHandler() 8 | ch.setLevel(logging.DEBUG) 9 | formatter = logging.Formatter("[{asctime} {levelname}] {name}: {message}", style="{") 10 | ch.setFormatter(formatter) 11 | logger.addHandler(ch) 12 | -------------------------------------------------------------------------------- /lecarb/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | import torch 4 | 5 | DATA_ROOT = Path(os.environ["DATA_ROOT"]) 6 | OUTPUT_ROOT = Path(os.environ["OUTPUT_ROOT"]) 7 | MODEL_ROOT = OUTPUT_ROOT / "model" 8 | RESULT_ROOT = OUTPUT_ROOT / "result" 9 | LOG_ROOT = OUTPUT_ROOT / "log" 10 | 11 | DATABASE_URL = os.environ["DATABASE_URL"] 12 | KDE_DATABASE_URL = os.environ["KDE_DATABASE_URL"] 13 | MYSQL_HOST = os.environ["MYSQL_HOST"] 14 | MYSQL_PORT = os.environ["MYSQL_PORT"] 15 | MYSQL_DB = os.environ["MYSQL_DB"] 16 | MYSQL_USER = os.environ["MYSQL_USER"] 17 | MYSQL_PSWD = os.environ["MYSQL_PSWD"] 18 | 19 | PKL_PROTO = 4 20 | 21 | DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' 22 | NUM_THREADS = int(os.environ.get("CPU_NUM_THREADS", os.cpu_count())) 23 | 24 | VALID_NUM_DATA_DRIVEN = 100 25 | -------------------------------------------------------------------------------- /lecarb/dataset/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import logging 4 | import pickle 5 | from collections import OrderedDict 6 | 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn.metrics import mutual_info_score 10 | from scipy.stats import entropy 11 | 12 | from ..constants import DATA_ROOT, PKL_PROTO 13 | from ..dtypes import is_categorical 14 | 15 | L = logging.getLogger(__name__) 16 | 17 | class Column(object): 18 | def __init__(self, name, data): 19 | self.name = name 20 | self.dtype = data.dtype 21 | 22 | # parse vocabulary 23 | self.vocab, self.has_nan = self.__parse_vocab(data) 24 | self.vocab_size = len(self.vocab) 25 | self.minval = self.vocab[1] if self.has_nan else self.vocab[0] 26 | self.maxval = self.vocab[-1] 27 | 28 | def __repr__(self): 29 | return f'Column({self.name}, type={self.dtype}, vocab size={self.vocab_size}, min={self.minval}, max={self.maxval}, has NaN={self.has_nan})' 30 | 31 | def __parse_vocab(self, data): 32 | # pd.isnull returns true for both np.nan and np.datetime64('NaT'). 33 | is_nan = pd.isnull(data) 34 | contains_nan = np.any(is_nan) 35 | # NOTE: np.sort puts NaT values at beginning, and NaN values at end. 36 | # For our purposes we always add any null value to the beginning. 37 | vs = np.sort(np.unique(data[~is_nan])) 38 | if contains_nan: 39 | vs = np.insert(vs, 0, np.nan) 40 | return vs, contains_nan 41 | 42 | def discretize(self, data): 43 | """Transforms data values into integers using a Column's vocabulary""" 44 | 45 | # pd.Categorical() does not allow categories be passed in an array 46 | # containing np.nan. It makes it a special case to return code -1 47 | # for NaN values. 48 | if self.has_nan: 49 | bin_ids = pd.Categorical(data, categories=self.vocab[1:]).codes 50 | # Since nan/nat bin_id is supposed to be 0 but pandas returns -1, just 51 | # add 1 to everybody 52 | bin_ids = bin_ids + 1 53 | else: 54 | # This column has no nan or nat values. 55 | bin_ids = pd.Categorical(data, categories=self.vocab).codes 56 | 57 | bin_ids = bin_ids.astype(np.int32, copy=False) 58 | assert (bin_ids >= 0).all(), (self, data, bin_ids) 59 | return bin_ids 60 | 61 | def normalize(self, data): 62 | """Normalize data to range [0, 1]""" 63 | minval = self.minval 64 | maxval = self.maxval 65 | # if column is not numerical, use descretized value 66 | if is_categorical(self.dtype): 67 | data = self.discretize(data) 68 | minval = 0 69 | maxval = self.vocab_size - 1 70 | data = np.array(data, dtype=np.float32) 71 | if minval >= maxval: 72 | L.warning(f"column {self.name} has min value {minval} >= max value{maxval}") 73 | return np.zeros(len(data)).astype(np.float32) 74 | val_norm = (data - minval) / (maxval - minval) 75 | return val_norm.astype(np.float32) 76 | 77 | class Table(object): 78 | def __init__(self, dataset, version): 79 | self.dataset = dataset 80 | self.version = version 81 | self.name = f"{self.dataset}_{self.version}" 82 | L.info(f"start building data {self.name}...") 83 | 84 | # load data 85 | self.data = pd.read_pickle(DATA_ROOT / self.dataset / f"{self.version}.pkl") 86 | self.data_size_mb = self.data.values.nbytes / 1024 / 1024 87 | self.row_num = self.data.shape[0] 88 | self.col_num = len(self.data.columns) 89 | 90 | # parse columns 91 | self.parse_columns() 92 | L.info(f"build finished: {self}") 93 | 94 | def parse_columns(self): 95 | self.columns = OrderedDict([(col, Column(col, self.data[col])) for col in self.data.columns]) 96 | 97 | def __repr__(self): 98 | return f"Table {self.name} ({self.row_num} rows, {self.data_size_mb:.2f}MB, columns:\n{os.linesep.join([repr(c) for c in self.columns.values()])})" 99 | 100 | def get_minmax_dict(self): 101 | minmax_dict = {} 102 | for i, col in enumerate(self.columns.values()): 103 | minmax_dict[i] = (col.minval, col.maxval) 104 | return minmax_dict 105 | 106 | def normalize(self, scale=1): 107 | data = copy.deepcopy(self.data) 108 | for cname, col in self.columns.items(): 109 | data[cname] = col.normalize(data[cname].values) * scale 110 | return data 111 | 112 | def digitalize(self): 113 | data = copy.deepcopy(self.data) 114 | for cname, col in self.columns.items(): 115 | if is_categorical(col.dtype): 116 | data[cname] = col.discretize(data[cname]) 117 | elif col.has_nan: 118 | data[cname].fillna(0, inplace=True) 119 | return data 120 | 121 | def get_max_muteinfo_order(self): 122 | order = [] 123 | 124 | # find the first column with maximum entropy 125 | max_entropy = float('-inf') 126 | first_col = None 127 | for c in self.columns.keys(): 128 | e = entropy(self.data[c].value_counts()) 129 | if e > max_entropy: 130 | first_col = c 131 | max_entropy = e 132 | assert first_col is not None, (first_col, max_entropy) 133 | order.append(first_col) 134 | sep = '|' 135 | chosen_data = self.data[first_col].astype(str) + sep 136 | 137 | # add the rest columns one by one by choosing the max mutual information with existing columns 138 | while len(order) < self.col_num: 139 | max_muinfo = float('-inf') 140 | next_col = None 141 | for c in self.columns.keys(): 142 | if c in order: continue 143 | m = mutual_info_score(chosen_data, self.data[c]) 144 | if m > max_muinfo: 145 | next_col = c 146 | max_muinfo = m 147 | assert next_col is not None, (next_col, max_entropy) 148 | order.append(next_col) 149 | # concate new chosen columns 150 | chosen_data = chosen_data + sep + self.data[next_col].astype(str) 151 | 152 | return order, [self.data.columns.get_loc(c) for c in order] 153 | 154 | def get_muteinfo(self, digital_data=None): 155 | data = digital_data if digital_data is not None else self.digitalize() 156 | muteinfo_dict = {} 157 | for c1 in self.columns.keys(): 158 | muteinfo_dict[c1] = {} 159 | for c2 in self.columns.keys(): 160 | if c1 != c2 and c2 in muteinfo_dict: 161 | assert c1 in muteinfo_dict[c2], muteinfo_dict.keys() 162 | muteinfo_dict[c1][c2] = muteinfo_dict[c2][c1] 163 | else: 164 | muteinfo_dict[c1][c2] = mutual_info_score(data[c1], data[c2]) 165 | return pd.DataFrame().from_dict(muteinfo_dict) 166 | 167 | def dump_table(table: Table) -> None: 168 | with open(DATA_ROOT / table.dataset / f"{table.version}.table.pkl", 'wb') as f: 169 | pickle.dump(table, f, protocol=PKL_PROTO) 170 | 171 | def load_table(dataset: str, version: str, overwrite: bool=False) -> Table: 172 | table_path = DATA_ROOT / dataset / f"{version}.table.pkl" 173 | 174 | if not overwrite and table_path.is_file(): 175 | L.info("table exists, load...") 176 | with open(table_path, 'rb') as f: 177 | table = pickle.load(f) 178 | L.info(f"load finished: {table}") 179 | return table 180 | 181 | table = Table(dataset, version) 182 | L.info("dump table to disk...") 183 | dump_table(table) 184 | return table 185 | 186 | def dump_table_to_num(dataset: str, version: str) -> None: 187 | table = load_table(dataset, version) 188 | num_data = table.digitalize() 189 | csv_path = DATA_ROOT / dataset / f"{version}_num.csv" 190 | L.info(f"dump csv file to {csv_path}") 191 | num_data.to_csv(csv_path, index=False) 192 | 193 | 194 | if __name__ == '__main__': 195 | # table = load_table('forest') 196 | # print(table.get_max_muteinfo_order()) 197 | # 7 1 8 6 5 9 0 4 3 2 198 | 199 | # table = load_table('census') 200 | # print(table.get_max_muteinfo_order()) 201 | # 4 3 2 0 6 12 7 5 1 13 9 10 8 11 202 | 203 | table = Table('census', 'original') 204 | print(table) 205 | # print(table.get_max_muteinfo_order()) 206 | # 4 0 1 2 3 5 8 7 6 207 | -------------------------------------------------------------------------------- /lecarb/dataset/gen_dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | import logging 3 | import numpy as np 4 | import pandas as pd 5 | from scipy.stats import truncnorm, truncexpon, genpareto 6 | from typing import Dict, Any 7 | 8 | from .dataset import load_table 9 | from ..constants import DATA_ROOT 10 | 11 | L = logging.getLogger(__name__) 12 | 13 | def get_truncated_normal(mean=0, sd=100, low=0, upp=1000): 14 | return truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd) 15 | 16 | def get_truncated_expon(scale=100, low=0, upp=1000): 17 | return truncexpon(b=(upp-low)/scale, loc=low, scale=scale) 18 | 19 | def generate_dataset( 20 | seed: int, dataset: str, version: str, 21 | params: Dict[str, Any], overwrite: bool 22 | ) -> None: 23 | path = DATA_ROOT / dataset 24 | path.mkdir(exist_ok=True) 25 | csv_path = path / f"{version}.csv" 26 | pkl_path = path / f"{version}.pkl" 27 | if not overwrite and csv_path.is_file(): 28 | L.info(f"Dataset path exists, do not continue") 29 | return 30 | 31 | row_num = params['row_num'] 32 | col_num = params['col_num'] 33 | dom = params['dom'] 34 | corr = params['corr'] 35 | skew = params['skew'] 36 | 37 | if col_num != 2: 38 | L.info("For now only support col=2!") 39 | exit(0) 40 | 41 | L.info(f"Start generate dataset with {col_num} columns and {row_num} rows using seed {seed}") 42 | random.seed(seed) 43 | np.random.seed(seed) 44 | 45 | # generate the first column according to skew 46 | col0 = np.arange(dom) # make sure every domain value has at least 1 value 47 | tmp = genpareto.rvs(skew-1, size=row_num-len(col0)) # c = skew - 1, so we can have c >= 0 48 | tmp = ((tmp - tmp.min()) / (tmp.max() - tmp.min())) * dom # rescale generated data to the range of domain 49 | col0 = np.concatenate((col0, np.clip(tmp.astype(int), 0, dom-1))) 50 | 51 | # generate the second column according to the first 52 | col1 = [] 53 | for c0 in col0: 54 | col1.append(c0 if np.random.uniform(0, 1) <= corr else np.random.choice(dom)) 55 | 56 | df = pd.DataFrame(data={'col0': col0, 'col1': col1}) 57 | 58 | L.info(f"Dump dataset {dataset} as version {version} to disk") 59 | df.to_csv(csv_path, index=False) 60 | df.to_pickle(pkl_path) 61 | load_table(dataset, version) 62 | L.info(f"Finish!") 63 | 64 | -------------------------------------------------------------------------------- /lecarb/dataset/manipulate_dataset.py: -------------------------------------------------------------------------------- 1 | import random 2 | import logging 3 | import pickle 4 | import numpy as np 5 | import math 6 | import pandas as pd 7 | from scipy.stats import truncnorm, truncexpon, genpareto 8 | from typing import Dict, Any, Tuple 9 | from copy import deepcopy 10 | 11 | from .dataset import load_table 12 | from ..constants import DATA_ROOT, PKL_PROTO 13 | 14 | L = logging.getLogger(__name__) 15 | 16 | # Independence data: Random by each column 17 | def get_random_data(dataset: str, version: str, overwrite=False) -> Tuple[pd.DataFrame, str]: 18 | rand_version = f"{version}_ind" 19 | random_file = DATA_ROOT / dataset / f"{rand_version}.pkl" 20 | if not overwrite and random_file.is_file(): 21 | L.info(f"Dataset path exists, using it") 22 | return pd.read_pickle(random_file), rand_version 23 | 24 | df = pd.read_pickle(DATA_ROOT / dataset / f"{version}.pkl") 25 | for col in df.columns: 26 | df[col] = df[col].sample(frac=1).reset_index(drop=True) 27 | pd.to_pickle(df, random_file, protocol=PKL_PROTO) 28 | return df, rand_version 29 | 30 | # Max Spearman correlation data: sort by each column 31 | def get_sorted_data(dataset: str, version: str, overwrite=False) -> Tuple[pd.DataFrame, str]: 32 | sort_version = f"{version}_cor" 33 | sorted_file = DATA_ROOT / dataset / f"{sort_version}.pkl" 34 | if not overwrite and sorted_file.is_file(): 35 | return pd.read_pickle(sorted_file), sort_version 36 | 37 | df = pd.read_pickle(DATA_ROOT / dataset / f"{version}.pkl") 38 | for col in df.columns: 39 | df[col] = df[col].sort_values().reset_index(drop=True) 40 | df = df.sample(frac=1).reset_index(drop=True) 41 | pd.to_pickle(df, sorted_file, protocol=PKL_PROTO) 42 | return df, sort_version 43 | 44 | # Get skew data by tuple level frequent rank. 45 | def get_skew_data(dataset: str = 'census', version: str = 'original', sample_ratio=0.0005, overwrite=False) -> Tuple[pd.DataFrame, str]: 46 | skew_version = f"{version}_skew" 47 | skew_file = DATA_ROOT / dataset / f"{skew_version}.pkl" 48 | if not overwrite and skew_file.is_file(): 49 | return pd.read_pickle(skew_file), skew_version 50 | 51 | df = pd.read_pickle(DATA_ROOT / dataset / f"{version}.pkl") 52 | 53 | 54 | rank_df = pd.DataFrame(0.0, index=range(len(df)), columns=['rank_sum']).astype(np.float32) 55 | for col in df.columns: 56 | rank_df['rank_sum'] += df[col].map(df[col].value_counts().div(len(rank_df))).astype(np.float32) 57 | print(f"{col} frequency calculation finished!") 58 | selected_id = rank_df.sort_values(by='rank_sum').head(round(len(df)*sample_ratio)).index 59 | sk_df = df.iloc[selected_id] 60 | sk_df = pd.concat([sk_df] * int(1/sample_ratio + 1), ignore_index=True).head(len(df)) 61 | pd.to_pickle(sk_df, skew_file, protocol=PKL_PROTO) 62 | return sk_df, skew_version 63 | 64 | 65 | 66 | def append_data(dataset: str, version_target: str, version_from: str, interval=0.2): 67 | df_target = pd.read_pickle(DATA_ROOT / dataset / f"{version_target}.pkl") 68 | df_from = pd.read_pickle(DATA_ROOT / dataset / f"{version_from}.pkl") 69 | 70 | row_num = len(df_from) 71 | l = 0 72 | r = l + interval 73 | if r <= 1: 74 | L.info(f"Start appending {version_target} with {version_from} in [{l}, {r}]") 75 | df_target = df_target.append(df_from[int(l*row_num): int(r*row_num)], ignore_index=True, sort=False) 76 | pd.to_pickle(df_target, DATA_ROOT / dataset / f"{version_target}+{version_from}_{r:.1f}.pkl") 77 | df_target.to_csv(DATA_ROOT / dataset / f"{version_target}+{version_from}_{r:.1f}.csv", index=False) 78 | load_table(dataset, f"{version_target}+{version_from}_{r:.1f}") 79 | else: 80 | L.info(f"Appending Fail! Batch size is too big!") 81 | 82 | 83 | 84 | def gen_appended_dataset( 85 | seed: int, dataset: str, version: str, 86 | params: Dict[str, Any], overwrite: bool 87 | ) -> None: 88 | random.seed(seed) 89 | np.random.seed(seed) 90 | update_type = params.get('type') 91 | batch_ratio = params.get('batch_ratio') 92 | L.info(f"Start generating appended data for {dataset}/{version}") 93 | 94 | if update_type == 'ind': 95 | _, rand_version = get_random_data(dataset, version, overwrite=overwrite) 96 | append_data(dataset, version, rand_version, interval=batch_ratio) 97 | elif update_type == 'cor': 98 | _, sort_version = get_sorted_data(dataset, version, overwrite=overwrite) 99 | append_data(dataset, version, sort_version, interval=batch_ratio) 100 | elif update_type == 'skew': 101 | _, skew_version = get_skew_data(dataset, version, 102 | sample_ratio=float(params['skew_size']), overwrite=overwrite) 103 | append_data(dataset, version, skew_version, interval=batch_ratio) 104 | else: 105 | raise NotImplementedError 106 | L.info("Finish updating data!") 107 | 108 | 109 | -------------------------------------------------------------------------------- /lecarb/dtypes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for auxiliary type detection functions 3 | """ 4 | 5 | from typing import Any 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | CATEGORICAL_NUMPY_DTYPES = [np.bool, np.object] 11 | CATEGORICAL_PANDAS_DTYPES = [pd.CategoricalDtype, pd.PeriodDtype] 12 | CATEGORICAL_DTYPES = CATEGORICAL_NUMPY_DTYPES + CATEGORICAL_PANDAS_DTYPES 13 | 14 | NUMERICAL_NUMPY_DTYPES = [np.number, np.datetime64] 15 | NUMERICAL_PANDAS_DTYPES = [pd.DatetimeTZDtype] 16 | NUMERICAL_DTYPES = NUMERICAL_NUMPY_DTYPES + NUMERICAL_PANDAS_DTYPES 17 | 18 | 19 | def is_categorical(dtype: Any) -> bool: 20 | """ 21 | Given a type, return if that type is a categorical type 22 | """ 23 | 24 | if is_numerical(dtype): 25 | return False 26 | 27 | if isinstance(dtype, np.dtype): 28 | dtype = dtype.type 29 | 30 | return any(issubclass(dtype, c) for c in CATEGORICAL_NUMPY_DTYPES) 31 | else: 32 | return any(isinstance(dtype, c) for c in CATEGORICAL_PANDAS_DTYPES) 33 | 34 | 35 | def is_numerical(dtype: Any) -> bool: 36 | """ 37 | Given a type, return if that type is a numerical type 38 | """ 39 | if isinstance(dtype, np.dtype): 40 | dtype = dtype.type 41 | return any(issubclass(dtype, c) for c in NUMERICAL_NUMPY_DTYPES) 42 | else: 43 | return any(isinstance(dtype, c) for c in NUMERICAL_PANDAS_DTYPES) 44 | 45 | def is_discrete(dtype: Any) -> bool: 46 | """ 47 | Given a type, return if that type is a discrete type (categorical or integer) 48 | """ 49 | if is_categorical(dtype): 50 | return True 51 | 52 | assert isinstance(dtype, np.dtype), dtype 53 | dtype = dtype.type 54 | return issubclass(dtype, np.integer) 55 | 56 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/README.md: -------------------------------------------------------------------------------- 1 | Paper: [DeepDB: Learn from Data, not from Queries!](http://www.vldb.org/pvldb/vol13/p992-hilprecht.pdf) 2 | Code Reference: [repo](https://github.com/DataManagementLab/deepdb-public) 3 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/aqp_spn/__init__.py -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/code_generation/convert_conditions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from spn.structure.StatisticalTypes import MetaType 3 | 4 | 5 | def _convert_range(range, pos): 6 | if range[pos] == -np.inf or range[pos] == np.inf: 7 | minusInf = True 8 | condition = 0 9 | else: 10 | minusInf = False 11 | condition = range[pos] 12 | return minusInf, condition 13 | 14 | 15 | def _convert_real(idx, condition, inverted_features): 16 | # method_params += [f'bool inverse{i}', f'bool leftMinusInf{i}', f'float leftCondition{i}', 17 | # f'bool rightMinusInf{i}', f'float rightCondition{i}', f'bool leftIncluded{i}', 18 | # f'bool rightIncluded{i}', f'float nullValue{i}'] 19 | 20 | inverse = idx in inverted_features 21 | if condition is not None: 22 | leftMinusInf, leftCondition = _convert_range(condition.ranges[0], 0) 23 | rightMinusInf, rightCondition = _convert_range(condition.ranges[0], 1) 24 | return inverse, leftMinusInf, leftCondition, rightMinusInf, rightCondition, condition.inclusive_intervals[0][0], \ 25 | condition.inclusive_intervals[0][1], condition.null_value 26 | 27 | return inverse, False, 0, False, 0, False, False, 0 28 | 29 | 30 | def _convert_categorical(condition): 31 | # method_params += [f'vector possibleValues{i}', f'int nullValueIdx{i}'] 32 | 33 | if condition is not None: 34 | if condition.is_not_null_condition: 35 | return condition.possible_values, condition.null_value 36 | else: 37 | return condition.possible_values, -1 38 | 39 | # leaves will anyway not be evaluated 40 | return [0], 0 41 | 42 | 43 | def convert_range(relevant_scope, featureScope, meta_types, conditions, inverted_features): 44 | """ 45 | Translates conditions for an expectation method call into parameters that can be passed to generated SPN code. 46 | :param relevant_scope: relevant_scope from expectation method 47 | :param featureScope: feature_scope from expectation method 48 | :param meta_types: types of the columns of the SPN 49 | :param conditions: conditions to be translated 50 | :param inverted_features: list indicating which indexes are inverted features (1/x) 51 | :return: Boolean indicating whether inference is supported by generated SPN. Parameters that have to be passed. 52 | """ 53 | parameters = (relevant_scope, featureScope) 54 | 55 | for idx, condition in enumerate(conditions): 56 | if meta_types[idx] == MetaType.DISCRETE: 57 | parameters += _convert_categorical(condition) 58 | elif meta_types[idx] == MetaType.REAL: 59 | # several conditions currently not supported 60 | if condition is not None and len(condition.ranges) > 1: 61 | return False, None 62 | # conditions on feature column currently not supported in C++ 63 | if featureScope[idx] is None: 64 | return False, None 65 | parameters += _convert_real(idx, condition, inverted_features) 66 | 67 | return True, parameters -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/code_generation/generate_code.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from enum import Enum 3 | from time import perf_counter 4 | 5 | import numpy as np 6 | from spn.structure.Base import assign_ids, Product, get_number_of_nodes 7 | from spn.structure.StatisticalTypes import MetaType 8 | 9 | from aqp_spn.aqp_leaves import Categorical, IdentityNumericLeaf, Sum 10 | from ensemble_compilation.spn_ensemble import read_ensemble 11 | 12 | import os 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class TemplatePath(Enum): 18 | current_file_path = __file__ 19 | current_file_dir = os.path.dirname(__file__) 20 | MASTER = os.path.join(current_file_dir, 'templates/master.cpp') 21 | CATEGORICAL = os.path.join(current_file_dir, 'templates/categorical_leave.cpp') 22 | IDENTITY = os.path.join(current_file_dir, 'templates/identity_leave.cpp') 23 | PRODUCT = os.path.join(current_file_dir, 'templates/product_node.cpp') 24 | SUM = os.path.join(current_file_dir, 'templates/sum_node.cpp') 25 | METHOD_MASTER = os.path.join(current_file_dir, 'templates/method_master.cpp') 26 | REGISTRATION_MASTER = os.path.join(current_file_dir, 'templates/registration_master.cpp') 27 | 28 | 29 | def replace_template(template_path, value_dictionary, depth): 30 | with open(template_path.value, 'r') as ftemp: 31 | templateString = ftemp.read() 32 | 33 | code_string = templateString.format(**value_dictionary) 34 | padding = ''.join([' '] * depth) 35 | return ''.join([padding + line for line in code_string.splitlines(True)]) 36 | 37 | 38 | def comma_seperated_list(value_list): 39 | return ', '.join([str(v) for v in value_list]) 40 | 41 | 42 | def generate_scope_check(scope): 43 | return ' || '.join([f'relevantScope[{node_scope}]' for node_scope in scope]) 44 | 45 | 46 | def generate_categorical_node(node, root_node, floating_data_type, depth): 47 | value_dictionary = { 48 | 'node_id': node.id, 49 | 'node_scope': node.scope[0], 50 | 'node_p': comma_seperated_list(node.p), 51 | 'final_assert': f'resultValue = nodeIntermediateResult[{node.id}];' if root_node == node else '', 52 | 'floating_data_type': floating_data_type 53 | } 54 | return replace_template(TemplatePath.CATEGORICAL, value_dictionary, depth) 55 | 56 | 57 | def nan_replacement(value): 58 | if np.isnan(value): 59 | return 0 60 | else: 61 | return value 62 | 63 | 64 | def generate_identity_node(node, root_node, floating_data_type, depth): 65 | value_dictionary = { 66 | 'node_id': node.id, 67 | 'node_scope': node.scope[0], 68 | 'null_value_prob': node.null_value_prob, 69 | 'unique_values': comma_seperated_list(node.unique_vals), 70 | 'prob_sum': comma_seperated_list(node.prob_sum), 71 | 'mean': nan_replacement(node.mean * (1 - node.null_value_prob)), 72 | 'inverted_mean': nan_replacement(node.inverted_mean * (1 - node.null_value_prob)), 73 | 'floating_data_type': floating_data_type, 74 | 'final_assert': f'resultValue = nodeIntermediateResult[{node.id}];' if root_node == node else '' 75 | } 76 | return replace_template(TemplatePath.IDENTITY, value_dictionary, depth) 77 | 78 | 79 | def generate_product_node(node, root_node, floating_data_type, depth): 80 | # if ({scope_check}) {{ 81 | # {subtree_code} 82 | # nodeIntermediateResult[{node_id}] = 1.0 83 | # {result_calculation} 84 | # }} 85 | 86 | result_calculation_lines = [] 87 | for child in node.children: 88 | result_calculation_lines += [f'if ({generate_scope_check(child.scope)}) ' 89 | f'{{nodeIntermediateResult[{node.id}] *= nodeIntermediateResult[{child.id}];}}'] 90 | 91 | value_dictionary = { 92 | 'node_id': node.id, 93 | 'scope_check': generate_scope_check(node.scope), 94 | 'subtree_code': '\n'.join( 95 | [generate_method_body(child, root_node, floating_data_type, depth) for child in node.children]), 96 | 'result_calculation': '\n '.join(result_calculation_lines), 97 | 'final_assert': f'resultValue = nodeIntermediateResult[{node.id}];' if root_node == node else '' 98 | } 99 | return replace_template(TemplatePath.PRODUCT, value_dictionary, depth) 100 | 101 | 102 | def generate_sum_node(node, root_node, floating_data_type, depth): 103 | # if ({scope_check}) {{ 104 | # {subtree_code} 105 | # {result_calculation} 106 | # {final_assert} 107 | # }} 108 | 109 | result_calculation_lines = [] 110 | for i, child in enumerate(node.children): 111 | result_calculation_lines += [f'nodeIntermediateResult[{child.id}] * {node.weights[i]}'] 112 | 113 | value_dictionary = { 114 | 'scope_check': generate_scope_check(node.scope), 115 | 'subtree_code': '\n'.join( 116 | [generate_method_body(child, root_node, floating_data_type, depth) for child in node.children]), 117 | 'result_calculation': f'nodeIntermediateResult[{node.id}]=' + ' + '.join(result_calculation_lines) + ';', 118 | 'final_assert': f'resultValue = nodeIntermediateResult[{node.id}];' if root_node == node else '' 119 | } 120 | return replace_template(TemplatePath.SUM, value_dictionary, depth) 121 | 122 | 123 | def generate_method_body(node, root_node, floating_data_type, depth): 124 | if isinstance(node, Categorical): 125 | return generate_categorical_node(node, root_node, floating_data_type, depth + 1) 126 | elif isinstance(node, IdentityNumericLeaf): 127 | return generate_identity_node(node, root_node, floating_data_type, depth + 1) 128 | elif isinstance(node, Product): 129 | return generate_product_node(node, root_node, floating_data_type, depth + 1) 130 | elif isinstance(node, Sum): 131 | return generate_sum_node(node, root_node, floating_data_type, depth + 1) 132 | else: 133 | raise NotImplementedError 134 | 135 | 136 | def generate_code(spn_id, spn, meta_types, floating_data_type): 137 | """ 138 | Generates inference code for an SPN 139 | :param target_path: the path the generated C++ code is written to 140 | :param floating_data_type: data type floating numbers are represented in generated C++ code 141 | :param spn: root node of an SPN 142 | :return: code string 143 | """ 144 | 145 | # make sure we have ids 146 | assign_ids(spn) 147 | 148 | # fill method body according to SPN structure 149 | method_body = generate_method_body(spn, spn, floating_data_type, 0) 150 | 151 | # build parameters used in generated c++ function 152 | method_params = [] 153 | passed_params = [] 154 | for i, type in enumerate(meta_types): 155 | if type == MetaType.DISCRETE: 156 | method_params += [f'vector possibleValues{i}', f'int nullValueIdx{i}'] 157 | passed_params += [f'py::arg("possibleValues{i}")', f'py::arg("nullValueIdx{i}")'] 158 | elif type == MetaType.REAL: 159 | method_params += [f'bool inverse{i}', f'bool leftMinusInf{i}', f'float leftCondition{i}', 160 | f'bool rightMinusInf{i}', f'float rightCondition{i}', f'bool leftIncluded{i}', 161 | f'bool rightIncluded{i}', f'float nullValue{i}'] 162 | passed_params += [f'py::arg("inverse{i}")', f'py::arg("leftMinusInf{i}")', f'py::arg("leftCondition{i}")', 163 | f'py::arg("rightMinusInf{i}")', f'py::arg("rightCondition{i}")', 164 | f'py::arg("leftIncluded{i}")', f'py::arg("rightIncluded{i}")', f'py::arg("nullValue{i}")'] 165 | 166 | value_dictionary = { 167 | 'spn_id': spn_id, 168 | 'method_body': method_body, 169 | 'method_params': ', '.join(method_params), 170 | 'node_count': get_number_of_nodes(spn), 171 | 'passed_params': ', '.join(passed_params), 172 | 'floating_data_type': floating_data_type 173 | } 174 | generated_method = replace_template(TemplatePath.METHOD_MASTER, value_dictionary, 0) 175 | registrate_method = replace_template(TemplatePath.REGISTRATION_MASTER, value_dictionary, 0) 176 | 177 | return generated_method, registrate_method 178 | 179 | 180 | def generate_ensemble_code(spn_ensemble, floating_data_type='float', ensemble_path=None): 181 | registrations = [] 182 | methods = [] 183 | logger.debug(f"Starting code generation") 184 | for i, spn in enumerate(spn_ensemble.spns): 185 | spn.id = i 186 | gen_start = perf_counter() 187 | generated_method, registrate_method = generate_code(i, spn.mspn, spn.meta_types, floating_data_type) 188 | registrations.append(registrate_method) 189 | methods.append(generated_method) 190 | gen_end = perf_counter() 191 | logger.debug(f"Generated code for SPN {i + 1}/{len(spn_ensemble.spns)} in {gen_end - gen_start:.2f}s.") 192 | 193 | value_dictionary = { 194 | 'methods': '\n\n'.join(methods), 195 | 'registration': '\n\t'.join(registrations) 196 | } 197 | generated_code = replace_template(TemplatePath.MASTER, value_dictionary, 0) 198 | 199 | if ensemble_path is not None: 200 | spn_ensemble.save(ensemble_path) 201 | 202 | with open('optimized_inference.cpp', 'w') as f: 203 | f.write(generated_code) 204 | 205 | logger.debug(f"Finished code generation.") 206 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/code_generation/templates/categorical_leave.cpp: -------------------------------------------------------------------------------- 1 | if (relevantScope[{node_scope}]) {{ 2 | // notNanPerNode[{node_id}] = true; 3 | {floating_data_type} probsNode{node_id}[] = {{ {node_p} }}; 4 | 5 | //not null condition 6 | if (nullValueIdx{node_scope} != -1) {{ 7 | nodeIntermediateResult[{node_id}] = 1 - probsNode{node_id}[nullValueIdx{node_scope}]; 8 | }} else {{ 9 | for (int &idx: possibleValues{node_scope}) {{ 10 | nodeIntermediateResult[{node_id}] += probsNode{node_id}[idx]; 11 | }} 12 | }} 13 | {final_assert} 14 | }} -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/code_generation/templates/identity_leave.cpp: -------------------------------------------------------------------------------- 1 | if (relevantScope[{node_scope}]) {{ 2 | if (featureScope[{node_scope}]) {{ 3 | if (inverse{node_scope}) {{ 4 | nodeIntermediateResult[{node_id}] = {inverted_mean}; 5 | }} else {{ 6 | nodeIntermediateResult[{node_id}] = {mean}; 7 | }} 8 | }} else {{ 9 | 10 | vector<{floating_data_type}> uniqueVals{node_id}{{ {unique_values} }}; 11 | vector<{floating_data_type}> probSum{node_id}{{ {prob_sum} }}; 12 | 13 | // search right and left bounds via binary search 14 | int leftIdx{node_id} = 0; 15 | if (!leftMinusInf{node_scope}) {{ 16 | vector<{floating_data_type}>::iterator leftBoundIdx{node_id}; 17 | leftBoundIdx{node_id} = std::lower_bound(uniqueVals{node_id}.begin(), uniqueVals{node_id}.end(), leftCondition{node_scope}); 18 | leftIdx{node_id} = leftBoundIdx{node_id} - uniqueVals{node_id}.begin(); 19 | }} 20 | 21 | int rightIdx{node_id} = uniqueVals{node_id}.size(); 22 | if (!rightMinusInf{node_scope}) {{ 23 | vector<{floating_data_type}>::iterator rightBoundIdx{node_id}; 24 | rightBoundIdx{node_id} = std::upper_bound(uniqueVals{node_id}.begin(), uniqueVals{node_id}.end(), rightCondition{node_scope}); 25 | rightIdx{node_id} = rightBoundIdx{node_id} - uniqueVals{node_id}.begin(); 26 | }} 27 | 28 | nodeIntermediateResult[{node_id}] = probSum{node_id}[rightIdx{node_id}] - probSum{node_id}[leftIdx{node_id}]; 29 | 30 | // exclude null value if it was included before 31 | if (((leftMinusInf{node_scope} || leftCondition{node_scope} < nullValue{node_scope}) && (rightMinusInf{node_scope} || rightCondition{node_scope} > nullValue{node_scope})) || 32 | (!leftMinusInf{node_scope} && (nullValue{node_scope} == leftCondition{node_scope}) && leftIncluded{node_scope}) || 33 | (!rightMinusInf{node_scope} && (nullValue{node_scope} == rightCondition{node_scope}) && rightIncluded{node_scope})) {{ 34 | nodeIntermediateResult[{node_id}] -= {null_value_prob}; // null value prob 35 | }} 36 | 37 | // left value should not be included in interval 38 | if (!leftIncluded{node_scope} && !leftMinusInf{node_scope} && leftCondition{node_scope} == uniqueVals{node_id}[leftIdx{node_id}]) {{ 39 | nodeIntermediateResult[{node_id}] -= probSum{node_id}[leftIdx{node_id} + 1] - probSum{node_id}[leftIdx{node_id}]; 40 | }} 41 | 42 | //same for right 43 | if (!rightIncluded{node_scope} && !rightMinusInf{node_scope} && rightCondition{node_scope} == uniqueVals{node_id}[rightIdx{node_id}-{node_id}] && leftCondition{node_scope} != rightCondition{node_scope}) {{ 44 | nodeIntermediateResult[{node_id}] -= probSum{node_id}[rightIdx{node_id}] - probSum{node_id}[rightIdx{node_id} - 1]; 45 | }} 46 | }} 47 | {final_assert} 48 | }} -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/code_generation/templates/master.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | using namespace std; 5 | namespace py = pybind11; 6 | 7 | {methods} 8 | 9 | PYBIND11_MODULE(optimized_inference, m){{ 10 | m.doc() = "Generated RSPN ensemble code"; 11 | {registration} 12 | }} -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/code_generation/templates/method_master.cpp: -------------------------------------------------------------------------------- 1 | {floating_data_type} spn{spn_id}(vector relevantScope, vector featureScope, {method_params}){{ 2 | {floating_data_type} resultValue = 0.0; 3 | // bool notNanPerNode[{node_count}] = {{ false }}; 4 | {floating_data_type} nodeIntermediateResult[{node_count}] = {{ 0 }}; 5 | 6 | {method_body} 7 | 8 | return resultValue; 9 | }} -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/code_generation/templates/product_node.cpp: -------------------------------------------------------------------------------- 1 | if ({scope_check}) {{ 2 | {subtree_code} 3 | nodeIntermediateResult[{node_id}] = 1.0; 4 | {result_calculation} 5 | {final_assert} 6 | }} -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/code_generation/templates/registration_master.cpp: -------------------------------------------------------------------------------- 1 | m.def("spn{spn_id}", &spn{spn_id}, "Generate expectation on SPN", py::arg("relevantScope"), py::arg("featureScope"), {passed_params}); -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/code_generation/templates/sum_node.cpp: -------------------------------------------------------------------------------- 1 | if ({scope_check}) {{ 2 | {subtree_code} 3 | {result_calculation} 4 | {final_assert} 5 | }} -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/custom_spflow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/aqp_spn/custom_spflow/__init__.py -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/custom_spflow/custom_transform_structure.py: -------------------------------------------------------------------------------- 1 | from spn.structure.Base import get_nodes_by_type, Product, Leaf, assign_ids 2 | 3 | from aqp_spn.aqp_leaves import Sum 4 | from aqp_spn.custom_spflow.custom_validity import is_valid 5 | 6 | 7 | def Prune(node, light=False): 8 | """ 9 | Prunes spn. Ensures that nodes have at least one child and that types of node and children differ. 10 | Adapts weigths and optionally bloom filters accordingly. 11 | :param node: 12 | :return: 13 | """ 14 | 15 | # v, err = is_valid(node) 16 | # assert v, err 17 | nodes = get_nodes_by_type(node, (Product, Sum)) 18 | 19 | while len(nodes) > 0: 20 | n = nodes.pop() 21 | 22 | n_type = type(n) 23 | is_sum = n_type == Sum 24 | is_product = n_type == Product 25 | 26 | i = 0 27 | while i < len(n.children): 28 | c = n.children[i] 29 | 30 | # if my child has only one node, we can get rid of it and link directly to that grandchildren 31 | # in this case, no bloom filters can be lost since we do not split 32 | if not isinstance(c, Leaf) and len(c.children) == 1: 33 | n.children[i] = c.children[0] 34 | continue 35 | 36 | # if the type is similar to the type of the child 37 | if n_type == type(c): 38 | 39 | if is_sum and not light: 40 | old_len = len(n.cluster_centers) 41 | len_child_cluster = len(c.cluster_centers) 42 | del n.cluster_centers[i] 43 | n.cluster_centers.extend(c.cluster_centers) 44 | 45 | assert old_len - 1 + len_child_cluster == len( 46 | n.cluster_centers), "cluster_center length mismatch, node " + n + c 47 | 48 | del n.children[i] 49 | n.children.extend(c.children) 50 | 51 | if is_sum: 52 | w = n.weights[i] 53 | del n.weights[i] 54 | 55 | n.weights.extend([cw * w for cw in c.weights]) 56 | 57 | if is_product: 58 | # hence, child type is also product and we should not loose bloom filter 59 | if hasattr(n, 'binary_bloom_filters'): 60 | n.binary_bloom_filters = {**n.binary_bloom_filters, **c.binary_bloom_filters} 61 | 62 | continue 63 | 64 | i += 1 65 | if is_sum and i > 0: 66 | n.weights[0] = 1.0 - sum(n.weights[1:]) 67 | 68 | if isinstance(node, (Product, Sum)) and len(node.children) == 1: 69 | node = node.children[0] 70 | 71 | assign_ids(node) 72 | v, err = is_valid(node, light=light) 73 | assert v, err 74 | 75 | return node 76 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/custom_spflow/custom_validity.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on March 20, 2018 3 | 4 | @author: Alejandro Molina 5 | """ 6 | import logging 7 | 8 | import numpy as np 9 | from math import isclose 10 | from spn.structure.Base import get_nodes_by_type, Product 11 | 12 | from aqp_spn.aqp_leaves import Sum, IdentityNumericLeaf 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | def is_consistent(node): 18 | """ 19 | all children of a product node have different scope 20 | """ 21 | 22 | assert node is not None 23 | 24 | allchildscope = set() 25 | for prod_node in reversed(get_nodes_by_type(node, Product)): 26 | nscope = set(prod_node.scope) 27 | 28 | if len(prod_node.children) == 0: 29 | return False, "Product node %s has no children" % prod_node.id 30 | 31 | allchildscope.clear() 32 | sum_features = 0 33 | for child in prod_node.children: 34 | sum_features += len(child.scope) 35 | allchildscope.update(child.scope) 36 | 37 | if allchildscope != nscope or sum_features != len(allchildscope): 38 | return False, "children of (prod) node %s do not have exclusive scope" % prod_node.id 39 | 40 | return True, None 41 | 42 | 43 | def is_complete(node): 44 | """ 45 | all children of a sum node have same scope as the parent 46 | """ 47 | 48 | assert node is not None 49 | 50 | for sum_node in reversed(get_nodes_by_type(node, Sum)): 51 | nscope = set(sum_node.scope) 52 | 53 | if len(sum_node.children) == 0: 54 | return False, "Sum node %s has no children" % sum_node.id 55 | 56 | for child in sum_node.children: 57 | if nscope != set(child.scope): 58 | return False, "children of (sum) node %s do not have the same scope as parent" % sum_node.id 59 | 60 | return True, None 61 | 62 | 63 | def is_valid_prob_sum(prob_sum, unique_vals, card): 64 | # return True, Null 65 | length = len(prob_sum) - 1 66 | 67 | if len(prob_sum) != len(unique_vals) + 1: 68 | return False, "len(prob_sum)!= len(unique_vals)+1" 69 | last_prob_sum = 0 70 | cards = [] 71 | 72 | sum_card = 0 73 | for i in range(0, len(prob_sum)): 74 | if prob_sum[i] > 1.0001: 75 | return False, "prob_sum[" + str(i) + "] must be =< 1.000, actual value at position " + str(i) + ":" + str( 76 | prob_sum[i]) + ", len:" + str(len(prob_sum)) 77 | if last_prob_sum - 0.0000001 > prob_sum[i]: 78 | return False, "prob_sum value must be increase (last_prob_sum:" + str(last_prob_sum) + ", prob_sum[" + str( 79 | i) + "]:" + str(prob_sum[i]) 80 | num = (prob_sum[i] - last_prob_sum) * card 81 | if False and not isclose(num, round(num), abs_tol=0.05): 82 | err_msg = "wrong probability value at idx " + str(i) + " (" + str( 83 | num) + ")- does not fit to an integer cardinality value for value " + str(unique_vals[i]) 84 | 85 | return False, err_msg 86 | last_prob_sum = prob_sum[i] 87 | sum_card += round(num) 88 | cards.append(round(num)) 89 | 90 | if not isclose(prob_sum[length], 1, abs_tol=0.05): 91 | return False, "Last value of prob_sum must be 1.0" 92 | if sum_card != card: 93 | return False, "Cardinality of the single values (" + str( 94 | sum_card) + ") does not match the overall cardinality (" + str(card) + ")" 95 | 96 | return True, None 97 | 98 | 99 | def is_valid(node, check_ids=True, check_prob_sum=False, light=False): 100 | # 101 | if check_ids: 102 | val, err = has_valid_ids(node) 103 | if not val: 104 | return val, err 105 | 106 | for n in get_nodes_by_type(node): 107 | if len(n.scope) == 0: 108 | return False, "node %s has no scope" % n.id 109 | is_sum = isinstance(n, Sum) 110 | is_prod = isinstance(n, Product) 111 | is_float = isinstance(n, IdentityNumericLeaf) 112 | 113 | if is_sum: 114 | if len(n.children) != len(n.weights): 115 | return False, "node %s has different children/weights" % n.id 116 | 117 | if not light: 118 | if len(n.children) != len(n.cluster_centers): 119 | return False, "node %s has different children/cluster_centers (#cluster_centers: %d, #childs: %d)" % ( 120 | n.id, len(n.cluster_centers), len(n.children)) 121 | 122 | weight_sum = np.sum(n.weights) 123 | 124 | if not isclose(weight_sum, 1, abs_tol=0.05): 125 | return False, "Sum of weights is not equal 1.0 (instead:" + weight_sum + ")" 126 | 127 | if is_sum or is_prod: 128 | if len(n.children) == 0: 129 | return False, "node %s has no children" % n.id 130 | 131 | if is_float: 132 | ok, err = is_valid_prob_sum(n.prob_sum, n.unique_vals, n.cardinality) 133 | if not ok: 134 | return False, err 135 | if check_prob_sum: 136 | assert (hasattr(n, 'prob_num')), str(n) + " has no property prob_num" 137 | assert hasattr((n, 'unique_vals')) 138 | if len(n.prob_sum) - 1 != len(n.unique_vals): 139 | # 140 | return False, "size of prob_sum does not match unique_vals (required: prob_sum -1 == unique_vals) " 141 | 142 | a, err = is_consistent(node) 143 | if not a: 144 | return a, err 145 | 146 | b, err = is_complete(node) 147 | if not b: 148 | return b, err 149 | 150 | return True, None 151 | 152 | 153 | def has_valid_ids(node): 154 | ids = set() 155 | all_nodes = get_nodes_by_type(node) 156 | for n in all_nodes: 157 | ids.add(n.id) 158 | 159 | if len(ids) != len(all_nodes): 160 | return False, "Nodes are missing ids or there are repeated ids" 161 | 162 | if min(ids) != 0: 163 | return False, "Node ids not starting at 0" 164 | 165 | if max(ids) != len(ids) - 1: 166 | return False, "Node ids not consecutive" 167 | 168 | return True, None 169 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/custom_spflow/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | import numpy as np 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def default_slicer(data, cols, num_cond_cols=None): 10 | if num_cond_cols is None: 11 | if len(cols) == 1: 12 | return data[:, cols[0]].reshape((-1, 1)) 13 | 14 | return data[:, cols] 15 | else: 16 | return np.concatenate((data[:, cols], data[:, -num_cond_cols:]), axis=1) 17 | 18 | 19 | def compute_cartesian_product_completeness(col1, col2, ds_context, data, min_sample_size, max_sample_size, 20 | oversampling_cart_product=10, debug=False): 21 | """ 22 | Compute how many distinct value combinations appear for pair of columns in data. A low value is an indicator for 23 | functional dependency or some different form of dependency. 24 | :param col1: 25 | :param col2: 26 | :param ds_context: 27 | :param data: 28 | :param min_sample_size: 29 | :param max_sample_size: 30 | :param oversampling_cart_product: 31 | :param debug: 32 | :return: 33 | """ 34 | 35 | unique_tuples_start_t = time.perf_counter() 36 | len_cartesian_product = ds_context.no_unique_values[col1] * ds_context.no_unique_values[col2] 37 | sample_size = max(min(oversampling_cart_product * len_cartesian_product, max_sample_size), min_sample_size) 38 | 39 | sample_idx = np.random.randint(data.shape[0], size=sample_size) 40 | if sample_size < data.shape[0]: 41 | local_data_sample = data[sample_idx, :] 42 | else: 43 | local_data_sample = data 44 | value_combinations_sample = set( 45 | [(bin_data[0], bin_data[1],) for bin_data in 46 | default_slicer(local_data_sample, [col1, col2])]) 47 | cartesian_product_completeness = len(value_combinations_sample) / len_cartesian_product 48 | unique_tuples_end_t = time.perf_counter() 49 | if debug: 50 | logging.debug( 51 | f"Computed unique combination set for scope ({col1}, {col2}) in " 52 | f"{unique_tuples_end_t - unique_tuples_start_t} sec.") 53 | return cartesian_product_completeness, value_combinations_sample, len_cartesian_product 54 | 55 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/expectations.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from time import perf_counter 3 | 4 | import numpy as np 5 | from spn.algorithms.Inference import likelihood 6 | from spn.structure.Base import Product 7 | 8 | from aqp_spn.aqp_leaves import Sum 9 | from aqp_spn.code_generation.convert_conditions import convert_range 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def expectation(spn, feature_scope, inverted_features, ranges, node_expectation=None, node_likelihoods=None, 15 | use_generated_code=False, spn_id=None, meta_types=None, gen_code_stats=None): 16 | """Compute the Expectation: 17 | E[1_{conditions} * X_feature_scope] 18 | First factor is one if condition is fulfilled. For the second factor the variables in feature scope are 19 | multiplied. If inverted_features[i] is True, variable is taken to denominator. 20 | The conditional expectation would be E[1_{conditions} * X_feature_scope]/P(conditions) 21 | """ 22 | 23 | # evidence_scope = set([i for i, r in enumerate(ranges) if not np.isnan(r)]) 24 | evidence_scope = set([i for i, r in enumerate(ranges[0]) if r is not None]) 25 | evidence = ranges 26 | 27 | assert not (len(evidence_scope) > 0 and evidence is None) 28 | 29 | relevant_scope = set() 30 | relevant_scope.update(evidence_scope) 31 | relevant_scope.update(feature_scope) 32 | if len(relevant_scope) == 0: 33 | return np.ones((ranges.shape[0], 1)) 34 | 35 | if ranges.shape[0] == 1: 36 | 37 | applicable = True 38 | if use_generated_code: 39 | boolean_relevant_scope = [i in relevant_scope for i in range(len(meta_types))] 40 | boolean_feature_scope = [i in feature_scope for i in range(len(meta_types))] 41 | applicable, parameters = convert_range(boolean_relevant_scope, boolean_feature_scope, meta_types, ranges[0], 42 | inverted_features) 43 | 44 | # generated C++ code 45 | if use_generated_code and applicable: 46 | time_start = perf_counter() 47 | import optimized_inference 48 | 49 | spn_func = getattr(optimized_inference, f'spn{spn_id}') 50 | result = np.array([[spn_func(*parameters)]]) 51 | 52 | time_end = perf_counter() 53 | 54 | if gen_code_stats is not None: 55 | gen_code_stats.calls += 1 56 | gen_code_stats.total_time += (time_end - time_start) 57 | 58 | # logger.debug(f"\t\tGenerated Code Latency: {(time_end - time_start) * 1000:.3f}ms") 59 | return result 60 | 61 | # lightweight non-batch version 62 | else: 63 | return np.array( 64 | [[expectation_recursive(spn, feature_scope, inverted_features, relevant_scope, evidence, 65 | node_expectation, node_likelihoods)]]) 66 | # full batch version 67 | return expectation_recursive_batch(spn, feature_scope, inverted_features, relevant_scope, evidence, 68 | node_expectation, node_likelihoods) 69 | 70 | 71 | def expectation_recursive_batch(node, feature_scope, inverted_features, relevant_scope, evidence, node_expectation, 72 | node_likelihoods): 73 | if isinstance(node, Product): 74 | 75 | llchildren = np.concatenate( 76 | [expectation_recursive_batch(child, feature_scope, inverted_features, relevant_scope, evidence, 77 | node_expectation, node_likelihoods) 78 | for child in node.children if 79 | len(relevant_scope.intersection(child.scope)) > 0], axis=1) 80 | return np.nanprod(llchildren, axis=1).reshape(-1, 1) 81 | 82 | elif isinstance(node, Sum): 83 | if len(relevant_scope.intersection(node.scope)) == 0: 84 | return np.full((evidence.shape[0], 1), np.nan) 85 | 86 | llchildren = np.concatenate( 87 | [expectation_recursive_batch(child, feature_scope, inverted_features, relevant_scope, evidence, 88 | node_expectation, node_likelihoods) 89 | for child in node.children], axis=1) 90 | 91 | relevant_children_idx = np.where(np.isnan(llchildren[0]) == False)[0] 92 | if len(relevant_children_idx) == 0: 93 | return np.array([np.nan]) 94 | 95 | weights_normalizer = sum(node.weights[j] for j in relevant_children_idx) 96 | b = np.array(node.weights)[relevant_children_idx] / weights_normalizer 97 | 98 | return np.dot(llchildren[:, relevant_children_idx], b).reshape(-1, 1) 99 | 100 | else: 101 | if node.scope[0] in feature_scope: 102 | t_node = type(node) 103 | if t_node in node_expectation: 104 | exps = np.zeros((evidence.shape[0], 1)) 105 | 106 | feature_idx = feature_scope.index(node.scope[0]) 107 | inverted = inverted_features[feature_idx] 108 | 109 | exps[:] = node_expectation[t_node](node, evidence, inverted=inverted) 110 | return exps 111 | else: 112 | raise Exception('Node type unknown: ' + str(t_node)) 113 | 114 | return likelihood(node, evidence, node_likelihood=node_likelihoods) 115 | 116 | 117 | def nanproduct(product, factor): 118 | if np.isnan(product): 119 | if not np.isnan(factor): 120 | return factor 121 | else: 122 | return np.nan 123 | else: 124 | if np.isnan(factor): 125 | return product 126 | else: 127 | return product * factor 128 | 129 | 130 | def expectation_recursive(node, feature_scope, inverted_features, relevant_scope, evidence, node_expectation, 131 | node_likelihoods): 132 | if isinstance(node, Product): 133 | 134 | product = np.nan 135 | for child in node.children: 136 | if len(relevant_scope.intersection(child.scope)) > 0: 137 | factor = expectation_recursive(child, feature_scope, inverted_features, relevant_scope, evidence, 138 | node_expectation, node_likelihoods) 139 | product = nanproduct(product, factor) 140 | return product 141 | 142 | elif isinstance(node, Sum): 143 | if len(relevant_scope.intersection(node.scope)) == 0: 144 | return np.nan 145 | 146 | llchildren = [expectation_recursive(child, feature_scope, inverted_features, relevant_scope, evidence, 147 | node_expectation, node_likelihoods) 148 | for child in node.children] 149 | 150 | relevant_children_idx = np.where(np.isnan(llchildren) == False)[0] 151 | 152 | if len(relevant_children_idx) == 0: 153 | return np.nan 154 | 155 | weights_normalizer = sum(node.weights[j] for j in relevant_children_idx) 156 | weighted_sum = sum(node.weights[j] * llchildren[j] for j in relevant_children_idx) 157 | 158 | return weighted_sum / weights_normalizer 159 | 160 | else: 161 | if node.scope[0] in feature_scope: 162 | t_node = type(node) 163 | if t_node in node_expectation: 164 | 165 | feature_idx = feature_scope.index(node.scope[0]) 166 | inverted = inverted_features[feature_idx] 167 | 168 | return node_expectation[t_node](node, evidence, inverted=inverted).item() 169 | else: 170 | raise Exception('Node type unknown: ' + str(t_node)) 171 | 172 | return node_likelihoods[type(node)](node, evidence).item() 173 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/group_by_combination.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | 4 | import numpy as np 5 | from spn.algorithms.Inference import likelihood 6 | from spn.structure.Base import get_nodes_by_type, Leaf, Product, eval_spn_bottom_up, assign_ids 7 | 8 | from aqp_spn.aqp_leaves import Sum 9 | from aqp_spn.custom_spflow.custom_transform_structure import Prune 10 | from aqp_spn.custom_spflow.custom_validity import is_valid 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def prod_group_by(node, children, data=None, dtype=np.float64): 16 | contains_probs = False 17 | contains_values = False 18 | contains_none_values = False 19 | contains_zero_prob = False 20 | group_by_scopes = [] 21 | # Check if only probabilities contained 22 | for child in children: 23 | # value 24 | if isinstance(child, tuple): 25 | contains_values = True 26 | 27 | scope, values = child 28 | group_by_scopes += scope 29 | if values is None: 30 | contains_none_values = True 31 | # probability 32 | else: 33 | contains_probs = True 34 | if (child == 0).any(): 35 | contains_zero_prob = True 36 | 37 | # Probability of subtree zero or no matching tuples 38 | if contains_zero_prob or contains_none_values: 39 | return [None], None 40 | # Cartesian product 41 | elif contains_values: 42 | result_values = None 43 | group_by_scopes.sort() 44 | for group_by_scope in group_by_scopes: 45 | matching_values = None 46 | matching_idx = None 47 | for child in children: 48 | if isinstance(child, tuple): 49 | scope, values = child 50 | if group_by_scope in scope: 51 | matching_values = values 52 | matching_idx = scope.index(group_by_scope) 53 | break 54 | assert matching_values is not None, "Matching values should not be None." 55 | if result_values is None: 56 | result_values = [(matching_value[matching_idx],) for matching_value in matching_values] 57 | else: 58 | result_values = [result_value + (matching_value[matching_idx],) for result_value in result_values for 59 | matching_value in matching_values] 60 | # assert len(result_values) <= len(group_by_scopes) 61 | old_len = len(result_values) 62 | if hasattr(node, 'binary_bloom_filters'): # , "For grouping product nodes must have bloom filters." 63 | for scope, bloom_filter in node.binary_bloom_filters.items(): 64 | if scope[0] in group_by_scopes and scope[1] in group_by_scopes: 65 | idx_left = group_by_scopes.index(scope[0]) 66 | idx_right = group_by_scopes.index(scope[1]) 67 | result_values = [result_value for result_value in result_values if 68 | (result_value[idx_left], result_value[idx_right],) in bloom_filter] 69 | if old_len > len(result_values): 70 | logger.debug( 71 | f"\t\tDue to bloom filters results were reduced by {(1 - len(result_values) / old_len) * 100}%") 72 | return group_by_scopes, set(result_values) 73 | # Only probabilities, normal inference 74 | elif contains_probs: 75 | llchildren = np.concatenate(children, axis=1) 76 | return np.nanprod(llchildren, axis=1).reshape(-1, 1) 77 | 78 | 79 | def sum_group_by(node, children, data=None, dtype=np.float64): 80 | """ 81 | Propagate expectations in sum node. 82 | 83 | :param node: sum node 84 | :param children: nodes below 85 | :param data: 86 | :param dtype: 87 | :return: 88 | """ 89 | 90 | # either all tuples or 91 | if isinstance(children[0], tuple): 92 | result_values = None 93 | group_by_scope = [None] 94 | for scope, values in children: 95 | if values is not None: 96 | group_by_scope = scope 97 | if result_values is None: 98 | result_values = values 99 | else: 100 | result_values = result_values.union(values) 101 | return group_by_scope, result_values 102 | 103 | # normal probability sum node code 104 | llchildren = np.concatenate(children, axis=1) 105 | relevant_children_idx = np.where(np.isnan(llchildren[0]) == False)[0] 106 | if len(relevant_children_idx) == 0: 107 | return np.array([np.nan]) 108 | 109 | assert llchildren.dtype == dtype 110 | 111 | weights_normalizer = sum(node.weights[j] for j in relevant_children_idx) 112 | b = np.array(node.weights, dtype=dtype)[relevant_children_idx] / weights_normalizer 113 | 114 | return np.dot(llchildren[:, relevant_children_idx], b).reshape(-1, 1) 115 | 116 | 117 | def group_by_combinations(spn, ds_context, feature_scope, ranges, node_distinct_vals=None, node_likelihoods=None): 118 | """ 119 | Computes the distinct value combinations for features given the range conditions. 120 | """ 121 | evidence_scope = set([i for i, r in enumerate(ranges[0]) if r is not None]) 122 | evidence = ranges 123 | 124 | # make feature scope sorted 125 | feature_scope_unsorted = copy.copy(feature_scope) 126 | feature_scope.sort() 127 | # add range conditions to feature scope (makes checking with bloom filters easier) 128 | feature_scope = list(set(feature_scope) 129 | .union(evidence_scope.intersection(np.where(ds_context.no_unique_values <= 1200)[0]))) 130 | feature_scope.sort() 131 | inverted_order = [feature_scope.index(scope) for scope in feature_scope_unsorted] 132 | 133 | assert not (len(evidence_scope) > 0 and evidence is None) 134 | 135 | relevant_scope = set() 136 | relevant_scope.update(evidence_scope) 137 | relevant_scope.update(feature_scope) 138 | marg_spn = marginalize(spn, relevant_scope) 139 | 140 | def leaf_expectation(node, data, dtype=np.float64, **kwargs): 141 | 142 | if node.scope[0] in feature_scope: 143 | t_node = type(node) 144 | if t_node in node_distinct_vals: 145 | vals = node_distinct_vals[t_node](node, evidence) 146 | return vals 147 | else: 148 | raise Exception('Node type unknown: ' + str(t_node)) 149 | 150 | return likelihood(node, evidence, node_likelihood=node_likelihoods) 151 | 152 | node_expectations = {type(leaf): leaf_expectation for leaf in get_nodes_by_type(marg_spn, Leaf)} 153 | node_expectations.update({Sum: sum_group_by, Product: prod_group_by}) 154 | 155 | result = eval_spn_bottom_up(marg_spn, node_expectations, all_results={}, data=evidence, dtype=np.float64) 156 | if feature_scope_unsorted == feature_scope: 157 | return result 158 | scope, grouped_tuples = result 159 | return feature_scope_unsorted, set( 160 | [tuple(group_tuple[i] for i in inverted_order) for group_tuple in grouped_tuples]) 161 | 162 | 163 | def marginalize(node, keep, light=False): 164 | # keep must be a set of features that you want to keep 165 | # Loc.enter() 166 | keep = set(keep) 167 | 168 | # Loc.p('keep:', keep) 169 | 170 | def marg_recursive(node): 171 | # Loc.enter() 172 | new_node_scope = keep.intersection(set(node.scope)) 173 | # Loc.p("new_node_scope:", new_node_scope) 174 | if len(new_node_scope) == 0: 175 | # we are summing out this node 176 | # Loc.leave(None) 177 | return None 178 | 179 | if isinstance(node, Leaf): 180 | if len(node.scope) > 1: 181 | raise Exception("Leaf Node with |scope| > 1") 182 | # Loc.leave('Leaf.deepcopy()') 183 | if light: 184 | return node 185 | return copy.deepcopy(node) 186 | 187 | newNode = node.__class__() 188 | newNode.cardinality = node.cardinality 189 | 190 | if isinstance(node, Sum): 191 | newNode.weights.extend(node.weights) 192 | if not light: 193 | newNode.cluster_centers.extend(node.cluster_centers) 194 | if isinstance(node, Product): 195 | if hasattr(node, 'binary_bloom_filters'): 196 | newNode.binary_bloom_filters = node.binary_bloom_filters 197 | 198 | for c in node.children: 199 | new_c = marg_recursive(c) 200 | if new_c is None: 201 | continue 202 | newNode.children.append(new_c) 203 | 204 | newNode.scope.extend(new_node_scope) 205 | 206 | # Loc.leave() 207 | return newNode 208 | 209 | newNode = marg_recursive(node) 210 | 211 | if not light: 212 | assign_ids(newNode) 213 | newNode = Prune(newNode, light=light) 214 | 215 | valid, err = is_valid(newNode, light=light) 216 | assert valid, err 217 | # Loc.leave() 218 | return newNode 219 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/ranges.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class NominalRange: 5 | """ 6 | This class specifies the range for a nominal attribute. It contains a list of integers which 7 | represent the values which are in the range. 8 | 9 | e.g. possible_values = [5,2] 10 | """ 11 | 12 | def __init__(self, possible_values, null_value=None, is_not_null_condition=False): 13 | self.is_not_null_condition = is_not_null_condition 14 | self.possible_values = np.array(possible_values, dtype=np.int64) 15 | self.null_value = null_value 16 | 17 | def is_impossible(self): 18 | return len(self.possible_values) == 0 19 | 20 | def get_ranges(self): 21 | return self.possible_values 22 | 23 | 24 | class NumericRange: 25 | """ 26 | This class specifies the range for a numeric attribute. It contains a list of intervals which 27 | represents the values which are valid. Inclusive Intervals specifies whether upper and lower bound are included. 28 | 29 | e.g. ranges = [[10,15],[22,23]] if valid values are between 10 and 15 plus 22 and 23 (bounds inclusive) 30 | """ 31 | 32 | def __init__(self, ranges, inclusive_intervals=None, null_value=None, is_not_null_condition=False): 33 | self.is_not_null_condition = is_not_null_condition 34 | self.ranges = ranges 35 | self.null_value = null_value 36 | self.inclusive_intervals = inclusive_intervals 37 | if self.inclusive_intervals is None: 38 | self.inclusive_intervals = [] 39 | for interval in self.ranges: 40 | self.inclusive_intervals.append([True, True]) 41 | 42 | def is_impossible(self): 43 | return len(self.ranges) == 0 44 | 45 | def get_ranges(self): 46 | return self.ranges 47 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/util/Graphics.py: -------------------------------------------------------------------------------- 1 | from spn.io.Graphics import plot_spn 2 | 3 | def overwrite_plot_spn(spn, plotfile): 4 | import os 5 | try: 6 | os.remove(plotfile) 7 | except OSError as err: 8 | pass 9 | plot_spn(spn, plotfile) 10 | 11 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/aqp_spn/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/aqp_spn/util/__init__.py -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/data_preparation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/data_preparation/__init__.py -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/ensemble_compilation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/ensemble_compilation/__init__.py -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/ensemble_compilation/graph_representation.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from enum import Enum 3 | 4 | 5 | class Table: 6 | """Represents a table with foreign key and primary key relationships""" 7 | 8 | def __init__(self, table_name, primary_key=["id"], table_nn_attribute=None, table_size=1000, csv_file_location=None, 9 | attributes=None, irrelevant_attributes=None, keep_fk_attributes=None, sample_rate=1.0, fd_list=None, 10 | no_compression=None): 11 | 12 | self.table_name = table_name 13 | self.table_size = table_size 14 | self.primary_key = primary_key 15 | 16 | self.csv_file_location = csv_file_location 17 | self.attributes = attributes 18 | self.irrelevant_attributes = irrelevant_attributes 19 | if irrelevant_attributes is None: 20 | self.irrelevant_attributes = [] 21 | self.keep_fk_attributes = keep_fk_attributes 22 | if keep_fk_attributes is None: 23 | self.keep_fk_attributes = [] 24 | self.no_compression = no_compression 25 | if no_compression is None: 26 | self.no_compression = [] 27 | 28 | if fd_list is None: 29 | self.fd_list = [] 30 | else: 31 | self.fd_list = [(table_name + '.' + fd_source, table_name + '.' + fd_dest) for fd_source, fd_dest in 32 | fd_list] 33 | 34 | # additional attribute indicating whether tuple is NULL (can occur since we learn SPN on FULL OUTER JOIN) 35 | if table_nn_attribute is None: 36 | self.table_nn_attribute = self.table_name + '_nn' 37 | 38 | # FK references 39 | self.outgoing_relationships = [] 40 | 41 | # referenced as FK 42 | self.incoming_relationships = [] 43 | self.sample_rate = sample_rate 44 | 45 | def children_fd_attributes(self, attribute): 46 | return [fd_source for fd_source, fd_dest in self.fd_list if fd_dest == attribute] 47 | 48 | def parent_fd_attributes(self, attribute): 49 | return [fd_dest for fd_source, fd_dest in self.fd_list if fd_source == attribute] 50 | 51 | 52 | class Relationship: 53 | """Foreign key primary key relationship""" 54 | 55 | def __init__(self, start, end, start_attr, end_attr, multiplier_attribute_name): 56 | self.start = start.table_name 57 | self.start_attr = start_attr 58 | 59 | self.end = end.table_name 60 | self.end_attr = end_attr 61 | 62 | # matching tuples in FULL OUTER JOIN 63 | self.multiplier_attribute_name = multiplier_attribute_name 64 | 65 | # matching tuples (not NULL) 66 | self.multiplier_attribute_name_nn = multiplier_attribute_name + '_nn' 67 | 68 | self.identifier = self.start + '.' + self.start_attr + \ 69 | ' = ' + self.end + '.' + self.end_attr 70 | 71 | # for start table we are outgoing relationship 72 | start.outgoing_relationships.append(self) 73 | end.incoming_relationships.append(self) 74 | 75 | 76 | class SchemaGraph: 77 | """Holds all tables and relationships""" 78 | 79 | def __init__(self): 80 | self.tables = [] 81 | self.relationships = [] 82 | self.table_dictionary = {} 83 | self.relationship_dictionary = {} 84 | 85 | def add_table(self, table): 86 | self.tables.append(table) 87 | self.table_dictionary[table.table_name] = table 88 | 89 | def add_relationship(self, start_name, start_attr, end_name, end_attr, multiplier_attribute_name=None): 90 | if multiplier_attribute_name is None: 91 | multiplier_attribute_name = 'mul_' + start_name + '.' + start_attr 92 | 93 | relationship = Relationship(self.table_dictionary[start_name], 94 | self.table_dictionary[end_name], 95 | start_attr, 96 | end_attr, 97 | multiplier_attribute_name) 98 | 99 | self.relationships.append(relationship) 100 | self.relationship_dictionary[relationship.identifier] = relationship 101 | 102 | return relationship.identifier 103 | 104 | 105 | class QueryType(Enum): 106 | AQP = 0 107 | CARDINALITY = 1 108 | 109 | 110 | class AggregationType(Enum): 111 | SUM = 0 112 | AVG = 1 113 | COUNT = 2 114 | 115 | 116 | class AggregationOperationType(Enum): 117 | PLUS = 0 118 | MINUS = 1 119 | AGGREGATION = 2 120 | 121 | 122 | class Query: 123 | """Represents query""" 124 | 125 | def __init__(self, schema_graph, query_type=QueryType.CARDINALITY, features=None): 126 | self.query_type = query_type 127 | self.schema_graph = schema_graph 128 | self.table_set = set() 129 | self.relationship_set = set() 130 | self.table_where_condition_dict = {} 131 | self.conditions = [] 132 | self.aggregation_operations = [] 133 | self.group_bys = [] 134 | 135 | def remove_conditions_for_attributes(self, table, attributes): 136 | def conflicting(condition): 137 | return any([condition.startswith(attribute + ' ') or condition.startswith(attribute + '<') or 138 | condition.startswith(attribute + '>') or condition.startswith(attribute + '=') for 139 | attribute in attributes]) 140 | 141 | if self.table_where_condition_dict.get(table) is not None: 142 | self.table_where_condition_dict[table] = [condition for condition in 143 | self.table_where_condition_dict[table] 144 | if not conflicting(condition)] 145 | self.conditions = [(cond_table, condition) for cond_table, condition in self.conditions 146 | if not (cond_table == table and conflicting(condition))] 147 | 148 | def copy_cardinality_query(self): 149 | query = Query(self.schema_graph) 150 | query.table_set = copy.copy(self.table_set) 151 | query.relationship_set = copy.copy(self.relationship_set) 152 | query.table_where_condition_dict = copy.copy(self.table_where_condition_dict) 153 | query.conditions = copy.copy(self.conditions) 154 | return query 155 | 156 | def add_group_by(self, table, attribute): 157 | self.group_bys.append((table, attribute)) 158 | 159 | def add_aggregation_operation(self, operation): 160 | """ 161 | Adds operation to AQP query. 162 | :param operation: (AggregationOperationType.AGGREGATION, operation_type, operation_factors) or (AggregationOperationType.MINUS, None, None) 163 | :return: 164 | """ 165 | self.aggregation_operations.append(operation) 166 | 167 | def add_join_condition(self, relationship_identifier): 168 | 169 | relationship = self.schema_graph.relationship_dictionary[relationship_identifier] 170 | self.table_set.add(relationship.start) 171 | self.table_set.add(relationship.end) 172 | 173 | self.relationship_set.add(relationship_identifier) 174 | 175 | def add_where_condition(self, table, condition): 176 | if self.table_where_condition_dict.get(table) is None: 177 | self.table_where_condition_dict[table] = [condition] 178 | else: 179 | self.table_where_condition_dict[table].append(condition) 180 | self.conditions.append((table, condition)) 181 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/ensemble_compilation/physical_db.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | import pandas as pd 3 | 4 | from ensemble_compilation.utils import gen_full_join_query, print_conditions 5 | 6 | 7 | class DBConnection: 8 | 9 | def __init__(self, db_user="postgres", db_password="postgres", db_host="localhost", db_port="5432", db="shopdb"): 10 | self.db_user = db_user 11 | self.db_password = db_password 12 | self.db_host = db_host 13 | self.db_port = db_port 14 | self.db = db 15 | 16 | def vacuum(self): 17 | connection = psycopg2.connect(user=self.db_user, 18 | password=self.db_password, 19 | host=self.db_host, 20 | port=self.db_port, 21 | database=self.db) 22 | old_isolation_level = connection.isolation_level 23 | connection.set_isolation_level(0) 24 | query = "VACUUM" 25 | cursor = connection.cursor() 26 | cursor.execute(query) 27 | connection.commit() 28 | connection.set_isolation_level(old_isolation_level) 29 | 30 | def get_dataframe(self, sql): 31 | connection = psycopg2.connect(user=self.db_user, 32 | password=self.db_password, 33 | host=self.db_host, 34 | port=self.db_port, 35 | database=self.db) 36 | return pd.read_sql(sql, connection) 37 | 38 | def submit_query(self, sql): 39 | """Submits query and ignores result.""" 40 | 41 | connection = psycopg2.connect(user=self.db_user, 42 | password=self.db_password, 43 | host=self.db_host, 44 | port=self.db_port, 45 | database=self.db) 46 | cursor = connection.cursor() 47 | cursor.execute(sql) 48 | connection.commit() 49 | 50 | def get_result(self, sql): 51 | """Fetches exactly one row of result set.""" 52 | 53 | connection = psycopg2.connect(user=self.db_user, 54 | password=self.db_password, 55 | host=self.db_host, 56 | port=self.db_port, 57 | database=self.db) 58 | cursor = connection.cursor() 59 | 60 | cursor.execute(sql) 61 | record = cursor.fetchone() 62 | result = record[0] 63 | 64 | if connection: 65 | cursor.close() 66 | connection.close() 67 | 68 | return result 69 | 70 | def get_result_set(self, sql, return_columns=False): 71 | """Fetches all rows of result set.""" 72 | 73 | connection = psycopg2.connect(user=self.db_user, 74 | password=self.db_password, 75 | host=self.db_host, 76 | port=self.db_port, 77 | database=self.db) 78 | cursor = connection.cursor() 79 | 80 | cursor.execute(sql) 81 | rows = cursor.fetchall() 82 | columns = [desc[0] for desc in cursor.description] 83 | 84 | if connection: 85 | cursor.close() 86 | connection.close() 87 | 88 | if return_columns: 89 | return rows, columns 90 | 91 | return rows 92 | 93 | 94 | class TrueCardinalityEstimator: 95 | """Queries the database to return true cardinalities.""" 96 | 97 | def __init__(self, schema_graph, db_connection): 98 | self.schema_graph = schema_graph 99 | self.db_connection = db_connection 100 | 101 | def true_cardinality(self, query): 102 | full_join_query = gen_full_join_query(self.schema_graph, query.relationship_set, query.table_set, "JOIN") 103 | 104 | where_cond = print_conditions(query.conditions, seperator='AND') 105 | if where_cond != "": 106 | where_cond = "WHERE " + where_cond 107 | sql_query = full_join_query.format("COUNT(*)", where_cond) 108 | cardinality = self.db_connection.get_result(sql_query) 109 | return sql_query, cardinality 110 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/ensemble_compilation/probabilistic_query.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from ensemble_compilation.utils import print_conditions 4 | 5 | 6 | class FactorType(Enum): 7 | INDICATOR_EXP = 0 8 | EXPECTATION = 1 9 | 10 | 11 | class IndicatorExpectation: 12 | """ 13 | Represents E[1_{conditions} * 1/ denominator_multipliers]. 14 | """ 15 | 16 | def __init__(self, denominator_multipliers, conditions, nominator_multipliers=None, spn=None, inverse=False, 17 | table_set=None): 18 | self.nominator_multipliers = nominator_multipliers 19 | if self.nominator_multipliers is None: 20 | self.nominator_multipliers = [] 21 | self.denominator_multipliers = denominator_multipliers 22 | self.conditions = conditions 23 | self.spn = spn 24 | self.min_val = 0 25 | self.inverse = inverse 26 | self.table_set = table_set 27 | if table_set is None: 28 | self.table_set = set() 29 | # if self.spn is not None: 30 | # self.min_val = 1 / self.spn.full_join_size 31 | 32 | def contains_groupby(self, group_bys): 33 | for table, attribute in group_bys: 34 | for cond_table, condition in self.conditions: 35 | if cond_table == table and condition.startswith(attribute): 36 | return True 37 | return False 38 | 39 | def matches(self, other_expectation, ignore_inverse=False, ignore_spn=False): 40 | if self.inverse != other_expectation.inverse and not ignore_inverse: 41 | return False 42 | if set(self.nominator_multipliers) != set(other_expectation.nominator_multipliers): 43 | return False 44 | if set(self.denominator_multipliers) != set(other_expectation.denominator_multipliers): 45 | return False 46 | if set(self.conditions) != set(other_expectation.conditions): 47 | return False 48 | if not ignore_spn and self.table_set != other_expectation.table_set: 49 | return False 50 | return True 51 | 52 | def __hash__(self): 53 | return hash((FactorType.INDICATOR_EXP, self.inverse, frozenset(self.nominator_multipliers), 54 | frozenset(self.denominator_multipliers), frozenset(self.conditions), frozenset(self.table_set))) 55 | 56 | def is_inverse(self, other_expectation): 57 | return self.inverse != other_expectation.inverse and self.matches(other_expectation, ignore_inverse=True) 58 | 59 | def __str__(self): 60 | """ 61 | Prints Expectation of multipliers for conditions. 62 | E(multipliers * 1_{c_1 Λ… Λc_n}) 63 | """ 64 | 65 | if self.inverse: 66 | formula = " / E(" 67 | else: 68 | formula = " * E(" 69 | 70 | for i, (table, normalizer) in enumerate(self.nominator_multipliers): 71 | formula += table + "." + normalizer 72 | if i < len(self.nominator_multipliers) - 1: 73 | formula += "*" 74 | if len(self.nominator_multipliers) == 0: 75 | formula += "1" 76 | 77 | if len(self.denominator_multipliers) > 0: 78 | formula += "/(" 79 | 80 | # 1/multiplier 81 | for i, (table, normalizer) in enumerate(self.denominator_multipliers): 82 | formula += table + "." + normalizer 83 | if i < len(self.denominator_multipliers) - 1: 84 | formula += "*" 85 | formula += ")" 86 | 87 | # |c_1 Λ… Λc_n 88 | if len(self.conditions) > 0: 89 | formula += "* 1_{" 90 | formula += print_conditions(self.conditions) 91 | formula += "}" 92 | formula += ")" 93 | 94 | return formula 95 | 96 | def print_conditions(self, seperator='Λ'): 97 | return print_conditions(self.conditions, seperator=seperator) 98 | 99 | 100 | class Expectation: 101 | """ 102 | Represents conditional expectation of feature with normalizing multipliers. 103 | """ 104 | 105 | def __init__(self, features, normalizing_multipliers, conditions, spn=None): 106 | self.features = features 107 | self.normalizing_multipliers = normalizing_multipliers 108 | self.conditions = conditions 109 | self.spn = spn 110 | self.min_val = 1 111 | 112 | def matches(self, other_expectation, ignore_spn=False): 113 | if set(self.features) != set(other_expectation.features): 114 | return False 115 | if set(self.normalizing_multipliers) != set(other_expectation.normalizing_multipliers): 116 | return False 117 | if set(self.conditions) != set(other_expectation.conditions): 118 | return False 119 | if not ignore_spn and self.spn != other_expectation.spn: 120 | return False 121 | return True 122 | 123 | def __hash__(self): 124 | return hash((FactorType.EXPECTATION, frozenset(self.features), frozenset(self.normalizing_multipliers), 125 | frozenset(self.conditions), self.spn)) 126 | 127 | def __str__(self): 128 | """ 129 | Prints Expectation of feature for conditions. 130 | E(feature | c_1 Λ… Λc_n) (norm by multipliers). 131 | """ 132 | 133 | formula = " * E(" 134 | # features 135 | for i, (table, multiplier) in enumerate(self.features): 136 | formula += table + "." + multiplier 137 | if i < len(self.features) - 1: 138 | formula += "*" 139 | 140 | # /(multipliers) 141 | if len(self.normalizing_multipliers) > 0: 142 | formula += " /(" 143 | # 1/multiplier 144 | for i, (table, normalizer) in enumerate(self.normalizing_multipliers): 145 | formula += table + "." + normalizer 146 | if i < len(self.normalizing_multipliers) - 1: 147 | formula += "*" 148 | formula += ")" 149 | 150 | # |c_1 Λ… Λc_n 151 | if len(self.conditions) > 0: 152 | formula += "| " 153 | formula += print_conditions(self.conditions) 154 | 155 | formula += ")" 156 | 157 | return formula 158 | 159 | def print_conditions(self, seperator='Λ'): 160 | return print_conditions(self.conditions, seperator=seperator) 161 | 162 | 163 | class Probability: 164 | 165 | def __init__(self, conditions): 166 | self.conditions = conditions 167 | 168 | def matches(self, other_probability): 169 | if set(self.conditions) != set(other_probability.conditions): 170 | return False 171 | return True 172 | 173 | def __str__(self): 174 | """ 175 | Prints Probability of conditions 176 | """ 177 | 178 | formula = "" 179 | if len(self.conditions) > 0: 180 | formula += " * P(" 181 | formula += print_conditions(self.conditions) 182 | formula += ")" 183 | 184 | return formula 185 | 186 | def print_conditions(self, seperator='Λ'): 187 | return print_conditions(self.conditions, seperator=seperator) 188 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/ensemble_compilation/utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | 4 | def print_conditions(conditions, seperator='Λ'): 5 | """Pretty prints a set of conditions with a custom seperator.""" 6 | 7 | formula = "" 8 | for i, (table, condition) in enumerate(conditions): 9 | formula += table + "." + condition 10 | if i < len(conditions) - 1: 11 | formula += ' ' + seperator + ' ' 12 | 13 | return formula 14 | 15 | 16 | def gen_full_join_query(schema_graph, relationship_set, table_set, join_type): 17 | """ 18 | Creates the full outer join to for a relationship set for join_type FULL OUTER JOIN or JOIN 19 | """ 20 | 21 | from_clause = "" 22 | if len(relationship_set) == 0: 23 | assert(len(table_set) == 1) 24 | 25 | from_clause = list(table_set)[0] 26 | 27 | else: 28 | included_tables = set() 29 | relationships = copy.copy(relationship_set) 30 | 31 | while relationships: 32 | # first relation to be included 33 | if len(included_tables) == 0: 34 | relationship = relationships.pop() 35 | relationship_obj = schema_graph.relationship_dictionary[relationship] 36 | included_tables.add(relationship_obj.start) 37 | included_tables.add(relationship_obj.end) 38 | from_clause += relationship_obj.start + " " + join_type + " " + relationship_obj.end + " ON " + relationship 39 | else: 40 | # search in suitable relations 41 | relationship_to_add = None 42 | for relationship in relationships: 43 | relationship_obj = schema_graph.relationship_dictionary[relationship] 44 | if (relationship_obj.start in included_tables and relationship_obj.end not in included_tables) or \ 45 | (relationship_obj.end in included_tables and relationship_obj.start not in included_tables): 46 | relationship_to_add = relationship 47 | if relationship_to_add is None: 48 | raise ValueError("Query not a tree") 49 | # add it to where formula 50 | relationship_obj = schema_graph.relationship_dictionary[relationship_to_add] 51 | if (relationship_obj.start in included_tables and relationship_obj.end not in included_tables): 52 | from_clause += " " + join_type + " " + relationship_obj.end + " ON " + relationship_to_add 53 | included_tables.add(relationship_obj.end) 54 | relationships.remove(relationship_to_add) 55 | elif (relationship_obj.end in included_tables and relationship_obj.start not in included_tables): 56 | from_clause += " " + join_type + " " + relationship_obj.start + " ON " + relationship_to_add 57 | included_tables.add(relationship_obj.start) 58 | relationships.remove(relationship_to_add) 59 | 60 | return "SELECT {} FROM " + from_clause + " {}" -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/ensemble_creation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/estimator/deepdb/ensemble_creation/__init__.py -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/ensemble_creation/naive.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from aqp_spn.aqp_spn import AQPSPN 4 | from data_preparation.join_data_preparation import JoinDataPreparator 5 | from ensemble_compilation.spn_ensemble import SPNEnsemble 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | RATIO_MIN_INSTANCE_SLICE = 1 / 100 10 | 11 | 12 | def create_naive_all_split_ensemble(schema, hdf_path, sample_size, ensemble_path, dataset, bloom_filters, 13 | rdc_threshold, max_table_data, post_sampling_factor, incremental_learning_rate): 14 | meta_data_path = hdf_path + '/meta_data.pkl' 15 | prep = JoinDataPreparator(meta_data_path, schema, max_table_data=max_table_data) 16 | spn_ensemble = SPNEnsemble(schema) 17 | 18 | logger.info(f"Creating naive ensemble.") 19 | 20 | for table_obj in schema.tables: 21 | logger.info(f"Learning SPN for {table_obj.table_name}.") 22 | if incremental_learning_rate > 0: 23 | df_samples, df_inc_samples, meta_types, null_values, full_join_est = prep.generate_n_samples_with_incremental_part( 24 | sample_size, 25 | single_table=table_obj.table_name, 26 | post_sampling_factor=post_sampling_factor, 27 | incremental_learning_rate=incremental_learning_rate) 28 | logger.debug(f"Requested {sample_size} samples and got {len(df_samples)} + {len(df_inc_samples)} " 29 | f"(for incremental learning)") 30 | else: 31 | df_samples, meta_types, null_values, full_join_est = prep.generate_n_samples(sample_size, 32 | single_table=table_obj.table_name, 33 | post_sampling_factor=post_sampling_factor) 34 | 35 | # learn spn 36 | aqp_spn = AQPSPN(meta_types, null_values, full_join_est, schema, None, full_sample_size=len(df_samples), 37 | table_set={table_obj.table_name}, column_names=list(df_samples.columns), 38 | table_meta_data=prep.table_meta_data) 39 | min_instance_slice = RATIO_MIN_INSTANCE_SLICE * min(sample_size, len(df_samples)) 40 | logger.debug(f"Using min_instance_slice parameter {min_instance_slice}.") 41 | logger.info(f"SPN training phase with {len(df_samples)} samples") 42 | aqp_spn.learn(df_samples.values, min_instances_slice=min_instance_slice, bloom_filters=bloom_filters, 43 | rdc_threshold=rdc_threshold) 44 | if incremental_learning_rate > 0: 45 | logger.info(f"additional incremental SPN training phase with {len(df_inc_samples)} samples " 46 | f"({incremental_learning_rate}%)") 47 | aqp_spn.learn_incremental(df_inc_samples.values) 48 | spn_ensemble.add_spn(aqp_spn) 49 | 50 | ensemble_path += '/ensemble_single_' + dataset + '_' + str(sample_size) + '.pkl' 51 | logger.info(f"Saving ensemble to {ensemble_path}") 52 | spn_ensemble.save(ensemble_path) 53 | 54 | 55 | def naive_every_relationship_ensemble(schema, hdf_path, sample_size, ensemble_path, dataset, bloom_filters, 56 | rdc_threshold, max_table_data, post_sampling_factor, 57 | incremental_learning_rate=0): 58 | meta_data_path = hdf_path + '/meta_data.pkl' 59 | prep = JoinDataPreparator(meta_data_path, schema, max_table_data=max_table_data) 60 | spn_ensemble = SPNEnsemble(schema) 61 | 62 | logger.info(f"Creating naive ensemble for every relationship.") 63 | for relationship_obj in schema.relationships: 64 | logger.info(f"Learning SPN for {relationship_obj.identifier}.") 65 | 66 | if incremental_learning_rate > 0: 67 | df_samples, df_inc_samples, meta_types, null_values, full_join_est = prep.generate_n_samples_with_incremental_part( 68 | sample_size, relationship_list=[relationship_obj.identifier], post_sampling_factor=post_sampling_factor, 69 | incremental_learning_rate=incremental_learning_rate) 70 | else: 71 | df_samples, meta_types, null_values, full_join_est = prep.generate_n_samples( 72 | sample_size, relationship_list=[relationship_obj.identifier], post_sampling_factor=post_sampling_factor) 73 | logger.debug(f"Requested {sample_size} samples and got {len(df_samples)}") 74 | 75 | # learn spn 76 | aqp_spn = AQPSPN(meta_types, null_values, full_join_est, schema, 77 | [relationship_obj.identifier], full_sample_size=len(df_samples), 78 | column_names=list(df_samples.columns), table_meta_data=prep.table_meta_data) 79 | min_instance_slice = RATIO_MIN_INSTANCE_SLICE * min(sample_size, len(df_samples)) 80 | logger.debug(f"Using min_instance_slice parameter {min_instance_slice}.") 81 | logger.info(f"SPN training phase with {len(df_samples)} samples") 82 | aqp_spn.learn(df_samples.values, min_instances_slice=min_instance_slice, bloom_filters=bloom_filters, 83 | rdc_threshold=rdc_threshold) 84 | if incremental_learning_rate > 0: 85 | logger.info(f"additional incremental SPN training phase with {len(df_inc_samples)} samples " 86 | f"({incremental_learning_rate}%)") 87 | aqp_spn.learn_incremental(df_inc_samples) 88 | spn_ensemble.add_spn(aqp_spn) 89 | 90 | ensemble_path += '/ensemble_relationships_' + dataset + '_' + str(sample_size) + '.pkl' 91 | logger.info(f"Saving ensemble to {ensemble_path}") 92 | spn_ensemble.save(ensemble_path) 93 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/ensemble_creation/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | def create_random_join(schema, no_relationships): 5 | assert no_relationships >= 0, "No_relationships must be greater equal 0" 6 | 7 | start_tables = list(schema.tables) 8 | random.shuffle(start_tables) 9 | start_table_obj = start_tables[0] 10 | 11 | merged_tables = {start_table_obj.table_name} 12 | relationships = set() 13 | 14 | for i in range(no_relationships): 15 | 16 | possible_next_relationships = list() 17 | 18 | for relationship_obj in schema.relationships: 19 | # already in random relationships 20 | if relationship_obj.identifier in relationships: 21 | continue 22 | 23 | if relationship_obj.start in merged_tables and \ 24 | relationship_obj.end not in merged_tables: 25 | possible_next_relationships.append((relationship_obj.identifier, relationship_obj.end)) 26 | 27 | elif relationship_obj.end in merged_tables and \ 28 | relationship_obj.start not in merged_tables: 29 | possible_next_relationships.append((relationship_obj.identifier, relationship_obj.start)) 30 | 31 | random.shuffle(possible_next_relationships) 32 | if len(possible_next_relationships) == 0: 33 | return list(relationships), merged_tables 34 | 35 | relationship, table = possible_next_relationships[0] 36 | merged_tables.add(table) 37 | relationships.add(relationship) 38 | 39 | return list(relationships), merged_tables 40 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/evaluation/cardinality_evaluation.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from time import perf_counter 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from ensemble_compilation.graph_representation import QueryType 8 | from ensemble_compilation.physical_db import DBConnection, TrueCardinalityEstimator 9 | from ensemble_compilation.spn_ensemble import read_ensemble 10 | from evaluation.utils import parse_query, save_csv 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def compute_ground_truth(query_filename, target_path, physical_db_name): 16 | """ 17 | Queries database for each query and stores result rows in csv file. 18 | :param query_filename: where to take queries from 19 | :param target_path: where to store dictionary 20 | :param physical_db_name: name of the database 21 | :return: 22 | """ 23 | 24 | db_connection = DBConnection(db=physical_db_name) 25 | 26 | # read all queries 27 | with open(query_filename) as f: 28 | queries = f.readlines() 29 | 30 | csv_rows = [] 31 | for query_no, query_str in enumerate(queries): 32 | logger.debug(f"Computing ground truth for cardinality query {query_no}: {query_str}") 33 | query_str = query_str.strip() 34 | cardinality_true = db_connection.get_result(query_str) 35 | 36 | csv_rows.append({'query_no': query_no, 37 | 'query': query_str, 38 | 'cardinality_true': cardinality_true}) 39 | 40 | save_csv(csv_rows, target_path) 41 | 42 | 43 | class GenCodeStats: 44 | 45 | def __init__(self): 46 | self.calls = 0 47 | self.total_time = 0.0 48 | 49 | 50 | def evaluate_cardinalities(ensemble_location, physical_db_name, query_filename, target_csv_path, schema, 51 | rdc_spn_selection, pairwise_rdc_path, use_generated_code=False, 52 | true_cardinalities_path='./benchmarks/job-light/sql/job_light_true_cardinalities.csv', 53 | max_variants=1, merge_indicator_exp=False, exploit_overlapping=False, min_sample_ratio=0): 54 | """ 55 | Loads ensemble and evaluates cardinality for every query in query_filename 56 | :param exploit_overlapping: 57 | :param min_sample_ratio: 58 | :param max_variants: 59 | :param merge_indicator_exp: 60 | :param target_csv_path: 61 | :param query_filename: 62 | :param true_cardinalities_path: 63 | :param ensemble_location: 64 | :param physical_db_name: 65 | :param schema: 66 | :return: 67 | """ 68 | if true_cardinalities_path is not None: 69 | df_true_card = pd.read_csv(true_cardinalities_path) 70 | else: 71 | # True cardinality via DB 72 | db_connection = DBConnection(db=physical_db_name) 73 | true_estimator = TrueCardinalityEstimator(schema, db_connection) 74 | 75 | # load ensemble 76 | spn_ensemble = read_ensemble(ensemble_location, build_reverse_dict=True) 77 | 78 | csv_rows = [] 79 | q_errors = [] 80 | 81 | # read all queries 82 | with open(query_filename) as f: 83 | queries = f.readlines() 84 | 85 | if use_generated_code: 86 | spn_ensemble.use_generated_code() 87 | 88 | latencies = [] 89 | for query_no, query_str in enumerate(queries): 90 | 91 | query_str = query_str.strip() 92 | logger.debug(f"Predicting cardinality for query {query_no}: {query_str}") 93 | 94 | query = parse_query(query_str.strip(), schema) 95 | assert query.query_type == QueryType.CARDINALITY 96 | 97 | if df_true_card is None: 98 | assert true_estimator is not None 99 | _, cardinality_true = true_estimator.true_cardinality(query) 100 | else: 101 | cardinality_true = df_true_card.loc[df_true_card['query_no'] == query_no, ['cardinality_true']].values[0][0] 102 | 103 | # only relevant for generated code 104 | gen_code_stats = GenCodeStats() 105 | 106 | card_start_t = perf_counter() 107 | _, factors, cardinality_predict, factor_values = spn_ensemble \ 108 | .cardinality(query, rdc_spn_selection=rdc_spn_selection, pairwise_rdc_path=pairwise_rdc_path, 109 | merge_indicator_exp=merge_indicator_exp, max_variants=max_variants, 110 | exploit_overlapping=exploit_overlapping, return_factor_values=True, 111 | gen_code_stats=gen_code_stats) 112 | card_end_t = perf_counter() 113 | latency_ms = (card_end_t - card_start_t) * 1000 114 | 115 | logger.debug(f"\t\tLatency: {latency_ms:.2f}ms") 116 | logger.debug(f"\t\tTrue: {cardinality_true}") 117 | logger.debug(f"\t\tPredicted: {cardinality_predict}") 118 | 119 | q_error = max(cardinality_predict / cardinality_true, cardinality_true / cardinality_predict) 120 | if cardinality_predict == 0 and cardinality_true == 0: 121 | q_error = 1.0 122 | 123 | logger.debug(f"Q-Error was: {q_error}") 124 | q_errors.append(q_error) 125 | csv_rows.append({'query_no': query_no, 126 | 'query': query_str, 127 | 'cardinality_predict': cardinality_predict, 128 | 'cardinality_true': cardinality_true, 129 | 'latency_ms': latency_ms, 130 | 'generated_spn_calls': gen_code_stats.calls, 131 | 'latency_generated_code': gen_code_stats.total_time * 1000}) 132 | latencies.append(latency_ms) 133 | 134 | # print percentiles of published JOB-light 135 | q_errors = np.array(q_errors) 136 | q_errors.sort() 137 | logger.info(f"{q_errors[-10:]}") 138 | # https://arxiv.org/pdf/1809.00677.pdf 139 | ibjs_vals = [1.59, 150, 3198, 14309, 590] 140 | mcsn_vals = [3.82, 78.4, 362, 927, 57.9] 141 | for i, percentile in enumerate([50, 90, 95, 99]): 142 | logger.info(f"Q-Error {percentile}%-Percentile: {np.percentile(q_errors, percentile)} (vs. " 143 | f"MCSN: {mcsn_vals[i]} and IBJS: {ibjs_vals[i]})") 144 | 145 | logger.info(f"Q-Mean wo inf {np.mean(q_errors[np.isfinite(q_errors)])} (vs. " 146 | f"MCSN: {mcsn_vals[-1]} and IBJS: {ibjs_vals[-1]})") 147 | logger.info(f"Latency avg: {np.mean(latencies):.2f}ms") 148 | 149 | # write to csv 150 | save_csv(csv_rows, target_csv_path) 151 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/evaluation/confidence_interval_evaluation.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | import pickle 4 | from time import perf_counter 5 | 6 | import math 7 | import scipy 8 | 9 | from ensemble_compilation.graph_representation import AggregationType 10 | from ensemble_compilation.spn_ensemble import read_ensemble, logger 11 | from evaluation.utils import parse_query, all_operations_of_type, save_csv 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def evaluate_confidence_intervals(ensemble_location, query_filename, target_path, schema, ground_truth_path, 17 | confidence_sample_size, rdc_spn_selection, pairwise_rdc_path, 18 | max_variants=5, merge_indicator_exp=False, 19 | exploit_overlapping=False, min_sample_ratio=0, sample_size=10000000, 20 | true_result_upsampling_factor=300): # 100 21 | """ 22 | Loads ensemble and computes metrics for confidence interval evaluation 23 | :param ensemble_location: 24 | :param query_filename: 25 | :param target_csv_path: 26 | :param schema: 27 | :param max_variants: 28 | :param merge_indicator_exp: 29 | :param exploit_overlapping: 30 | :param min_sample_ratio: 31 | :return: 32 | """ 33 | 34 | spn_ensemble = read_ensemble(ensemble_location, build_reverse_dict=True) 35 | csv_rows = [] 36 | 37 | # read all queries 38 | with open(query_filename) as f: 39 | queries = f.readlines() 40 | # read ground truth 41 | with open(ground_truth_path, 'rb') as handle: 42 | ground_truth = pickle.load(handle) 43 | 44 | for query_no, query_str in enumerate(queries): 45 | 46 | query_str = query_str.strip() 47 | logger.info(f"Evaluating the confidence intervals for query {query_no}: {query_str}") 48 | 49 | query = parse_query(query_str.strip(), schema) 50 | aqp_start_t = perf_counter() 51 | confidence_intervals, aqp_result = spn_ensemble.evaluate_query(query, rdc_spn_selection=rdc_spn_selection, 52 | pairwise_rdc_path=pairwise_rdc_path, 53 | merge_indicator_exp=merge_indicator_exp, 54 | max_variants=max_variants, 55 | exploit_overlapping=exploit_overlapping, 56 | debug=False, 57 | confidence_intervals=True, 58 | confidence_sample_size=confidence_sample_size) 59 | aqp_end_t = perf_counter() 60 | latency = aqp_end_t - aqp_start_t 61 | logger.info(f"\t\t{'total_time:':<32}{latency} secs") 62 | 63 | true_result = ground_truth[query_no] 64 | 65 | type_all_ops = None 66 | if all_operations_of_type(AggregationType.SUM, query): 67 | type_all_ops = AggregationType.SUM 68 | elif all_operations_of_type(AggregationType.AVG, query): 69 | type_all_ops = AggregationType.AVG 70 | elif all_operations_of_type(AggregationType.COUNT, query): 71 | type_all_ops = AggregationType.COUNT 72 | 73 | if isinstance(aqp_result, list): 74 | for result_row in true_result: 75 | group_by_attributes = result_row[:-3] 76 | matching_aqp_rows = [(matching_idx, aqp_row) for matching_idx, aqp_row in enumerate(aqp_result) 77 | if aqp_row[:-1] == group_by_attributes] 78 | assert len(matching_aqp_rows) <= 1, "Multiple possible group by attributes found." 79 | if len(matching_aqp_rows) == 1: 80 | matching_idx, matching_aqp_row = matching_aqp_rows[0] 81 | true_aggregate, std, count = result_row[-3:] 82 | 83 | if count <= 1: 84 | # std is not defined in this case 85 | continue 86 | 87 | interval = confidence_intervals[matching_idx] 88 | aqp_std, true_std, relative_confidence_interval_error, true_result, aqp_aggregate = evaluate_stds( 89 | matching_aqp_row[-1], 90 | interval, count, 91 | sample_size, std, 92 | true_aggregate, type_all_ops, 93 | true_result_upsampling_factor) 94 | 95 | logger.debug(f"\t\taqp_std: {aqp_std}") 96 | logger.debug(f"\t\ttrue_std: {true_std}") 97 | 98 | csv_rows.append({'query_no': query_no, 99 | 'latency': latency, 100 | 'aqp_std': aqp_std, 101 | 'aqp_aggregate': aqp_aggregate, 102 | 'true_std': true_std, 103 | 'true_aggregate': true_result, 104 | 'count': count, 105 | 'relative_confidence_interval_error': relative_confidence_interval_error 106 | }) 107 | else: 108 | true_aggregate, std, count = true_result[0][-3:] 109 | 110 | aqp_std, true_std, relative_confidence_interval_error, true_result, aqp_aggregate = evaluate_stds( 111 | aqp_result, confidence_intervals, 112 | count, sample_size, std, 113 | true_aggregate, 114 | type_all_ops, 115 | true_result_upsampling_factor) 116 | logger.debug(f"\t\taqp_std: {aqp_std}") 117 | logger.debug(f"\t\ttrue_std: {true_std}") 118 | 119 | csv_rows.append({'query_no': query_no, 120 | 'latency': latency, 121 | 'aqp_std': aqp_std, 122 | 'aqp_aggregate': aqp_aggregate, 123 | 'true_std': true_std, 124 | 'true_aggregate': true_result, 125 | 'count': count, 126 | 'relative_confidence_interval_error': relative_confidence_interval_error 127 | }) 128 | 129 | save_csv(csv_rows, target_path) 130 | 131 | 132 | def evaluate_stds(aqp_result, confidence_intervals, count, sample_size, std, true_result, type_all_ops, 133 | true_result_upsampling_factor): 134 | std = float(std) 135 | count = float(count) 136 | true_result = float(true_result) 137 | confidence_upper_bound = confidence_intervals[1] 138 | ci_length = confidence_upper_bound - aqp_result 139 | aqp_std = ci_length # / scipy.stats.norm.ppf(0.95) 140 | if type_all_ops == AggregationType.AVG: 141 | # for normal random variable std/sqrt(n) 142 | true_std = std / math.sqrt(count) 143 | 144 | elif type_all_ops == AggregationType.COUNT: 145 | # for bernoulli: sqrt(n*p*(1-p)) 146 | 147 | bernoulli_p = count / sample_size 148 | true_std = math.sqrt(sample_size * bernoulli_p * (1 - bernoulli_p)) * true_result_upsampling_factor 149 | true_result *= true_result_upsampling_factor 150 | 151 | elif type_all_ops == AggregationType.SUM: 152 | # model sum as product of 1_c * X 153 | 154 | bernoulli_p = count / sample_size 155 | bernoulli_std = math.sqrt(sample_size * bernoulli_p * (1 - bernoulli_p)) 156 | 157 | rv_exp = true_result / count 158 | rv_std = std / math.sqrt(count) 159 | 160 | true_std = math.sqrt((bernoulli_std ** 2 + bernoulli_p ** 2) * (rv_std ** 2 + rv_exp ** 2) - 161 | bernoulli_p ** 2 * rv_exp ** 2) * true_result_upsampling_factor 162 | true_result *= true_result_upsampling_factor 163 | 164 | true_std *= scipy.stats.norm.ppf(0.95) 165 | relative_confidence_interval_error = abs(aqp_std - true_std) / true_result 166 | return aqp_std, true_std, relative_confidence_interval_error, true_result, aqp_result 167 | -------------------------------------------------------------------------------- /lecarb/estimator/deepdb/evaluation/spn_statistics.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | import os 4 | 5 | from spn.structure.Base import Node, get_nodes_by_type 6 | 7 | from ensemble_compilation.spn_ensemble import read_ensemble 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def evaluate_spn_statistics(spn_path, target_csv_path, build_time_path): 13 | csv_list = [] 14 | 15 | # SPN learn times 16 | for filename in os.listdir(spn_path): 17 | logger.debug(f'Reading {filename}') 18 | if not filename.startswith("ensemble") or filename.endswith('.zip'): 19 | continue 20 | 21 | spn_ensemble = read_ensemble(os.path.join(spn_path, filename)) 22 | for spn in spn_ensemble.spns: 23 | num_nodes = len(get_nodes_by_type(spn.mspn, Node)) 24 | upper_bound = 200 * len(spn.column_names) - 1 25 | # assert num_nodes <= upper_bound, "Num of nodes upper bound is wrong" 26 | csv_list.append((filename, spn.learn_time, spn.full_sample_size, spn.min_instances_slice, spn.rdc_threshold, 27 | len(spn.relationship_set), len(spn.table_set), 28 | " - ".join([table for table in spn.table_set]), 29 | len(spn.column_names), 30 | num_nodes, 31 | upper_bound)) 32 | 33 | # HDF create times 34 | with open(build_time_path) as f: 35 | hdf_preprocessing_time = int(f.readlines()[0]) 36 | csv_list += [('generate_hdf', hdf_preprocessing_time, 0, 0, 0, 0, 0, "")] 37 | 38 | with open(target_csv_path, 'w', newline='') as f: 39 | writer = csv.writer(f) 40 | writer.writerow( 41 | ['filename', 'learn_time', 'full_sample_size', 'min_instances_slice', 'rdc_threshold', 'no_joins', 42 | 'no_tables', 'tables', 'no_columns', 'structure_stats', 'upper_bound']) 43 | writer.writerows(csv_list) 44 | -------------------------------------------------------------------------------- /lecarb/estimator/estimator.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | import numpy as np 4 | from typing import Tuple, Any 5 | from ..workload.workload import Query, query_2_triple 6 | from ..dataset.dataset import Table 7 | 8 | L = logging.getLogger(__name__) 9 | 10 | class Estimator(object): 11 | """Base class for a cardinality estimator.""" 12 | def __init__(self, table: Table, **kwargs: Any) -> None: 13 | self.table = table 14 | self.params = dict(kwargs) 15 | 16 | def __repr__(self) -> str: 17 | pstr = ';'.join([f"{p}={v}" for p, v in self.params.items()]) 18 | return f"{self.__class__.__name__.lower()}-{pstr}" 19 | 20 | def query(self, query: Query) -> Tuple[float, float]: 21 | """return est_card, dur_ms""" 22 | raise NotImplementedError 23 | 24 | def in_between(data: Any, val: Tuple[Any, Any]) -> bool: 25 | assert len(val) == 2 26 | lrange, rrange = val 27 | return np.greater_equal(data, lrange) & np.less_equal(data, rrange) 28 | 29 | OPS = { 30 | '>': np.greater, 31 | '<': np.less, 32 | '>=': np.greater_equal, 33 | '<=': np.less_equal, 34 | '=': np.equal, 35 | '[]': in_between 36 | } 37 | 38 | class Oracle(Estimator): 39 | def __init__(self, table): 40 | super(Oracle, self).__init__(table=table) 41 | 42 | def query(self, query): 43 | columns, operators, values = query_2_triple(query, with_none=False, split_range=False) 44 | start_stmp = time.time() 45 | bitmap = np.ones(self.table.row_num, dtype=bool) 46 | for c, o, v in zip(columns, operators, values): 47 | bitmap &= OPS[o](self.table.data[c], v) 48 | card = bitmap.sum() 49 | dur_ms = (time.time() - start_stmp) * 1e3 50 | return card, dur_ms 51 | 52 | # from pandasql import sqldf <- too slow 53 | # def query(self, query): 54 | # sql = query_2_sql(query, self.table) 55 | # data = self.table.data 56 | # start_stmp = time.time() 57 | # df = sqldf(sql, locals()) 58 | # card = df.iloc[0, 0] 59 | # dur_ms = (time.time() - start_stmp) * 1e3 60 | # return card, dur_ms 61 | -------------------------------------------------------------------------------- /lecarb/estimator/feedback_kde.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | from typing import Any, Dict 4 | import psycopg2 5 | 6 | from .estimator import Estimator 7 | from .utils import run_test 8 | from ..workload.workload import query_2_kde_sql, load_queryset 9 | from ..dataset.dataset import load_table 10 | from ..constants import KDE_DATABASE_URL 11 | 12 | L = logging.getLogger(__name__) 13 | 14 | class FeedbackKDE(Estimator): 15 | def __init__(self, table, ratio, train_num, seed): 16 | super(FeedbackKDE, self).__init__(table=table, version=table.version, ratio=ratio, train_num=train_num, seed=seed) 17 | self.sample_num = int(table.row_num * ratio) 18 | L.info(f"Going to collect {self.sample_num} samples") 19 | 20 | self.conn = psycopg2.connect(KDE_DATABASE_URL) 21 | self.conn.set_session('read uncommitted', autocommit=True) 22 | self.cursor = self.conn.cursor() 23 | 24 | # Make sure that debug mode is deactivated and that all model traces are removed (unless we want to reuse the model): 25 | self.cursor.execute(f"SELECT setseed({1/seed});") 26 | # self.cursor.execute("SET kde_debug TO true;") 27 | self.cursor.execute("SET kde_debug TO false;") 28 | self.cursor.execute("SET ocl_use_gpu TO true;") 29 | self.cursor.execute("SET kde_error_metric TO Quadratic;") 30 | 31 | # Remove all existing model traces if we don't reuse the model. 32 | self.cursor.execute("DELETE FROM pg_kdemodels;") 33 | self.cursor.execute("DELETE FROM pg_kdefeedback;") 34 | self.cursor.execute("SELECT pg_stat_reset();") 35 | 36 | # KDE-specific parameters. 37 | self.cursor.execute(f"SET kde_samplesize TO {self.sample_num};") 38 | self.cursor.execute("SET kde_enable TO true;") 39 | self.cursor.execute("SET kde_collect_feedback TO true;") 40 | 41 | def train_batch(self, queries): 42 | for i, query in enumerate(queries): 43 | self.cursor.execute(query_2_kde_sql(query, self.table)) 44 | if (i + 1) % 100 == 0: 45 | L.info(f"{i+1} queries done") 46 | L.info("Finishing running all training queries") 47 | 48 | self.cursor.execute("SET kde_collect_feedback TO false;") # We don't need further feedback collection. 49 | self.cursor.execute("SET kde_enable_bandwidth_optimization TO true;") 50 | self.cursor.execute(f"SET kde_optimization_feedback_window TO {len(queries)};") 51 | 52 | stat_cnt = 100 53 | for c in self.table.columns.values(): 54 | self.cursor.execute(f"alter table \"{self.table.name}\" alter column {c.name} set statistics {stat_cnt};") 55 | 56 | self.cursor.execute(f"analyze \"{self.table.name}\"({','.join(self.table.columns.keys())});") 57 | 58 | sample_file = f"/tmp/sample_{self.table.name}.csv" 59 | self.cursor.execute(f"SELECT kde_dump_sample('{self.table.name}', '{sample_file}');") 60 | 61 | def query(self, query): 62 | sql = f"explain(format json) {query_2_kde_sql(query, self.table)}" 63 | 64 | start_stmp = time.time() 65 | self.cursor.execute(sql) 66 | dur_ms = (time.time() - start_stmp) * 1e3 67 | res = self.cursor.fetchall() 68 | card = res[0][0][0]['Plan']['Plan Rows'] 69 | # L.info(card) 70 | return card, dur_ms 71 | 72 | def test_kde(seed: int, dataset: str, version: str, workload:str, params: Dict[str, Any], overwrite: bool): 73 | """ 74 | params: 75 | version: the version of table that postgres construct statistics, might not be the same with the one we test on 76 | ratio: ratio of the sample size 77 | train_num: number of queries use to train 78 | """ 79 | # prioriy: params['version'] (build statistics from another dataset) > version (build statistics on the same dataset) 80 | table = load_table(dataset, params.get('version') or version) 81 | train_num = params['train_num'] 82 | 83 | L.info("load training workload...") 84 | queries = load_queryset(dataset, workload)['train'][:train_num] 85 | 86 | L.info("construct postgres estimator...") 87 | estimator = FeedbackKDE(table, ratio=params['ratio'], train_num=train_num, seed=seed) 88 | 89 | L.info(f"start training with {train_num} queries...") 90 | start_stmp = time.time() 91 | estimator.train_batch(queries) 92 | dur_min = (time.time() - start_stmp) / 60 93 | L.info(f"built kde estimator: {estimator}, using {dur_min:1f} minutes") 94 | 95 | run_test(dataset, version, workload, estimator, overwrite) 96 | 97 | 98 | -------------------------------------------------------------------------------- /lecarb/estimator/lw/README.md: -------------------------------------------------------------------------------- 1 | Implementation of paper [Selectivity Estimation for Range Predicates using Lightweight Models](http://www.vldb.org/pvldb/vol12/p1044-dutt.pdf) 2 | -------------------------------------------------------------------------------- /lecarb/estimator/lw/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import logging 4 | 5 | from ..postgres import Postgres 6 | from ...workload.workload import load_queryset, load_labels, query_2_sqls, query_2_vector 7 | from ...constants import DATA_ROOT, PKL_PROTO 8 | 9 | L = logging.getLogger(__name__) 10 | 11 | # selectivity_list (np.array): selectivity for each attribute 12 | def AVI(sel_list): 13 | return np.prod(sel_list) if len(sel_list) > 0 else 1.0 14 | 15 | def EBO(sel_list): 16 | s = 1.0 17 | sorted_slist = np.sort(sel_list) 18 | for i in range(min(4, sel_list.size)): 19 | s = s * np.power(sorted_slist[i], 1 / (i+1)) 20 | return s 21 | 22 | def MinSel(sel_list): 23 | return sel_list.min() if len(sel_list) > 0 else 1.0 24 | 25 | def encode_query(table, query, pg_est): 26 | range_features = query_2_vector(query, table, upper=1000) 27 | sqls = query_2_sqls(query, table) 28 | sel_list = [] 29 | for sql in sqls: 30 | pred, _ = pg_est.query_sql(sql) 31 | sel_list.append(pred / table.row_num) 32 | sel_list = np.array(sel_list) 33 | ce_features = np.round(np.array([AVI(sel_list), EBO(sel_list), MinSel(sel_list)]) * table.row_num) 34 | 35 | return np.concatenate([range_features, encode_label(ce_features)]) 36 | 37 | def encode_label(label): 38 | # +1 before log2 to deal with ground truth = 0 scenario 39 | return np.log2(label + 1) 40 | 41 | def decode_label(label): 42 | return np.power(2, label) - 1 43 | 44 | def encode_queries(table, queryset, labels, pg_est): 45 | X = [] 46 | y = [] 47 | gt = [] 48 | 49 | for query, label in zip(queryset, labels): 50 | features = encode_query(table, query, pg_est) 51 | log2l = encode_label(label.cardinality) 52 | X.append(features) 53 | y.append(log2l) 54 | gt.append(label.cardinality) 55 | 56 | return np.array(X), np.array(y), np.array(gt) 57 | 58 | def load_lw_dataset(table, workload, seed, bins): 59 | query_path = DATA_ROOT / table.dataset / "lw" 60 | query_path.mkdir(exist_ok=True) 61 | 62 | file_path = query_path / f"{table.version}_{workload}_{bins}_{seed}.pkl" 63 | if file_path.is_file(): 64 | L.info(f"features already built in file {file_path}") 65 | with open(file_path, 'rb') as f: 66 | return pickle.load(f) 67 | 68 | pg_est = Postgres(table, bins, seed) 69 | L.info(f"Start loading queryset:{workload} and labels for version {table.version} of dataset {table.dataset}...") 70 | queryset = load_queryset(table.dataset, workload) 71 | labels = load_labels(table.dataset, table.version, workload) 72 | 73 | lw_dataset = {} 74 | for group in queryset.keys(): 75 | L.info(f"Start encode group: {group} with {len(labels[group])} queries...") 76 | lw_dataset[group] = encode_queries(table, queryset[group], labels[group], pg_est) 77 | 78 | with open(file_path, 'wb') as f: 79 | pickle.dump(lw_dataset, f, protocol=PKL_PROTO) 80 | 81 | return lw_dataset 82 | -------------------------------------------------------------------------------- /lecarb/estimator/lw/lw_nn.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | from typing import Dict, Any, Tuple 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | from torch.utils.data import DataLoader, Dataset 9 | 10 | from .model import LWNNModel 11 | from .common import load_lw_dataset, encode_query, decode_label 12 | from ..postgres import Postgres 13 | from ..estimator import Estimator 14 | from ..utils import report_model, evaluate, run_test 15 | from ...dataset.dataset import load_table 16 | from ...workload.workload import Query 17 | from ...constants import DEVICE, MODEL_ROOT, NUM_THREADS 18 | 19 | L = logging.getLogger(__name__) 20 | 21 | class Args: 22 | def __init__(self, **kwargs): 23 | self.bs = 32 24 | self.epochs = 500 25 | self.lr = 0.001 # default value in both pytorch and keras 26 | self.hid_units = '128_64_32' 27 | self.bins = 200 28 | self.train_num = 10000 29 | 30 | # overwrite parameters from user 31 | self.__dict__.update(kwargs) 32 | 33 | class LWQueryDataset(Dataset): 34 | def __init__(self, X, y, gt): 35 | super(LWQueryDataset, self).__init__() 36 | self.X = X 37 | self.y = y 38 | self.gt = gt 39 | def __len__(self): 40 | return len(self.y) 41 | def __getitem__(self, idx): 42 | return self.X[idx], self.y[idx], self.gt[idx] 43 | 44 | def make_dataset(dataset, num=-1): 45 | X, y, gt = dataset 46 | L.info(f"{X.shape}, {y.shape}, {gt.shape}") 47 | if num <= 0: 48 | return LWQueryDataset(X, y, gt) 49 | else: 50 | return LWQueryDataset(X[:num], y[:num], gt[:num]) 51 | 52 | def train_lw_nn(seed, dataset, version, workload, params, sizelimit): 53 | # uniform thread number 54 | torch.set_num_threads(NUM_THREADS) 55 | assert NUM_THREADS == torch.get_num_threads(), torch.get_num_threads() 56 | L.info(f"torch threads: {torch.get_num_threads()}") 57 | 58 | torch.manual_seed(seed) 59 | np.random.seed(seed) 60 | 61 | # convert parameter dict of lw(nn) 62 | L.info(f"params: {params}") 63 | args = Args(**params) 64 | 65 | table = load_table(dataset, version) 66 | 67 | # create model 68 | fea_num = table.col_num*2+3 69 | model = LWNNModel(fea_num, args.hid_units).to(DEVICE) 70 | model_size = report_model(model) 71 | 72 | # check size limit 73 | if sizelimit > 0 and model_size > (sizelimit * table.data_size_mb): 74 | L.info(f"Exceeds size limit {model_size:.2f}MB > {sizelimit} x {table.data_size_mb}, do not conintue training!") 75 | return 76 | L.info(f'Overall LWNN model size = {model_size:.2f}MB') 77 | 78 | # load dataset 79 | dataset = load_lw_dataset(table, workload, seed, args.bins) 80 | train_dataset = make_dataset(dataset['train'], num=args.train_num) 81 | valid_dataset = make_dataset(dataset['valid'], num=args.train_num//10) 82 | 83 | L.info(f"Number of training samples: {len(train_dataset)}") 84 | L.info(f"Number of validation samples: {len(valid_dataset)}") 85 | train_loader = DataLoader(train_dataset, batch_size=args.bs) 86 | valid_loader = DataLoader(valid_dataset, batch_size=args.bs) 87 | 88 | # Train model 89 | state = { 90 | 'seed': seed, 91 | 'args': args, 92 | 'device': DEVICE, 93 | 'threads': torch.get_num_threads(), 94 | 'dataset': table.dataset, 95 | 'version': table.version, 96 | 'workload': workload, 97 | 'model_size': model_size, 98 | 'fea_num': fea_num, 99 | } 100 | model_path = MODEL_ROOT / table.dataset 101 | model_path.mkdir(parents=True, exist_ok=True) 102 | model_file = model_path / f"{table.version}_{workload}-{model.name()}_bin{args.bins}_ep{args.epochs}_bs{args.bs}_{args.train_num//1000}k-{seed}.pt" 103 | 104 | optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) 105 | mse_loss = nn.MSELoss(reduction='none') 106 | best_valid_loss = float('inf') 107 | 108 | start_stmp = time.time() 109 | valid_time = 0 110 | for epoch in range(args.epochs): 111 | train_loss = torch.tensor([]) 112 | model.train() 113 | for _, data in enumerate(train_loader): 114 | inputs, labels, _ = data 115 | inputs = inputs.to(DEVICE).float() 116 | labels = labels.to(DEVICE).float() 117 | 118 | optimizer.zero_grad() 119 | preds = model(inputs).reshape(-1) 120 | 121 | loss = mse_loss(preds, labels) 122 | loss.mean().backward() 123 | optimizer.step() 124 | train_loss = torch.cat([train_loss, loss.cpu()]) 125 | dur_min = (time.time() - start_stmp) / 60 126 | L.info(f"Epoch {epoch+1}, loss: {train_loss.mean()}, time since start: {dur_min:.1f} mins") 127 | 128 | L.info(f"Test on valid set...") 129 | valid_stmp = time.time() 130 | valid_loss = torch.tensor([]) 131 | valid_preds = torch.tensor([]) 132 | valid_gts = torch.tensor([]) 133 | model.eval() 134 | for _, data in enumerate(valid_loader): 135 | inputs, labels, gts = data 136 | inputs = inputs.to(DEVICE).float() 137 | labels = labels.to(DEVICE).float() 138 | 139 | with torch.no_grad(): 140 | preds = model(inputs).reshape(-1) 141 | valid_preds = torch.cat([valid_preds, preds.cpu()]) 142 | valid_gts = torch.cat([valid_gts, gts.float()]) 143 | 144 | loss = mse_loss(preds, labels) 145 | valid_loss = torch.cat([valid_loss, loss.cpu()]) 146 | 147 | valid_loss = valid_loss.mean() 148 | L.info(f'Valid loss is {valid_loss:.4f}') 149 | valid_preds = np.maximum(np.round(decode_label(valid_preds)), 0.0) 150 | L.info("Q-Error on validation set:") 151 | _, metrics = evaluate(valid_preds, valid_gts) 152 | 153 | if valid_loss < best_valid_loss: 154 | L.info('best valid loss for now!') 155 | best_valid_loss = valid_loss 156 | state['model_state_dict'] = model.state_dict() 157 | state['optimizer_state_dict'] = optimizer.state_dict() 158 | state['valid_error'] = {workload: metrics} 159 | state['train_time'] = (valid_stmp-start_stmp-valid_time) / 60 160 | state['current_epoch'] = epoch 161 | torch.save(state, model_file) 162 | 163 | valid_time += time.time() - valid_stmp 164 | 165 | L.info(f"Training finished! Time spent since start: {(time.time()-start_stmp)/60:.2f} mins") 166 | L.info(f"Model saved to {model_file}, best valid: {state['valid_error']}") 167 | 168 | class LWNN(Estimator): 169 | def __init__(self, model, model_name, pg_est, table): 170 | super(LWNN, self).__init__(table=table, model=model_name) 171 | self.model = model.to(DEVICE) 172 | self.model.eval() 173 | self.pg_est = pg_est 174 | 175 | def query(self, query): 176 | if isinstance(query, Query): 177 | query = encode_query(self.table, query, self.pg_est) 178 | return self.query_vector(query) 179 | 180 | def query_vector(self, vec): 181 | start_stmp = time.time() 182 | with torch.no_grad(): 183 | pred = self.model(torch.FloatTensor(vec).to(DEVICE)).cpu().item() 184 | dur_ms = (time.time() - start_stmp) * 1e3 185 | return np.maximum(np.round(decode_label(pred)), 0.0), dur_ms 186 | 187 | def load_lw_nn(dataset: str, model_name: str) -> Tuple[Estimator, Dict[str, Any]]: 188 | model_file = MODEL_ROOT / dataset / f"{model_name}.pt" 189 | L.info(f"load model from {model_file} ...") 190 | state = torch.load(model_file, map_location=DEVICE) 191 | args = state['args'] 192 | 193 | table = load_table(dataset, state['version']) 194 | # load model 195 | model = LWNNModel(state['fea_num'], args.hid_units).to(DEVICE) 196 | report_model(model) 197 | L.info(f"Overall LWNN model size = {state['model_size']:.2f}MB") 198 | model.load_state_dict(state['model_state_dict']) 199 | pg_est = Postgres(table, args.bins, state['seed']) 200 | 201 | estimator = LWNN(model, model_name, pg_est, table) 202 | return estimator, state 203 | 204 | def test_lw_nn(dataset: str, version: str, workload: str, params: Dict[str, Any], overwrite: bool) -> None: 205 | """ 206 | params: 207 | model: model file name 208 | use_cache: load processed vectors directly instead of build from queries 209 | """ 210 | # uniform thread number 211 | torch.set_num_threads(NUM_THREADS) 212 | assert NUM_THREADS == torch.get_num_threads(), torch.get_num_threads() 213 | L.info(f"Torch threads: {torch.get_num_threads()}") 214 | 215 | model_file = MODEL_ROOT / dataset / f"{params['model']}.pt" 216 | L.info(f"Load model from {model_file} ...") 217 | state = torch.load(model_file, map_location=DEVICE) 218 | args = state['args'] 219 | 220 | # load corresonding version of table 221 | table = load_table(dataset, state['version']) 222 | 223 | # load model 224 | model = LWNNModel(state['fea_num'], args.hid_units).to(DEVICE) 225 | report_model(model) 226 | L.info(f"Overall LWNN model size = {state['model_size']:.2f}MB") 227 | model.load_state_dict(state['model_state_dict']) 228 | 229 | if params['use_cache']: 230 | # do not need to connect postgres in this case 231 | estimator = LWNN(model, params['model'], None, table) 232 | L.info(f"Load and build lw(nn) estimator: {estimator}") 233 | 234 | # test table might has different version with train 235 | test_table = load_table(dataset, version) 236 | lw_dataset = load_lw_dataset(test_table, workload, state['seed'], args.bins) 237 | X, _, gt = lw_dataset['test'] 238 | run_test(dataset, version, workload, estimator, overwrite, lw_vec=(X, gt)) 239 | else: 240 | pg_est = Postgres(table, args.bins, state['seed']) 241 | estimator = LWNN(model, params['model'], pg_est, table) 242 | L.info(f"Load and build lw(nn) estimator: {estimator}") 243 | 244 | run_test(dataset, version, workload, estimator, overwrite) 245 | 246 | 247 | -------------------------------------------------------------------------------- /lecarb/estimator/lw/lw_tree.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | from typing import Dict, Any, Tuple 4 | import pickle 5 | 6 | import numpy as np 7 | import xgboost as xgb 8 | 9 | from .common import load_lw_dataset, encode_query, decode_label 10 | from ..postgres import Postgres 11 | from ..estimator import Estimator 12 | from ..utils import evaluate, run_test 13 | from ...dataset.dataset import load_table 14 | from ...workload.workload import Query 15 | from ...constants import MODEL_ROOT, NUM_THREADS, PKL_PROTO 16 | 17 | L = logging.getLogger(__name__) 18 | 19 | class Args: 20 | def __init__(self, **kwargs): 21 | self.trees = 16 22 | self.bins = 200 23 | self.train_num = 10000 24 | 25 | # overwrite parameters from user 26 | self.__dict__.update(kwargs) 27 | 28 | def train_lw_tree(seed, dataset, version, workload, params, sizelimit): 29 | np.random.seed(seed) 30 | 31 | # convert parameter dict of lw(nn) 32 | L.info(f"params: {params}") 33 | args = Args(**params) 34 | valid_num = args.train_num // 10 35 | 36 | table = load_table(dataset, version) 37 | dataset = load_lw_dataset(table, workload, seed, args.bins) 38 | train_X, train_y, _ = dataset['train'] 39 | valid_X, valid_y, valid_gt = dataset['valid'] 40 | 41 | # Train model 42 | model_path = MODEL_ROOT / table.dataset 43 | model_path.mkdir(parents=True, exist_ok=True) 44 | model_file = model_path / f"{table.version}_{workload}-lwxgb_tr{args.trees}_bin{args.bins}_{args.train_num//1000}k-{seed}.pkl" 45 | 46 | L.info(f"Start training...") 47 | start_stmp = time.time() 48 | model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=args.trees, random_state=seed, n_jobs=NUM_THREADS) 49 | model.fit(train_X[:args.train_num], train_y[:args.train_num], eval_set=[(valid_X[:valid_num], valid_y[:valid_num])]) 50 | dur_min = (time.time() - start_stmp) / 60 51 | L.info(f"Finish training, time since start: {dur_min:.4f} mins") 52 | 53 | L.info(f"Run on valid set...") 54 | preds = np.maximum(np.round(decode_label(model.predict(valid_X[:valid_num]))), 0.0) 55 | gts = valid_gt[:valid_num] 56 | L.info("Q-Error on validation set:") 57 | _, metrics = evaluate(preds, gts) 58 | 59 | state = { 60 | 'seed': seed, 61 | 'args': args, 62 | 'device': 'cpu', 63 | 'threads': NUM_THREADS, 64 | 'dataset': table.dataset, 65 | 'version': table.version, 66 | 'workload': workload, 67 | 'model': model, 68 | 'train_time': dur_min, 69 | 'valid_error': {workload: metrics} 70 | # 'model_size': model_size, 71 | } 72 | with open(model_file, 'wb') as f: 73 | pickle.dump(state, f, protocol=PKL_PROTO) 74 | 75 | L.info(f'All finished! Time spent since training start: {(time.time()-start_stmp)/60:.2f} mins') 76 | L.info(f"Model saved to {model_file}") 77 | 78 | class LWTree(Estimator): 79 | def __init__(self, model, model_name, pg_est, table): 80 | super(LWTree, self).__init__(table=table, model=model_name) 81 | self.model = model 82 | self.pg_est = pg_est 83 | 84 | def query(self, query): 85 | if isinstance(query, Query): 86 | query = encode_query(self.table, query, self.pg_est) 87 | return self.query_vector(np.expand_dims(query, axis=0)) 88 | 89 | def query_vector(self, vec): 90 | start_stmp = time.time() 91 | pred = self.model.predict(vec).item() 92 | dur_ms = (time.time() - start_stmp) * 1e3 93 | return np.maximum(np.round(decode_label(pred)), 0.0), dur_ms 94 | 95 | 96 | def load_lw_tree(dataset: str, model_name: str) -> Tuple[Estimator, Dict[str, Any]]: 97 | model_file = MODEL_ROOT / dataset / f"{model_name}.pkl" 98 | L.info(f"load model from {model_file} ...") 99 | with open(model_file, 'rb') as f: 100 | state = pickle.load(f) 101 | 102 | # load model 103 | args = state['args'] 104 | model = state['model'] 105 | table = load_table(dataset, state['version']) 106 | pg_est = Postgres(table, args.bins, state['seed']) 107 | 108 | estimator = LWTree(model, model_name, pg_est, table) 109 | return estimator, state 110 | 111 | def test_lw_tree(dataset: str, version: str, workload: str, params: Dict[str, Any], overwrite: bool) -> None: 112 | """ 113 | params: 114 | model: model file name 115 | use_cache: load processed vectors directly instead of build from queries 116 | """ 117 | # uniform thread number 118 | model_file = MODEL_ROOT / dataset / f"{params['model']}.pkl" 119 | L.info(f"Load model from {model_file} ...") 120 | with open(model_file, 'rb') as f: 121 | state = pickle.load(f) 122 | 123 | # load corresonding version of table 124 | table = load_table(dataset, state['version']) 125 | 126 | # load model 127 | args = state['args'] 128 | model = state['model'] 129 | pg_est = Postgres(table, args.bins, state['seed']) 130 | estimator = LWTree(model, params['model'], pg_est, table) 131 | 132 | L.info(f"Load and built lw(tree) estimator: {estimator}") 133 | if params['use_cache']: 134 | # test table might has different version with train 135 | test_table = load_table(dataset, version) 136 | lw_dataset = load_lw_dataset(test_table, workload, state['seed'], args.bins) 137 | X, _, gt = lw_dataset['test'] 138 | run_test(dataset, version, workload, estimator, overwrite, lw_vec=(X, gt)) 139 | else: 140 | run_test(dataset, version, workload, estimator, overwrite) 141 | 142 | 143 | -------------------------------------------------------------------------------- /lecarb/estimator/lw/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | class LWNNLayer(nn.Module): 4 | def __init__(self, input_len, output_len): 5 | super().__init__() 6 | self.layer = nn.Sequential( 7 | nn.Linear(input_len, output_len), 8 | nn.ReLU(inplace=True), 9 | ) 10 | 11 | def forward(self, X): 12 | return self.layer(X) 13 | 14 | class LWNNModel(nn.Module): 15 | def __init__(self, input_len, hid_units): 16 | super().__init__() 17 | self.hid_units = hid_units 18 | 19 | self.hid_layers = nn.Sequential() 20 | for l, output_len in enumerate([int(u) for u in hid_units.split('_')]): 21 | self.hid_layers.add_module('layer_{}'.format(l), LWNNLayer(input_len, output_len)) 22 | input_len = output_len 23 | 24 | self.final = nn.Linear(input_len, 1) 25 | 26 | def forward(self, X): 27 | mid_out = self.hid_layers(X) 28 | pred = self.final(mid_out) 29 | 30 | return pred 31 | 32 | def name(self): 33 | return f"lwnn_hid{self.hid_units}" 34 | -------------------------------------------------------------------------------- /lecarb/estimator/mscn/README.md: -------------------------------------------------------------------------------- 1 | Paper: [Learned Cardinalities: Estimating Correlated Joins with Deep Learning](https://arxiv.org/pdf/1809.00677.pdf) 2 | Code Reference: [repo](https://github.com/andreaskipf/learnedcardinalities) 3 | -------------------------------------------------------------------------------- /lecarb/estimator/mscn/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | # Define model architecture 6 | # removed all join related components since we only do cardinality estimation on single table 7 | 8 | class SetConv(nn.Module): 9 | def __init__(self, sample_feats, predicate_feats, hid_units): 10 | super(SetConv, self).__init__() 11 | self.sample_feats = sample_feats 12 | self.hid_units = hid_units 13 | 14 | self.sample_mlp1 = nn.Linear(sample_feats, hid_units) 15 | self.sample_mlp2 = nn.Linear(hid_units, hid_units) 16 | self.predicate_mlp1 = nn.Linear(predicate_feats, hid_units) 17 | self.predicate_mlp2 = nn.Linear(hid_units, hid_units) 18 | self.out_mlp1 = nn.Linear(hid_units * 2, hid_units) 19 | self.out_mlp2 = nn.Linear(hid_units, 1) 20 | 21 | def forward(self, samples, predicates, sample_mask, predicate_mask): 22 | # samples has shape [batch_size x num_joins+1 x sample_feats] 23 | # predicates has shape [batch_size x num_predicates x predicate_feats] 24 | # joins has shape [batch_size x num_joins x join_feats] 25 | 26 | hid_sample = F.relu(self.sample_mlp1(samples)) 27 | hid_sample = F.relu(self.sample_mlp2(hid_sample)) 28 | hid_sample = hid_sample * sample_mask # Mask 29 | hid_sample = torch.sum(hid_sample, dim=1, keepdim=False) 30 | sample_norm = sample_mask.sum(1, keepdim=False) 31 | hid_sample = hid_sample / sample_norm # Calculate average only over non-masked parts 32 | 33 | hid_predicate = F.relu(self.predicate_mlp1(predicates)) 34 | hid_predicate = F.relu(self.predicate_mlp2(hid_predicate)) 35 | hid_predicate = hid_predicate * predicate_mask 36 | hid_predicate = torch.sum(hid_predicate, dim=1, keepdim=False) 37 | predicate_norm = predicate_mask.sum(1, keepdim=False) 38 | hid_predicate = hid_predicate / predicate_norm 39 | 40 | hid = torch.cat((hid_sample, hid_predicate), 1) 41 | hid = F.relu(self.out_mlp1(hid)) 42 | out = torch.sigmoid(self.out_mlp2(hid)) 43 | return out 44 | 45 | def name(self): 46 | return f"mscn_hid{self.hid_units}_sample{self.sample_feats}" 47 | -------------------------------------------------------------------------------- /lecarb/estimator/mysql.py: -------------------------------------------------------------------------------- 1 | import time 2 | import mysql.connector 3 | import logging 4 | from typing import Any, Dict 5 | import numpy as np 6 | 7 | from .estimator import Estimator 8 | from .utils import run_test 9 | from ..workload.workload import query_2_sql 10 | from ..dataset.dataset import load_table 11 | from ..constants import MYSQL_HOST, MYSQL_PORT, MYSQL_DB, MYSQL_USER, MYSQL_PSWD 12 | 13 | L = logging.getLogger(__name__) 14 | 15 | class MySQL(Estimator): 16 | def __init__(self, table, bucket, seed): 17 | super(MySQL, self).__init__(table=table, version=table.version, bucket=bucket, seed=seed) 18 | 19 | self.conn = mysql.connector.connect(user=MYSQL_USER, password=MYSQL_PSWD, host=MYSQL_HOST, port=MYSQL_PORT, database=MYSQL_DB) 20 | self.conn.autocommit = True 21 | self.cursor = self.conn.cursor() 22 | 23 | # construct statistics 24 | start_stmp = time.time() 25 | self.cursor.execute(f"analyze table `{self.table.name}` update histogram on " 26 | f"{','.join([c.name for c in table.columns.values()])} " 27 | f"with {bucket} buckets;") 28 | rows = self.cursor.fetchall() 29 | L.info(f"{rows}") 30 | dur_min = (time.time() - start_stmp) / 60 31 | 32 | L.info(f"construct statistics finished, using {dur_min:.4f} minutes") 33 | 34 | def query(self, query): 35 | sql = 'explain {}'.format(query_2_sql(query, self.table, aggregate=False, dbms='mysql')) 36 | # L.info('sql: {}'.format(sql)) 37 | 38 | start_stmp = time.time() 39 | self.cursor.execute(sql) 40 | dur_ms = (time.time() - start_stmp) * 1e3 41 | res = self.cursor.fetchall() 42 | assert len(res) == 1, res 43 | # test 1 44 | card = np.round(0.01 * res[0][10] * self.table.row_num) 45 | # test 2 46 | # card = np.round(0.01 * res[0][10] * res[0][9]) 47 | # L.info(card) 48 | return card, dur_ms 49 | 50 | def test_mysql(seed: int, dataset: str, version: str, workload:str, params: Dict[str, Any], overwrite: bool): 51 | """ 52 | params: 53 | version: the version of table that mysql construct statistics, might not be the same with the one we test on 54 | bucket: number of bucket for each histogram 55 | """ 56 | # prioriy: params['version'] (build statistics from another dataset) > version (build statistics on the same dataset) 57 | table = load_table(dataset, params.get('version') or version) 58 | 59 | L.info("construct mysql estimator...") 60 | estimator = MySQL(table, params['bucket'], seed=seed) 61 | L.info(f"built mysql estimator: {estimator}") 62 | 63 | run_test(dataset, version, workload, estimator, overwrite) 64 | 65 | 66 | -------------------------------------------------------------------------------- /lecarb/estimator/naru/README.md: -------------------------------------------------------------------------------- 1 | Paper: [Deep Unsupervised Cardinality Estimation](http://www.vldb.org/pvldb/vol13/p279-yang.pdf) 2 | Code Reference: [repo](https://github.com/naru-project/naru) 3 | -------------------------------------------------------------------------------- /lecarb/estimator/postgres.py: -------------------------------------------------------------------------------- 1 | import time 2 | import psycopg2 3 | import logging 4 | from typing import Any, Dict 5 | 6 | from .estimator import Estimator 7 | from .utils import run_test 8 | from ..workload.workload import query_2_sql 9 | from ..dataset.dataset import load_table 10 | from ..constants import DATABASE_URL 11 | 12 | L = logging.getLogger(__name__) 13 | 14 | class Postgres(Estimator): 15 | def __init__(self, table, stat_target, seed): 16 | super(Postgres, self).__init__(table=table, version=table.version, stat=stat_target, seed=seed) 17 | 18 | self.conn = psycopg2.connect(DATABASE_URL) 19 | self.conn.autocommit = True 20 | self.cursor = self.conn.cursor() 21 | 22 | # construct statistics 23 | start_stmp = time.time() 24 | self.cursor.execute('select setseed({});'.format(1 / seed)) 25 | for c in table.columns.values(): 26 | self.cursor.execute('alter table \"{}\" alter column {} set statistics {};'.format( 27 | table.name, c.name, stat_target)) 28 | self.cursor.execute('analyze \"{}\";'.format(self.table.name)) 29 | self.conn.commit() 30 | dur_min = (time.time() - start_stmp) / 60 31 | 32 | # get size 33 | self.cursor.execute('select sum(pg_column_size(pg_stats)) from pg_stats where tablename=\'{}\''.format(self.table.name)) 34 | size = self.cursor.fetchall()[0][0] 35 | # self.cursor.execute('select sum(pg_column_size(pg_stats_ext)) from pg_stats_ext where tablename=\'{}\''.format(self.table.name)) 36 | # res = self.cursor.fetchall()[0][0] 37 | # might not have content in ext table 38 | # if res is not None: 39 | # size += res 40 | size = size / 1024 / 1024 # MB 41 | 42 | L.info(f"construct statistics finished, using {dur_min:.4f} minutes, All statistics consumes {size:.2f} MBs") 43 | 44 | def query(self, query): 45 | sql = 'explain(format json) {}'.format(query_2_sql(query, self.table, aggregate=False)) 46 | # L.info('sql: {}'.format(sql)) 47 | 48 | start_stmp = time.time() 49 | self.cursor.execute(sql) 50 | dur_ms = (time.time() - start_stmp) * 1e3 51 | res = self.cursor.fetchall() 52 | card = res[0][0][0]['Plan']['Plan Rows'] 53 | # L.info(card) 54 | return card, dur_ms 55 | 56 | def query_sql(self, sql): 57 | sql = 'explain(format json) {}'.format(sql) 58 | # L.info('sql: {}'.format(sql)) 59 | 60 | start_stmp = time.time() 61 | self.cursor.execute(sql) 62 | res = self.cursor.fetchall() 63 | card = res[0][0][0]['Plan']['Plan Rows'] 64 | # L.info(card) 65 | dur_ms = (time.time() - start_stmp) * 1e3 66 | return card, dur_ms 67 | 68 | def test_postgres(seed: int, dataset: str, version: str, workload:str, params: Dict[str, Any], overwrite: bool): 69 | """ 70 | params: 71 | version: the version of table that postgres construct statistics, might not be the same with the one we test on 72 | stat_target: size of the statistics limit 73 | """ 74 | # prioriy: params['version'] (build statistics from another dataset) > version (build statistics on the same dataset) 75 | table = load_table(dataset, params.get('version') or version) 76 | 77 | L.info("construct postgres estimator...") 78 | estimator = Postgres(table, stat_target=params['stat_target'], seed=seed) 79 | L.info(f"built postgres estimator: {estimator}") 80 | 81 | run_test(dataset, version, workload, estimator, overwrite) 82 | 83 | 84 | -------------------------------------------------------------------------------- /lecarb/estimator/sample.py: -------------------------------------------------------------------------------- 1 | 2 | import time 3 | import logging 4 | from typing import Any, Dict 5 | import numpy as np 6 | from .estimator import Estimator, OPS 7 | from .utils import run_test 8 | from ..workload.workload import query_2_triple 9 | from ..dataset.dataset import load_table 10 | 11 | L = logging.getLogger(__name__) 12 | 13 | class Sampling(Estimator): 14 | def __init__(self, table, ratio, seed): 15 | super(Sampling, self).__init__(table=table, version=table.version, ratio=ratio, seed=seed) 16 | self.sample = table.data.sample(frac=ratio, random_state=seed) 17 | self.sample_num = len(self.sample) 18 | 19 | def query(self, query): 20 | columns, operators, values = query_2_triple(query, with_none=False, split_range=False) 21 | start_stmp = time.time() 22 | bitmap = np.ones(self.sample_num, dtype=bool) 23 | for c, o, v in zip(columns, operators, values): 24 | bitmap &= OPS[o](self.sample[c], v) 25 | card = np.round((self.table.row_num / self.sample_num) * bitmap.sum()) 26 | dur_ms = (time.time() - start_stmp) * 1e3 27 | return card, dur_ms 28 | 29 | def test_sample(seed: int, dataset: str, version: str, workload: str, params: Dict[str, Any], overwrite: bool) -> None: 30 | """ 31 | params: 32 | version: the version of table that the sample draw from, might not be the same with the one we test on 33 | ratio: the ratio of the sample 34 | """ 35 | # prioriy: params['version'] (draw sample from another dataset) > version (draw and test on the same dataset) 36 | table = load_table(dataset, params.get('version') or version) 37 | 38 | L.info("construct sampling estimator...") 39 | estimator = Sampling(table, ratio=params['ratio'] or 0.01, seed=seed) 40 | L.info(f"built sampling estimator: {estimator}") 41 | 42 | run_test(dataset, version, workload, estimator, overwrite) 43 | 44 | 45 | -------------------------------------------------------------------------------- /lecarb/estimator/utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import ray 3 | import logging 4 | import numpy as np 5 | import pandas as pd 6 | import torch 7 | from scipy.stats.mstats import gmean 8 | 9 | # from .lw.lw_nn import LWNN 10 | # from .lw.lw_tree import LWTree 11 | from .estimator import Estimator 12 | from ..constants import NUM_THREADS, RESULT_ROOT 13 | from ..workload.workload import load_queryset, load_labels 14 | from ..dataset.dataset import load_table 15 | 16 | L = logging.getLogger(__name__) 17 | 18 | def report_model(model, blacklist=None): 19 | ps = [] 20 | for name, p in model.named_parameters(): 21 | if blacklist is None or blacklist not in name: 22 | ps.append(np.prod(p.size())) 23 | num_params = sum(ps) 24 | mb = num_params * 4 / 1024 / 1024 25 | L.info(f'Number of model parameters: {num_params} (~= {mb:.2f}MB)') 26 | L.info(model) 27 | return mb 28 | 29 | def qerror(est_card, card): 30 | if est_card == 0 and card == 0: 31 | return 1.0 32 | if est_card == 0: 33 | return card 34 | if card == 0: 35 | return est_card 36 | if est_card > card: 37 | return est_card / card 38 | else: 39 | return card / est_card 40 | 41 | def rmserror(preds, labels, total_rows): 42 | return np.sqrt(np.mean(np.square(preds/total_rows-labels/total_rows))) 43 | 44 | def evaluate(preds, labels, total_rows=-1): 45 | errors = [] 46 | for i in range(len(preds)): 47 | errors.append(qerror(float(preds[i]), float(labels[i]))) 48 | 49 | metrics = { 50 | 'max': np.max(errors), 51 | '99th': np.percentile(errors, 99), 52 | '95th': np.percentile(errors, 95), 53 | '90th': np.percentile(errors, 90), 54 | 'median': np.median(errors), 55 | 'mean': np.mean(errors), 56 | 'gmean': gmean(errors) 57 | } 58 | 59 | if total_rows > 0: 60 | metrics['rms'] = rmserror(preds, labels, total_rows) 61 | L.info(f"{metrics}") 62 | return np.array(errors), metrics 63 | 64 | def evaluate_errors(errors): 65 | metrics = { 66 | 'max': np.max(errors), 67 | '99th': np.percentile(errors, 99), 68 | '95th': np.percentile(errors, 95), 69 | '90th': np.percentile(errors, 90), 70 | 'median': np.median(errors), 71 | 'mean': np.mean(errors), 72 | 'gmean': gmean(errors) 73 | } 74 | L.info(f"{metrics}") 75 | return metrics 76 | 77 | def report_errors(dataset, result_file): 78 | df = pd.read_csv(RESULT_ROOT / dataset / result_file) 79 | evaluate_errors(df['error']) 80 | 81 | def report_dynamic_errors(dataset, old_new_file, new_new_file, max_t, current_t): 82 | ''' 83 | max_t: Time limit for update 84 | current_t: Model's update time. 85 | old_new_path: Result file of applying stale model on new workload 86 | new_new_path: Result file of applying updated model on new workload 87 | ''' 88 | old_new_path = RESULT_ROOT / dataset / old_new_file 89 | new_new_path = RESULT_ROOT / dataset / new_new_file 90 | if max_t > current_t: 91 | try: 92 | o_n = pd.read_csv(old_new_path) 93 | n_n = pd.read_csv(new_new_path) 94 | assert len(o_n) == len(n_n), "In current version, the workload test size should be same." 95 | o_n_s = o_n.sample(frac = current_t / max_t) 96 | n_n_s = n_n.sample(frac = 1 - current_t / max_t) 97 | mixed_df = pd.concat([o_n_s, n_n_s], ignore_index=True, sort=False) 98 | return evaluate_errors(mixed_df['error']) 99 | except OSError: 100 | print('Cannot open file.') 101 | return -1 102 | 103 | def lazy_derive(origin_result_file, result_file, r, labels): 104 | L.info("Already have the original result, directly derive the new prediction!") 105 | df = pd.read_csv(origin_result_file) 106 | with open(result_file, 'w') as f: 107 | writer = csv.writer(f) 108 | writer.writerow(['id', 'error', 'predict', 'label', 'dur_ms']) 109 | for index, row in df.iterrows(): 110 | p = np.round(row['predict'] * r) 111 | l = labels[index].cardinality 112 | writer.writerow([int(row['id']), qerror(p, l), p, l, row['dur_ms']]) 113 | L.info("Done infering all predictions from previous result") 114 | 115 | def run_test(dataset: str, version: str, workload: str, estimator: Estimator, overwrite: bool, lazy: bool=True, lw_vec=None, query_async=False) -> None: 116 | # for inference speed. 117 | torch.backends.cudnn.deterministic = False 118 | torch.backends.cudnn.benchmark = True 119 | 120 | # uniform thread number 121 | torch.set_num_threads(NUM_THREADS) 122 | assert NUM_THREADS == torch.get_num_threads(), torch.get_num_threads() 123 | L.info(f"torch threads: {torch.get_num_threads()}") 124 | 125 | L.info(f"Start loading queryset:{workload} and labels for version {version} of dataset {dataset}...") 126 | # only keep test queries 127 | queries = load_queryset(dataset, workload)['test'] 128 | labels = load_labels(dataset, version, workload)['test'] 129 | 130 | if lw_vec is not None: 131 | X, gt = lw_vec 132 | # assert isinstance(estimator, LWNN) or isinstance(estimator, LWTree), estimator 133 | assert len(X) == len(queries), len(X) 134 | assert np.array_equal(np.array([l.cardinality for l in labels]), gt) 135 | L.info("Hack for LW's method, use processed vector instead of raw query") 136 | queries = X 137 | 138 | # prepare file path, do not proceed if result already exists 139 | result_path = RESULT_ROOT / f"{dataset}" 140 | result_path.mkdir(parents=True, exist_ok=True) 141 | result_file = result_path / f"{version}-{workload}-{estimator}.csv" 142 | if not overwrite and result_file.is_file(): 143 | L.info(f"Already have the result {result_file}, do not run again!") 144 | exit(0) 145 | 146 | r = 1.0 147 | if version != estimator.table.version: 148 | test_row = load_table(dataset, version).row_num 149 | r = test_row / estimator.table.row_num 150 | L.info(f"Testing on a different data version, need to adjust the prediction according to the row number ratio {r} = {test_row} / {estimator.table.row_num}!") 151 | 152 | origin_result_file = RESULT_ROOT / dataset / f"{estimator.table.version}-{workload}-{estimator}.csv" 153 | if lazy and origin_result_file.is_file(): 154 | return lazy_derive(origin_result_file, result_file, r, labels) 155 | 156 | if query_async: 157 | L.info("Start test estimator asynchronously...") 158 | for i, query in enumerate(queries): 159 | estimator.query_async(query, i) 160 | 161 | L.info('Waiting for queries to finish...') 162 | stats = ray.get([w.get_stats.remote() for w in estimator.workers]) 163 | 164 | errors = [] 165 | latencys = [] 166 | with open(result_file, 'w') as f: 167 | writer = csv.writer(f) 168 | writer.writerow(['id', 'error', 'predict', 'label', 'dur_ms']) 169 | for i, label in enumerate(labels): 170 | r = stats[i%estimator.num_workers][i//estimator.num_workers] 171 | assert i == r.i, r 172 | error = qerror(r.est_card, label.cardinality) 173 | errors.append(error) 174 | latencys.append(r.dur_ms) 175 | writer.writerow([i, error, r.est_card, label.cardinality, r.dur_ms]) 176 | 177 | L.info(f"Test finished, {np.mean(latencys)} ms/query in average") 178 | evaluate_errors(errors) 179 | return 180 | 181 | L.info("Start test estimator on test queries...") 182 | errors = [] 183 | latencys = [] 184 | with open(result_file, 'w') as f: 185 | writer = csv.writer(f) 186 | writer.writerow(['id', 'error', 'predict', 'label', 'dur_ms']) 187 | for i, data in enumerate(zip(queries, labels)): 188 | query, label = data 189 | est_card, dur_ms = estimator.query(query) 190 | est_card = np.round(r * est_card) 191 | error = qerror(est_card, label.cardinality) 192 | errors.append(error) 193 | latencys.append(dur_ms) 194 | writer.writerow([i, error, est_card, label.cardinality, dur_ms]) 195 | if (i+1) % 1000 == 0: 196 | L.info(f"{i+1} queries finished") 197 | L.info(f"Test finished, {np.mean(latencys)} ms/query in average") 198 | evaluate_errors(errors) 199 | 200 | 201 | -------------------------------------------------------------------------------- /lecarb/workload/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sfu-db/AreCELearnedYet/aa52da7768023270bad884232972e0b77ec6534a/lecarb/workload/__init__.py -------------------------------------------------------------------------------- /lecarb/workload/dump_quicksel.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import logging 3 | from pathlib import Path 4 | from typing import Dict, Any 5 | import numpy as np 6 | 7 | from .workload import load_queryset, load_labels, query_2_quicksel_vector, new_query 8 | from ..dtypes import is_discrete, is_categorical 9 | from ..dataset.dataset import load_table 10 | from ..estimator.estimator import Oracle 11 | from ..constants import DATA_ROOT 12 | 13 | L = logging.getLogger(__name__) 14 | 15 | def dump_quicksel_query_files(dataset: str, version: str, workload: str, overwrite: bool) -> None: 16 | result_path = DATA_ROOT / dataset / "quicksel" 17 | result_path.mkdir(exist_ok=True) 18 | if not overwrite and Path(result_path / f"{workload}-{version}-train.csv").is_file() and Path(result_path / f"{workload}-{version}-test.csv").is_file(): 19 | L.info("Already has quicksel workload file dumped, do not continue") 20 | return 21 | 22 | table = load_table(dataset, version) 23 | queryset = load_queryset(dataset, workload) 24 | labels = load_labels(dataset, version, workload) 25 | 26 | discrete_cols = set() 27 | for col_name, col in table.columns.items(): 28 | # hard code for power dataset since all these columns are actually integers 29 | if dataset[:5] == 'power' and col_name in ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']: 30 | discrete_cols.add(col_name) 31 | continue 32 | if is_discrete(col.dtype): 33 | discrete_cols.add(col_name) 34 | L.info(f"Detect discrete columns: {discrete_cols}") 35 | 36 | for group in ('train', 'test'): 37 | L.info(f"Start dump {workload} for {dataset}-{version}") 38 | result_file = result_path / f"{workload}-{version}-{group}.csv" 39 | with open(result_file, 'w') as f: 40 | writer = csv.writer(f) 41 | for query, label in zip(queryset[group], labels[group]): 42 | vec = query_2_quicksel_vector(query, table, discrete_cols).tolist() 43 | vec.append(label.selectivity) 44 | writer.writerow(vec) 45 | L.info(f"File dumped to {result_file}") 46 | 47 | def generate_quicksel_permanent_assertions(dataset: str, version: str, params: Dict[str, Dict[str, Any]], overwrite: bool) -> None: 48 | result_path = DATA_ROOT / dataset / "quicksel" 49 | result_path.mkdir(exist_ok=True) 50 | result_file = result_path / f"{version}-permanent.csv" 51 | if not overwrite and result_file.is_file(): 52 | L.info("Already has permanent assertions generated, do not continue") 53 | return 54 | 55 | count = params['count']+1 56 | 57 | table = load_table(dataset, version) 58 | oracle = Oracle(table) 59 | 60 | discrete_cols = set() 61 | for col_name, col in table.columns.items(): 62 | # hard code for power dataset since all these columns are actually integers 63 | if dataset[:5] == 'power' and col_name in ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']: 64 | discrete_cols.add(col_name) 65 | continue 66 | if is_discrete(col.dtype): 67 | discrete_cols.add(col_name) 68 | L.info(f"Detect discrete columns: {discrete_cols}") 69 | 70 | with open(result_file, 'w') as f: 71 | writer = csv.writer(f) 72 | writer.writerow([0.0, 1.0] * table.col_num + [1.0]) 73 | for col_id, col in enumerate(table.columns.values()): 74 | L.info(f"Start generate permanent queries on column {col.name}") 75 | # hard code for power dataset since all these columns are actually integers 76 | if is_discrete(col.dtype) or (dataset[:5] == 'power' and col.name in ['Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3']): 77 | if is_categorical(col.dtype): 78 | L.info("Categorical column") 79 | if col.vocab_size <= count: 80 | for i in range(col.vocab_size): 81 | query = new_query(table, ncols=1) 82 | query.predicates[col.name] = ('=', col.vocab[i]) 83 | card, _ = oracle.query(query) 84 | # vec = query_2_quicksel_vector(query, table, discrete_cols).tolist() 85 | # vec.append(card/table.row_num) 86 | vec = [0.0, 1.0] * table.col_num 87 | vec.append(card/table.row_num) 88 | vec[col_id*2] = i/col.vocab_size 89 | vec[col_id*2+1] = (i+1)/col.vocab_size 90 | writer.writerow(vec) 91 | L.info(f"# {i}: {query.predicates[col.name]}, card={card}\n\t{vec}") 92 | else: 93 | minval = 0 94 | maxval = col.vocab_size 95 | norm_range = np.linspace(0.0, 1.0, count, dtype=np.float32) 96 | prange = minval + (maxval - minval) * norm_range 97 | for i in range(len(prange)-1): 98 | val0 = col.vocab[np.ceil(prange[i]).astype(int)] 99 | val1 = col.vocab[np.ceil(prange[i+1]).astype(int)-1] 100 | assert np.greater_equal(np.array(val1).astype(object), val0), (val1, val0) 101 | query = new_query(table, ncols=1) 102 | query.predicates[col.name] = ('[]', (val0, val1)) 103 | card, _ = oracle.query(query) 104 | # vec = query_2_quicksel_vector(query, table, discrete_cols).tolist() 105 | # vec.append(card/table.row_num) 106 | 107 | vec = [0.0, 1.0] * table.col_num 108 | vec.append(card/table.row_num) 109 | vec[col_id*2] = norm_range[i] 110 | vec[col_id*2+1] = norm_range[i+1] 111 | writer.writerow(vec) 112 | L.info(f"# {i}: {query.predicates[col.name]}, card={card}\n\t{vec}") 113 | else: 114 | L.info("Integer column") 115 | minval = col.minval 116 | maxval = col.maxval + 1 117 | norm_range = np.linspace(0.0, 1.0, count, dtype=np.float32) 118 | prange = minval + (maxval - minval) * norm_range 119 | for i in range(len(prange)-1): 120 | val0 = np.ceil(prange[i]) 121 | val1 = np.ceil(prange[i+1])-1 122 | assert val1 >= val0, (val0, val1) 123 | query = new_query(table, ncols=1) 124 | query.predicates[col.name] = ('[]', (val0, val1)) 125 | card, _ = oracle.query(query) 126 | # vec = query_2_quicksel_vector(query, table, discrete_cols).tolist() 127 | # vec.append(card/table.row_num) 128 | 129 | vec = [0.0, 1.0] * table.col_num 130 | vec.append(card/table.row_num) 131 | vec[col_id*2] = norm_range[i] 132 | vec[col_id*2+1] = norm_range[i+1] 133 | writer.writerow(vec) 134 | L.info(f"# {i}: {query.predicates[col.name]}, card={card}\n\t{vec}") 135 | else: 136 | L.info("Real-value column") 137 | norm_range = np.linspace(0.0, 1.0, count, dtype=np.float32) 138 | prange = col.minval + (col.maxval - col.minval) * norm_range 139 | for i in range(len(prange)-1): 140 | query = new_query(table, ncols=1) 141 | query.predicates[col.name] = ('[]', (prange[i], prange[i+1])) 142 | card, _ = oracle.query(query) 143 | # vec = query_2_quicksel_vector(query, table, discrete_cols).tolist() 144 | # vec.append(card/table.row_num) 145 | vec = [0.0, 1.0] * table.col_num 146 | vec.append(card/table.row_num) 147 | vec[col_id*2] = norm_range[i] 148 | vec[col_id*2+1] = norm_range[i+1] 149 | writer.writerow(vec) 150 | L.info(f"# {i}: {query.predicates[col.name]}, card={card}\n\t{vec}") 151 | -------------------------------------------------------------------------------- /lecarb/workload/gen_label.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Dict 3 | 4 | from .workload import Label, Query, load_queryset, dump_labels 5 | from ..estimator.estimator import Oracle 6 | from ..estimator.sample import Sampling 7 | from ..dataset.dataset import Table, load_table 8 | 9 | L = logging.getLogger(__name__) 10 | 11 | def generate_labels_for_queries(table: Table, queryset: Dict[str, List[Query]]) -> Dict[str, List[Label]]: 12 | oracle = Oracle(table) 13 | labels = {} 14 | for group, queries in queryset.items(): 15 | l = [] 16 | for i, q in enumerate(queries): 17 | card, _ = oracle.query(q) 18 | l.append(Label(cardinality=card, selectivity=card/table.row_num)) 19 | if (i+1) % 1000 == 0: 20 | L.info(f"{i+1} labels generated for {group}") 21 | labels[group] = l 22 | 23 | return labels 24 | 25 | def generate_labels(dataset: str, version: str, workload: str) -> None: 26 | 27 | L.info("Load table...") 28 | table = load_table(dataset, version) 29 | 30 | L.info("Load queryset from disk...") 31 | queryset = load_queryset(dataset, workload) 32 | 33 | L.info("Start generate ground truth labels for the workload...") 34 | labels = generate_labels_for_queries(table, queryset) 35 | 36 | L.info("Dump labels to disk...") 37 | dump_labels(dataset, version, workload, labels) 38 | 39 | def update_labels_for_queries(table: Table, queryset: Dict[str, List[Query]], seed: int, sampling_ratio: float=0.05) -> Dict[str, List[Label]]: 40 | sample_ester = Sampling(table, sampling_ratio, seed) 41 | labels = {} 42 | for group, queries in queryset.items(): 43 | l = [] 44 | for i, q in enumerate(queries): 45 | card, _ = sample_ester.query(q) 46 | l.append(Label(cardinality=card, selectivity=card/table.row_num)) 47 | if (i+1) % 1000 == 0: 48 | L.info(f"{i+1} labels generated for {group}") 49 | labels[group] = l 50 | return labels 51 | 52 | def update_labels(seed: int, dataset: str, version: str, workload: str, sampling_ratio: float=0.05) -> None: 53 | 54 | L.info("Load table...") 55 | table = load_table(dataset, version) 56 | 57 | L.info("Load queryset from disk...") 58 | queryset = load_queryset(dataset, workload) 59 | 60 | L.info("Updating ground truth labels for the workload, with sample size {}...".format(sampling_ratio)) 61 | labels = update_labels_for_queries(table, queryset, seed, sampling_ratio) 62 | 63 | L.info("Dump labels to disk...") 64 | dump_labels(dataset, version, workload, labels) 65 | 66 | -------------------------------------------------------------------------------- /lecarb/workload/gen_workload.py: -------------------------------------------------------------------------------- 1 | import random 2 | import logging 3 | import numpy as np 4 | from typing import Dict, Any 5 | import copy 6 | 7 | from . import generator 8 | from .generator import QueryGenerator 9 | from .gen_label import generate_labels_for_queries 10 | from .workload import dump_queryset, dump_labels 11 | from ..dataset.dataset import load_table 12 | 13 | L = logging.getLogger(__name__) 14 | 15 | def get_focused_table(table, ref_table, win_ratio): 16 | focused_table = copy.deepcopy(table) 17 | win_size = int(win_ratio * len(ref_table.data)) 18 | focused_table.data = focused_table.data.tail(win_size).reset_index(drop=True) 19 | focused_table.parse_columns() 20 | return focused_table 21 | 22 | def generate_workload( 23 | seed: int, dataset: str, version: str, 24 | name: str, no_label: bool, old_version: str, win_ratio: str, 25 | params: Dict[str, Dict[str, Any]] 26 | ) -> None: 27 | 28 | random.seed(seed) 29 | np.random.seed(seed) 30 | 31 | attr_funcs = {getattr(generator, f"asf_{a}"): v for a, v in params['attr'].items()} 32 | center_funcs = {getattr(generator, f"csf_{c}"): v for c, v in params['center'].items()} 33 | width_funcs = {getattr(generator, f"wsf_{w}"): v for w, v in params['width'].items()} 34 | 35 | L.info("Load table...") 36 | table = load_table(dataset, version) 37 | if old_version and win_ratio: 38 | L.info(f"According to {old_version}, generate queries for updated data in {version}...") 39 | win_ratio = float(win_ratio) 40 | assert 0 List[str]: ... 19 | 20 | def asf_pred_number(table: Table, params: Dict[str, Any]) -> List[str]: 21 | if 'whitelist' in params: 22 | attr_domain = params['whitelist'] 23 | else: 24 | blacklist = params.get('blacklist') or [] 25 | attr_domain = [c for c in list(table.data.columns) if c not in blacklist] 26 | nums = params.get('nums') 27 | nums = nums or range(1, len(attr_domain)+1) 28 | num_pred = np.random.choice(nums) 29 | assert num_pred <= len(attr_domain) 30 | return np.random.choice(attr_domain, size=num_pred, replace=False) 31 | 32 | def asf_comb(table: Table, params: Dict[str, Any]) -> List[str]: 33 | assert 'comb' in params and type(params['comb']) == list, params 34 | for c in params['comb']: 35 | assert c in table.columns, c 36 | return params['comb'] 37 | 38 | def asf_naru(table: Table, params: Dict[str, Any]) -> List[str]: 39 | num_filters = np.random.randint(5, 12) 40 | return np.random.choice(table.data.columns, size=num_filters, replace=False) 41 | 42 | """====== Center Selection Functions ======""" 43 | 44 | class CenterSelFunc(Protocol): 45 | def __call__(self, table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]: ... 46 | 47 | DOMAIN_CACHE = {} 48 | # This domain version makes sure that query's cardinality > 0 49 | def csf_domain(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]: 50 | global DOMAIN_CACHE 51 | key = tuple(sorted(attrs)) 52 | if key not in DOMAIN_CACHE: 53 | data_from = params.get('data_from') or 0 54 | DOMAIN_CACHE[key] = table.data[data_from:][attrs].drop_duplicates().index 55 | assert len(DOMAIN_CACHE[key]) > 0, key 56 | # L.debug(f'Cache size: {len(DOMAIN_CACHE)}') 57 | row_id = np.random.choice(DOMAIN_CACHE[key]) 58 | return [table.data.at[row_id, a] for a in attrs] 59 | 60 | ROW_CACHE = None 61 | GLOBAL_COUNTER = 1000 62 | def csf_distribution(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]: 63 | global GLOBAL_COUNTER 64 | global ROW_CACHE 65 | if GLOBAL_COUNTER >= 1000: 66 | data_from = params.get('data_from') or 0 67 | ROW_CACHE = np.random.choice(range(data_from, len(table.data)), size=1000) 68 | GLOBAL_COUNTER = 0 69 | row_id = ROW_CACHE[GLOBAL_COUNTER] 70 | GLOBAL_COUNTER += 1 71 | # data_from = params.get('data_from') or 0 72 | # row_id = np.random.choice(range(data_from, len(table.data))) 73 | return [table.data.at[row_id, a] for a in attrs] 74 | 75 | def csf_ood(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]: 76 | row_ids = np.random.choice(len(table.data), len(attrs)) 77 | return [table.data.at[i, a] for i, a in zip(row_ids, attrs)] 78 | 79 | def csf_vocab_ood(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]: 80 | centers = [] 81 | for a in attrs: 82 | col = table.columns[a] 83 | centers.append(np.random.choice(col.vocab)) 84 | return centers 85 | 86 | def csf_domain_ood(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]: 87 | centers = [] 88 | for a in attrs: 89 | col = table.columns[a] 90 | if is_categorical(col.dtype): # randomly pick one point from domain for categorical 91 | centers.append(np.random.choice(col.vocab)) 92 | else: # uniformly pick one point from domain for numerical 93 | centers.append(random.uniform(col.minval, col.maxval)) 94 | return centers 95 | 96 | def csf_naru(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]: 97 | row_id = np.random.randint(0, len(table.data)) 98 | return [table.data.at[row_id, a] for a in attrs] 99 | 100 | def csf_naru_ood(table: Table, attrs: List[str], params: Dict[str, Any]) -> List[Any]: 101 | row_ids = np.random.choice(len(table.data), len(attrs)) 102 | return [table.data.at[i, a] for i, a in zip(row_ids, attrs)] 103 | 104 | """====== Width Selection Functions ======""" 105 | 106 | class WidthSelFunc(Protocol): 107 | def __call__(self, table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query: ... 108 | 109 | def parse_range(col: Column, left: Any, right: Any) -> Optional[Tuple[str, Any]]: 110 | # if left <= col.minval and right >= col.maxval: 111 | # return None 112 | # if left == right: 113 | # return ('=', left) 114 | if left <= col.minval: 115 | return ('<=', right) 116 | if right >= col.maxval: 117 | return ('>=', left) 118 | return ('[]', (left, right)) 119 | 120 | def wsf_uniform(table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query: 121 | query = new_query(table, ncols=len(attrs)) 122 | for a, c in zip(attrs, centers): 123 | # NaN/NaT literal can only be assigned to = operator 124 | if pd.isnull(c) or is_categorical(table.columns[a].dtype): 125 | query.predicates[a] = ('=', c) 126 | continue 127 | col = table.columns[a] 128 | width = random.uniform(0, col.maxval-col.minval) 129 | query.predicates[a] = parse_range(col, c-width/2, c+width/2) 130 | return query 131 | 132 | def wsf_exponential(table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query: 133 | query = new_query(table, ncols=len(attrs)) 134 | for a, c in zip(attrs, centers): 135 | # NaN/NaT literal can only be assigned to = operator 136 | if pd.isnull(c) or is_categorical(table.columns[a].dtype): 137 | query.predicates[a] = ('=', c) 138 | continue 139 | col = table.columns[a] 140 | lmd = 1 / ((col.maxval - col.minval) / 10) 141 | width = random.expovariate(lmd) 142 | query.predicates[a] = parse_range(col, c-width/2, c+width/2) 143 | return query 144 | 145 | def wsf_naru(table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query: 146 | query = new_query(table, ncols=len(attrs)) 147 | ops = np.random.choice(['>=', '<=', '='], size=len(attrs)) 148 | for a, c, o in zip(attrs, centers, ops): 149 | if table.columns[a].vocab_size >= 10: 150 | query.predicates[a] = (o, c) 151 | else: 152 | query.predicates[a] = ('=', c) 153 | return query 154 | 155 | def wsf_equal(table: Table, attrs: List[str], centers: List[Any], params: Dict[str, Any]) -> Query: 156 | query = new_query(table, ncols=len(attrs)) 157 | for a, c in zip(attrs, centers): 158 | query.predicates[a] = ('=', c) 159 | return query 160 | 161 | class QueryGenerator(object): 162 | table: Table 163 | attr: Dict[AttributeSelFunc, float] 164 | center: Dict[CenterSelFunc, float] 165 | width: Dict[WidthSelFunc, float] 166 | attr_params: Dict[str, Any] 167 | center_params: Dict[str, Any] 168 | width_params: Dict[str, Any] 169 | 170 | def __init__( 171 | self, table: Table, 172 | attr: Dict[AttributeSelFunc, float], 173 | center: Dict[CenterSelFunc, float], 174 | width: Dict[WidthSelFunc, float], 175 | attr_params: Dict[str, Any], 176 | center_params: Dict[str, Any], 177 | width_params: Dict[str, Any] 178 | ) -> None: 179 | self.table = table 180 | self.attr = attr 181 | self.center = center 182 | self.width = width 183 | self.attr_params = attr_params 184 | self.center_params = center_params 185 | self.width_params = width_params 186 | 187 | def generate(self) -> Query: 188 | attr_func = np.random.choice(list(self.attr.keys()), p=list(self.attr.values())) 189 | # L.info(f'start generate attr {attr_func.__name__}') 190 | attr_lst = attr_func(self.table, self.attr_params) 191 | 192 | center_func = np.random.choice(list(self.center.keys()), p=list(self.center.values())) 193 | # L.info(f'start generate center points {center_func.__name__}') 194 | center_lst = center_func(self.table, attr_lst, self.center_params) 195 | 196 | width_func = np.random.choice(list(self.width.keys()), p=list(self.width.values())) 197 | # L.info(f'start generate widths {width_func.__name__}') 198 | return width_func(self.table, attr_lst, center_lst, self.width_params) 199 | -------------------------------------------------------------------------------- /lecarb/workload/merge_workload.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .workload import load_queryset, load_labels, dump_queryset, dump_labels 3 | 4 | L = logging.getLogger(__name__) 5 | 6 | def merge_workload(dataset: str, version: str, workload: str, count: int=10) -> None: 7 | queryset = {'train': [], 'valid': [], 'test': []} 8 | labels = {'train': [], 'valid': [], 'test': []} 9 | 10 | for i in range(count): 11 | L.info(f"Merge querset {workload}_{i}...") 12 | qs = load_queryset(dataset, f"{workload}_{i}") 13 | ls = load_labels(dataset, version, f"{workload}_{i}") 14 | for k in queryset.keys(): 15 | # print(f"{k}: {ls[k][0]}") 16 | queryset[k] += qs[k] 17 | labels[k] += ls[k] 18 | 19 | for k in queryset.keys(): 20 | L.info(f"Final queryset has {len(queryset[k])} queries with {len(labels[k])} labels") 21 | 22 | L.info("Dump queryset and labels...") 23 | dump_queryset(dataset, workload, queryset) 24 | dump_labels(dataset, version, workload, labels) 25 | L.info(f"Done, run: rm data/{dataset}/workload/{workload}_[0-9]* to remove temporary files") 26 | -------------------------------------------------------------------------------- /lecarb/workload/workload.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from collections import OrderedDict 3 | from typing import Dict, NamedTuple, Optional, Tuple, List, Any 4 | import pickle 5 | import numpy as np 6 | 7 | from ..dtypes import is_categorical 8 | from ..constants import DATA_ROOT, PKL_PROTO 9 | from ..dataset.dataset import Table, load_table 10 | 11 | class Query(NamedTuple): 12 | """predicate of each attritbute are conjunctive""" 13 | predicates: Dict[str, Optional[Tuple[str, Any]]] 14 | ncols: int 15 | 16 | class Label(NamedTuple): 17 | cardinality: int 18 | selectivity: float 19 | 20 | def new_query(table: Table, ncols) -> Query: 21 | return Query(predicates=OrderedDict.fromkeys(table.data.columns, None), 22 | ncols=ncols) 23 | 24 | def query_2_triple(query: Query, with_none: bool=True, split_range: bool=False 25 | ) -> Tuple[List[int], List[str], List[Any]]: 26 | """return 3 lists with same length: cols(columns names), ops(predicate operators), vals(predicate literals)""" 27 | cols = [] 28 | ops = [] 29 | vals = [] 30 | for c, p in query.predicates.items(): 31 | if p is not None: 32 | if split_range is True and p[0] == '[]': 33 | cols.append(c) 34 | ops.append('>=') 35 | vals.append(p[1][0]) 36 | cols.append(c) 37 | ops.append('<=') 38 | vals.append(p[1][1]) 39 | else: 40 | cols.append(c) 41 | ops.append(p[0]) 42 | vals.append(p[1]) 43 | elif with_none: 44 | cols.append(c) 45 | ops.append(None) 46 | vals.append(None) 47 | return cols, ops, vals 48 | 49 | def query_2_sql(query: Query, table: Table, aggregate=True, split=False, dbms='postgres'): 50 | preds = [] 51 | for col, pred in query.predicates.items(): 52 | if pred is None: 53 | continue 54 | op, val = pred 55 | if is_categorical(table.data[col].dtype): 56 | val = f"\'{val}\'" if not isinstance(val, tuple) else tuple(f"\'{v}\'" for v in val) 57 | if op == '[]': 58 | if split: 59 | preds.append(f"{col} >= {val[0]}") 60 | preds.append(f"{col} <= {val[1]}") 61 | else: 62 | preds.append(f"({col} between {val[0]} and {val[1]})") 63 | else: 64 | preds.append(f"{col} {op} {val}") 65 | 66 | if dbms == 'mysql': 67 | return f"SELECT {'COUNT(*)' if aggregate else '*'} FROM `{table.name}` WHERE {' AND '.join(preds)}" 68 | return f"SELECT {'COUNT(*)' if aggregate else '*'} FROM \"{table.name}\" WHERE {' AND '.join(preds)}" 69 | 70 | def query_2_kde_sql(query: Query, table: Table): 71 | preds = [] 72 | for col, pred in query.predicates.items(): 73 | if pred is None: 74 | continue 75 | op, val = pred 76 | if is_categorical(table.data[col].dtype): 77 | assert op =='=' and not isinstance(val, tuple), val 78 | val = table.columns[col].discretize(val).item() 79 | if op == '[]': 80 | preds.append(f"{col} >= {val[0]}") 81 | preds.append(f"{col} <= {val[1]}") 82 | else: 83 | preds.append(f"{col} {op} {val}") 84 | 85 | return f"SELECT * FROM \"{table.name}\" WHERE {' AND '.join(preds)}" 86 | 87 | def query_2_deepdb_sql(query: Query, table: Table, aggregate=True, split=False): 88 | preds = [] 89 | for col, pred in query.predicates.items(): 90 | if pred is None: 91 | continue 92 | op, val = pred 93 | if op == '[]': 94 | val = table.columns[col].normalize(list(val)) 95 | assert len(val) == 2, val 96 | if split: 97 | preds.append(f"{col} >= {val[0]}") 98 | preds.append(f"{col} <= {val[1]}") 99 | else: 100 | preds.append(f"({col} between {val[0]} and {val[1]})") 101 | else: 102 | val = table.columns[col].normalize(val).item() 103 | preds.append(f"{col} {op} {val}") 104 | 105 | return f"SELECT {'COUNT(*)' if aggregate else '*'} FROM \"{table.name}\" WHERE {' AND '.join(preds)}" 106 | 107 | def query_2_sqls(query: Query, table: Table): 108 | sqls = [] 109 | for col, pred in query.predicates.items(): 110 | if pred is None: 111 | continue 112 | op, val = pred 113 | if is_categorical(table.data[col].dtype): 114 | val = f"\'{val}\'" if not isinstance(val, tuple) else tuple(f"\'{v}\'" for v in val) 115 | 116 | if op == '[]': 117 | sqls.append(f"SELECT * FROM \"{table.name}\" WHERE {col} between {val[0]} and {val[1]}") 118 | else: 119 | sqls.append(f"SELECT * FROM \"{table.name}\" WHERE {col} {op} {val}") 120 | return sqls 121 | 122 | 123 | def query_2_vector(query: Query, table: Table, upper: int=1): 124 | vec = [] 125 | for col, pred in query.predicates.items(): 126 | if pred is None: 127 | vec.extend([0.0, 1.0]) 128 | continue 129 | op, val = pred 130 | if op == '[]': 131 | vec.extend([table.columns[col].normalize(val[0]).item(), table.columns[col].normalize(val[1]).item()]) 132 | elif op == '>=': 133 | vec.extend([table.columns[col].normalize(val).item(), 1.0]) 134 | elif op == '<=': 135 | vec.extend([0.0, table.columns[col].normalize(val).item()]) 136 | elif op == '=': 137 | vec.extend([table.columns[col].normalize(val).item()] * 2) 138 | else: 139 | raise NotImplementedError 140 | return np.array(vec) * upper 141 | 142 | def query_2_quicksel_vector(query: Query, table: Table, discrete_cols=set()): 143 | vec = [] 144 | for col_name, pred in query.predicates.items(): 145 | if pred is None: 146 | vec.extend([0.0, 1.0]) 147 | continue 148 | op, val = pred 149 | col = table.columns[col_name] 150 | 151 | # adjust predicate to a proper range for discrete columns 152 | if col_name in discrete_cols: 153 | if is_categorical(col.dtype): 154 | val = col.discretize(val) 155 | minval = 0 156 | maxval = col.vocab_size 157 | vocab = np.arange(col.vocab_size) 158 | else: # integer values 159 | minval = col.minval 160 | maxval = col.maxval + 1 161 | vocab = col.vocab 162 | 163 | if op == '=': 164 | val = (val, val) 165 | elif op == '>=': 166 | val = (val, maxval) 167 | elif op == '<=': 168 | val = (minval, val) 169 | else: 170 | assert op == '[]' 171 | 172 | vocab = np.append(vocab, maxval) 173 | # argmax return 0 if no value in array satisfies 174 | val0 = vocab[np.argmax(vocab >= val[0])] if val[0] < maxval else maxval 175 | val1 = vocab[np.argmax(vocab > val[1])] if val[1] < maxval else maxval 176 | assert val0 <= val1, (val0, val1) 177 | assert val0 >= minval and val0 <= maxval, (val0, minval, maxval) 178 | assert val1 >= minval and val1 <= maxval, (val1, minval, maxval) 179 | # normalize to [0, 1] 180 | vec.extend([(val0-minval)/(maxval-minval), (val1-minval)/(maxval-minval)]) 181 | 182 | # directly normalize continous columns 183 | else: 184 | if op == '>=': 185 | vec.extend([col.normalize(val).item(), 1.0]) 186 | elif op == '<=': 187 | vec.extend([0.0, col.normalize(val).item()]) 188 | elif op == '[]': 189 | vec.extend([col.normalize(val[0]).item(), col.normalize(val[1]).item()]) 190 | else: 191 | raise NotImplementedError 192 | return np.array(vec) 193 | 194 | 195 | def dump_queryset(dataset: str, name: str, queryset: Dict[str, List[Query]]) -> None: 196 | query_path = DATA_ROOT / dataset / "workload" 197 | query_path.mkdir(exist_ok=True) 198 | with open(query_path / f"{name}.pkl", 'wb') as f: 199 | pickle.dump(queryset, f, protocol=PKL_PROTO) 200 | 201 | def load_queryset(dataset: str, name: str) -> Dict[str, List[Query]]: 202 | query_path = DATA_ROOT / dataset / "workload" 203 | with open(query_path / f"{name}.pkl", 'rb') as f: 204 | return pickle.load(f) 205 | 206 | def dump_labels(dataset: str, version: str, name: str, labels: Dict[str, List[Label]]) -> None: 207 | label_path = DATA_ROOT / dataset / "workload" 208 | with open(label_path / f"{name}-{version}-label.pkl", 'wb') as f: 209 | pickle.dump(labels, f, protocol=PKL_PROTO) 210 | 211 | def load_labels(dataset: str, version: str, name: str) -> Dict[str, List[Label]]: 212 | label_path = DATA_ROOT / dataset / "workload" 213 | with open(label_path / f"{name}-{version}-label.pkl", 'rb') as f: 214 | return pickle.load(f) 215 | 216 | def dump_sqls(dataset: str, version: str, workload: str, group: str='test'): 217 | table = load_table(dataset, version) 218 | queryset = load_queryset(dataset, workload) 219 | labels = load_labels(dataset, version, workload) 220 | 221 | with open('test.csv', 'w') as f: 222 | writer = csv.writer(f) 223 | for query, label in zip(queryset[group], labels[group]): 224 | sql = query_2_sql(query, table, aggregate=False, dbms='sqlserver') 225 | writer.writerow([sql, label.cardinality]) 226 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "lecarb" 3 | version = "0.1.0" 4 | description = "Are We Ready For Learned Cardinality Estimation?" 5 | authors = ["Weiyuan Wu ", "Xiaoying Wang ", "Changbo Qu "] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.7" 9 | contexttimer = "0.3" 10 | pandas = "1.0.3" 11 | numpy = "1.18.1" 12 | docopt = "0.6" 13 | psycopg2-binary = "2.8.4" 14 | scikit-learn = "0.23.0" 15 | torch = "1.5.0" 16 | sqlparse = "0.3.1" 17 | dask = "1.2.0" 18 | toolz = "0.9.0" 19 | cloudpickle = "1.2.1" 20 | tables = "3.5.1" 21 | spflow = "0.0.34" 22 | bloom-filter = "1.3" 23 | python-dotenv = "0.13.0" 24 | jupyter = "1.0.0" 25 | jupyterlab = "2.1.4" 26 | seaborn = "0.11.0" 27 | scipy = "1.4.1" 28 | xgboost = "1.1.1" 29 | ray = "^0.8.7" 30 | Cython = "^0.29.21" 31 | pomegranate = "^0.13.4" 32 | pyodbc = "^4.0.30" 33 | mysql-connector-python = "^8.0.21" 34 | PyQt5 = "5.15.1" 35 | 36 | [tool.poetry.dev-dependencies] 37 | mypy = "0.770" 38 | black = "19.10b0" 39 | pylint = "2.4.4" 40 | ipython = "7.13.0" 41 | 42 | [build-system] 43 | requires = ["poetry>=0.12"] 44 | build-backend = "poetry.masonry.api" 45 | --------------------------------------------------------------------------------